Grcov report - script.rs

1

//! Unicode script detection and language identification for text shaping

2

//!

3

// Taken from: https://github.com/greyblake/whatlang-rs/blob/master/src/scripts/detect.rs

4

//

5

// See: https://github.com/greyblake/whatlang-rs/pull/67

6

7

// License:

8

//

9

// (The MIT License)

10

//

11

// Copyright (c) 2017 Sergey Potapov <blake131313@gmail.com>

12

// Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>

13

// Copyright (c) 2008 Kent S Johnson

14

// Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>

15

// Copyright (c) 2004 Maciej Ceglowski

16

//

17

// Permission is hereby granted, free of charge, to any person obtaining

18

// a copy of this software and associated documentation files (the

19

// 'Software'), to deal in the Software without restriction, including

20

// without limitation the rights to use, copy, modify, merge, publish,

21

// distribute, sublicense, and/or sell copies of the Software, and to

22

// permit persons to whom the Software is furnished to do so, subject to

23

// the following conditions:

24

//

25

// The above copyright notice and this permission notice shall be

26

// included in all copies or substantial portions of the Software.

27

//

28

// THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,

29

// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

30

// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.

31

// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY

32

// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,

33

// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE

34

// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

35

36

#[cfg(feature = "text_layout_hyphenation")]

37

use hyphenation::Language as HyphenationLanguage;

38

#[cfg(feature = "text_layout_hyphenation")]

39

pub use hyphenation::Language;

40

41

/// Stub Language enum for when hyphenation is not enabled.

42

/// This mirrors the variants used in script detection functions.

43

#[cfg(not(feature = "text_layout_hyphenation"))]

44

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]

45

#[allow(dead_code)]

46

pub enum Language {

47

    // Latin script languages

48

    EnglishUS,

49

    French,

50

    German1996,

51

    Spanish,

52

    Portuguese,

53

    Estonian,

54

    Hungarian,

55

    Polish,

56

    Czech,

57

    Slovak,

58

    Latvian,

59

    Lithuanian,

60

    Romanian,

61

    Turkish,

62

    Croatian,

63

    Icelandic,

64

    Welsh,

65

    NorwegianBokmal,

66

    Swedish,

67

    // Cyrillic script languages

68

    Russian,

69

    Ukrainian,

70

    Belarusian,

71

    Bulgarian,

72

    Macedonian,

73

    SerbianCyrillic,

74

    Mongolian,

75

    SlavonicChurch,

76

    // Greek script languages

77

    GreekMono,

78

    GreekPoly,

79

    Coptic,

80

    // Indic script languages

81

    Hindi,

82

    Bengali,

83

    Assamese,

84

    Marathi,

85

    Sanskrit,

86

    Gujarati,

87

    Panjabi,

88

    Kannada,

89

    Malayalam,

90

    Oriya,

91

    Tamil,

92

    Telugu,

93

    // Other scripts

94

    Georgian,

95

    Ethiopic,

96

    Thai,

97

    Chinese,

98

99

100

use rust_fontconfig::UnicodeRange;

101

102

#[derive(PartialEq, Eq, Debug, Clone, Copy)]

103

pub enum Script {

104

    // Keep this in alphabetic order (for C bindings)

105

    Arabic,

106

    Bengali,

107

    Cyrillic,

108

    Devanagari,

109

    Ethiopic,

110

    Georgian,

111

    Greek,

112

    Gujarati,

113

    Gurmukhi,

114

    Hangul,

115

    Hebrew,

116

    Hiragana,

117

    Kannada,

118

    Katakana,

119

    Khmer,

120

    Latin,

121

    Malayalam,

122

    Mandarin,

123

    Myanmar,

124

    Oriya,

125

    Sinhala,

126

    Tamil,

127

    Telugu,

128

    Thai,

129

130

131

impl Script {

132

    /// Maps a Script to a vector of its representative Unicode character ranges.

133

///

134

    /// The ranges are extracted from the `is_*` functions in the provided source code.

135

    pub fn get_unicode_ranges(&self) -> Vec<UnicodeRange> {

136

        match self {

137

            Script::Arabic => vec![

138

                UnicodeRange {

139

                    start: 0x0600,

140

                    end: 0x06FF,

141

},

142

                UnicodeRange {

143

                    start: 0x0750,

144

                    end: 0x07FF,

145

},

146

                UnicodeRange {

147

                    start: 0x08A0,

148

                    end: 0x08FF,

149

},

150

                UnicodeRange {

151

                    start: 0xFB50,

152

                    end: 0xFDFF,

153

},

154

                UnicodeRange {

155

                    start: 0xFE70,

156

                    end: 0xFEFF,

157

},

158

                UnicodeRange {

159

                    start: 0x10E60,

160

                    end: 0x10E7F,

161

},

162

                UnicodeRange {

163

                    start: 0x1EE00,

164

                    end: 0x1EEFF,

165

},

166

],

167

            Script::Bengali => vec![UnicodeRange {

168

                start: 0x0980,

169

                end: 0x09FF,

170

}],

171

            Script::Cyrillic => vec![

172

                UnicodeRange {

173

                    start: 0x0400,

174

                    end: 0x0484,

175

},

176

                UnicodeRange {

177

                    start: 0x0487,

178

                    end: 0x052F,

179

},

180

                UnicodeRange {

181

                    start: 0x2DE0,

182

                    end: 0x2DFF,

183

},

184

                UnicodeRange {

185

                    start: 0xA640,

186

                    end: 0xA69D,

187

},

188

                UnicodeRange {

189

                    start: 0x1D2B,

190

                    end: 0x1D2B,

191

},

192

                UnicodeRange {

193

                    start: 0x1D78,

194

                    end: 0x1D78,

195

},

196

                UnicodeRange {

197

                    start: 0xA69F,

198

                    end: 0xA69F,

199

},

200

],

201

            Script::Devanagari => vec![

202

                UnicodeRange {

203

                    start: 0x0900,

204

                    end: 0x097F,

205

},

206

                UnicodeRange {

207

                    start: 0xA8E0,

208

                    end: 0xA8FF,

209

},

210

                UnicodeRange {

211

                    start: 0x1CD0,

212

                    end: 0x1CFF,

213

},

214

],

215

            Script::Ethiopic => vec![

216

                UnicodeRange {

217

                    start: 0x1200,

218

                    end: 0x139F,

219

},

220

                UnicodeRange {

221

                    start: 0x2D80,

222

                    end: 0x2DDF,

223

},

224

                UnicodeRange {

225

                    start: 0xAB00,

226

                    end: 0xAB2F,

227

},

228

],

229

            Script::Georgian => vec![UnicodeRange {

230

                start: 0x10A0,

231

                end: 0x10FF,

232

}],

233

            Script::Greek => vec![UnicodeRange {

234

                start: 0x0370,

235

                end: 0x03FF,

236

}],

237

            Script::Gujarati => vec![UnicodeRange {

238

                start: 0x0A80,

239

                end: 0x0AFF,

240

}],

241

            Script::Gurmukhi => vec![UnicodeRange {

242

                start: 0x0A00,

243

                end: 0x0A7F,

244

}],

245

            Script::Hangul => vec![

246

                UnicodeRange {

247

                    start: 0xAC00,

248

                    end: 0xD7AF,

249

},

250

                UnicodeRange {

251

                    start: 0x1100,

252

                    end: 0x11FF,

253

},

254

                UnicodeRange {

255

                    start: 0x3130,

256

                    end: 0x318F,

257

},

258

                UnicodeRange {

259

                    start: 0x3200,

260

                    end: 0x32FF,

261

},

262

                UnicodeRange {

263

                    start: 0xA960,

264

                    end: 0xA97F,

265

},

266

                UnicodeRange {

267

                    start: 0xD7B0,

268

                    end: 0xD7FF,

269

},

270

                UnicodeRange {

271

                    start: 0xFF00,

272

                    end: 0xFFEF,

273

},

274

],

275

            Script::Hebrew => vec![UnicodeRange {

276

                start: 0x0590,

277

                end: 0x05FF,

278

}],

279

            Script::Hiragana => vec![UnicodeRange {

280

                start: 0x3040,

281

                end: 0x309F,

282

}],

283

            Script::Kannada => vec![UnicodeRange {

284

                start: 0x0C80,

285

                end: 0x0CFF,

286

}],

287

            Script::Katakana => vec![UnicodeRange {

288

                start: 0x30A0,

289

                end: 0x30FF,

290

}],

291

            Script::Khmer => vec![

292

                UnicodeRange {

293

                    start: 0x1780,

294

                    end: 0x17FF,

295

},

296

                UnicodeRange {

297

                    start: 0x19E0,

298

                    end: 0x19FF,

299

},

300

],

301

            Script::Latin => vec![

302

                UnicodeRange {

303

                    start: 0x0041,

304

                    end: 0x005A,

305

                }, // A-Z

306

                UnicodeRange {

307

                    start: 0x0061,

308

                    end: 0x007A,

309

                }, // a-z

310

                UnicodeRange {

311

                    start: 0x0080,

312

                    end: 0x00FF,

313

},

314

                UnicodeRange {

315

                    start: 0x0100,

316

                    end: 0x017F,

317

},

318

                UnicodeRange {

319

                    start: 0x0180,

320

                    end: 0x024F,

321

},

322

                UnicodeRange {

323

                    start: 0x0250,

324

                    end: 0x02AF,

325

},

326

                UnicodeRange {

327

                    start: 0x1D00,

328

                    end: 0x1D7F,

329

},

330

                UnicodeRange {

331

                    start: 0x1D80,

332

                    end: 0x1DBF,

333

},

334

                UnicodeRange {

335

                    start: 0x1E00,

336

                    end: 0x1EFF,

337

},

338

                UnicodeRange {

339

                    start: 0x2100,

340

                    end: 0x214F,

341

},

342

                UnicodeRange {

343

                    start: 0x2C60,

344

                    end: 0x2C7F,

345

},

346

                UnicodeRange {

347

                    start: 0xA720,

348

                    end: 0xA7FF,

349

},

350

                UnicodeRange {

351

                    start: 0xAB30,

352

                    end: 0xAB6F,

353

},

354

],

355

            Script::Malayalam => vec![UnicodeRange {

356

                start: 0x0D00,

357

                end: 0x0D7F,

358

}],

359

            Script::Mandarin => vec![

360

                UnicodeRange {

361

                    start: 0x2E80,

362

                    end: 0x2E99,

363

},

364

                UnicodeRange {

365

                    start: 0x2E9B,

366

                    end: 0x2EF3,

367

},

368

                UnicodeRange {

369

                    start: 0x2F00,

370

                    end: 0x2FD5,

371

},

372

                UnicodeRange {

373

                    start: 0x3005,

374

                    end: 0x3005,

375

},

376

                UnicodeRange {

377

                    start: 0x3007,

378

                    end: 0x3007,

379

},

380

                UnicodeRange {

381

                    start: 0x3021,

382

                    end: 0x3029,

383

},

384

                UnicodeRange {

385

                    start: 0x3038,

386

                    end: 0x303B,

387

},

388

                UnicodeRange {

389

                    start: 0x3400,

390

                    end: 0x4DB5,

391

},

392

                UnicodeRange {

393

                    start: 0x4E00,

394

                    end: 0x9FCC,

395

},

396

                UnicodeRange {

397

                    start: 0xF900,

398

                    end: 0xFA6D,

399

},

400

                UnicodeRange {

401

                    start: 0xFA70,

402

                    end: 0xFAD9,

403

},

404

],

405

            Script::Myanmar => vec![UnicodeRange {

406

                start: 0x1000,

407

                end: 0x109F,

408

}],

409

            Script::Oriya => vec![UnicodeRange {

410

                start: 0x0B00,

411

                end: 0x0B7F,

412

}],

413

            Script::Sinhala => vec![UnicodeRange {

414

                start: 0x0D80,

415

                end: 0x0DFF,

416

}],

417

            Script::Tamil => vec![UnicodeRange {

418

                start: 0x0B80,

419

                end: 0x0BFF,

420

}],

421

            Script::Telugu => vec![UnicodeRange {

422

                start: 0x0C00,

423

                end: 0x0C7F,

424

}],

425

            Script::Thai => vec![UnicodeRange {

426

                start: 0x0E00,

427

                end: 0x0E7F,

428

}],

429

430

431

432

433

// Is it space, punctuation or digit?

434

// Stop character is a character that does not give any value for script

435

// or language detection.

436

#[inline]

437

121308

pub fn is_stop_char(ch: char) -> bool {

438

121308

    matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')

439

121308

440

441

type ScriptCounter = (Script, fn(char) -> bool, usize);

442

443

/// Detect only a script by a given text

444

18040

pub fn detect_script(text: &str) -> Option<Script> {

445

18040

    let mut script_counters: [ScriptCounter; 24] = [

446

18040

        (Script::Latin, is_latin, 0),

447

18040

        (Script::Cyrillic, is_cyrillic, 0),

448

18040

        (Script::Arabic, is_arabic, 0),

449

18040

        (Script::Mandarin, is_mandarin, 0),

450

18040

        (Script::Devanagari, is_devanagari, 0),

451

18040

        (Script::Hebrew, is_hebrew, 0),

452

18040

        (Script::Ethiopic, is_ethiopic, 0),

453

18040

        (Script::Georgian, is_georgian, 0),

454

18040

        (Script::Bengali, is_bengali, 0),

455

18040

        (Script::Hangul, is_hangul, 0),

456

18040

        (Script::Hiragana, is_hiragana, 0),

457

18040

        (Script::Katakana, is_katakana, 0),

458

18040

        (Script::Greek, is_greek, 0),

459

18040

        (Script::Kannada, is_kannada, 0),

460

18040

        (Script::Tamil, is_tamil, 0),

461

18040

        (Script::Thai, is_thai, 0),

462

18040

        (Script::Gujarati, is_gujarati, 0),

463

18040

        (Script::Gurmukhi, is_gurmukhi, 0),

464

18040

        (Script::Telugu, is_telugu, 0),

465

18040

        (Script::Malayalam, is_malayalam, 0),

466

18040

        (Script::Oriya, is_oriya, 0),

467

18040

        (Script::Myanmar, is_myanmar, 0),

468

18040

        (Script::Sinhala, is_sinhala, 0),

469

18040

        (Script::Khmer, is_khmer, 0),

470

18040

];

471

472

18040

    let half = text.chars().count() / 2;

473

474

121308

    for ch in text.chars() {

475

121308

        if is_stop_char(ch) {

476

42372

            continue;

477

78936

478

479

        // For performance reasons, we need to mutate script_counters by calling

480

        // `swap` function, it would not be possible to do using normal iterator.

481

229724

        for i in 0..script_counters.len() {

482

218900

            let found = {

483

229724

                let (script, check_fn, ref mut count) = script_counters[i];

484

229724

                if check_fn(ch) {

485

72380

                    *count += 1;

486

72380

                    if *count > half {

487

10824

                        return Some(script);

488

61556

489

61556

                    true

490

                } else {

491

157344

                    false

492

493

};

494

            // Have to let borrow of count fall out of scope before doing swapping, or we could

495

            // do this above.

496

218900

            if found {

497

                // If script was found, move it closer to the front.

498

                // If the text contains largely 1 or 2 scripts, this will

499

                // cause these scripts to be eventually checked first.

500

61556

                if i > 0 {

501

                    script_counters.swap(i - 1, i);

502

61556

503

61556

                break;

504

157344

505

506

507

508

7216

    let (script, _, count) = script_counters

509

7216

        .iter()

510

7216

        .cloned()

511

7216

        .max_by_key(|&(_, _, count)| count)

512

7216

        .unwrap();

513

7216

    if count != 0 {

514

4796

        Some(script)

515

    } else {

516

2420

        None

517

518

18040

519

520

pub fn detect_char_script(ch: char) -> Option<Script> {

521

    let script_counters: [ScriptCounter; 24] = [

522

        (Script::Latin, is_latin, 0),

523

        (Script::Cyrillic, is_cyrillic, 0),

524

        (Script::Arabic, is_arabic, 0),

525

        (Script::Mandarin, is_mandarin, 0),

526

        (Script::Devanagari, is_devanagari, 0),

527

        (Script::Hebrew, is_hebrew, 0),

528

        (Script::Ethiopic, is_ethiopic, 0),

529

        (Script::Georgian, is_georgian, 0),

530

        (Script::Bengali, is_bengali, 0),

531

        (Script::Hangul, is_hangul, 0),

532

        (Script::Hiragana, is_hiragana, 0),

533

        (Script::Katakana, is_katakana, 0),

534

        (Script::Greek, is_greek, 0),

535

        (Script::Kannada, is_kannada, 0),

536

        (Script::Tamil, is_tamil, 0),

537

        (Script::Thai, is_thai, 0),

538

        (Script::Gujarati, is_gujarati, 0),

539

        (Script::Gurmukhi, is_gurmukhi, 0),

540

        (Script::Telugu, is_telugu, 0),

541

        (Script::Malayalam, is_malayalam, 0),

542

        (Script::Oriya, is_oriya, 0),

543

        (Script::Myanmar, is_myanmar, 0),

544

        (Script::Sinhala, is_sinhala, 0),

545

        (Script::Khmer, is_khmer, 0),

546

];

547

548

    for i in 0..script_counters.len() {

549

        let (script, check_fn, _) = script_counters[i];

550

        if check_fn(ch) {

551

            return Some(script);

552

553

554

    None

555

556

557

/// Iterates through the text once and returns as soon as an Assamese-specific character is found.

558

fn detect_bengali_language(text: &str) -> Language {

559

    for c in text.chars() {

560

        // These characters are specific to Assamese in the Bengali script block.

561

        // We can return immediately as this is the highest priority check.

562

        if matches!(c, '\u{09F0}' | '\u{09F1}') {

563

            // ৰ, ৱ

564

            return Language::Assamese;

565

566

567

    // If we finish the loop without finding any Assamese characters, it's Bengali.

568

    Language::Bengali

569

570

571

fn detect_cyrillic_language(text: &str) -> Language {

572

    for c in text.chars() {

573

        match c {

574

            // Highest priority: Old Cyrillic characters for Slavonic Church. Return immediately.

575

            '\u{0460}'..='\u{047F}' => return Language::SlavonicChurch,

576

            // Set flags for other languages. We don't return yet because a higher-priority

577

            // character (like the one above) could still appear.

578

            'ѓ' | 'ќ' | 'ѕ' => return Language::Macedonian,

579

            'ў' => return Language::Belarusian,

580

            'є' | 'і' | 'ї' | 'ґ' => return Language::Ukrainian,

581

            'ө' | 'ү' | 'һ' => return Language::Mongolian,

582

            'ј' | 'љ' | 'њ' | 'ћ' | 'ђ' | 'џ' => return Language::SerbianCyrillic,

583

            // Bulgarian 'ъ' is also in Russian, but 'щ' is a stronger indicator.

584

            // The logic implies that if either is present, it might be Bulgarian.

585

            'щ' => return Language::Bulgarian,

586

            _ => {}

587

588

589

590

    Language::Russian

591

592

593

fn detect_devanagari_language(text: &str) -> Language {

594

    for c in text.chars() {

595

        match c {

596

            // Marathi has higher priority in the original logic. Return immediately.

597

            '\u{0933}' => return Language::Marathi, // ळ

598

            // Flag for Sanskrit Vedic extensions.

599

            '\u{1CD0}'..='\u{1CFF}' => return Language::Sanskrit,

600

            _ => (),

601

602

603

604

    Language::Hindi

605

606

607

fn detect_greek_language(text: &str) -> Language {

608

    for c in text.chars() {

609

        match c {

610

            // Coptic has higher priority. Return immediately.

611

            '\u{2C80}'..='\u{2CFF}' => return Language::Coptic,

612

            // Flag for Greek Extended (Polytonic) characters.

613

            '\u{1F00}'..='\u{1FFF}' => return Language::GreekPoly,

614

            _ => {}

615

616

617

618

    Language::GreekMono

619

620

621

15224

fn detect_latin_language(text: &str) -> Language {

622

    // Flags for languages checked near the end of the original if-else chain.

623

15224

    let mut has_french_c = false;

624

15224

    let mut has_portuguese_o = false;

625

15224

    let mut has_portuguese_a = false;

626

627

140756

    for c in text.chars() {

628

140756

        match c {

629

            // --- Early Return Cases (in order of priority) ---

630

            'ß' => return Language::German1996,

631

            'ő' | 'ű' => return Language::Hungarian,

632

            'ł' => return Language::Polish,

633

            'ř' | 'ů' => return Language::Czech,

634

            'ľ' | 'ĺ' | 'ŕ' => return Language::Slovak,

635

            'ā' | 'ē' | 'ģ' | 'ī' | 'ķ' | 'ļ' | 'ņ' | 'ō' | 'ū' => {

636

                return Language::Latvian

637

638

            'ą' | 'ę' | 'ė' | 'į' | 'ų' => return Language::Lithuanian,

639

            'ă' | 'ș' | 'ț' => return Language::Romanian,

640

            'ğ' | 'ı' | 'ş' => return Language::Turkish,

641

            'đ' => return Language::Croatian, /* Also used in Vietnamese, but Croatian is the */

642

            // original's intent

643

            'þ' | 'ð' => return Language::Icelandic,

644

            'ŵ' | 'ŷ' => return Language::Welsh,

645

            'æ' | 'ø' => return Language::NorwegianBokmal, // And Danish

646

            'å' => return Language::Swedish,               // And Norwegian, Finnish

647

            'ñ' => return Language::Spanish,

648

            'ä' | 'ö' | 'ü' => return Language::German1996,

649

650

            // NOTE: 'õ' is used by both Estonian and Portuguese

651

            // Since Estonian is checked first, it takes precedence.

652

            'õ' => has_portuguese_o = true,

653

            'ã' => has_portuguese_a = true,

654

655

            // --- Flag-setting Cases ---

656

            'ç' => has_french_c = true, // Also in Portuguese

657

            'á' | 'é' | 'í' | 'ó' | 'ú' => return Language::Spanish,

658

659

140756

            _ => (),

660

661

662

663

    // decide between portuguese, estonian and french

664

665

15224

    if has_french_c && !has_portuguese_o && !has_portuguese_a {

666

        return Language::French;

667

15224

668

669

15224

    if has_portuguese_o && !has_french_c && !has_portuguese_a {

670

        return Language::Estonian;

671

15224

672

673

15224

    if has_portuguese_o || has_portuguese_a || has_french_c {

674

        return Language::Portuguese;

675

15224

676

677

15224

    Language::EnglishUS

678

15224

679

680

15224

pub fn script_to_language(script: Script, text: &str) -> Language {

681

15224

    match script {

682

        Script::Ethiopic => Language::Ethiopic,

683

        Script::Georgian => Language::Georgian,

684

        Script::Gujarati => Language::Gujarati,

685

        Script::Gurmukhi => Language::Panjabi,

686

        Script::Kannada => Language::Kannada,

687

        Script::Malayalam => Language::Malayalam,

688

        Script::Mandarin => Language::Chinese,

689

        Script::Oriya => Language::Oriya,

690

        Script::Tamil => Language::Tamil,

691

        Script::Telugu => Language::Telugu,

692

        Script::Thai => Language::Thai,

693

        Script::Bengali => detect_bengali_language(text),

694

        Script::Cyrillic => detect_cyrillic_language(text),

695

        Script::Devanagari => detect_devanagari_language(text),

696

        Script::Greek => detect_greek_language(text),

697

15224

        Script::Latin => detect_latin_language(text),

698

699

        // not directly matchable

700

        Script::Myanmar => Language::Thai,

701

        Script::Khmer => Language::Thai,

702

        Script::Sinhala => Language::Hindi,

703

704

        // no classical hyphenation behaviour

705

        Script::Arabic => Language::Chinese,

706

        Script::Hebrew => Language::Chinese,

707

        Script::Hangul => Language::Chinese,

708

        Script::Hiragana => Language::Chinese,

709

        Script::Katakana => Language::Chinese,

710

711

15224

712

713

6556

pub fn is_cyrillic(ch: char) -> bool {

714

6556

    matches!(ch,

715

6556

        '\u{0400}'..='\u{0484}'

716

6556

        | '\u{0487}'..='\u{052F}'

717

836

        | '\u{2DE0}'..='\u{2DFF}'

718

836

        | '\u{A640}'..='\u{A69D}'

719

        | '\u{1D2B}'

720

        | '\u{1D78}'

721

        | '\u{A69F}'

722

723

6556

724

725

// https://en.wikipedia.org/wiki/Latin_script_in_Unicode

726

78936

pub fn is_latin(ch: char) -> bool {

727

78936

    matches!(ch,

728

69080

        'a'..='z'

729

17160

        | 'A'..='Z'

730

7304

        | '\u{0080}'..='\u{00FF}'

731

6556

        | '\u{0100}'..='\u{017F}'

732

6556

        | '\u{0180}'..='\u{024F}'

733

6556

        | '\u{0250}'..='\u{02AF}'

734

6556

        | '\u{1D00}'..='\u{1D7F}'

735

6556

        | '\u{1D80}'..='\u{1DBF}'

736

6556

        | '\u{1E00}'..='\u{1EFF}'

737

1936

        | '\u{2100}'..='\u{214F}'

738

836

        | '\u{2C60}'..='\u{2C7F}'

739

836

        | '\u{A720}'..='\u{A7FF}'

740

836

        | '\u{AB30}'..='\u{AB6F}'

741

742

78936

743

744

// Based on https://en.wikipedia.org/wiki/Arabic_script_in_Unicode

745

6556

pub fn is_arabic(ch: char) -> bool {

746

6556

    matches!(ch,

747

6556

        '\u{0600}'..='\u{06FF}'

748

6556

        | '\u{0750}'..='\u{07FF}'

749

6556

        | '\u{08A0}'..='\u{08FF}'

750

836

        | '\u{FB50}'..='\u{FDFF}'

751

836

        | '\u{FE70}'..='\u{FEFF}'

752

        | '\u{10E60}'..='\u{10E7F}'

753

        | '\u{1EE00}'..='\u{1EEFF}'

754

755

6556

756

757

// Based on https://en.wikipedia.org/wiki/Devanagari#Unicode

758

6556

pub fn is_devanagari(ch: char) -> bool {

759

6556

    matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')

760

6556

761

762

// Based on https://www.key-shortcut.com/en/writing-systems/ethiopian-script/

763

6556

pub fn is_ethiopic(ch: char) -> bool {

764

6556

    matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')

765

6556

766

767

// Based on https://en.wikipedia.org/wiki/Hebrew_(Unicode_block)

768

6556

pub fn is_hebrew(ch: char) -> bool {

769

6556

    matches!(ch, '\u{0590}'..='\u{05FF}')

770

6556

771

772

6556

pub fn is_georgian(ch: char) -> bool {

773

6556

    matches!(ch, '\u{10A0}'..='\u{10FF}')

774

6556

775

776

6556

pub fn is_mandarin(ch: char) -> bool {

777

6556

    matches!(ch,

778

836

        '\u{2E80}'..='\u{2E99}'

779

836

        | '\u{2E9B}'..='\u{2EF3}'

780

836

        | '\u{2F00}'..='\u{2FD5}'

781

        | '\u{3005}'

782

        | '\u{3007}'

783

836

        | '\u{3021}'..='\u{3029}'

784

836

        | '\u{3038}'..='\u{303B}'

785

836

        | '\u{3400}'..='\u{4DB5}'

786

836

        | '\u{4E00}'..='\u{9FCC}'

787

836

        | '\u{F900}'..='\u{FA6D}'

788

836

        | '\u{FA70}'..='\u{FAD9}'

789

790

6556

791

792

6556

pub fn is_bengali(ch: char) -> bool {

793

6556

    matches!(ch, '\u{0980}'..='\u{09FF}')

794

6556

795

796

6556

pub fn is_hiragana(ch: char) -> bool {

797

6556

    matches!(ch, '\u{3040}'..='\u{309F}')

798

6556

799

800

6556

pub fn is_katakana(ch: char) -> bool {

801

6556

    matches!(ch, '\u{30A0}'..='\u{30FF}')

802

6556

803

804

// Hangul is Korean Alphabet. Unicode ranges are taken from: https://en.wikipedia.org/wiki/Hangul

805

6556

pub fn is_hangul(ch: char) -> bool {

806

6556

    matches!(ch,

807

836

        '\u{AC00}'..='\u{D7AF}'

808

6556

        | '\u{1100}'..='\u{11FF}'

809

836

        | '\u{3130}'..='\u{318F}'

810

836

        | '\u{3200}'..='\u{32FF}'

811

836

        | '\u{A960}'..='\u{A97F}'

812

836

        | '\u{D7B0}'..='\u{D7FF}'

813

836

        | '\u{FF00}'..='\u{FFEF}'

814

815

6556

816

817

// Taken from: https://en.wikipedia.org/wiki/Greek_and_Coptic

818

6556

pub fn is_greek(ch: char) -> bool {

819

6556

    matches!(ch, '\u{0370}'..='\u{03FF}')

820

6556

821

822

// Based on: https://en.wikipedia.org/wiki/Kannada_(Unicode_block)

823

6556

pub fn is_kannada(ch: char) -> bool {

824

6556

    matches!(ch, '\u{0C80}'..='\u{0CFF}')

825

6556

826

827

// Based on: https://en.wikipedia.org/wiki/Tamil_(Unicode_block)

828

6556

pub fn is_tamil(ch: char) -> bool {

829

6556

    matches!(ch, '\u{0B80}'..='\u{0BFF}')

830

6556

831

832

// Based on: https://en.wikipedia.org/wiki/Thai_(Unicode_block)

833

6556

pub fn is_thai(ch: char) -> bool {

834

6556

    matches!(ch, '\u{0E00}'..='\u{0E7F}')

835

6556

836

837

// Based on: https://en.wikipedia.org/wiki/Gujarati_(Unicode_block)

838

6556

pub fn is_gujarati(ch: char) -> bool {

839

6556

    matches!(ch, '\u{0A80}'..='\u{0AFF}')

840

6556

841

842

// Gurmukhi is the script for Punjabi language.

843

// Based on: https://en.wikipedia.org/wiki/Gurmukhi_(Unicode_block)

844

6556

pub fn is_gurmukhi(ch: char) -> bool {

845

6556

    matches!(ch, '\u{0A00}'..='\u{0A7F}')

846

6556

847

848

6556

pub fn is_telugu(ch: char) -> bool {

849

6556

    matches!(ch, '\u{0C00}'..='\u{0C7F}')

850

6556

851

852

// Based on: https://en.wikipedia.org/wiki/Malayalam_(Unicode_block)

853

6556

pub fn is_malayalam(ch: char) -> bool {

854

6556

    matches!(ch, '\u{0D00}'..='\u{0D7F}')

855

6556

856

857

// Based on: https://en.wikipedia.org/wiki/Oriya_(Unicode_block)

858

6556

pub fn is_oriya(ch: char) -> bool {

859

6556

    matches!(ch, '\u{0B00}'..='\u{0B7F}')

860

6556

861

862

// Based on: https://en.wikipedia.org/wiki/Myanmar_(Unicode_block)

863

6556

pub fn is_myanmar(ch: char) -> bool {

864

6556

    matches!(ch, '\u{1000}'..='\u{109F}')

865

6556

866

867

// Based on: https://en.wikipedia.org/wiki/Sinhala_(Unicode_block)

868

6556

pub fn is_sinhala(ch: char) -> bool {

869

6556

    matches!(ch, '\u{0D80}'..='\u{0DFF}')

870

6556

871

872

// Based on: https://en.wikipedia.org/wiki/Khmer_alphabet

873

6556

pub fn is_khmer(ch: char) -> bool {

874

6556

    matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')

875

6556