Grcov report - script.rs

1

//! Unicode script detection and language identification for text shaping

2

//!

3

// Taken from: https://github.com/greyblake/whatlang-rs/blob/master/src/scripts/detect.rs

4

//

5

// See: https://github.com/greyblake/whatlang-rs/pull/67

6

7

// License:

8

//

9

// (The MIT License)

10

//

11

// Copyright (c) 2017 Sergey Potapov <blake131313@gmail.com>

12

// Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>

13

// Copyright (c) 2008 Kent S Johnson

14

// Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>

15

// Copyright (c) 2004 Maciej Ceglowski

16

//

17

// Permission is hereby granted, free of charge, to any person obtaining

18

// a copy of this software and associated documentation files (the

19

// 'Software'), to deal in the Software without restriction, including

20

// without limitation the rights to use, copy, modify, merge, publish,

21

// distribute, sublicense, and/or sell copies of the Software, and to

22

// permit persons to whom the Software is furnished to do so, subject to

23

// the following conditions:

24

//

25

// The above copyright notice and this permission notice shall be

26

// included in all copies or substantial portions of the Software.

27

//

28

// THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,

29

// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

30

// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.

31

// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY

32

// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,

33

// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE

34

// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

35

36

#[cfg(feature = "text_layout_hyphenation")]

37

use hyphenation::Language as HyphenationLanguage;

38

#[cfg(feature = "text_layout_hyphenation")]

39

pub use hyphenation::Language;

40

41

/// Stub Language enum for when hyphenation is not enabled.

42

/// This mirrors the variants used in script detection functions.

43

#[cfg(not(feature = "text_layout_hyphenation"))]

44

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]

45

#[allow(dead_code)]

46

pub enum Language {

47

    // Latin script languages

48

    EnglishUS,

49

    French,

50

    German1996,

51

    Spanish,

52

    Portuguese,

53

    Estonian,

54

    Hungarian,

55

    Polish,

56

    Czech,

57

    Slovak,

58

    Latvian,

59

    Lithuanian,

60

    Romanian,

61

    Turkish,

62

    Croatian,

63

    Icelandic,

64

    Welsh,

65

    NorwegianBokmal,

66

    Swedish,

67

    // Cyrillic script languages

68

    Russian,

69

    Ukrainian,

70

    Belarusian,

71

    Bulgarian,

72

    Macedonian,

73

    SerbianCyrillic,

74

    Mongolian,

75

    SlavonicChurch,

76

    // Greek script languages

77

    GreekMono,

78

    GreekPoly,

79

    Coptic,

80

    // Indic script languages

81

    Hindi,

82

    Bengali,

83

    Assamese,

84

    Marathi,

85

    Sanskrit,

86

    Gujarati,

87

    Panjabi,

88

    Kannada,

89

    Malayalam,

90

    Oriya,

91

    Tamil,

92

    Telugu,

93

    // Other scripts

94

    Georgian,

95

    Ethiopic,

96

    Thai,

97

    Chinese,

98

99

100

use rust_fontconfig::UnicodeRange;

101

102

#[derive(PartialEq, Eq, Debug, Clone, Copy)]

103

pub enum Script {

104

    // Keep this in alphabetic order (for C bindings)

105

    Arabic,

106

    Bengali,

107

    Cyrillic,

108

    Devanagari,

109

    Ethiopic,

110

    Georgian,

111

    Greek,

112

    Gujarati,

113

    Gurmukhi,

114

    Hangul,

115

    Hebrew,

116

    Hiragana,

117

    Kannada,

118

    Katakana,

119

    Khmer,

120

    Latin,

121

    Malayalam,

122

    Mandarin,

123

    Myanmar,

124

    Oriya,

125

    Sinhala,

126

    Tamil,

127

    Telugu,

128

    Thai,

129

130

131

impl Script {

132

    /// Maps a Script to a vector of its representative Unicode character ranges.

133

///

134

    /// The ranges are extracted from the `is_*` functions in the provided source code.

135

    pub fn get_unicode_ranges(&self) -> Vec<UnicodeRange> {

136

        match self {

137

            Script::Arabic => vec![

138

                UnicodeRange {

139

                    start: 0x0600,

140

                    end: 0x06FF,

141

},

142

                UnicodeRange {

143

                    start: 0x0750,

144

                    end: 0x07FF,

145

},

146

                UnicodeRange {

147

                    start: 0x08A0,

148

                    end: 0x08FF,

149

},

150

                UnicodeRange {

151

                    start: 0xFB50,

152

                    end: 0xFDFF,

153

},

154

                UnicodeRange {

155

                    start: 0xFE70,

156

                    end: 0xFEFF,

157

},

158

                UnicodeRange {

159

                    start: 0x10E60,

160

                    end: 0x10E7F,

161

},

162

                UnicodeRange {

163

                    start: 0x1EE00,

164

                    end: 0x1EEFF,

165

},

166

],

167

            Script::Bengali => vec![UnicodeRange {

168

                start: 0x0980,

169

                end: 0x09FF,

170

}],

171

            Script::Cyrillic => vec![

172

                UnicodeRange {

173

                    start: 0x0400,

174

                    end: 0x0484,

175

},

176

                UnicodeRange {

177

                    start: 0x0487,

178

                    end: 0x052F,

179

},

180

                UnicodeRange {

181

                    start: 0x2DE0,

182

                    end: 0x2DFF,

183

},

184

                UnicodeRange {

185

                    start: 0xA640,

186

                    end: 0xA69D,

187

},

188

                UnicodeRange {

189

                    start: 0x1D2B,

190

                    end: 0x1D2B,

191

},

192

                UnicodeRange {

193

                    start: 0x1D78,

194

                    end: 0x1D78,

195

},

196

                UnicodeRange {

197

                    start: 0xA69F,

198

                    end: 0xA69F,

199

},

200

],

201

            Script::Devanagari => vec![

202

                UnicodeRange {

203

                    start: 0x0900,

204

                    end: 0x097F,

205

},

206

                UnicodeRange {

207

                    start: 0xA8E0,

208

                    end: 0xA8FF,

209

},

210

                UnicodeRange {

211

                    start: 0x1CD0,

212

                    end: 0x1CFF,

213

},

214

],

215

            Script::Ethiopic => vec![

216

                UnicodeRange {

217

                    start: 0x1200,

218

                    end: 0x139F,

219

},

220

                UnicodeRange {

221

                    start: 0x2D80,

222

                    end: 0x2DDF,

223

},

224

                UnicodeRange {

225

                    start: 0xAB00,

226

                    end: 0xAB2F,

227

},

228

],

229

            Script::Georgian => vec![UnicodeRange {

230

                start: 0x10A0,

231

                end: 0x10FF,

232

}],

233

            Script::Greek => vec![UnicodeRange {

234

                start: 0x0370,

235

                end: 0x03FF,

236

}],

237

            Script::Gujarati => vec![UnicodeRange {

238

                start: 0x0A80,

239

                end: 0x0AFF,

240

}],

241

            Script::Gurmukhi => vec![UnicodeRange {

242

                start: 0x0A00,

243

                end: 0x0A7F,

244

}],

245

            Script::Hangul => vec![

246

                UnicodeRange {

247

                    start: 0xAC00,

248

                    end: 0xD7AF,

249

},

250

                UnicodeRange {

251

                    start: 0x1100,

252

                    end: 0x11FF,

253

},

254

                UnicodeRange {

255

                    start: 0x3130,

256

                    end: 0x318F,

257

},

258

                UnicodeRange {

259

                    start: 0x3200,

260

                    end: 0x32FF,

261

},

262

                UnicodeRange {

263

                    start: 0xA960,

264

                    end: 0xA97F,

265

},

266

                UnicodeRange {

267

                    start: 0xD7B0,

268

                    end: 0xD7FF,

269

},

270

                UnicodeRange {

271

                    start: 0xFF00,

272

                    end: 0xFFEF,

273

},

274

],

275

            Script::Hebrew => vec![UnicodeRange {

276

                start: 0x0590,

277

                end: 0x05FF,

278

}],

279

            Script::Hiragana => vec![UnicodeRange {

280

                start: 0x3040,

281

                end: 0x309F,

282

}],

283

            Script::Kannada => vec![UnicodeRange {

284

                start: 0x0C80,

285

                end: 0x0CFF,

286

}],

287

            Script::Katakana => vec![UnicodeRange {

288

                start: 0x30A0,

289

                end: 0x30FF,

290

}],

291

            Script::Khmer => vec![

292

                UnicodeRange {

293

                    start: 0x1780,

294

                    end: 0x17FF,

295

},

296

                UnicodeRange {

297

                    start: 0x19E0,

298

                    end: 0x19FF,

299

},

300

],

301

            Script::Latin => vec![

302

                UnicodeRange {

303

                    start: 0x0041,

304

                    end: 0x005A,

305

                }, // A-Z

306

                UnicodeRange {

307

                    start: 0x0061,

308

                    end: 0x007A,

309

                }, // a-z

310

                UnicodeRange {

311

                    start: 0x0080,

312

                    end: 0x00FF,

313

},

314

                UnicodeRange {

315

                    start: 0x0100,

316

                    end: 0x017F,

317

},

318

                UnicodeRange {

319

                    start: 0x0180,

320

                    end: 0x024F,

321

},

322

                UnicodeRange {

323

                    start: 0x0250,

324

                    end: 0x02AF,

325

},

326

                UnicodeRange {

327

                    start: 0x1D00,

328

                    end: 0x1D7F,

329

},

330

                UnicodeRange {

331

                    start: 0x1D80,

332

                    end: 0x1DBF,

333

},

334

                UnicodeRange {

335

                    start: 0x1E00,

336

                    end: 0x1EFF,

337

},

338

                UnicodeRange {

339

                    start: 0x2100,

340

                    end: 0x214F,

341

},

342

                UnicodeRange {

343

                    start: 0x2C60,

344

                    end: 0x2C7F,

345

},

346

                UnicodeRange {

347

                    start: 0xA720,

348

                    end: 0xA7FF,

349

},

350

                UnicodeRange {

351

                    start: 0xAB30,

352

                    end: 0xAB6F,

353

},

354

],

355

            Script::Malayalam => vec![UnicodeRange {

356

                start: 0x0D00,

357

                end: 0x0D7F,

358

}],

359

            Script::Mandarin => vec![

360

                UnicodeRange {

361

                    start: 0x2E80,

362

                    end: 0x2E99,

363

},

364

                UnicodeRange {

365

                    start: 0x2E9B,

366

                    end: 0x2EF3,

367

},

368

                UnicodeRange {

369

                    start: 0x2F00,

370

                    end: 0x2FD5,

371

},

372

                UnicodeRange {

373

                    start: 0x3005,

374

                    end: 0x3005,

375

},

376

                UnicodeRange {

377

                    start: 0x3007,

378

                    end: 0x3007,

379

},

380

                UnicodeRange {

381

                    start: 0x3021,

382

                    end: 0x3029,

383

},

384

                UnicodeRange {

385

                    start: 0x3038,

386

                    end: 0x303B,

387

},

388

                UnicodeRange {

389

                    start: 0x3400,

390

                    end: 0x4DB5,

391

},

392

                UnicodeRange {

393

                    start: 0x4E00,

394

                    end: 0x9FCC,

395

},

396

                UnicodeRange {

397

                    start: 0xF900,

398

                    end: 0xFA6D,

399

},

400

                UnicodeRange {

401

                    start: 0xFA70,

402

                    end: 0xFAD9,

403

},

404

],

405

            Script::Myanmar => vec![UnicodeRange {

406

                start: 0x1000,

407

                end: 0x109F,

408

}],

409

            Script::Oriya => vec![UnicodeRange {

410

                start: 0x0B00,

411

                end: 0x0B7F,

412

}],

413

            Script::Sinhala => vec![UnicodeRange {

414

                start: 0x0D80,

415

                end: 0x0DFF,

416

}],

417

            Script::Tamil => vec![UnicodeRange {

418

                start: 0x0B80,

419

                end: 0x0BFF,

420

}],

421

            Script::Telugu => vec![UnicodeRange {

422

                start: 0x0C00,

423

                end: 0x0C7F,

424

}],

425

            Script::Thai => vec![UnicodeRange {

426

                start: 0x0E00,

427

                end: 0x0E7F,

428

}],

429

430

431

432

433

// Is it space, punctuation or digit?

434

// Stop character is a character that does not give any value for script

435

// or language detection.

436

#[inline]

437

49770

pub fn is_stop_char(ch: char) -> bool {

438

49770

    matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')

439

49770

440

441

type ScriptCounter = (Script, fn(char) -> bool, usize);

442

443

/// Detect only a script by a given text

444

7700

pub fn detect_script(text: &str) -> Option<Script> {

445

7700

    let mut script_counters: [ScriptCounter; 24] = [

446

7700

        (Script::Latin, is_latin, 0),

447

7700

        (Script::Cyrillic, is_cyrillic, 0),

448

7700

        (Script::Arabic, is_arabic, 0),

449

7700

        (Script::Mandarin, is_mandarin, 0),

450

7700

        (Script::Devanagari, is_devanagari, 0),

451

7700

        (Script::Hebrew, is_hebrew, 0),

452

7700

        (Script::Ethiopic, is_ethiopic, 0),

453

7700

        (Script::Georgian, is_georgian, 0),

454

7700

        (Script::Bengali, is_bengali, 0),

455

7700

        (Script::Hangul, is_hangul, 0),

456

7700

        (Script::Hiragana, is_hiragana, 0),

457

7700

        (Script::Katakana, is_katakana, 0),

458

7700

        (Script::Greek, is_greek, 0),

459

7700

        (Script::Kannada, is_kannada, 0),

460

7700

        (Script::Tamil, is_tamil, 0),

461

7700

        (Script::Thai, is_thai, 0),

462

7700

        (Script::Gujarati, is_gujarati, 0),

463

7700

        (Script::Gurmukhi, is_gurmukhi, 0),

464

7700

        (Script::Telugu, is_telugu, 0),

465

7700

        (Script::Malayalam, is_malayalam, 0),

466

7700

        (Script::Oriya, is_oriya, 0),

467

7700

        (Script::Myanmar, is_myanmar, 0),

468

7700

        (Script::Sinhala, is_sinhala, 0),

469

7700

        (Script::Khmer, is_khmer, 0),

470

7700

];

471

472

7700

    let half = text.chars().count() / 2;

473

474

49770

    for ch in text.chars() {

475

49770

        if is_stop_char(ch) {

476

7210

            continue;

477

42560

478

479

        // For performance reasons, we need to mutate script_counters by calling

480

        // `swap` function, it would not be possible to do using normal iterator.

481

57855

        for i in 0..script_counters.len() {

482

51135

            let found = {

483

57855

                let (script, check_fn, ref mut count) = script_counters[i];

484

57855

                if check_fn(ch) {

485

41895

                    *count += 1;

486

41895

                    if *count > half {

487

6720

                        return Some(script);

488

35175

489

35175

                    true

490

                } else {

491

15960

                    false

492

493

};

494

            // Have to let borrow of count fall out of scope before doing swapping, or we could

495

            // do this above.

496

51135

            if found {

497

                // If script was found, move it closer to the front.

498

                // If the text contains largely 1 or 2 scripts, this will

499

                // cause these scripts to be eventually checked first.

500

35175

                if i > 0 {

501

                    script_counters.swap(i - 1, i);

502

35175

503

35175

                break;

504

15960

505

506

507

508

980

    let (script, _, count) = script_counters

509

980

        .iter()

510

980

        .cloned()

511

980

        .max_by_key(|&(_, _, count)| count)

512

980

        .unwrap();

513

980

    if count != 0 {

514

140

        Some(script)

515

    } else {

516

840

        None

517

518

7700

519

520

pub fn detect_char_script(ch: char) -> Option<Script> {

521

    let script_counters: [ScriptCounter; 24] = [

522

        (Script::Latin, is_latin, 0),

523

        (Script::Cyrillic, is_cyrillic, 0),

524

        (Script::Arabic, is_arabic, 0),

525

        (Script::Mandarin, is_mandarin, 0),

526

        (Script::Devanagari, is_devanagari, 0),

527

        (Script::Hebrew, is_hebrew, 0),

528

        (Script::Ethiopic, is_ethiopic, 0),

529

        (Script::Georgian, is_georgian, 0),

530

        (Script::Bengali, is_bengali, 0),

531

        (Script::Hangul, is_hangul, 0),

532

        (Script::Hiragana, is_hiragana, 0),

533

        (Script::Katakana, is_katakana, 0),

534

        (Script::Greek, is_greek, 0),

535

        (Script::Kannada, is_kannada, 0),

536

        (Script::Tamil, is_tamil, 0),

537

        (Script::Thai, is_thai, 0),

538

        (Script::Gujarati, is_gujarati, 0),

539

        (Script::Gurmukhi, is_gurmukhi, 0),

540

        (Script::Telugu, is_telugu, 0),

541

        (Script::Malayalam, is_malayalam, 0),

542

        (Script::Oriya, is_oriya, 0),

543

        (Script::Myanmar, is_myanmar, 0),

544

        (Script::Sinhala, is_sinhala, 0),

545

        (Script::Khmer, is_khmer, 0),

546

];

547

548

    for i in 0..script_counters.len() {

549

        let (script, check_fn, _) = script_counters[i];

550

        if check_fn(ch) {

551

            return Some(script);

552

553

554

    None

555

556

557

/// Iterates through the text once and returns as soon as an Assamese-specific character is found.

558

fn detect_bengali_language(text: &str) -> Language {

559

    for c in text.chars() {

560

        // These characters are specific to Assamese in the Bengali script block.

561

        // We can return immediately as this is the highest priority check.

562

        if matches!(c, '\u{09F0}' | '\u{09F1}') {

563

            // ৰ, ৱ

564

            return Language::Assamese;

565

566

567

    // If we finish the loop without finding any Assamese characters, it's Bengali.

568

    Language::Bengali

569

570

571

fn detect_cyrillic_language(text: &str) -> Language {

572

    for c in text.chars() {

573

        match c {

574

            // Highest priority: Old Cyrillic characters for Slavonic Church. Return immediately.

575

            '\u{0460}'..='\u{047F}' => return Language::SlavonicChurch,

576

            // Set flags for other languages. We don't return yet because a higher-priority

577

            // character (like the one above) could still appear.

578

            'ѓ' | 'ќ' | 'ѕ' => return Language::Macedonian,

579

            'ў' => return Language::Belarusian,

580

            'є' | 'і' | 'ї' | 'ґ' => return Language::Ukrainian,

581

            'ө' | 'ү' | 'һ' => return Language::Mongolian,

582

            'ј' | 'љ' | 'њ' | 'ћ' | 'ђ' | 'џ' => return Language::SerbianCyrillic,

583

            // Bulgarian 'ъ' is also in Russian, but 'щ' is a stronger indicator.

584

            // The logic implies that if either is present, it might be Bulgarian.

585

            'щ' => return Language::Bulgarian,

586

            _ => {}

587

588

589

590

    Language::Russian

591

592

593

fn detect_devanagari_language(text: &str) -> Language {

594

    for c in text.chars() {

595

        match c {

596

            // Marathi has higher priority in the original logic. Return immediately.

597

            '\u{0933}' => return Language::Marathi, // ळ

598

            // Flag for Sanskrit Vedic extensions.

599

            '\u{1CD0}'..='\u{1CFF}' => return Language::Sanskrit,

600

            _ => (),

601

602

603

604

    Language::Hindi

605

606

607

fn detect_greek_language(text: &str) -> Language {

608

    for c in text.chars() {

609

        match c {

610

            // Coptic has higher priority. Return immediately.

611

            '\u{2C80}'..='\u{2CFF}' => return Language::Coptic,

612

            // Flag for Greek Extended (Polytonic) characters.

613

            '\u{1F00}'..='\u{1FFF}' => return Language::GreekPoly,

614

            _ => {}

615

616

617

618

    Language::GreekMono

619

620

621

5495

fn detect_latin_language(text: &str) -> Language {

622

    // Flags for languages checked near the end of the original if-else chain.

623

5495

    let mut has_french_c = false;

624

5495

    let mut has_portuguese_o = false;

625

5495

    let mut has_portuguese_a = false;

626

627

61040

    for c in text.chars() {

628

61040

        match c {

629

            // --- Early Return Cases (in order of priority) ---

630

            'ß' => return Language::German1996,

631

            'ő' | 'ű' => return Language::Hungarian,

632

            'ł' => return Language::Polish,

633

            'ř' | 'ů' => return Language::Czech,

634

            'ľ' | 'ĺ' | 'ŕ' => return Language::Slovak,

635

            'ā' | 'ē' | 'ģ' | 'ī' | 'ķ' | 'ļ' | 'ņ' | 'ō' | 'ū' => {

636

                return Language::Latvian

637

638

            'ą' | 'ę' | 'ė' | 'į' | 'ų' => return Language::Lithuanian,

639

            'ă' | 'ș' | 'ț' => return Language::Romanian,

640

            'ğ' | 'ı' | 'ş' => return Language::Turkish,

641

            'đ' => return Language::Croatian, /* Also used in Vietnamese, but Croatian is the */

642

            // original's intent

643

            'þ' | 'ð' => return Language::Icelandic,

644

            'ŵ' | 'ŷ' => return Language::Welsh,

645

            'æ' | 'ø' => return Language::NorwegianBokmal, // And Danish

646

            'å' => return Language::Swedish,               // And Norwegian, Finnish

647

            'ñ' => return Language::Spanish,

648

            'ä' | 'ö' | 'ü' => return Language::German1996,

649

650

            // NOTE: 'õ' is used by both Estonian and Portuguese

651

            // Since Estonian is checked first, it takes precedence.

652

            'õ' => has_portuguese_o = true,

653

            'ã' => has_portuguese_a = true,

654

655

            // --- Flag-setting Cases ---

656

            'ç' => has_french_c = true, // Also in Portuguese

657

            'á' | 'é' | 'í' | 'ó' | 'ú' => return Language::Spanish,

658

659

61040

            _ => (),

660

661

662

663

    // decide between portuguese, estonian and french

664

665

5495

    if has_french_c && !has_portuguese_o && !has_portuguese_a {

666

        return Language::French;

667

5495

668

669

5495

    if has_portuguese_o && !has_french_c && !has_portuguese_a {

670

        return Language::Estonian;

671

5495

672

673

5495

    if has_portuguese_o || has_portuguese_a || has_french_c {

674

        return Language::Portuguese;

675

5495

676

677

5495

    Language::EnglishUS

678

5495

679

680

5495

pub fn script_to_language(script: Script, text: &str) -> Language {

681

5495

    match script {

682

        Script::Ethiopic => Language::Ethiopic,

683

        Script::Georgian => Language::Georgian,

684

        Script::Gujarati => Language::Gujarati,

685

        Script::Gurmukhi => Language::Panjabi,

686

        Script::Kannada => Language::Kannada,

687

        Script::Malayalam => Language::Malayalam,

688

        Script::Mandarin => Language::Chinese,

689

        Script::Oriya => Language::Oriya,

690

        Script::Tamil => Language::Tamil,

691

        Script::Telugu => Language::Telugu,

692

        Script::Thai => Language::Thai,

693

        Script::Bengali => detect_bengali_language(text),

694

        Script::Cyrillic => detect_cyrillic_language(text),

695

        Script::Devanagari => detect_devanagari_language(text),

696

        Script::Greek => detect_greek_language(text),

697

5495

        Script::Latin => detect_latin_language(text),

698

699

        // not directly matchable

700

        Script::Myanmar => Language::Thai,

701

        Script::Khmer => Language::Thai,

702

        Script::Sinhala => Language::Hindi,

703

704

        // no classical hyphenation behaviour

705

        Script::Arabic => Language::Chinese,

706

        Script::Hebrew => Language::Chinese,

707

        Script::Hangul => Language::Chinese,

708

        Script::Hiragana => Language::Chinese,

709

        Script::Katakana => Language::Chinese,

710

711

5495

712

713

665

pub fn is_cyrillic(ch: char) -> bool {

714

665

    matches!(ch,

715

665

        '\u{0400}'..='\u{0484}'

716

665

        | '\u{0487}'..='\u{052F}'

717

665

        | '\u{2DE0}'..='\u{2DFF}'

718

665

        | '\u{A640}'..='\u{A69D}'

719

        | '\u{1D2B}'

720

        | '\u{1D78}'

721

        | '\u{A69F}'

722

723

665

724

725

// https://en.wikipedia.org/wiki/Latin_script_in_Unicode

726

42560

pub fn is_latin(ch: char) -> bool {

727

42560

    matches!(ch,

728

36820

        'a'..='z'

729

6405

        | 'A'..='Z'

730

665

        | '\u{0080}'..='\u{00FF}'

731

665

        | '\u{0100}'..='\u{017F}'

732

665

        | '\u{0180}'..='\u{024F}'

733

665

        | '\u{0250}'..='\u{02AF}'

734

665

        | '\u{1D00}'..='\u{1D7F}'

735

665

        | '\u{1D80}'..='\u{1DBF}'

736

665

        | '\u{1E00}'..='\u{1EFF}'

737

665

        | '\u{2100}'..='\u{214F}'

738

665

        | '\u{2C60}'..='\u{2C7F}'

739

665

        | '\u{A720}'..='\u{A7FF}'

740

665

        | '\u{AB30}'..='\u{AB6F}'

741

742

42560

743

744

// Based on https://en.wikipedia.org/wiki/Arabic_script_in_Unicode

745

665

pub fn is_arabic(ch: char) -> bool {

746

665

    matches!(ch,

747

665

        '\u{0600}'..='\u{06FF}'

748

665

        | '\u{0750}'..='\u{07FF}'

749

665

        | '\u{08A0}'..='\u{08FF}'

750

665

        | '\u{FB50}'..='\u{FDFF}'

751

665

        | '\u{FE70}'..='\u{FEFF}'

752

        | '\u{10E60}'..='\u{10E7F}'

753

        | '\u{1EE00}'..='\u{1EEFF}'

754

755

665

756

757

// Based on https://en.wikipedia.org/wiki/Devanagari#Unicode

758

665

pub fn is_devanagari(ch: char) -> bool {

759

665

    matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')

760

665

761

762

// Based on https://www.key-shortcut.com/en/writing-systems/ethiopian-script/

763

665

pub fn is_ethiopic(ch: char) -> bool {

764

665

    matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')

765

665

766

767

// Based on https://en.wikipedia.org/wiki/Hebrew_(Unicode_block)

768

665

pub fn is_hebrew(ch: char) -> bool {

769

665

    matches!(ch, '\u{0590}'..='\u{05FF}')

770

665

771

772

665

pub fn is_georgian(ch: char) -> bool {

773

665

    matches!(ch, '\u{10A0}'..='\u{10FF}')

774

665

775

776

665

pub fn is_mandarin(ch: char) -> bool {

777

665

    matches!(ch,

778

665

        '\u{2E80}'..='\u{2E99}'

779

665

        | '\u{2E9B}'..='\u{2EF3}'

780

665

        | '\u{2F00}'..='\u{2FD5}'

781

        | '\u{3005}'

782

        | '\u{3007}'

783

665

        | '\u{3021}'..='\u{3029}'

784

665

        | '\u{3038}'..='\u{303B}'

785

665

        | '\u{3400}'..='\u{4DB5}'

786

665

        | '\u{4E00}'..='\u{9FCC}'

787

665

        | '\u{F900}'..='\u{FA6D}'

788

665

        | '\u{FA70}'..='\u{FAD9}'

789

790

665

791

792

665

pub fn is_bengali(ch: char) -> bool {

793

665

    matches!(ch, '\u{0980}'..='\u{09FF}')

794

665

795

796

665

pub fn is_hiragana(ch: char) -> bool {

797

665

    matches!(ch, '\u{3040}'..='\u{309F}')

798

665

799

800

665

pub fn is_katakana(ch: char) -> bool {

801

665

    matches!(ch, '\u{30A0}'..='\u{30FF}')

802

665

803

804

// Hangul is Korean Alphabet. Unicode ranges are taken from: https://en.wikipedia.org/wiki/Hangul

805

665

pub fn is_hangul(ch: char) -> bool {

806

665

    matches!(ch,

807

665

        '\u{AC00}'..='\u{D7AF}'

808

665

        | '\u{1100}'..='\u{11FF}'

809

665

        | '\u{3130}'..='\u{318F}'

810

665

        | '\u{3200}'..='\u{32FF}'

811

665

        | '\u{A960}'..='\u{A97F}'

812

665

        | '\u{D7B0}'..='\u{D7FF}'

813

665

        | '\u{FF00}'..='\u{FFEF}'

814

815

665

816

817

// Taken from: https://en.wikipedia.org/wiki/Greek_and_Coptic

818

665

pub fn is_greek(ch: char) -> bool {

819

665

    matches!(ch, '\u{0370}'..='\u{03FF}')

820

665

821

822

// Based on: https://en.wikipedia.org/wiki/Kannada_(Unicode_block)

823

665

pub fn is_kannada(ch: char) -> bool {

824

665

    matches!(ch, '\u{0C80}'..='\u{0CFF}')

825

665

826

827

// Based on: https://en.wikipedia.org/wiki/Tamil_(Unicode_block)

828

665

pub fn is_tamil(ch: char) -> bool {

829

665

    matches!(ch, '\u{0B80}'..='\u{0BFF}')

830

665

831

832

// Based on: https://en.wikipedia.org/wiki/Thai_(Unicode_block)

833

665

pub fn is_thai(ch: char) -> bool {

834

665

    matches!(ch, '\u{0E00}'..='\u{0E7F}')

835

665

836

837

// Based on: https://en.wikipedia.org/wiki/Gujarati_(Unicode_block)

838

665

pub fn is_gujarati(ch: char) -> bool {

839

665

    matches!(ch, '\u{0A80}'..='\u{0AFF}')

840

665

841

842

// Gurmukhi is the script for Punjabi language.

843

// Based on: https://en.wikipedia.org/wiki/Gurmukhi_(Unicode_block)

844

665

pub fn is_gurmukhi(ch: char) -> bool {

845

665

    matches!(ch, '\u{0A00}'..='\u{0A7F}')

846

665

847

848

665

pub fn is_telugu(ch: char) -> bool {

849

665

    matches!(ch, '\u{0C00}'..='\u{0C7F}')

850

665

851

852

// Based on: https://en.wikipedia.org/wiki/Malayalam_(Unicode_block)

853

665

pub fn is_malayalam(ch: char) -> bool {

854

665

    matches!(ch, '\u{0D00}'..='\u{0D7F}')

855

665

856

857

// Based on: https://en.wikipedia.org/wiki/Oriya_(Unicode_block)

858

665

pub fn is_oriya(ch: char) -> bool {

859

665

    matches!(ch, '\u{0B00}'..='\u{0B7F}')

860

665

861

862

// Based on: https://en.wikipedia.org/wiki/Myanmar_(Unicode_block)

863

665

pub fn is_myanmar(ch: char) -> bool {

864

665

    matches!(ch, '\u{1000}'..='\u{109F}')

865

665

866

867

// Based on: https://en.wikipedia.org/wiki/Sinhala_(Unicode_block)

868

665

pub fn is_sinhala(ch: char) -> bool {

869

665

    matches!(ch, '\u{0D80}'..='\u{0DFF}')

870

665

871

872

// Based on: https://en.wikipedia.org/wiki/Khmer_alphabet

873

665

pub fn is_khmer(ch: char) -> bool {

874

665

    matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')

875

665