1
//! Unicode script detection and language identification for text shaping
2
//!
3
// Taken from: https://github.com/greyblake/whatlang-rs/blob/master/src/scripts/detect.rs
4
//
5
// See: https://github.com/greyblake/whatlang-rs/pull/67
6

            
7
// License:
8
//
9
// (The MIT License)
10
//
11
// Copyright (c) 2017 Sergey Potapov <blake131313@gmail.com>
12
// Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>
13
// Copyright (c) 2008 Kent S Johnson
14
// Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
15
// Copyright (c) 2004 Maciej Ceglowski
16
//
17
// Permission is hereby granted, free of charge, to any person obtaining
18
// a copy of this software and associated documentation files (the
19
// 'Software'), to deal in the Software without restriction, including
20
// without limitation the rights to use, copy, modify, merge, publish,
21
// distribute, sublicense, and/or sell copies of the Software, and to
22
// permit persons to whom the Software is furnished to do so, subject to
23
// the following conditions:
24
//
25
// The above copyright notice and this permission notice shall be
26
// included in all copies or substantial portions of the Software.
27
//
28
// THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
29
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
30
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
31
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
32
// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
33
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
34
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35

            
36
#[cfg(feature = "text_layout_hyphenation")]
37
use hyphenation::Language as HyphenationLanguage;
38
#[cfg(feature = "text_layout_hyphenation")]
39
pub use hyphenation::Language;
40

            
41
/// Stub Language enum for when hyphenation is not enabled.
42
/// This mirrors the variants used in script detection functions.
43
#[cfg(not(feature = "text_layout_hyphenation"))]
44
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
45
#[allow(dead_code)]
46
pub enum Language {
47
    // Latin script languages
48
    EnglishUS,
49
    French,
50
    German1996,
51
    Spanish,
52
    Portuguese,
53
    Estonian,
54
    Hungarian,
55
    Polish,
56
    Czech,
57
    Slovak,
58
    Latvian,
59
    Lithuanian,
60
    Romanian,
61
    Turkish,
62
    Croatian,
63
    Icelandic,
64
    Welsh,
65
    NorwegianBokmal,
66
    Swedish,
67
    // Cyrillic script languages
68
    Russian,
69
    Ukrainian,
70
    Belarusian,
71
    Bulgarian,
72
    Macedonian,
73
    SerbianCyrillic,
74
    Mongolian,
75
    SlavonicChurch,
76
    // Greek script languages
77
    GreekMono,
78
    GreekPoly,
79
    Coptic,
80
    // Indic script languages
81
    Hindi,
82
    Bengali,
83
    Assamese,
84
    Marathi,
85
    Sanskrit,
86
    Gujarati,
87
    Panjabi,
88
    Kannada,
89
    Malayalam,
90
    Oriya,
91
    Tamil,
92
    Telugu,
93
    // Other scripts
94
    Georgian,
95
    Ethiopic,
96
    Thai,
97
    Chinese,
98
}
99

            
100
use rust_fontconfig::UnicodeRange;
101

            
102
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
103
pub enum Script {
104
    // Keep this in alphabetic order (for C bindings)
105
    Arabic,
106
    Bengali,
107
    Cyrillic,
108
    Devanagari,
109
    Ethiopic,
110
    Georgian,
111
    Greek,
112
    Gujarati,
113
    Gurmukhi,
114
    Hangul,
115
    Hebrew,
116
    Hiragana,
117
    Kannada,
118
    Katakana,
119
    Khmer,
120
    Latin,
121
    Malayalam,
122
    Mandarin,
123
    Myanmar,
124
    Oriya,
125
    Sinhala,
126
    Tamil,
127
    Telugu,
128
    Thai,
129
}
130

            
131
impl Script {
132
    /// Maps a Script to a vector of its representative Unicode character ranges.
133
    ///
134
    /// The ranges are extracted from the `is_*` functions in the provided source code.
135
    pub fn get_unicode_ranges(&self) -> Vec<UnicodeRange> {
136
        match self {
137
            Script::Arabic => vec![
138
                UnicodeRange {
139
                    start: 0x0600,
140
                    end: 0x06FF,
141
                },
142
                UnicodeRange {
143
                    start: 0x0750,
144
                    end: 0x07FF,
145
                },
146
                UnicodeRange {
147
                    start: 0x08A0,
148
                    end: 0x08FF,
149
                },
150
                UnicodeRange {
151
                    start: 0xFB50,
152
                    end: 0xFDFF,
153
                },
154
                UnicodeRange {
155
                    start: 0xFE70,
156
                    end: 0xFEFF,
157
                },
158
                UnicodeRange {
159
                    start: 0x10E60,
160
                    end: 0x10E7F,
161
                },
162
                UnicodeRange {
163
                    start: 0x1EE00,
164
                    end: 0x1EEFF,
165
                },
166
            ],
167
            Script::Bengali => vec![UnicodeRange {
168
                start: 0x0980,
169
                end: 0x09FF,
170
            }],
171
            Script::Cyrillic => vec![
172
                UnicodeRange {
173
                    start: 0x0400,
174
                    end: 0x0484,
175
                },
176
                UnicodeRange {
177
                    start: 0x0487,
178
                    end: 0x052F,
179
                },
180
                UnicodeRange {
181
                    start: 0x2DE0,
182
                    end: 0x2DFF,
183
                },
184
                UnicodeRange {
185
                    start: 0xA640,
186
                    end: 0xA69D,
187
                },
188
                UnicodeRange {
189
                    start: 0x1D2B,
190
                    end: 0x1D2B,
191
                },
192
                UnicodeRange {
193
                    start: 0x1D78,
194
                    end: 0x1D78,
195
                },
196
                UnicodeRange {
197
                    start: 0xA69F,
198
                    end: 0xA69F,
199
                },
200
            ],
201
            Script::Devanagari => vec![
202
                UnicodeRange {
203
                    start: 0x0900,
204
                    end: 0x097F,
205
                },
206
                UnicodeRange {
207
                    start: 0xA8E0,
208
                    end: 0xA8FF,
209
                },
210
                UnicodeRange {
211
                    start: 0x1CD0,
212
                    end: 0x1CFF,
213
                },
214
            ],
215
            Script::Ethiopic => vec![
216
                UnicodeRange {
217
                    start: 0x1200,
218
                    end: 0x139F,
219
                },
220
                UnicodeRange {
221
                    start: 0x2D80,
222
                    end: 0x2DDF,
223
                },
224
                UnicodeRange {
225
                    start: 0xAB00,
226
                    end: 0xAB2F,
227
                },
228
            ],
229
            Script::Georgian => vec![UnicodeRange {
230
                start: 0x10A0,
231
                end: 0x10FF,
232
            }],
233
            Script::Greek => vec![UnicodeRange {
234
                start: 0x0370,
235
                end: 0x03FF,
236
            }],
237
            Script::Gujarati => vec![UnicodeRange {
238
                start: 0x0A80,
239
                end: 0x0AFF,
240
            }],
241
            Script::Gurmukhi => vec![UnicodeRange {
242
                start: 0x0A00,
243
                end: 0x0A7F,
244
            }],
245
            Script::Hangul => vec![
246
                UnicodeRange {
247
                    start: 0xAC00,
248
                    end: 0xD7AF,
249
                },
250
                UnicodeRange {
251
                    start: 0x1100,
252
                    end: 0x11FF,
253
                },
254
                UnicodeRange {
255
                    start: 0x3130,
256
                    end: 0x318F,
257
                },
258
                UnicodeRange {
259
                    start: 0x3200,
260
                    end: 0x32FF,
261
                },
262
                UnicodeRange {
263
                    start: 0xA960,
264
                    end: 0xA97F,
265
                },
266
                UnicodeRange {
267
                    start: 0xD7B0,
268
                    end: 0xD7FF,
269
                },
270
                UnicodeRange {
271
                    start: 0xFF00,
272
                    end: 0xFFEF,
273
                },
274
            ],
275
            Script::Hebrew => vec![UnicodeRange {
276
                start: 0x0590,
277
                end: 0x05FF,
278
            }],
279
            Script::Hiragana => vec![UnicodeRange {
280
                start: 0x3040,
281
                end: 0x309F,
282
            }],
283
            Script::Kannada => vec![UnicodeRange {
284
                start: 0x0C80,
285
                end: 0x0CFF,
286
            }],
287
            Script::Katakana => vec![UnicodeRange {
288
                start: 0x30A0,
289
                end: 0x30FF,
290
            }],
291
            Script::Khmer => vec![
292
                UnicodeRange {
293
                    start: 0x1780,
294
                    end: 0x17FF,
295
                },
296
                UnicodeRange {
297
                    start: 0x19E0,
298
                    end: 0x19FF,
299
                },
300
            ],
301
            Script::Latin => vec![
302
                UnicodeRange {
303
                    start: 0x0041,
304
                    end: 0x005A,
305
                }, // A-Z
306
                UnicodeRange {
307
                    start: 0x0061,
308
                    end: 0x007A,
309
                }, // a-z
310
                UnicodeRange {
311
                    start: 0x0080,
312
                    end: 0x00FF,
313
                },
314
                UnicodeRange {
315
                    start: 0x0100,
316
                    end: 0x017F,
317
                },
318
                UnicodeRange {
319
                    start: 0x0180,
320
                    end: 0x024F,
321
                },
322
                UnicodeRange {
323
                    start: 0x0250,
324
                    end: 0x02AF,
325
                },
326
                UnicodeRange {
327
                    start: 0x1D00,
328
                    end: 0x1D7F,
329
                },
330
                UnicodeRange {
331
                    start: 0x1D80,
332
                    end: 0x1DBF,
333
                },
334
                UnicodeRange {
335
                    start: 0x1E00,
336
                    end: 0x1EFF,
337
                },
338
                UnicodeRange {
339
                    start: 0x2100,
340
                    end: 0x214F,
341
                },
342
                UnicodeRange {
343
                    start: 0x2C60,
344
                    end: 0x2C7F,
345
                },
346
                UnicodeRange {
347
                    start: 0xA720,
348
                    end: 0xA7FF,
349
                },
350
                UnicodeRange {
351
                    start: 0xAB30,
352
                    end: 0xAB6F,
353
                },
354
            ],
355
            Script::Malayalam => vec![UnicodeRange {
356
                start: 0x0D00,
357
                end: 0x0D7F,
358
            }],
359
            Script::Mandarin => vec![
360
                UnicodeRange {
361
                    start: 0x2E80,
362
                    end: 0x2E99,
363
                },
364
                UnicodeRange {
365
                    start: 0x2E9B,
366
                    end: 0x2EF3,
367
                },
368
                UnicodeRange {
369
                    start: 0x2F00,
370
                    end: 0x2FD5,
371
                },
372
                UnicodeRange {
373
                    start: 0x3005,
374
                    end: 0x3005,
375
                },
376
                UnicodeRange {
377
                    start: 0x3007,
378
                    end: 0x3007,
379
                },
380
                UnicodeRange {
381
                    start: 0x3021,
382
                    end: 0x3029,
383
                },
384
                UnicodeRange {
385
                    start: 0x3038,
386
                    end: 0x303B,
387
                },
388
                UnicodeRange {
389
                    start: 0x3400,
390
                    end: 0x4DB5,
391
                },
392
                UnicodeRange {
393
                    start: 0x4E00,
394
                    end: 0x9FCC,
395
                },
396
                UnicodeRange {
397
                    start: 0xF900,
398
                    end: 0xFA6D,
399
                },
400
                UnicodeRange {
401
                    start: 0xFA70,
402
                    end: 0xFAD9,
403
                },
404
            ],
405
            Script::Myanmar => vec![UnicodeRange {
406
                start: 0x1000,
407
                end: 0x109F,
408
            }],
409
            Script::Oriya => vec![UnicodeRange {
410
                start: 0x0B00,
411
                end: 0x0B7F,
412
            }],
413
            Script::Sinhala => vec![UnicodeRange {
414
                start: 0x0D80,
415
                end: 0x0DFF,
416
            }],
417
            Script::Tamil => vec![UnicodeRange {
418
                start: 0x0B80,
419
                end: 0x0BFF,
420
            }],
421
            Script::Telugu => vec![UnicodeRange {
422
                start: 0x0C00,
423
                end: 0x0C7F,
424
            }],
425
            Script::Thai => vec![UnicodeRange {
426
                start: 0x0E00,
427
                end: 0x0E7F,
428
            }],
429
        }
430
    }
431
}
432

            
433
// Is it space, punctuation or digit?
434
// Stop character is a character that does not give any value for script
435
// or language detection.
436
#[inline]
437
49770
pub fn is_stop_char(ch: char) -> bool {
438
49770
    matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')
439
49770
}
440

            
441
type ScriptCounter = (Script, fn(char) -> bool, usize);
442

            
443
/// Detect only a script by a given text
444
7700
pub fn detect_script(text: &str) -> Option<Script> {
445
7700
    let mut script_counters: [ScriptCounter; 24] = [
446
7700
        (Script::Latin, is_latin, 0),
447
7700
        (Script::Cyrillic, is_cyrillic, 0),
448
7700
        (Script::Arabic, is_arabic, 0),
449
7700
        (Script::Mandarin, is_mandarin, 0),
450
7700
        (Script::Devanagari, is_devanagari, 0),
451
7700
        (Script::Hebrew, is_hebrew, 0),
452
7700
        (Script::Ethiopic, is_ethiopic, 0),
453
7700
        (Script::Georgian, is_georgian, 0),
454
7700
        (Script::Bengali, is_bengali, 0),
455
7700
        (Script::Hangul, is_hangul, 0),
456
7700
        (Script::Hiragana, is_hiragana, 0),
457
7700
        (Script::Katakana, is_katakana, 0),
458
7700
        (Script::Greek, is_greek, 0),
459
7700
        (Script::Kannada, is_kannada, 0),
460
7700
        (Script::Tamil, is_tamil, 0),
461
7700
        (Script::Thai, is_thai, 0),
462
7700
        (Script::Gujarati, is_gujarati, 0),
463
7700
        (Script::Gurmukhi, is_gurmukhi, 0),
464
7700
        (Script::Telugu, is_telugu, 0),
465
7700
        (Script::Malayalam, is_malayalam, 0),
466
7700
        (Script::Oriya, is_oriya, 0),
467
7700
        (Script::Myanmar, is_myanmar, 0),
468
7700
        (Script::Sinhala, is_sinhala, 0),
469
7700
        (Script::Khmer, is_khmer, 0),
470
7700
    ];
471

            
472
7700
    let half = text.chars().count() / 2;
473

            
474
49770
    for ch in text.chars() {
475
49770
        if is_stop_char(ch) {
476
7210
            continue;
477
42560
        }
478

            
479
        // For performance reasons, we need to mutate script_counters by calling
480
        // `swap` function, it would not be possible to do using normal iterator.
481
57855
        for i in 0..script_counters.len() {
482
51135
            let found = {
483
57855
                let (script, check_fn, ref mut count) = script_counters[i];
484
57855
                if check_fn(ch) {
485
41895
                    *count += 1;
486
41895
                    if *count > half {
487
6720
                        return Some(script);
488
35175
                    }
489
35175
                    true
490
                } else {
491
15960
                    false
492
                }
493
            };
494
            // Have to let borrow of count fall out of scope before doing swapping, or we could
495
            // do this above.
496
51135
            if found {
497
                // If script was found, move it closer to the front.
498
                // If the text contains largely 1 or 2 scripts, this will
499
                // cause these scripts to be eventually checked first.
500
35175
                if i > 0 {
501
                    script_counters.swap(i - 1, i);
502
35175
                }
503
35175
                break;
504
15960
            }
505
        }
506
    }
507

            
508
980
    let (script, _, count) = script_counters
509
980
        .iter()
510
980
        .cloned()
511
980
        .max_by_key(|&(_, _, count)| count)
512
980
        .unwrap();
513
980
    if count != 0 {
514
140
        Some(script)
515
    } else {
516
840
        None
517
    }
518
7700
}
519

            
520
pub fn detect_char_script(ch: char) -> Option<Script> {
521
    let script_counters: [ScriptCounter; 24] = [
522
        (Script::Latin, is_latin, 0),
523
        (Script::Cyrillic, is_cyrillic, 0),
524
        (Script::Arabic, is_arabic, 0),
525
        (Script::Mandarin, is_mandarin, 0),
526
        (Script::Devanagari, is_devanagari, 0),
527
        (Script::Hebrew, is_hebrew, 0),
528
        (Script::Ethiopic, is_ethiopic, 0),
529
        (Script::Georgian, is_georgian, 0),
530
        (Script::Bengali, is_bengali, 0),
531
        (Script::Hangul, is_hangul, 0),
532
        (Script::Hiragana, is_hiragana, 0),
533
        (Script::Katakana, is_katakana, 0),
534
        (Script::Greek, is_greek, 0),
535
        (Script::Kannada, is_kannada, 0),
536
        (Script::Tamil, is_tamil, 0),
537
        (Script::Thai, is_thai, 0),
538
        (Script::Gujarati, is_gujarati, 0),
539
        (Script::Gurmukhi, is_gurmukhi, 0),
540
        (Script::Telugu, is_telugu, 0),
541
        (Script::Malayalam, is_malayalam, 0),
542
        (Script::Oriya, is_oriya, 0),
543
        (Script::Myanmar, is_myanmar, 0),
544
        (Script::Sinhala, is_sinhala, 0),
545
        (Script::Khmer, is_khmer, 0),
546
    ];
547

            
548
    for i in 0..script_counters.len() {
549
        let (script, check_fn, _) = script_counters[i];
550
        if check_fn(ch) {
551
            return Some(script);
552
        }
553
    }
554
    None
555
}
556

            
557
/// Iterates through the text once and returns as soon as an Assamese-specific character is found.
558
fn detect_bengali_language(text: &str) -> Language {
559
    for c in text.chars() {
560
        // These characters are specific to Assamese in the Bengali script block.
561
        // We can return immediately as this is the highest priority check.
562
        if matches!(c, '\u{09F0}' | '\u{09F1}') {
563
            // ৰ, ৱ
564
            return Language::Assamese;
565
        }
566
    }
567
    // If we finish the loop without finding any Assamese characters, it's Bengali.
568
    Language::Bengali
569
}
570

            
571
fn detect_cyrillic_language(text: &str) -> Language {
572
    for c in text.chars() {
573
        match c {
574
            // Highest priority: Old Cyrillic characters for Slavonic Church. Return immediately.
575
            '\u{0460}'..='\u{047F}' => return Language::SlavonicChurch,
576
            // Set flags for other languages. We don't return yet because a higher-priority
577
            // character (like the one above) could still appear.
578
            'ѓ' | 'ќ' | 'ѕ' => return Language::Macedonian,
579
            'ў' => return Language::Belarusian,
580
            'є' | 'і' | 'ї' | 'ґ' => return Language::Ukrainian,
581
            'ө' | 'ү' | 'һ' => return Language::Mongolian,
582
            'ј' | 'љ' | 'њ' | 'ћ' | 'ђ' | 'џ' => return Language::SerbianCyrillic,
583
            // Bulgarian 'ъ' is also in Russian, but 'щ' is a stronger indicator.
584
            // The logic implies that if either is present, it might be Bulgarian.
585
            'щ' => return Language::Bulgarian,
586
            _ => {}
587
        }
588
    }
589

            
590
    Language::Russian
591
}
592

            
593
fn detect_devanagari_language(text: &str) -> Language {
594
    for c in text.chars() {
595
        match c {
596
            // Marathi has higher priority in the original logic. Return immediately.
597
            '\u{0933}' => return Language::Marathi, // ळ
598
            // Flag for Sanskrit Vedic extensions.
599
            '\u{1CD0}'..='\u{1CFF}' => return Language::Sanskrit,
600
            _ => (),
601
        }
602
    }
603

            
604
    Language::Hindi
605
}
606

            
607
fn detect_greek_language(text: &str) -> Language {
608
    for c in text.chars() {
609
        match c {
610
            // Coptic has higher priority. Return immediately.
611
            '\u{2C80}'..='\u{2CFF}' => return Language::Coptic,
612
            // Flag for Greek Extended (Polytonic) characters.
613
            '\u{1F00}'..='\u{1FFF}' => return Language::GreekPoly,
614
            _ => {}
615
        }
616
    }
617

            
618
    Language::GreekMono
619
}
620

            
621
5495
fn detect_latin_language(text: &str) -> Language {
622
    // Flags for languages checked near the end of the original if-else chain.
623
5495
    let mut has_french_c = false;
624
5495
    let mut has_portuguese_o = false;
625
5495
    let mut has_portuguese_a = false;
626

            
627
61040
    for c in text.chars() {
628
61040
        match c {
629
            // --- Early Return Cases (in order of priority) ---
630
            'ß' => return Language::German1996,
631
            'ő' | 'ű' => return Language::Hungarian,
632
            'ł' => return Language::Polish,
633
            'ř' | 'ů' => return Language::Czech,
634
            'ľ' | 'ĺ' | 'ŕ' => return Language::Slovak,
635
            'ā' | 'ē' | 'ģ' | 'ī' | 'ķ' | 'ļ' | 'ņ' | 'ō' | 'ū' => {
636
                return Language::Latvian
637
            }
638
            'ą' | 'ę' | 'ė' | 'į' | 'ų' => return Language::Lithuanian,
639
            'ă' | 'ș' | 'ț' => return Language::Romanian,
640
            'ğ' | 'ı' | 'ş' => return Language::Turkish,
641
            'đ' => return Language::Croatian, /* Also used in Vietnamese, but Croatian is the */
642
            // original's intent
643
            'þ' | 'ð' => return Language::Icelandic,
644
            'ŵ' | 'ŷ' => return Language::Welsh,
645
            'æ' | 'ø' => return Language::NorwegianBokmal, // And Danish
646
            'å' => return Language::Swedish,               // And Norwegian, Finnish
647
            'ñ' => return Language::Spanish,
648
            'ä' | 'ö' | 'ü' => return Language::German1996,
649

            
650
            // NOTE: 'õ' is used by both Estonian and Portuguese
651
            // Since Estonian is checked first, it takes precedence.
652
            'õ' => has_portuguese_o = true,
653
            'ã' => has_portuguese_a = true,
654

            
655
            // --- Flag-setting Cases ---
656
            'ç' => has_french_c = true, // Also in Portuguese
657
            'á' | 'é' | 'í' | 'ó' | 'ú' => return Language::Spanish,
658

            
659
61040
            _ => (),
660
        }
661
    }
662

            
663
    // decide between portuguese, estonian and french
664

            
665
5495
    if has_french_c && !has_portuguese_o && !has_portuguese_a {
666
        return Language::French;
667
5495
    }
668

            
669
5495
    if has_portuguese_o && !has_french_c && !has_portuguese_a {
670
        return Language::Estonian;
671
5495
    }
672

            
673
5495
    if has_portuguese_o || has_portuguese_a || has_french_c {
674
        return Language::Portuguese;
675
5495
    }
676

            
677
5495
    Language::EnglishUS
678
5495
}
679

            
680
5495
pub fn script_to_language(script: Script, text: &str) -> Language {
681
5495
    match script {
682
        Script::Ethiopic => Language::Ethiopic,
683
        Script::Georgian => Language::Georgian,
684
        Script::Gujarati => Language::Gujarati,
685
        Script::Gurmukhi => Language::Panjabi,
686
        Script::Kannada => Language::Kannada,
687
        Script::Malayalam => Language::Malayalam,
688
        Script::Mandarin => Language::Chinese,
689
        Script::Oriya => Language::Oriya,
690
        Script::Tamil => Language::Tamil,
691
        Script::Telugu => Language::Telugu,
692
        Script::Thai => Language::Thai,
693
        Script::Bengali => detect_bengali_language(text),
694
        Script::Cyrillic => detect_cyrillic_language(text),
695
        Script::Devanagari => detect_devanagari_language(text),
696
        Script::Greek => detect_greek_language(text),
697
5495
        Script::Latin => detect_latin_language(text),
698

            
699
        // not directly matchable
700
        Script::Myanmar => Language::Thai,
701
        Script::Khmer => Language::Thai,
702
        Script::Sinhala => Language::Hindi,
703

            
704
        // no classical hyphenation behaviour
705
        Script::Arabic => Language::Chinese,
706
        Script::Hebrew => Language::Chinese,
707
        Script::Hangul => Language::Chinese,
708
        Script::Hiragana => Language::Chinese,
709
        Script::Katakana => Language::Chinese,
710
    }
711
5495
}
712

            
713
665
pub fn is_cyrillic(ch: char) -> bool {
714
665
    matches!(ch,
715
665
        '\u{0400}'..='\u{0484}'
716
665
        | '\u{0487}'..='\u{052F}'
717
665
        | '\u{2DE0}'..='\u{2DFF}'
718
665
        | '\u{A640}'..='\u{A69D}'
719
        | '\u{1D2B}'
720
        | '\u{1D78}'
721
        | '\u{A69F}'
722
    )
723
665
}
724

            
725
// https://en.wikipedia.org/wiki/Latin_script_in_Unicode
726
42560
pub fn is_latin(ch: char) -> bool {
727
42560
    matches!(ch,
728
36820
        'a'..='z'
729
6405
        | 'A'..='Z'
730
665
        | '\u{0080}'..='\u{00FF}'
731
665
        | '\u{0100}'..='\u{017F}'
732
665
        | '\u{0180}'..='\u{024F}'
733
665
        | '\u{0250}'..='\u{02AF}'
734
665
        | '\u{1D00}'..='\u{1D7F}'
735
665
        | '\u{1D80}'..='\u{1DBF}'
736
665
        | '\u{1E00}'..='\u{1EFF}'
737
665
        | '\u{2100}'..='\u{214F}'
738
665
        | '\u{2C60}'..='\u{2C7F}'
739
665
        | '\u{A720}'..='\u{A7FF}'
740
665
        | '\u{AB30}'..='\u{AB6F}'
741
    )
742
42560
}
743

            
744
// Based on https://en.wikipedia.org/wiki/Arabic_script_in_Unicode
745
665
pub fn is_arabic(ch: char) -> bool {
746
665
    matches!(ch,
747
665
        '\u{0600}'..='\u{06FF}'
748
665
        | '\u{0750}'..='\u{07FF}'
749
665
        | '\u{08A0}'..='\u{08FF}'
750
665
        | '\u{FB50}'..='\u{FDFF}'
751
665
        | '\u{FE70}'..='\u{FEFF}'
752
        | '\u{10E60}'..='\u{10E7F}'
753
        | '\u{1EE00}'..='\u{1EEFF}'
754
    )
755
665
}
756

            
757
// Based on https://en.wikipedia.org/wiki/Devanagari#Unicode
758
665
pub fn is_devanagari(ch: char) -> bool {
759
665
    matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
760
665
}
761

            
762
// Based on https://www.key-shortcut.com/en/writing-systems/ethiopian-script/
763
665
pub fn is_ethiopic(ch: char) -> bool {
764
665
    matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
765
665
}
766

            
767
// Based on https://en.wikipedia.org/wiki/Hebrew_(Unicode_block)
768
665
pub fn is_hebrew(ch: char) -> bool {
769
665
    matches!(ch, '\u{0590}'..='\u{05FF}')
770
665
}
771

            
772
665
pub fn is_georgian(ch: char) -> bool {
773
665
    matches!(ch, '\u{10A0}'..='\u{10FF}')
774
665
}
775

            
776
665
pub fn is_mandarin(ch: char) -> bool {
777
665
    matches!(ch,
778
665
        '\u{2E80}'..='\u{2E99}'
779
665
        | '\u{2E9B}'..='\u{2EF3}'
780
665
        | '\u{2F00}'..='\u{2FD5}'
781
        | '\u{3005}'
782
        | '\u{3007}'
783
665
        | '\u{3021}'..='\u{3029}'
784
665
        | '\u{3038}'..='\u{303B}'
785
665
        | '\u{3400}'..='\u{4DB5}'
786
665
        | '\u{4E00}'..='\u{9FCC}'
787
665
        | '\u{F900}'..='\u{FA6D}'
788
665
        | '\u{FA70}'..='\u{FAD9}'
789
    )
790
665
}
791

            
792
665
pub fn is_bengali(ch: char) -> bool {
793
665
    matches!(ch, '\u{0980}'..='\u{09FF}')
794
665
}
795

            
796
665
pub fn is_hiragana(ch: char) -> bool {
797
665
    matches!(ch, '\u{3040}'..='\u{309F}')
798
665
}
799

            
800
665
pub fn is_katakana(ch: char) -> bool {
801
665
    matches!(ch, '\u{30A0}'..='\u{30FF}')
802
665
}
803

            
804
// Hangul is Korean Alphabet. Unicode ranges are taken from: https://en.wikipedia.org/wiki/Hangul
805
665
pub fn is_hangul(ch: char) -> bool {
806
665
    matches!(ch,
807
665
        '\u{AC00}'..='\u{D7AF}'
808
665
        | '\u{1100}'..='\u{11FF}'
809
665
        | '\u{3130}'..='\u{318F}'
810
665
        | '\u{3200}'..='\u{32FF}'
811
665
        | '\u{A960}'..='\u{A97F}'
812
665
        | '\u{D7B0}'..='\u{D7FF}'
813
665
        | '\u{FF00}'..='\u{FFEF}'
814
    )
815
665
}
816

            
817
// Taken from: https://en.wikipedia.org/wiki/Greek_and_Coptic
818
665
pub fn is_greek(ch: char) -> bool {
819
665
    matches!(ch, '\u{0370}'..='\u{03FF}')
820
665
}
821

            
822
// Based on: https://en.wikipedia.org/wiki/Kannada_(Unicode_block)
823
665
pub fn is_kannada(ch: char) -> bool {
824
665
    matches!(ch, '\u{0C80}'..='\u{0CFF}')
825
665
}
826

            
827
// Based on: https://en.wikipedia.org/wiki/Tamil_(Unicode_block)
828
665
pub fn is_tamil(ch: char) -> bool {
829
665
    matches!(ch, '\u{0B80}'..='\u{0BFF}')
830
665
}
831

            
832
// Based on: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
833
665
pub fn is_thai(ch: char) -> bool {
834
665
    matches!(ch, '\u{0E00}'..='\u{0E7F}')
835
665
}
836

            
837
// Based on: https://en.wikipedia.org/wiki/Gujarati_(Unicode_block)
838
665
pub fn is_gujarati(ch: char) -> bool {
839
665
    matches!(ch, '\u{0A80}'..='\u{0AFF}')
840
665
}
841

            
842
// Gurmukhi is the script for Punjabi language.
843
// Based on: https://en.wikipedia.org/wiki/Gurmukhi_(Unicode_block)
844
665
pub fn is_gurmukhi(ch: char) -> bool {
845
665
    matches!(ch, '\u{0A00}'..='\u{0A7F}')
846
665
}
847

            
848
665
pub fn is_telugu(ch: char) -> bool {
849
665
    matches!(ch, '\u{0C00}'..='\u{0C7F}')
850
665
}
851

            
852
// Based on: https://en.wikipedia.org/wiki/Malayalam_(Unicode_block)
853
665
pub fn is_malayalam(ch: char) -> bool {
854
665
    matches!(ch, '\u{0D00}'..='\u{0D7F}')
855
665
}
856

            
857
// Based on: https://en.wikipedia.org/wiki/Oriya_(Unicode_block)
858
665
pub fn is_oriya(ch: char) -> bool {
859
665
    matches!(ch, '\u{0B00}'..='\u{0B7F}')
860
665
}
861

            
862
// Based on: https://en.wikipedia.org/wiki/Myanmar_(Unicode_block)
863
665
pub fn is_myanmar(ch: char) -> bool {
864
665
    matches!(ch, '\u{1000}'..='\u{109F}')
865
665
}
866

            
867
// Based on: https://en.wikipedia.org/wiki/Sinhala_(Unicode_block)
868
665
pub fn is_sinhala(ch: char) -> bool {
869
665
    matches!(ch, '\u{0D80}'..='\u{0DFF}')
870
665
}
871

            
872
// Based on: https://en.wikipedia.org/wiki/Khmer_alphabet
873
665
pub fn is_khmer(ch: char) -> bool {
874
665
    matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
875
665
}