1
//! Unicode script detection and language identification for text shaping
2
//!
3
// Taken from: https://github.com/greyblake/whatlang-rs/blob/master/src/scripts/detect.rs
4
//
5
// See: https://github.com/greyblake/whatlang-rs/pull/67
6

            
7
// License:
8
//
9
// (The MIT License)
10
//
11
// Copyright (c) 2017 Sergey Potapov <blake131313@gmail.com>
12
// Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>
13
// Copyright (c) 2008 Kent S Johnson
14
// Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
15
// Copyright (c) 2004 Maciej Ceglowski
16
//
17
// Permission is hereby granted, free of charge, to any person obtaining
18
// a copy of this software and associated documentation files (the
19
// 'Software'), to deal in the Software without restriction, including
20
// without limitation the rights to use, copy, modify, merge, publish,
21
// distribute, sublicense, and/or sell copies of the Software, and to
22
// permit persons to whom the Software is furnished to do so, subject to
23
// the following conditions:
24
//
25
// The above copyright notice and this permission notice shall be
26
// included in all copies or substantial portions of the Software.
27
//
28
// THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
29
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
30
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
31
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
32
// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
33
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
34
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35

            
36
#[cfg(feature = "text_layout_hyphenation")]
37
use hyphenation::Language as HyphenationLanguage;
38
#[cfg(feature = "text_layout_hyphenation")]
39
pub use hyphenation::Language;
40

            
41
/// Stub Language enum for when hyphenation is not enabled.
42
/// This mirrors the variants used in script detection functions.
43
#[cfg(not(feature = "text_layout_hyphenation"))]
44
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
45
#[allow(dead_code)]
46
pub enum Language {
47
    // Latin script languages
48
    EnglishUS,
49
    French,
50
    German1996,
51
    Spanish,
52
    Portuguese,
53
    Estonian,
54
    Hungarian,
55
    Polish,
56
    Czech,
57
    Slovak,
58
    Latvian,
59
    Lithuanian,
60
    Romanian,
61
    Turkish,
62
    Croatian,
63
    Icelandic,
64
    Welsh,
65
    NorwegianBokmal,
66
    Swedish,
67
    // Cyrillic script languages
68
    Russian,
69
    Ukrainian,
70
    Belarusian,
71
    Bulgarian,
72
    Macedonian,
73
    SerbianCyrillic,
74
    Mongolian,
75
    SlavonicChurch,
76
    // Greek script languages
77
    GreekMono,
78
    GreekPoly,
79
    Coptic,
80
    // Indic script languages
81
    Hindi,
82
    Bengali,
83
    Assamese,
84
    Marathi,
85
    Sanskrit,
86
    Gujarati,
87
    Panjabi,
88
    Kannada,
89
    Malayalam,
90
    Oriya,
91
    Tamil,
92
    Telugu,
93
    // Other scripts
94
    Georgian,
95
    Ethiopic,
96
    Thai,
97
    Chinese,
98
}
99

            
100
use rust_fontconfig::UnicodeRange;
101

            
102
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
103
pub enum Script {
104
    // Keep this in alphabetic order (for C bindings)
105
    Arabic,
106
    Bengali,
107
    Cyrillic,
108
    Devanagari,
109
    Ethiopic,
110
    Georgian,
111
    Greek,
112
    Gujarati,
113
    Gurmukhi,
114
    Hangul,
115
    Hebrew,
116
    Hiragana,
117
    Kannada,
118
    Katakana,
119
    Khmer,
120
    Latin,
121
    Malayalam,
122
    Mandarin,
123
    Myanmar,
124
    Oriya,
125
    Sinhala,
126
    Tamil,
127
    Telugu,
128
    Thai,
129
}
130

            
131
impl Script {
132
    /// Maps a Script to a vector of its representative Unicode character ranges.
133
    ///
134
    /// The ranges are extracted from the `is_*` functions in the provided source code.
135
    pub fn get_unicode_ranges(&self) -> Vec<UnicodeRange> {
136
        match self {
137
            Script::Arabic => vec![
138
                UnicodeRange {
139
                    start: 0x0600,
140
                    end: 0x06FF,
141
                },
142
                UnicodeRange {
143
                    start: 0x0750,
144
                    end: 0x07FF,
145
                },
146
                UnicodeRange {
147
                    start: 0x08A0,
148
                    end: 0x08FF,
149
                },
150
                UnicodeRange {
151
                    start: 0xFB50,
152
                    end: 0xFDFF,
153
                },
154
                UnicodeRange {
155
                    start: 0xFE70,
156
                    end: 0xFEFF,
157
                },
158
                UnicodeRange {
159
                    start: 0x10E60,
160
                    end: 0x10E7F,
161
                },
162
                UnicodeRange {
163
                    start: 0x1EE00,
164
                    end: 0x1EEFF,
165
                },
166
            ],
167
            Script::Bengali => vec![UnicodeRange {
168
                start: 0x0980,
169
                end: 0x09FF,
170
            }],
171
            Script::Cyrillic => vec![
172
                UnicodeRange {
173
                    start: 0x0400,
174
                    end: 0x0484,
175
                },
176
                UnicodeRange {
177
                    start: 0x0487,
178
                    end: 0x052F,
179
                },
180
                UnicodeRange {
181
                    start: 0x2DE0,
182
                    end: 0x2DFF,
183
                },
184
                UnicodeRange {
185
                    start: 0xA640,
186
                    end: 0xA69D,
187
                },
188
                UnicodeRange {
189
                    start: 0x1D2B,
190
                    end: 0x1D2B,
191
                },
192
                UnicodeRange {
193
                    start: 0x1D78,
194
                    end: 0x1D78,
195
                },
196
                UnicodeRange {
197
                    start: 0xA69F,
198
                    end: 0xA69F,
199
                },
200
            ],
201
            Script::Devanagari => vec![
202
                UnicodeRange {
203
                    start: 0x0900,
204
                    end: 0x097F,
205
                },
206
                UnicodeRange {
207
                    start: 0xA8E0,
208
                    end: 0xA8FF,
209
                },
210
                UnicodeRange {
211
                    start: 0x1CD0,
212
                    end: 0x1CFF,
213
                },
214
            ],
215
            Script::Ethiopic => vec![
216
                UnicodeRange {
217
                    start: 0x1200,
218
                    end: 0x139F,
219
                },
220
                UnicodeRange {
221
                    start: 0x2D80,
222
                    end: 0x2DDF,
223
                },
224
                UnicodeRange {
225
                    start: 0xAB00,
226
                    end: 0xAB2F,
227
                },
228
            ],
229
            Script::Georgian => vec![UnicodeRange {
230
                start: 0x10A0,
231
                end: 0x10FF,
232
            }],
233
            Script::Greek => vec![UnicodeRange {
234
                start: 0x0370,
235
                end: 0x03FF,
236
            }],
237
            Script::Gujarati => vec![UnicodeRange {
238
                start: 0x0A80,
239
                end: 0x0AFF,
240
            }],
241
            Script::Gurmukhi => vec![UnicodeRange {
242
                start: 0x0A00,
243
                end: 0x0A7F,
244
            }],
245
            Script::Hangul => vec![
246
                UnicodeRange {
247
                    start: 0xAC00,
248
                    end: 0xD7AF,
249
                },
250
                UnicodeRange {
251
                    start: 0x1100,
252
                    end: 0x11FF,
253
                },
254
                UnicodeRange {
255
                    start: 0x3130,
256
                    end: 0x318F,
257
                },
258
                UnicodeRange {
259
                    start: 0x3200,
260
                    end: 0x32FF,
261
                },
262
                UnicodeRange {
263
                    start: 0xA960,
264
                    end: 0xA97F,
265
                },
266
                UnicodeRange {
267
                    start: 0xD7B0,
268
                    end: 0xD7FF,
269
                },
270
                UnicodeRange {
271
                    start: 0xFF00,
272
                    end: 0xFFEF,
273
                },
274
            ],
275
            Script::Hebrew => vec![UnicodeRange {
276
                start: 0x0590,
277
                end: 0x05FF,
278
            }],
279
            Script::Hiragana => vec![UnicodeRange {
280
                start: 0x3040,
281
                end: 0x309F,
282
            }],
283
            Script::Kannada => vec![UnicodeRange {
284
                start: 0x0C80,
285
                end: 0x0CFF,
286
            }],
287
            Script::Katakana => vec![UnicodeRange {
288
                start: 0x30A0,
289
                end: 0x30FF,
290
            }],
291
            Script::Khmer => vec![
292
                UnicodeRange {
293
                    start: 0x1780,
294
                    end: 0x17FF,
295
                },
296
                UnicodeRange {
297
                    start: 0x19E0,
298
                    end: 0x19FF,
299
                },
300
            ],
301
            Script::Latin => vec![
302
                UnicodeRange {
303
                    start: 0x0041,
304
                    end: 0x005A,
305
                }, // A-Z
306
                UnicodeRange {
307
                    start: 0x0061,
308
                    end: 0x007A,
309
                }, // a-z
310
                UnicodeRange {
311
                    start: 0x0080,
312
                    end: 0x00FF,
313
                },
314
                UnicodeRange {
315
                    start: 0x0100,
316
                    end: 0x017F,
317
                },
318
                UnicodeRange {
319
                    start: 0x0180,
320
                    end: 0x024F,
321
                },
322
                UnicodeRange {
323
                    start: 0x0250,
324
                    end: 0x02AF,
325
                },
326
                UnicodeRange {
327
                    start: 0x1D00,
328
                    end: 0x1D7F,
329
                },
330
                UnicodeRange {
331
                    start: 0x1D80,
332
                    end: 0x1DBF,
333
                },
334
                UnicodeRange {
335
                    start: 0x1E00,
336
                    end: 0x1EFF,
337
                },
338
                UnicodeRange {
339
                    start: 0x2100,
340
                    end: 0x214F,
341
                },
342
                UnicodeRange {
343
                    start: 0x2C60,
344
                    end: 0x2C7F,
345
                },
346
                UnicodeRange {
347
                    start: 0xA720,
348
                    end: 0xA7FF,
349
                },
350
                UnicodeRange {
351
                    start: 0xAB30,
352
                    end: 0xAB6F,
353
                },
354
            ],
355
            Script::Malayalam => vec![UnicodeRange {
356
                start: 0x0D00,
357
                end: 0x0D7F,
358
            }],
359
            Script::Mandarin => vec![
360
                UnicodeRange {
361
                    start: 0x2E80,
362
                    end: 0x2E99,
363
                },
364
                UnicodeRange {
365
                    start: 0x2E9B,
366
                    end: 0x2EF3,
367
                },
368
                UnicodeRange {
369
                    start: 0x2F00,
370
                    end: 0x2FD5,
371
                },
372
                UnicodeRange {
373
                    start: 0x3005,
374
                    end: 0x3005,
375
                },
376
                UnicodeRange {
377
                    start: 0x3007,
378
                    end: 0x3007,
379
                },
380
                UnicodeRange {
381
                    start: 0x3021,
382
                    end: 0x3029,
383
                },
384
                UnicodeRange {
385
                    start: 0x3038,
386
                    end: 0x303B,
387
                },
388
                UnicodeRange {
389
                    start: 0x3400,
390
                    end: 0x4DB5,
391
                },
392
                UnicodeRange {
393
                    start: 0x4E00,
394
                    end: 0x9FCC,
395
                },
396
                UnicodeRange {
397
                    start: 0xF900,
398
                    end: 0xFA6D,
399
                },
400
                UnicodeRange {
401
                    start: 0xFA70,
402
                    end: 0xFAD9,
403
                },
404
            ],
405
            Script::Myanmar => vec![UnicodeRange {
406
                start: 0x1000,
407
                end: 0x109F,
408
            }],
409
            Script::Oriya => vec![UnicodeRange {
410
                start: 0x0B00,
411
                end: 0x0B7F,
412
            }],
413
            Script::Sinhala => vec![UnicodeRange {
414
                start: 0x0D80,
415
                end: 0x0DFF,
416
            }],
417
            Script::Tamil => vec![UnicodeRange {
418
                start: 0x0B80,
419
                end: 0x0BFF,
420
            }],
421
            Script::Telugu => vec![UnicodeRange {
422
                start: 0x0C00,
423
                end: 0x0C7F,
424
            }],
425
            Script::Thai => vec![UnicodeRange {
426
                start: 0x0E00,
427
                end: 0x0E7F,
428
            }],
429
        }
430
    }
431
}
432

            
433
// Is it space, punctuation or digit?
434
// Stop character is a character that does not give any value for script
435
// or language detection.
436
#[inline]
437
121308
pub fn is_stop_char(ch: char) -> bool {
438
121308
    matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')
439
121308
}
440

            
441
type ScriptCounter = (Script, fn(char) -> bool, usize);
442

            
443
/// Detect only a script by a given text
444
18040
pub fn detect_script(text: &str) -> Option<Script> {
445
18040
    let mut script_counters: [ScriptCounter; 24] = [
446
18040
        (Script::Latin, is_latin, 0),
447
18040
        (Script::Cyrillic, is_cyrillic, 0),
448
18040
        (Script::Arabic, is_arabic, 0),
449
18040
        (Script::Mandarin, is_mandarin, 0),
450
18040
        (Script::Devanagari, is_devanagari, 0),
451
18040
        (Script::Hebrew, is_hebrew, 0),
452
18040
        (Script::Ethiopic, is_ethiopic, 0),
453
18040
        (Script::Georgian, is_georgian, 0),
454
18040
        (Script::Bengali, is_bengali, 0),
455
18040
        (Script::Hangul, is_hangul, 0),
456
18040
        (Script::Hiragana, is_hiragana, 0),
457
18040
        (Script::Katakana, is_katakana, 0),
458
18040
        (Script::Greek, is_greek, 0),
459
18040
        (Script::Kannada, is_kannada, 0),
460
18040
        (Script::Tamil, is_tamil, 0),
461
18040
        (Script::Thai, is_thai, 0),
462
18040
        (Script::Gujarati, is_gujarati, 0),
463
18040
        (Script::Gurmukhi, is_gurmukhi, 0),
464
18040
        (Script::Telugu, is_telugu, 0),
465
18040
        (Script::Malayalam, is_malayalam, 0),
466
18040
        (Script::Oriya, is_oriya, 0),
467
18040
        (Script::Myanmar, is_myanmar, 0),
468
18040
        (Script::Sinhala, is_sinhala, 0),
469
18040
        (Script::Khmer, is_khmer, 0),
470
18040
    ];
471

            
472
18040
    let half = text.chars().count() / 2;
473

            
474
121308
    for ch in text.chars() {
475
121308
        if is_stop_char(ch) {
476
42372
            continue;
477
78936
        }
478

            
479
        // For performance reasons, we need to mutate script_counters by calling
480
        // `swap` function, it would not be possible to do using normal iterator.
481
229724
        for i in 0..script_counters.len() {
482
218900
            let found = {
483
229724
                let (script, check_fn, ref mut count) = script_counters[i];
484
229724
                if check_fn(ch) {
485
72380
                    *count += 1;
486
72380
                    if *count > half {
487
10824
                        return Some(script);
488
61556
                    }
489
61556
                    true
490
                } else {
491
157344
                    false
492
                }
493
            };
494
            // Have to let borrow of count fall out of scope before doing swapping, or we could
495
            // do this above.
496
218900
            if found {
497
                // If script was found, move it closer to the front.
498
                // If the text contains largely 1 or 2 scripts, this will
499
                // cause these scripts to be eventually checked first.
500
61556
                if i > 0 {
501
                    script_counters.swap(i - 1, i);
502
61556
                }
503
61556
                break;
504
157344
            }
505
        }
506
    }
507

            
508
7216
    let (script, _, count) = script_counters
509
7216
        .iter()
510
7216
        .cloned()
511
7216
        .max_by_key(|&(_, _, count)| count)
512
7216
        .unwrap();
513
7216
    if count != 0 {
514
4796
        Some(script)
515
    } else {
516
2420
        None
517
    }
518
18040
}
519

            
520
pub fn detect_char_script(ch: char) -> Option<Script> {
521
    let script_counters: [ScriptCounter; 24] = [
522
        (Script::Latin, is_latin, 0),
523
        (Script::Cyrillic, is_cyrillic, 0),
524
        (Script::Arabic, is_arabic, 0),
525
        (Script::Mandarin, is_mandarin, 0),
526
        (Script::Devanagari, is_devanagari, 0),
527
        (Script::Hebrew, is_hebrew, 0),
528
        (Script::Ethiopic, is_ethiopic, 0),
529
        (Script::Georgian, is_georgian, 0),
530
        (Script::Bengali, is_bengali, 0),
531
        (Script::Hangul, is_hangul, 0),
532
        (Script::Hiragana, is_hiragana, 0),
533
        (Script::Katakana, is_katakana, 0),
534
        (Script::Greek, is_greek, 0),
535
        (Script::Kannada, is_kannada, 0),
536
        (Script::Tamil, is_tamil, 0),
537
        (Script::Thai, is_thai, 0),
538
        (Script::Gujarati, is_gujarati, 0),
539
        (Script::Gurmukhi, is_gurmukhi, 0),
540
        (Script::Telugu, is_telugu, 0),
541
        (Script::Malayalam, is_malayalam, 0),
542
        (Script::Oriya, is_oriya, 0),
543
        (Script::Myanmar, is_myanmar, 0),
544
        (Script::Sinhala, is_sinhala, 0),
545
        (Script::Khmer, is_khmer, 0),
546
    ];
547

            
548
    for i in 0..script_counters.len() {
549
        let (script, check_fn, _) = script_counters[i];
550
        if check_fn(ch) {
551
            return Some(script);
552
        }
553
    }
554
    None
555
}
556

            
557
/// Iterates through the text once and returns as soon as an Assamese-specific character is found.
558
fn detect_bengali_language(text: &str) -> Language {
559
    for c in text.chars() {
560
        // These characters are specific to Assamese in the Bengali script block.
561
        // We can return immediately as this is the highest priority check.
562
        if matches!(c, '\u{09F0}' | '\u{09F1}') {
563
            // ৰ, ৱ
564
            return Language::Assamese;
565
        }
566
    }
567
    // If we finish the loop without finding any Assamese characters, it's Bengali.
568
    Language::Bengali
569
}
570

            
571
fn detect_cyrillic_language(text: &str) -> Language {
572
    for c in text.chars() {
573
        match c {
574
            // Highest priority: Old Cyrillic characters for Slavonic Church. Return immediately.
575
            '\u{0460}'..='\u{047F}' => return Language::SlavonicChurch,
576
            // Set flags for other languages. We don't return yet because a higher-priority
577
            // character (like the one above) could still appear.
578
            'ѓ' | 'ќ' | 'ѕ' => return Language::Macedonian,
579
            'ў' => return Language::Belarusian,
580
            'є' | 'і' | 'ї' | 'ґ' => return Language::Ukrainian,
581
            'ө' | 'ү' | 'һ' => return Language::Mongolian,
582
            'ј' | 'љ' | 'њ' | 'ћ' | 'ђ' | 'џ' => return Language::SerbianCyrillic,
583
            // Bulgarian 'ъ' is also in Russian, but 'щ' is a stronger indicator.
584
            // The logic implies that if either is present, it might be Bulgarian.
585
            'щ' => return Language::Bulgarian,
586
            _ => {}
587
        }
588
    }
589

            
590
    Language::Russian
591
}
592

            
593
fn detect_devanagari_language(text: &str) -> Language {
594
    for c in text.chars() {
595
        match c {
596
            // Marathi has higher priority in the original logic. Return immediately.
597
            '\u{0933}' => return Language::Marathi, // ळ
598
            // Flag for Sanskrit Vedic extensions.
599
            '\u{1CD0}'..='\u{1CFF}' => return Language::Sanskrit,
600
            _ => (),
601
        }
602
    }
603

            
604
    Language::Hindi
605
}
606

            
607
fn detect_greek_language(text: &str) -> Language {
608
    for c in text.chars() {
609
        match c {
610
            // Coptic has higher priority. Return immediately.
611
            '\u{2C80}'..='\u{2CFF}' => return Language::Coptic,
612
            // Flag for Greek Extended (Polytonic) characters.
613
            '\u{1F00}'..='\u{1FFF}' => return Language::GreekPoly,
614
            _ => {}
615
        }
616
    }
617

            
618
    Language::GreekMono
619
}
620

            
621
15224
fn detect_latin_language(text: &str) -> Language {
622
    // Flags for languages checked near the end of the original if-else chain.
623
15224
    let mut has_french_c = false;
624
15224
    let mut has_portuguese_o = false;
625
15224
    let mut has_portuguese_a = false;
626

            
627
140756
    for c in text.chars() {
628
140756
        match c {
629
            // --- Early Return Cases (in order of priority) ---
630
            'ß' => return Language::German1996,
631
            'ő' | 'ű' => return Language::Hungarian,
632
            'ł' => return Language::Polish,
633
            'ř' | 'ů' => return Language::Czech,
634
            'ľ' | 'ĺ' | 'ŕ' => return Language::Slovak,
635
            'ā' | 'ē' | 'ģ' | 'ī' | 'ķ' | 'ļ' | 'ņ' | 'ō' | 'ū' => {
636
                return Language::Latvian
637
            }
638
            'ą' | 'ę' | 'ė' | 'į' | 'ų' => return Language::Lithuanian,
639
            'ă' | 'ș' | 'ț' => return Language::Romanian,
640
            'ğ' | 'ı' | 'ş' => return Language::Turkish,
641
            'đ' => return Language::Croatian, /* Also used in Vietnamese, but Croatian is the */
642
            // original's intent
643
            'þ' | 'ð' => return Language::Icelandic,
644
            'ŵ' | 'ŷ' => return Language::Welsh,
645
            'æ' | 'ø' => return Language::NorwegianBokmal, // And Danish
646
            'å' => return Language::Swedish,               // And Norwegian, Finnish
647
            'ñ' => return Language::Spanish,
648
            'ä' | 'ö' | 'ü' => return Language::German1996,
649

            
650
            // NOTE: 'õ' is used by both Estonian and Portuguese
651
            // Since Estonian is checked first, it takes precedence.
652
            'õ' => has_portuguese_o = true,
653
            'ã' => has_portuguese_a = true,
654

            
655
            // --- Flag-setting Cases ---
656
            'ç' => has_french_c = true, // Also in Portuguese
657
            'á' | 'é' | 'í' | 'ó' | 'ú' => return Language::Spanish,
658

            
659
140756
            _ => (),
660
        }
661
    }
662

            
663
    // decide between portuguese, estonian and french
664

            
665
15224
    if has_french_c && !has_portuguese_o && !has_portuguese_a {
666
        return Language::French;
667
15224
    }
668

            
669
15224
    if has_portuguese_o && !has_french_c && !has_portuguese_a {
670
        return Language::Estonian;
671
15224
    }
672

            
673
15224
    if has_portuguese_o || has_portuguese_a || has_french_c {
674
        return Language::Portuguese;
675
15224
    }
676

            
677
15224
    Language::EnglishUS
678
15224
}
679

            
680
15224
pub fn script_to_language(script: Script, text: &str) -> Language {
681
15224
    match script {
682
        Script::Ethiopic => Language::Ethiopic,
683
        Script::Georgian => Language::Georgian,
684
        Script::Gujarati => Language::Gujarati,
685
        Script::Gurmukhi => Language::Panjabi,
686
        Script::Kannada => Language::Kannada,
687
        Script::Malayalam => Language::Malayalam,
688
        Script::Mandarin => Language::Chinese,
689
        Script::Oriya => Language::Oriya,
690
        Script::Tamil => Language::Tamil,
691
        Script::Telugu => Language::Telugu,
692
        Script::Thai => Language::Thai,
693
        Script::Bengali => detect_bengali_language(text),
694
        Script::Cyrillic => detect_cyrillic_language(text),
695
        Script::Devanagari => detect_devanagari_language(text),
696
        Script::Greek => detect_greek_language(text),
697
15224
        Script::Latin => detect_latin_language(text),
698

            
699
        // not directly matchable
700
        Script::Myanmar => Language::Thai,
701
        Script::Khmer => Language::Thai,
702
        Script::Sinhala => Language::Hindi,
703

            
704
        // no classical hyphenation behaviour
705
        Script::Arabic => Language::Chinese,
706
        Script::Hebrew => Language::Chinese,
707
        Script::Hangul => Language::Chinese,
708
        Script::Hiragana => Language::Chinese,
709
        Script::Katakana => Language::Chinese,
710
    }
711
15224
}
712

            
713
6556
pub fn is_cyrillic(ch: char) -> bool {
714
6556
    matches!(ch,
715
6556
        '\u{0400}'..='\u{0484}'
716
6556
        | '\u{0487}'..='\u{052F}'
717
836
        | '\u{2DE0}'..='\u{2DFF}'
718
836
        | '\u{A640}'..='\u{A69D}'
719
        | '\u{1D2B}'
720
        | '\u{1D78}'
721
        | '\u{A69F}'
722
    )
723
6556
}
724

            
725
// https://en.wikipedia.org/wiki/Latin_script_in_Unicode
726
78936
pub fn is_latin(ch: char) -> bool {
727
78936
    matches!(ch,
728
69080
        'a'..='z'
729
17160
        | 'A'..='Z'
730
7304
        | '\u{0080}'..='\u{00FF}'
731
6556
        | '\u{0100}'..='\u{017F}'
732
6556
        | '\u{0180}'..='\u{024F}'
733
6556
        | '\u{0250}'..='\u{02AF}'
734
6556
        | '\u{1D00}'..='\u{1D7F}'
735
6556
        | '\u{1D80}'..='\u{1DBF}'
736
6556
        | '\u{1E00}'..='\u{1EFF}'
737
1936
        | '\u{2100}'..='\u{214F}'
738
836
        | '\u{2C60}'..='\u{2C7F}'
739
836
        | '\u{A720}'..='\u{A7FF}'
740
836
        | '\u{AB30}'..='\u{AB6F}'
741
    )
742
78936
}
743

            
744
// Based on https://en.wikipedia.org/wiki/Arabic_script_in_Unicode
745
6556
pub fn is_arabic(ch: char) -> bool {
746
6556
    matches!(ch,
747
6556
        '\u{0600}'..='\u{06FF}'
748
6556
        | '\u{0750}'..='\u{07FF}'
749
6556
        | '\u{08A0}'..='\u{08FF}'
750
836
        | '\u{FB50}'..='\u{FDFF}'
751
836
        | '\u{FE70}'..='\u{FEFF}'
752
        | '\u{10E60}'..='\u{10E7F}'
753
        | '\u{1EE00}'..='\u{1EEFF}'
754
    )
755
6556
}
756

            
757
// Based on https://en.wikipedia.org/wiki/Devanagari#Unicode
758
6556
pub fn is_devanagari(ch: char) -> bool {
759
6556
    matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
760
6556
}
761

            
762
// Based on https://www.key-shortcut.com/en/writing-systems/ethiopian-script/
763
6556
pub fn is_ethiopic(ch: char) -> bool {
764
6556
    matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
765
6556
}
766

            
767
// Based on https://en.wikipedia.org/wiki/Hebrew_(Unicode_block)
768
6556
pub fn is_hebrew(ch: char) -> bool {
769
6556
    matches!(ch, '\u{0590}'..='\u{05FF}')
770
6556
}
771

            
772
6556
pub fn is_georgian(ch: char) -> bool {
773
6556
    matches!(ch, '\u{10A0}'..='\u{10FF}')
774
6556
}
775

            
776
6556
pub fn is_mandarin(ch: char) -> bool {
777
6556
    matches!(ch,
778
836
        '\u{2E80}'..='\u{2E99}'
779
836
        | '\u{2E9B}'..='\u{2EF3}'
780
836
        | '\u{2F00}'..='\u{2FD5}'
781
        | '\u{3005}'
782
        | '\u{3007}'
783
836
        | '\u{3021}'..='\u{3029}'
784
836
        | '\u{3038}'..='\u{303B}'
785
836
        | '\u{3400}'..='\u{4DB5}'
786
836
        | '\u{4E00}'..='\u{9FCC}'
787
836
        | '\u{F900}'..='\u{FA6D}'
788
836
        | '\u{FA70}'..='\u{FAD9}'
789
    )
790
6556
}
791

            
792
6556
pub fn is_bengali(ch: char) -> bool {
793
6556
    matches!(ch, '\u{0980}'..='\u{09FF}')
794
6556
}
795

            
796
6556
pub fn is_hiragana(ch: char) -> bool {
797
6556
    matches!(ch, '\u{3040}'..='\u{309F}')
798
6556
}
799

            
800
6556
pub fn is_katakana(ch: char) -> bool {
801
6556
    matches!(ch, '\u{30A0}'..='\u{30FF}')
802
6556
}
803

            
804
// Hangul is Korean Alphabet. Unicode ranges are taken from: https://en.wikipedia.org/wiki/Hangul
805
6556
pub fn is_hangul(ch: char) -> bool {
806
6556
    matches!(ch,
807
836
        '\u{AC00}'..='\u{D7AF}'
808
6556
        | '\u{1100}'..='\u{11FF}'
809
836
        | '\u{3130}'..='\u{318F}'
810
836
        | '\u{3200}'..='\u{32FF}'
811
836
        | '\u{A960}'..='\u{A97F}'
812
836
        | '\u{D7B0}'..='\u{D7FF}'
813
836
        | '\u{FF00}'..='\u{FFEF}'
814
    )
815
6556
}
816

            
817
// Taken from: https://en.wikipedia.org/wiki/Greek_and_Coptic
818
6556
pub fn is_greek(ch: char) -> bool {
819
6556
    matches!(ch, '\u{0370}'..='\u{03FF}')
820
6556
}
821

            
822
// Based on: https://en.wikipedia.org/wiki/Kannada_(Unicode_block)
823
6556
pub fn is_kannada(ch: char) -> bool {
824
6556
    matches!(ch, '\u{0C80}'..='\u{0CFF}')
825
6556
}
826

            
827
// Based on: https://en.wikipedia.org/wiki/Tamil_(Unicode_block)
828
6556
pub fn is_tamil(ch: char) -> bool {
829
6556
    matches!(ch, '\u{0B80}'..='\u{0BFF}')
830
6556
}
831

            
832
// Based on: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
833
6556
pub fn is_thai(ch: char) -> bool {
834
6556
    matches!(ch, '\u{0E00}'..='\u{0E7F}')
835
6556
}
836

            
837
// Based on: https://en.wikipedia.org/wiki/Gujarati_(Unicode_block)
838
6556
pub fn is_gujarati(ch: char) -> bool {
839
6556
    matches!(ch, '\u{0A80}'..='\u{0AFF}')
840
6556
}
841

            
842
// Gurmukhi is the script for Punjabi language.
843
// Based on: https://en.wikipedia.org/wiki/Gurmukhi_(Unicode_block)
844
6556
pub fn is_gurmukhi(ch: char) -> bool {
845
6556
    matches!(ch, '\u{0A00}'..='\u{0A7F}')
846
6556
}
847

            
848
6556
pub fn is_telugu(ch: char) -> bool {
849
6556
    matches!(ch, '\u{0C00}'..='\u{0C7F}')
850
6556
}
851

            
852
// Based on: https://en.wikipedia.org/wiki/Malayalam_(Unicode_block)
853
6556
pub fn is_malayalam(ch: char) -> bool {
854
6556
    matches!(ch, '\u{0D00}'..='\u{0D7F}')
855
6556
}
856

            
857
// Based on: https://en.wikipedia.org/wiki/Oriya_(Unicode_block)
858
6556
pub fn is_oriya(ch: char) -> bool {
859
6556
    matches!(ch, '\u{0B00}'..='\u{0B7F}')
860
6556
}
861

            
862
// Based on: https://en.wikipedia.org/wiki/Myanmar_(Unicode_block)
863
6556
pub fn is_myanmar(ch: char) -> bool {
864
6556
    matches!(ch, '\u{1000}'..='\u{109F}')
865
6556
}
866

            
867
// Based on: https://en.wikipedia.org/wiki/Sinhala_(Unicode_block)
868
6556
pub fn is_sinhala(ch: char) -> bool {
869
6556
    matches!(ch, '\u{0D80}'..='\u{0DFF}')
870
6556
}
871

            
872
// Based on: https://en.wikipedia.org/wiki/Khmer_alphabet
873
6556
pub fn is_khmer(ch: char) -> bool {
874
6556
    matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
875
6556
}