aboutsummaryrefslogtreecommitdiff
path: root/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs
diff options
context:
space:
mode:
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs')
-rw-r--r--Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs330
1 files changed, 330 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs b/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs
new file mode 100644
index 000000000..b1738f7ca
--- /dev/null
+++ b/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs
@@ -0,0 +1,330 @@
+// TODO IMM HI: check which classes can be made internal?
+
+using System.Collections.Generic;
+using System.Text;
+using NLangDetect.Core.Extensions;
+
+namespace NLangDetect.Core.Utils
+{
+ public class NGram
+ {
+ public const int GramsCount = 3;
+
+ private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
+
+ private static readonly string[] CjkClass =
+ {
+ #region CJK classes
+
+ Messages.getString("NGram.KANJI_1_0"),
+ Messages.getString("NGram.KANJI_1_2"),
+ Messages.getString("NGram.KANJI_1_4"),
+ Messages.getString("NGram.KANJI_1_8"),
+ Messages.getString("NGram.KANJI_1_11"),
+ Messages.getString("NGram.KANJI_1_12"),
+ Messages.getString("NGram.KANJI_1_13"),
+ Messages.getString("NGram.KANJI_1_14"),
+ Messages.getString("NGram.KANJI_1_16"),
+ Messages.getString("NGram.KANJI_1_18"),
+ Messages.getString("NGram.KANJI_1_22"),
+ Messages.getString("NGram.KANJI_1_27"),
+ Messages.getString("NGram.KANJI_1_29"),
+ Messages.getString("NGram.KANJI_1_31"),
+ Messages.getString("NGram.KANJI_1_35"),
+ Messages.getString("NGram.KANJI_2_0"),
+ Messages.getString("NGram.KANJI_2_1"),
+ Messages.getString("NGram.KANJI_2_4"),
+ Messages.getString("NGram.KANJI_2_9"),
+ Messages.getString("NGram.KANJI_2_10"),
+ Messages.getString("NGram.KANJI_2_11"),
+ Messages.getString("NGram.KANJI_2_12"),
+ Messages.getString("NGram.KANJI_2_13"),
+ Messages.getString("NGram.KANJI_2_15"),
+ Messages.getString("NGram.KANJI_2_16"),
+ Messages.getString("NGram.KANJI_2_18"),
+ Messages.getString("NGram.KANJI_2_21"),
+ Messages.getString("NGram.KANJI_2_22"),
+ Messages.getString("NGram.KANJI_2_23"),
+ Messages.getString("NGram.KANJI_2_28"),
+ Messages.getString("NGram.KANJI_2_29"),
+ Messages.getString("NGram.KANJI_2_30"),
+ Messages.getString("NGram.KANJI_2_31"),
+ Messages.getString("NGram.KANJI_2_32"),
+ Messages.getString("NGram.KANJI_2_35"),
+ Messages.getString("NGram.KANJI_2_36"),
+ Messages.getString("NGram.KANJI_2_37"),
+ Messages.getString("NGram.KANJI_2_38"),
+ Messages.getString("NGram.KANJI_3_1"),
+ Messages.getString("NGram.KANJI_3_2"),
+ Messages.getString("NGram.KANJI_3_3"),
+ Messages.getString("NGram.KANJI_3_4"),
+ Messages.getString("NGram.KANJI_3_5"),
+ Messages.getString("NGram.KANJI_3_8"),
+ Messages.getString("NGram.KANJI_3_9"),
+ Messages.getString("NGram.KANJI_3_11"),
+ Messages.getString("NGram.KANJI_3_12"),
+ Messages.getString("NGram.KANJI_3_13"),
+ Messages.getString("NGram.KANJI_3_15"),
+ Messages.getString("NGram.KANJI_3_16"),
+ Messages.getString("NGram.KANJI_3_18"),
+ Messages.getString("NGram.KANJI_3_19"),
+ Messages.getString("NGram.KANJI_3_22"),
+ Messages.getString("NGram.KANJI_3_23"),
+ Messages.getString("NGram.KANJI_3_27"),
+ Messages.getString("NGram.KANJI_3_29"),
+ Messages.getString("NGram.KANJI_3_30"),
+ Messages.getString("NGram.KANJI_3_31"),
+ Messages.getString("NGram.KANJI_3_32"),
+ Messages.getString("NGram.KANJI_3_35"),
+ Messages.getString("NGram.KANJI_3_36"),
+ Messages.getString("NGram.KANJI_3_37"),
+ Messages.getString("NGram.KANJI_3_38"),
+ Messages.getString("NGram.KANJI_4_0"),
+ Messages.getString("NGram.KANJI_4_9"),
+ Messages.getString("NGram.KANJI_4_10"),
+ Messages.getString("NGram.KANJI_4_16"),
+ Messages.getString("NGram.KANJI_4_17"),
+ Messages.getString("NGram.KANJI_4_18"),
+ Messages.getString("NGram.KANJI_4_22"),
+ Messages.getString("NGram.KANJI_4_24"),
+ Messages.getString("NGram.KANJI_4_28"),
+ Messages.getString("NGram.KANJI_4_34"),
+ Messages.getString("NGram.KANJI_4_39"),
+ Messages.getString("NGram.KANJI_5_10"),
+ Messages.getString("NGram.KANJI_5_11"),
+ Messages.getString("NGram.KANJI_5_12"),
+ Messages.getString("NGram.KANJI_5_13"),
+ Messages.getString("NGram.KANJI_5_14"),
+ Messages.getString("NGram.KANJI_5_18"),
+ Messages.getString("NGram.KANJI_5_26"),
+ Messages.getString("NGram.KANJI_5_29"),
+ Messages.getString("NGram.KANJI_5_34"),
+ Messages.getString("NGram.KANJI_5_39"),
+ Messages.getString("NGram.KANJI_6_0"),
+ Messages.getString("NGram.KANJI_6_3"),
+ Messages.getString("NGram.KANJI_6_9"),
+ Messages.getString("NGram.KANJI_6_10"),
+ Messages.getString("NGram.KANJI_6_11"),
+ Messages.getString("NGram.KANJI_6_12"),
+ Messages.getString("NGram.KANJI_6_16"),
+ Messages.getString("NGram.KANJI_6_18"),
+ Messages.getString("NGram.KANJI_6_20"),
+ Messages.getString("NGram.KANJI_6_21"),
+ Messages.getString("NGram.KANJI_6_22"),
+ Messages.getString("NGram.KANJI_6_23"),
+ Messages.getString("NGram.KANJI_6_25"),
+ Messages.getString("NGram.KANJI_6_28"),
+ Messages.getString("NGram.KANJI_6_29"),
+ Messages.getString("NGram.KANJI_6_30"),
+ Messages.getString("NGram.KANJI_6_32"),
+ Messages.getString("NGram.KANJI_6_34"),
+ Messages.getString("NGram.KANJI_6_35"),
+ Messages.getString("NGram.KANJI_6_37"),
+ Messages.getString("NGram.KANJI_6_39"),
+ Messages.getString("NGram.KANJI_7_0"),
+ Messages.getString("NGram.KANJI_7_3"),
+ Messages.getString("NGram.KANJI_7_6"),
+ Messages.getString("NGram.KANJI_7_7"),
+ Messages.getString("NGram.KANJI_7_9"),
+ Messages.getString("NGram.KANJI_7_11"),
+ Messages.getString("NGram.KANJI_7_12"),
+ Messages.getString("NGram.KANJI_7_13"),
+ Messages.getString("NGram.KANJI_7_16"),
+ Messages.getString("NGram.KANJI_7_18"),
+ Messages.getString("NGram.KANJI_7_19"),
+ Messages.getString("NGram.KANJI_7_20"),
+ Messages.getString("NGram.KANJI_7_21"),
+ Messages.getString("NGram.KANJI_7_23"),
+ Messages.getString("NGram.KANJI_7_25"),
+ Messages.getString("NGram.KANJI_7_28"),
+ Messages.getString("NGram.KANJI_7_29"),
+ Messages.getString("NGram.KANJI_7_32"),
+ Messages.getString("NGram.KANJI_7_33"),
+ Messages.getString("NGram.KANJI_7_35"),
+ Messages.getString("NGram.KANJI_7_37"),
+
+ #endregion
+ };
+
+ private static readonly Dictionary<char, char> _cjkMap;
+
+ private StringBuilder _grams;
+ private bool _capitalword;
+
+ #region Constructor(s)
+
+ static NGram()
+ {
+ _cjkMap = new Dictionary<char, char>();
+
+ foreach (string cjk_list in CjkClass)
+ {
+ char representative = cjk_list[0];
+
+ for (int i = 0; i < cjk_list.Length; i++)
+ {
+ _cjkMap.Add(cjk_list[i], representative);
+ }
+ }
+ }
+
+ public NGram()
+ {
+ _grams = new StringBuilder(" ");
+ _capitalword = false;
+ }
+
+ #endregion
+
+ #region Public methods
+
+ public static char Normalize(char ch)
+ {
+ UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
+
+ if (!unicodeBlock.HasValue)
+ {
+ return ch;
+ }
+
+ switch (unicodeBlock.Value)
+ {
+ case UnicodeBlock.BasicLatin:
+ {
+ if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
+ {
+ return ' ';
+ }
+
+ break;
+ }
+
+ case UnicodeBlock.Latin1Supplement:
+ {
+ if (Latin1Excluded.IndexOf(ch) >= 0)
+ {
+ return ' ';
+ }
+
+ break;
+ }
+
+ case UnicodeBlock.GeneralPunctuation:
+ {
+ return ' ';
+ }
+
+ case UnicodeBlock.Arabic:
+ {
+ if (ch == '\u06cc')
+ {
+ return '\u064a';
+ }
+
+ break;
+ }
+
+ case UnicodeBlock.LatinExtendedAdditional:
+ {
+ if (ch >= '\u1ea0')
+ {
+ return '\u1ec3';
+ }
+
+ break;
+ }
+
+ case UnicodeBlock.Hiragana:
+ {
+ return '\u3042';
+ }
+
+ case UnicodeBlock.Katakana:
+ {
+ return '\u30a2';
+ }
+
+ case UnicodeBlock.Bopomofo:
+ case UnicodeBlock.BopomofoExtended:
+ {
+ return '\u3105';
+ }
+
+ case UnicodeBlock.CjkUnifiedIdeographs:
+ {
+ if (_cjkMap.ContainsKey(ch))
+ {
+ return _cjkMap[ch];
+ }
+
+ break;
+ }
+
+ case UnicodeBlock.HangulSyllables:
+ {
+ return '\uac00';
+ }
+ }
+
+ return ch;
+ }
+
+ public void AddChar(char ch)
+ {
+ ch = Normalize(ch);
+ char lastchar = _grams[_grams.Length - 1];
+ if (lastchar == ' ')
+ {
+ _grams = new StringBuilder(" ");
+ _capitalword = false;
+ if (ch == ' ') return;
+ }
+ else if (_grams.Length >= GramsCount)
+ {
+ _grams.Remove(0, 1);
+ }
+ _grams.Append(ch);
+
+ if (char.IsUpper(ch))
+ {
+ if (char.IsUpper(lastchar)) _capitalword = true;
+ }
+ else
+ {
+ _capitalword = false;
+ }
+ }
+
+ public string Get(int n)
+ {
+ if (_capitalword)
+ {
+ return null;
+ }
+
+ int len = _grams.Length;
+
+ if (n < 1 || n > 3 || len < n)
+ {
+ return null;
+ }
+
+ if (n == 1)
+ {
+ char ch = _grams[len - 1];
+
+ if (ch == ' ')
+ {
+ return null;
+ }
+
+ return ch.ToString();
+ }
+
+ // TODO IMM HI: is ToString() here effective?
+ return _grams.ToString().SubSequence(len - n, len);
+ }
+
+ #endregion
+ }
+}