diff options
| author | cvium <clausvium@gmail.com> | 2019-01-16 20:50:40 +0100 |
|---|---|---|
| committer | Claus Vium <clausvium@gmail.com> | 2019-01-20 06:30:50 +0100 |
| commit | b35dcbb9f02c27d2d84ee3281a60d654a3fb1259 (patch) | |
| tree | 12049e17ac9e6cdbe2e7fec5ec7a886cff8a8c6d /Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs | |
| parent | 3a5e3ade01ab4b4fccca03ba2da969f03d5564f1 (diff) | |
Remove MediaBrowser.Text since it violates licenses and is overall hacky
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs')
| -rw-r--r-- | Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs | 330 |
1 files changed, 0 insertions, 330 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs b/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs deleted file mode 100644 index 2d29ec697..000000000 --- a/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs +++ /dev/null @@ -1,330 +0,0 @@ -// TODO IMM HI: check which classes can be made internal? - -using System.Collections.Generic; -using System.Text; -using NLangDetect.Core.Extensions; - -namespace NLangDetect.Core.Utils -{ - public class NGram - { - public const int GramsCount = 3; - - private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE"); - - private static readonly string[] CjkClass = - { - #region CJK classes - - Messages.getString("NGram.KANJI_1_0"), - Messages.getString("NGram.KANJI_1_2"), - Messages.getString("NGram.KANJI_1_4"), - Messages.getString("NGram.KANJI_1_8"), - Messages.getString("NGram.KANJI_1_11"), - Messages.getString("NGram.KANJI_1_12"), - Messages.getString("NGram.KANJI_1_13"), - Messages.getString("NGram.KANJI_1_14"), - Messages.getString("NGram.KANJI_1_16"), - Messages.getString("NGram.KANJI_1_18"), - Messages.getString("NGram.KANJI_1_22"), - Messages.getString("NGram.KANJI_1_27"), - Messages.getString("NGram.KANJI_1_29"), - Messages.getString("NGram.KANJI_1_31"), - Messages.getString("NGram.KANJI_1_35"), - Messages.getString("NGram.KANJI_2_0"), - Messages.getString("NGram.KANJI_2_1"), - Messages.getString("NGram.KANJI_2_4"), - Messages.getString("NGram.KANJI_2_9"), - Messages.getString("NGram.KANJI_2_10"), - Messages.getString("NGram.KANJI_2_11"), - Messages.getString("NGram.KANJI_2_12"), - Messages.getString("NGram.KANJI_2_13"), - Messages.getString("NGram.KANJI_2_15"), - Messages.getString("NGram.KANJI_2_16"), - Messages.getString("NGram.KANJI_2_18"), - Messages.getString("NGram.KANJI_2_21"), - Messages.getString("NGram.KANJI_2_22"), - Messages.getString("NGram.KANJI_2_23"), - Messages.getString("NGram.KANJI_2_28"), - Messages.getString("NGram.KANJI_2_29"), - Messages.getString("NGram.KANJI_2_30"), - Messages.getString("NGram.KANJI_2_31"), - Messages.getString("NGram.KANJI_2_32"), - Messages.getString("NGram.KANJI_2_35"), - Messages.getString("NGram.KANJI_2_36"), - Messages.getString("NGram.KANJI_2_37"), - Messages.getString("NGram.KANJI_2_38"), - Messages.getString("NGram.KANJI_3_1"), - Messages.getString("NGram.KANJI_3_2"), - Messages.getString("NGram.KANJI_3_3"), - Messages.getString("NGram.KANJI_3_4"), - Messages.getString("NGram.KANJI_3_5"), - Messages.getString("NGram.KANJI_3_8"), - Messages.getString("NGram.KANJI_3_9"), - Messages.getString("NGram.KANJI_3_11"), - Messages.getString("NGram.KANJI_3_12"), - Messages.getString("NGram.KANJI_3_13"), - Messages.getString("NGram.KANJI_3_15"), - Messages.getString("NGram.KANJI_3_16"), - Messages.getString("NGram.KANJI_3_18"), - Messages.getString("NGram.KANJI_3_19"), - Messages.getString("NGram.KANJI_3_22"), - Messages.getString("NGram.KANJI_3_23"), - Messages.getString("NGram.KANJI_3_27"), - Messages.getString("NGram.KANJI_3_29"), - Messages.getString("NGram.KANJI_3_30"), - Messages.getString("NGram.KANJI_3_31"), - Messages.getString("NGram.KANJI_3_32"), - Messages.getString("NGram.KANJI_3_35"), - Messages.getString("NGram.KANJI_3_36"), - Messages.getString("NGram.KANJI_3_37"), - Messages.getString("NGram.KANJI_3_38"), - Messages.getString("NGram.KANJI_4_0"), - Messages.getString("NGram.KANJI_4_9"), - Messages.getString("NGram.KANJI_4_10"), - Messages.getString("NGram.KANJI_4_16"), - Messages.getString("NGram.KANJI_4_17"), - Messages.getString("NGram.KANJI_4_18"), - Messages.getString("NGram.KANJI_4_22"), - Messages.getString("NGram.KANJI_4_24"), - Messages.getString("NGram.KANJI_4_28"), - Messages.getString("NGram.KANJI_4_34"), - Messages.getString("NGram.KANJI_4_39"), - Messages.getString("NGram.KANJI_5_10"), - Messages.getString("NGram.KANJI_5_11"), - Messages.getString("NGram.KANJI_5_12"), - Messages.getString("NGram.KANJI_5_13"), - Messages.getString("NGram.KANJI_5_14"), - Messages.getString("NGram.KANJI_5_18"), - Messages.getString("NGram.KANJI_5_26"), - Messages.getString("NGram.KANJI_5_29"), - Messages.getString("NGram.KANJI_5_34"), - Messages.getString("NGram.KANJI_5_39"), - Messages.getString("NGram.KANJI_6_0"), - Messages.getString("NGram.KANJI_6_3"), - Messages.getString("NGram.KANJI_6_9"), - Messages.getString("NGram.KANJI_6_10"), - Messages.getString("NGram.KANJI_6_11"), - Messages.getString("NGram.KANJI_6_12"), - Messages.getString("NGram.KANJI_6_16"), - Messages.getString("NGram.KANJI_6_18"), - Messages.getString("NGram.KANJI_6_20"), - Messages.getString("NGram.KANJI_6_21"), - Messages.getString("NGram.KANJI_6_22"), - Messages.getString("NGram.KANJI_6_23"), - Messages.getString("NGram.KANJI_6_25"), - Messages.getString("NGram.KANJI_6_28"), - Messages.getString("NGram.KANJI_6_29"), - Messages.getString("NGram.KANJI_6_30"), - Messages.getString("NGram.KANJI_6_32"), - Messages.getString("NGram.KANJI_6_34"), - Messages.getString("NGram.KANJI_6_35"), - Messages.getString("NGram.KANJI_6_37"), - Messages.getString("NGram.KANJI_6_39"), - Messages.getString("NGram.KANJI_7_0"), - Messages.getString("NGram.KANJI_7_3"), - Messages.getString("NGram.KANJI_7_6"), - Messages.getString("NGram.KANJI_7_7"), - Messages.getString("NGram.KANJI_7_9"), - Messages.getString("NGram.KANJI_7_11"), - Messages.getString("NGram.KANJI_7_12"), - Messages.getString("NGram.KANJI_7_13"), - Messages.getString("NGram.KANJI_7_16"), - Messages.getString("NGram.KANJI_7_18"), - Messages.getString("NGram.KANJI_7_19"), - Messages.getString("NGram.KANJI_7_20"), - Messages.getString("NGram.KANJI_7_21"), - Messages.getString("NGram.KANJI_7_23"), - Messages.getString("NGram.KANJI_7_25"), - Messages.getString("NGram.KANJI_7_28"), - Messages.getString("NGram.KANJI_7_29"), - Messages.getString("NGram.KANJI_7_32"), - Messages.getString("NGram.KANJI_7_33"), - Messages.getString("NGram.KANJI_7_35"), - Messages.getString("NGram.KANJI_7_37"), - - #endregion - }; - - private static readonly Dictionary<char, char> _cjkMap; - - private StringBuilder _grams; - private bool _capitalword; - - #region Constructor(s) - - static NGram() - { - _cjkMap = new Dictionary<char, char>(); - - foreach (string cjk_list in CjkClass) - { - char representative = cjk_list[0]; - - for (int i = 0; i < cjk_list.Length; i++) - { - _cjkMap.Add(cjk_list[i], representative); - } - } - } - - public NGram() - { - _grams = new StringBuilder(" "); - _capitalword = false; - } - - #endregion - - #region Public methods - - public static char Normalize(char ch) - { - UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock(); - - if (!unicodeBlock.HasValue) - { - return ch; - } - - switch (unicodeBlock.Value) - { - case UnicodeBlock.BasicLatin: - { - if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z') - { - return ' '; - } - - break; - } - - case UnicodeBlock.Latin1Supplement: - { - if (Latin1Excluded.IndexOf(ch) >= 0) - { - return ' '; - } - - break; - } - - case UnicodeBlock.GeneralPunctuation: - { - return ' '; - } - - case UnicodeBlock.Arabic: - { - if (ch == '\u06cc') - { - return '\u064a'; - } - - break; - } - - case UnicodeBlock.LatinExtendedAdditional: - { - if (ch >= '\u1ea0') - { - return '\u1ec3'; - } - - break; - } - - case UnicodeBlock.Hiragana: - { - return '\u3042'; - } - - case UnicodeBlock.Katakana: - { - return '\u30a2'; - } - - case UnicodeBlock.Bopomofo: - case UnicodeBlock.BopomofoExtended: - { - return '\u3105'; - } - - case UnicodeBlock.CjkUnifiedIdeographs: - { - if (_cjkMap.ContainsKey(ch)) - { - return _cjkMap[ch]; - } - - break; - } - - case UnicodeBlock.HangulSyllables: - { - return '\uac00'; - } - } - - return ch; - } - - public void AddChar(char ch) - { - ch = Normalize(ch); - char lastchar = _grams[_grams.Length - 1]; - if (lastchar == ' ') - { - _grams = new StringBuilder(" "); - _capitalword = false; - if (ch == ' ') return; - } - else if (_grams.Length >= GramsCount) - { - _grams.Remove(0, 1); - } - _grams.Append(ch); - - if (char.IsUpper(ch)) - { - if (char.IsUpper(lastchar)) _capitalword = true; - } - else - { - _capitalword = false; - } - } - - public string Get(int n) - { - if (_capitalword) - { - return null; - } - - int len = _grams.Length; - - if (n < 1 || n > 3 || len < n) - { - return null; - } - - if (n == 1) - { - char ch = _grams[len - 1]; - - if (ch == ' ') - { - return null; - } - - return ch.ToString(); - } - - // TODO IMM HI: is ToString() here effective? - return _grams.ToString().SubSequence(len - n, len); - } - - #endregion - } -} |
