diff options
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs')
| -rw-r--r-- | Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs b/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs new file mode 100644 index 000000000..0413edfad --- /dev/null +++ b/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs @@ -0,0 +1,118 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace NLangDetect.Core.Utils +{ + public class LangProfile + { + private const int MinimumFreq = 2; + private const int LessFreqRatio = 100000; + + public string name { get; set; } + + public Dictionary<string, int> freq { get; set; } + public int[] n_words { get; set; } + + #region Constructor(s) + + public LangProfile() + { + freq = new Dictionary<string, int>(); + n_words = new int[NGram.GramsCount]; + } + + public LangProfile(string name) + { + this.name = name; + freq = new Dictionary<string, int>(); + n_words = new int[NGram.GramsCount]; + } + + #endregion + + #region Public methods + + public void Add(string gram) + { + if (name == null || gram == null) return; // Illegal + int len = gram.Length; + if (len < 1 || len > NGram.GramsCount) return; // Illegal + + n_words[len - 1]++; + + if (freq.ContainsKey(gram)) + { + freq[gram] = freq[gram] + 1; + } + else + { + freq.Add(gram, 1); + } + } + + public void OmitLessFreq() + { + if (name == null) return; // Illegal + int threshold = n_words[0] / LessFreqRatio; + if (threshold < MinimumFreq) threshold = MinimumFreq; + + ICollection<string> keys = freq.Keys; + int roman = 0; + // TODO IMM HI: move up? + Regex regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled); + List<string> keysToRemove = new List<string>(); + + foreach (string key in keys) + { + int count = freq[key]; + + if (count <= threshold) + { + n_words[key.Length - 1] -= count; + keysToRemove.Add(key); + } + else + { + if (regex1.IsMatch(key)) + { + roman += count; + } + } + } + + foreach (string keyToRemove in keysToRemove) + { + freq.Remove(keyToRemove); + } + + // roman check + keysToRemove = new List<string>(); + + if (roman < n_words[0] / 3) + { + ICollection<string> keys2 = freq.Keys; + + // TODO IMM HI: move up? + Regex regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled); + + foreach (string key in keys2) + { + int count = freq[key]; + + if (regex2.IsMatch(key)) + { + n_words[key.Length - 1] -= count; + keysToRemove.Add(key); + } + } + + foreach (string keyToRemove in keysToRemove) + { + freq.Remove(keyToRemove); + } + } + } + + #endregion + } +} |
