aboutsummaryrefslogtreecommitdiff
path: root/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs
diff options
context:
space:
mode:
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs')
-rw-r--r--Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs118
1 files changed, 118 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs b/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs
new file mode 100644
index 000000000..0413edfad
--- /dev/null
+++ b/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs
@@ -0,0 +1,118 @@
+using System.Collections.Generic;
+using System.Text.RegularExpressions;
+
+namespace NLangDetect.Core.Utils
+{
+ public class LangProfile
+ {
+ private const int MinimumFreq = 2;
+ private const int LessFreqRatio = 100000;
+
+ public string name { get; set; }
+
+ public Dictionary<string, int> freq { get; set; }
+ public int[] n_words { get; set; }
+
+ #region Constructor(s)
+
+ public LangProfile()
+ {
+ freq = new Dictionary<string, int>();
+ n_words = new int[NGram.GramsCount];
+ }
+
+ public LangProfile(string name)
+ {
+ this.name = name;
+ freq = new Dictionary<string, int>();
+ n_words = new int[NGram.GramsCount];
+ }
+
+ #endregion
+
+ #region Public methods
+
+ public void Add(string gram)
+ {
+ if (name == null || gram == null) return; // Illegal
+ int len = gram.Length;
+ if (len < 1 || len > NGram.GramsCount) return; // Illegal
+
+ n_words[len - 1]++;
+
+ if (freq.ContainsKey(gram))
+ {
+ freq[gram] = freq[gram] + 1;
+ }
+ else
+ {
+ freq.Add(gram, 1);
+ }
+ }
+
+ public void OmitLessFreq()
+ {
+ if (name == null) return; // Illegal
+ int threshold = n_words[0] / LessFreqRatio;
+ if (threshold < MinimumFreq) threshold = MinimumFreq;
+
+ ICollection<string> keys = freq.Keys;
+ int roman = 0;
+ // TODO IMM HI: move up?
+ Regex regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);
+ List<string> keysToRemove = new List<string>();
+
+ foreach (string key in keys)
+ {
+ int count = freq[key];
+
+ if (count <= threshold)
+ {
+ n_words[key.Length - 1] -= count;
+ keysToRemove.Add(key);
+ }
+ else
+ {
+ if (regex1.IsMatch(key))
+ {
+ roman += count;
+ }
+ }
+ }
+
+ foreach (string keyToRemove in keysToRemove)
+ {
+ freq.Remove(keyToRemove);
+ }
+
+ // roman check
+ keysToRemove = new List<string>();
+
+ if (roman < n_words[0] / 3)
+ {
+ ICollection<string> keys2 = freq.Keys;
+
+ // TODO IMM HI: move up?
+ Regex regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);
+
+ foreach (string key in keys2)
+ {
+ int count = freq[key];
+
+ if (regex2.IsMatch(key))
+ {
+ n_words[key.Length - 1] -= count;
+ keysToRemove.Add(key);
+ }
+ }
+
+ foreach (string keyToRemove in keysToRemove)
+ {
+ freq.Remove(keyToRemove);
+ }
+ }
+ }
+
+ #endregion
+ }
+}