aboutsummaryrefslogtreecommitdiff
path: root/Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs
diff options
context:
space:
mode:
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs')
-rw-r--r--Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs371
1 files changed, 371 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs b/Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs
new file mode 100644
index 000000000..507dd5e42
--- /dev/null
+++ b/Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs
@@ -0,0 +1,371 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using NLangDetect.Core.Extensions;
+using NLangDetect.Core.Utils;
+
+namespace NLangDetect.Core
+{
+ public class Detector
+ {
+ private const double _AlphaDefault = 0.5;
+ private const double _AlphaWidth = 0.05;
+
+ private const int _IterationLimit = 1000;
+ private const double _ProbThreshold = 0.1;
+ private const double _ConvThreshold = 0.99999;
+ private const int _BaseFreq = 10000;
+
+ private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
+ private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);
+
+ private readonly Dictionary<string, ProbVector> _wordLangProbMap;
+ private readonly List<string> _langlist;
+
+ private StringBuilder _text;
+ private double[] _langprob;
+
+ private double _alpha = _AlphaDefault;
+ private const int _trialsCount = 7;
+ private int _maxTextLength = 10000;
+ private double[] _priorMap;
+ private int? _seed;
+
+ #region Constructor(s)
+
+ public Detector(DetectorFactory factory)
+ {
+ _wordLangProbMap = factory.WordLangProbMap;
+ _langlist = factory.Langlist;
+ _text = new StringBuilder();
+ _seed = factory.Seed;
+ }
+
+ #endregion
+
+ #region Public methods
+
+ public void SetAlpha(double alpha)
+ {
+ _alpha = alpha;
+ }
+
+ public void SetPriorMap(Dictionary<string, double> priorMap)
+ {
+ _priorMap = new double[_langlist.Count];
+
+ double sump = 0;
+
+ for (int i = 0; i < _priorMap.Length; i++)
+ {
+ string lang = _langlist[i];
+
+ if (priorMap.ContainsKey(lang))
+ {
+ double p = priorMap[lang];
+
+ if (p < 0)
+ {
+ throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
+ }
+
+ _priorMap[i] = p;
+ sump += p;
+ }
+ }
+
+ if (sump <= 0)
+ {
+ throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
+ }
+
+ for (int i = 0; i < _priorMap.Length; i++)
+ {
+ _priorMap[i] /= sump;
+ }
+ }
+
+ public void SetMaxTextLength(int max_text_length)
+ {
+ _maxTextLength = max_text_length;
+ }
+
+ // TODO IMM HI: TextReader?
+ public void Append(StreamReader streamReader)
+ {
+ var buf = new char[_maxTextLength / 2];
+
+ while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
+ {
+ int length = streamReader.Read(buf, 0, buf.Length);
+
+ Append(new string(buf, 0, length));
+ }
+ }
+
+ public void Append(string text)
+ {
+ text = _UrlRegex.Replace(text, " ");
+ text = _MailRegex.Replace(text, " ");
+
+ char pre = '\0';
+
+ for (int i = 0; i < text.Length && i < _maxTextLength; i++)
+ {
+ char c = NGram.Normalize(text[i]);
+
+ if (c != ' ' || pre != ' ')
+ {
+ _text.Append(c);
+ }
+
+ pre = c;
+ }
+ }
+
+ private void CleanText()
+ {
+ int latinCount = 0, nonLatinCount = 0;
+
+ for (int i = 0; i < _text.Length; i++)
+ {
+ char c = _text[i];
+
+ if (c <= 'z' && c >= 'A')
+ {
+ latinCount++;
+ }
+ else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
+ {
+ nonLatinCount++;
+ }
+ }
+
+ if (latinCount * 2 < nonLatinCount)
+ {
+ var textWithoutLatin = new StringBuilder();
+
+ for (int i = 0; i < _text.Length; i++)
+ {
+ char c = _text[i];
+
+ if (c > 'z' || c < 'A')
+ {
+ textWithoutLatin.Append(c);
+ }
+ }
+
+ _text = textWithoutLatin;
+ }
+ }
+
+ public string Detect()
+ {
+ List<Language> probabilities = GetProbabilities();
+
+ return
+ probabilities.Count > 0
+ ? probabilities[0].Name
+ : null;
+ }
+
+ public List<Language> GetProbabilities()
+ {
+ if (_langprob == null)
+ {
+ DetectBlock();
+ }
+
+ List<Language> list = SortProbability(_langprob);
+
+ return list;
+ }
+
+ #endregion
+
+ #region Private helper methods
+
+ private static double NormalizeProb(double[] probs)
+ {
+ double maxp = 0, sump = 0;
+
+ sump += probs.Sum();
+
+ for (int i = 0; i < probs.Length; i++)
+ {
+ double p = probs[i] / sump;
+
+ if (maxp < p)
+ {
+ maxp = p;
+ }
+
+ probs[i] = p;
+ }
+
+ return maxp;
+ }
+
+ private static string UnicodeEncode(string word)
+ {
+ var resultSb = new StringBuilder();
+
+ foreach (char ch in word)
+ {
+ if (ch >= '\u0080')
+ {
+ string st = string.Format("{0:x}", 0x10000 + ch);
+
+ while (st.Length < 4)
+ {
+ st = "0" + st;
+ }
+
+ resultSb
+ .Append("\\u")
+ .Append(st.SubSequence(1, 5));
+ }
+ else
+ {
+ resultSb.Append(ch);
+ }
+ }
+
+ return resultSb.ToString();
+ }
+
+ private void DetectBlock()
+ {
+ CleanText();
+
+ List<string> ngrams = ExtractNGrams();
+
+ if (ngrams.Count == 0)
+ {
+ throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
+ }
+
+ _langprob = new double[_langlist.Count];
+
+ Random rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());
+
+ for (int t = 0; t < _trialsCount; t++)
+ {
+ double[] prob = InitProbability();
+
+ // TODO IMM HI: verify it works
+ double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;
+
+ for (int i = 0; ; i++)
+ {
+ int r = rand.Next(ngrams.Count);
+
+ UpdateLangProb(prob, ngrams[r], alpha);
+
+ if (i % 5 == 0)
+ {
+ if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
+ {
+ break;
+ }
+ }
+ }
+
+ for (int j = 0; j < _langprob.Length; j++)
+ {
+ _langprob[j] += prob[j] / _trialsCount;
+ }
+ }
+ }
+
+ private double[] InitProbability()
+ {
+ var prob = new double[_langlist.Count];
+
+ if (_priorMap != null)
+ {
+ for (int i = 0; i < prob.Length; i++)
+ {
+ prob[i] = _priorMap[i];
+ }
+ }
+ else
+ {
+ for (int i = 0; i < prob.Length; i++)
+ {
+ prob[i] = 1.0 / _langlist.Count;
+ }
+ }
+ return prob;
+ }
+
+ private List<string> ExtractNGrams()
+ {
+ var list = new List<string>();
+ NGram ngram = new NGram();
+
+ for (int i = 0; i < _text.Length; i++)
+ {
+ ngram.AddChar(_text[i]);
+
+ for (int n = 1; n <= NGram.GramsCount; n++)
+ {
+ string w = ngram.Get(n);
+
+ if (w != null && _wordLangProbMap.ContainsKey(w))
+ {
+ list.Add(w);
+ }
+ }
+ }
+
+ return list;
+ }
+
+ private void UpdateLangProb(double[] prob, string word, double alpha)
+ {
+ if (word == null || !_wordLangProbMap.ContainsKey(word))
+ {
+ return;
+ }
+
+ ProbVector langProbMap = _wordLangProbMap[word];
+ double weight = alpha / _BaseFreq;
+
+ for (int i = 0; i < prob.Length; i++)
+ {
+ prob[i] *= weight + langProbMap[i];
+ }
+ }
+
+ private List<Language> SortProbability(double[] prob)
+ {
+ var list = new List<Language>();
+
+ for (int j = 0; j < prob.Length; j++)
+ {
+ double p = prob[j];
+
+ if (p > _ProbThreshold)
+ {
+ for (int i = 0; i <= list.Count; i++)
+ {
+ if (i == list.Count || list[i].Probability < p)
+ {
+ list.Insert(i, new Language(_langlist[j], p));
+
+ break;
+ }
+ }
+ }
+ }
+
+ return list;
+ }
+
+ #endregion
+ }
+}