diff options
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs')
| -rw-r--r-- | Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs | 371 |
1 files changed, 0 insertions, 371 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs b/Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs deleted file mode 100644 index 507dd5e42..000000000 --- a/Emby.Server.Implementations/TextEncoding/NLangDetect/Detector.cs +++ /dev/null @@ -1,371 +0,0 @@ -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using System.Text.RegularExpressions; -using NLangDetect.Core.Extensions; -using NLangDetect.Core.Utils; - -namespace NLangDetect.Core -{ - public class Detector - { - private const double _AlphaDefault = 0.5; - private const double _AlphaWidth = 0.05; - - private const int _IterationLimit = 1000; - private const double _ProbThreshold = 0.1; - private const double _ConvThreshold = 0.99999; - private const int _BaseFreq = 10000; - - private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled); - private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled); - - private readonly Dictionary<string, ProbVector> _wordLangProbMap; - private readonly List<string> _langlist; - - private StringBuilder _text; - private double[] _langprob; - - private double _alpha = _AlphaDefault; - private const int _trialsCount = 7; - private int _maxTextLength = 10000; - private double[] _priorMap; - private int? _seed; - - #region Constructor(s) - - public Detector(DetectorFactory factory) - { - _wordLangProbMap = factory.WordLangProbMap; - _langlist = factory.Langlist; - _text = new StringBuilder(); - _seed = factory.Seed; - } - - #endregion - - #region Public methods - - public void SetAlpha(double alpha) - { - _alpha = alpha; - } - - public void SetPriorMap(Dictionary<string, double> priorMap) - { - _priorMap = new double[_langlist.Count]; - - double sump = 0; - - for (int i = 0; i < _priorMap.Length; i++) - { - string lang = _langlist[i]; - - if (priorMap.ContainsKey(lang)) - { - double p = priorMap[lang]; - - if (p < 0) - { - throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError); - } - - _priorMap[i] = p; - sump += p; - } - } - - if (sump <= 0) - { - throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError); - } - - for (int i = 0; i < _priorMap.Length; i++) - { - _priorMap[i] /= sump; - } - } - - public void SetMaxTextLength(int max_text_length) - { - _maxTextLength = max_text_length; - } - - // TODO IMM HI: TextReader? - public void Append(StreamReader streamReader) - { - var buf = new char[_maxTextLength / 2]; - - while (_text.Length < _maxTextLength && !streamReader.EndOfStream) - { - int length = streamReader.Read(buf, 0, buf.Length); - - Append(new string(buf, 0, length)); - } - } - - public void Append(string text) - { - text = _UrlRegex.Replace(text, " "); - text = _MailRegex.Replace(text, " "); - - char pre = '\0'; - - for (int i = 0; i < text.Length && i < _maxTextLength; i++) - { - char c = NGram.Normalize(text[i]); - - if (c != ' ' || pre != ' ') - { - _text.Append(c); - } - - pre = c; - } - } - - private void CleanText() - { - int latinCount = 0, nonLatinCount = 0; - - for (int i = 0; i < _text.Length; i++) - { - char c = _text[i]; - - if (c <= 'z' && c >= 'A') - { - latinCount++; - } - else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional) - { - nonLatinCount++; - } - } - - if (latinCount * 2 < nonLatinCount) - { - var textWithoutLatin = new StringBuilder(); - - for (int i = 0; i < _text.Length; i++) - { - char c = _text[i]; - - if (c > 'z' || c < 'A') - { - textWithoutLatin.Append(c); - } - } - - _text = textWithoutLatin; - } - } - - public string Detect() - { - List<Language> probabilities = GetProbabilities(); - - return - probabilities.Count > 0 - ? probabilities[0].Name - : null; - } - - public List<Language> GetProbabilities() - { - if (_langprob == null) - { - DetectBlock(); - } - - List<Language> list = SortProbability(_langprob); - - return list; - } - - #endregion - - #region Private helper methods - - private static double NormalizeProb(double[] probs) - { - double maxp = 0, sump = 0; - - sump += probs.Sum(); - - for (int i = 0; i < probs.Length; i++) - { - double p = probs[i] / sump; - - if (maxp < p) - { - maxp = p; - } - - probs[i] = p; - } - - return maxp; - } - - private static string UnicodeEncode(string word) - { - var resultSb = new StringBuilder(); - - foreach (char ch in word) - { - if (ch >= '\u0080') - { - string st = string.Format("{0:x}", 0x10000 + ch); - - while (st.Length < 4) - { - st = "0" + st; - } - - resultSb - .Append("\\u") - .Append(st.SubSequence(1, 5)); - } - else - { - resultSb.Append(ch); - } - } - - return resultSb.ToString(); - } - - private void DetectBlock() - { - CleanText(); - - List<string> ngrams = ExtractNGrams(); - - if (ngrams.Count == 0) - { - throw new NLangDetectException("no features in text", ErrorCode.CantDetectError); - } - - _langprob = new double[_langlist.Count]; - - Random rand = (_seed.HasValue ? new Random(_seed.Value) : new Random()); - - for (int t = 0; t < _trialsCount; t++) - { - double[] prob = InitProbability(); - - // TODO IMM HI: verify it works - double alpha = _alpha + rand.NextGaussian() * _AlphaWidth; - - for (int i = 0; ; i++) - { - int r = rand.Next(ngrams.Count); - - UpdateLangProb(prob, ngrams[r], alpha); - - if (i % 5 == 0) - { - if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit) - { - break; - } - } - } - - for (int j = 0; j < _langprob.Length; j++) - { - _langprob[j] += prob[j] / _trialsCount; - } - } - } - - private double[] InitProbability() - { - var prob = new double[_langlist.Count]; - - if (_priorMap != null) - { - for (int i = 0; i < prob.Length; i++) - { - prob[i] = _priorMap[i]; - } - } - else - { - for (int i = 0; i < prob.Length; i++) - { - prob[i] = 1.0 / _langlist.Count; - } - } - return prob; - } - - private List<string> ExtractNGrams() - { - var list = new List<string>(); - NGram ngram = new NGram(); - - for (int i = 0; i < _text.Length; i++) - { - ngram.AddChar(_text[i]); - - for (int n = 1; n <= NGram.GramsCount; n++) - { - string w = ngram.Get(n); - - if (w != null && _wordLangProbMap.ContainsKey(w)) - { - list.Add(w); - } - } - } - - return list; - } - - private void UpdateLangProb(double[] prob, string word, double alpha) - { - if (word == null || !_wordLangProbMap.ContainsKey(word)) - { - return; - } - - ProbVector langProbMap = _wordLangProbMap[word]; - double weight = alpha / _BaseFreq; - - for (int i = 0; i < prob.Length; i++) - { - prob[i] *= weight + langProbMap[i]; - } - } - - private List<Language> SortProbability(double[] prob) - { - var list = new List<Language>(); - - for (int j = 0; j < prob.Length; j++) - { - double p = prob[j]; - - if (p > _ProbThreshold) - { - for (int i = 0; i <= list.Count; i++) - { - if (i == list.Count || list[i].Probability < p) - { - list.Insert(i, new Language(_langlist[j], p)); - - break; - } - } - } - } - - return list; - } - - #endregion - } -} |
