aboutsummaryrefslogtreecommitdiff
path: root/Emby.Server.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs
blob: 78b44e1fcc416ba9d1357f998b542ca64196310b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace NLangDetect.Core.Utils
{
    public class LangProfile
    {
        private const int MinimumFreq = 2;
        private const int LessFreqRatio = 100000;

        public string name { get; set; }

        public Dictionary<string, int> freq { get; set; }
        public int[] n_words { get; set; }

        #region Constructor(s)

        public LangProfile()
        {
            freq = new Dictionary<string, int>();
            n_words = new int[NGram.GramsCount];
        }

        public LangProfile(string name)
        {
            this.name = name;
            freq = new Dictionary<string, int>();
            n_words = new int[NGram.GramsCount];
        }

        #endregion

        #region Public methods

        public void Add(string gram)
        {
            if (name == null || gram == null) return; // Illegal
            int len = gram.Length;
            if (len < 1 || len > NGram.GramsCount) return; // Illegal

            n_words[len - 1]++;

            if (freq.ContainsKey(gram))
            {
                freq[gram] = freq[gram] + 1;
            }
            else
            {
                freq.Add(gram, 1);
            }
        }

        public void OmitLessFreq()
        {
            if (name == null) return; // Illegal
            int threshold = n_words[0] / LessFreqRatio;
            if (threshold < MinimumFreq) threshold = MinimumFreq;

            ICollection<string> keys = freq.Keys;
            int roman = 0;
            // TODO IMM HI: move up?
            var regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);
            var keysToRemove = new List<string>();

            foreach (string key in keys)
            {
                int count = freq[key];

                if (count <= threshold)
                {
                    n_words[key.Length - 1] -= count;
                    keysToRemove.Add(key);
                }
                else
                {
                    if (regex1.IsMatch(key))
                    {
                        roman += count;
                    }
                }
            }

            foreach (string keyToRemove in keysToRemove)
            {
                freq.Remove(keyToRemove);
            }

            // roman check
            keysToRemove = new List<string>();

            if (roman < n_words[0] / 3)
            {
                ICollection<string> keys2 = freq.Keys;

                // TODO IMM HI: move up?
                var regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);

                foreach (string key in keys2)
                {
                    int count = freq[key];

                    if (regex2.IsMatch(key))
                    {
                        n_words[key.Length - 1] -= count;
                        keysToRemove.Add(key);
                    }
                }

                foreach (string keyToRemove in keysToRemove)
                {
                    freq.Remove(keyToRemove);
                }
            }
        }

        #endregion
    }
}