4 files changed, 581 insertions, 0 deletions
diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs
new file mode 100644
index 0000000000..59076bd664
--- /dev/null
+++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs
@@ -0,0 +1,374 @@
+using System;
+
+namespace NLangDetect.Core.Extensions
+{
+  public static class CharExtensions
+  {
+    private const int MIN_CODE_POINT = 0x000000;
+    private const int MAX_CODE_POINT = 0x10ffff;
+
+    private static readonly int[] _unicodeBlockStarts =
+      {
+        #region Unicode block starts
+
+        0x0000, // Basic Latin
+        0x0080, // Latin-1 Supplement
+        0x0100, // Latin Extended-A
+        0x0180, // Latin Extended-B
+        0x0250, // IPA Extensions
+        0x02B0, // Spacing Modifier Letters
+        0x0300, // Combining Diacritical Marks
+        0x0370, // Greek and Coptic
+        0x0400, // Cyrillic
+        0x0500, // Cyrillic Supplementary
+        0x0530, // Armenian
+        0x0590, // Hebrew
+        0x0600, // Arabic
+        0x0700, // Syriac
+        0x0750, // unassigned
+        0x0780, // Thaana
+        0x07C0, // unassigned
+        0x0900, // Devanagari
+        0x0980, // Bengali
+        0x0A00, // Gurmukhi
+        0x0A80, // Gujarati
+        0x0B00, // Oriya
+        0x0B80, // Tamil
+        0x0C00, // Telugu
+        0x0C80, // Kannada
+        0x0D00, // Malayalam
+        0x0D80, // Sinhala
+        0x0E00, // Thai
+        0x0E80, // Lao
+        0x0F00, // Tibetan
+        0x1000, // Myanmar
+        0x10A0, // Georgian
+        0x1100, // Hangul Jamo
+        0x1200, // Ethiopic
+        0x1380, // unassigned
+        0x13A0, // Cherokee
+        0x1400, // Unified Canadian Aboriginal Syllabics
+        0x1680, // Ogham
+        0x16A0, // Runic
+        0x1700, // Tagalog
+        0x1720, // Hanunoo
+        0x1740, // Buhid
+        0x1760, // Tagbanwa
+        0x1780, // Khmer
+        0x1800, // Mongolian
+        0x18B0, // unassigned
+        0x1900, // Limbu
+        0x1950, // Tai Le
+        0x1980, // unassigned
+        0x19E0, // Khmer Symbols
+        0x1A00, // unassigned
+        0x1D00, // Phonetic Extensions
+        0x1D80, // unassigned
+        0x1E00, // Latin Extended Additional
+        0x1F00, // Greek Extended
+        0x2000, // General Punctuation
+        0x2070, // Superscripts and Subscripts
+        0x20A0, // Currency Symbols
+        0x20D0, // Combining Diacritical Marks for Symbols
+        0x2100, // Letterlike Symbols
+        0x2150, // Number Forms
+        0x2190, // Arrows
+        0x2200, // Mathematical Operators
+        0x2300, // Miscellaneous Technical
+        0x2400, // Control Pictures
+        0x2440, // Optical Character Recognition
+        0x2460, // Enclosed Alphanumerics
+        0x2500, // Box Drawing
+        0x2580, // Block Elements
+        0x25A0, // Geometric Shapes
+        0x2600, // Miscellaneous Symbols
+        0x2700, // Dingbats
+        0x27C0, // Miscellaneous Mathematical Symbols-A
+        0x27F0, // Supplemental Arrows-A
+        0x2800, // Braille Patterns
+        0x2900, // Supplemental Arrows-B
+        0x2980, // Miscellaneous Mathematical Symbols-B
+        0x2A00, // Supplemental Mathematical Operators
+        0x2B00, // Miscellaneous Symbols and Arrows
+        0x2C00, // unassigned
+        0x2E80, // CJK Radicals Supplement
+        0x2F00, // Kangxi Radicals
+        0x2FE0, // unassigned
+        0x2FF0, // Ideographic Description Characters
+        0x3000, // CJK Symbols and Punctuation
+        0x3040, // Hiragana
+        0x30A0, // Katakana
+        0x3100, // Bopomofo
+        0x3130, // Hangul Compatibility Jamo
+        0x3190, // Kanbun
+        0x31A0, // Bopomofo Extended
+        0x31C0, // unassigned
+        0x31F0, // Katakana Phonetic Extensions
+        0x3200, // Enclosed CJK Letters and Months
+        0x3300, // CJK Compatibility
+        0x3400, // CJK Unified Ideographs Extension A
+        0x4DC0, // Yijing Hexagram Symbols
+        0x4E00, // CJK Unified Ideographs
+        0xA000, // Yi Syllables
+        0xA490, // Yi Radicals
+        0xA4D0, // unassigned
+        0xAC00, // Hangul Syllables
+        0xD7B0, // unassigned
+        0xD800, // High Surrogates
+        0xDB80, // High Private Use Surrogates
+        0xDC00, // Low Surrogates
+        0xE000, // Private Use
+        0xF900, // CJK Compatibility Ideographs
+        0xFB00, // Alphabetic Presentation Forms
+        0xFB50, // Arabic Presentation Forms-A
+        0xFE00, // Variation Selectors
+        0xFE10, // unassigned
+        0xFE20, // Combining Half Marks
+        0xFE30, // CJK Compatibility Forms
+        0xFE50, // Small Form Variants
+        0xFE70, // Arabic Presentation Forms-B
+        0xFF00, // Halfwidth and Fullwidth Forms
+        0xFFF0, // Specials
+        0x10000, // Linear B Syllabary
+        0x10080, // Linear B Ideograms
+        0x10100, // Aegean Numbers
+        0x10140, // unassigned
+        0x10300, // Old Italic
+        0x10330, // Gothic
+        0x10350, // unassigned
+        0x10380, // Ugaritic
+        0x103A0, // unassigned
+        0x10400, // Deseret
+        0x10450, // Shavian
+        0x10480, // Osmanya
+        0x104B0, // unassigned
+        0x10800, // Cypriot Syllabary
+        0x10840, // unassigned
+        0x1D000, // Byzantine Musical Symbols
+        0x1D100, // Musical Symbols
+        0x1D200, // unassigned
+        0x1D300, // Tai Xuan Jing Symbols
+        0x1D360, // unassigned
+        0x1D400, // Mathematical Alphanumeric Symbols
+        0x1D800, // unassigned
+        0x20000, // CJK Unified Ideographs Extension B
+        0x2A6E0, // unassigned
+        0x2F800, // CJK Compatibility Ideographs Supplement
+        0x2FA20, // unassigned
+        0xE0000, // Tags
+        0xE0080, // unassigned
+        0xE0100, // Variation Selectors Supplement
+        0xE01F0, // unassigned
+        0xF0000, // Supplementary Private Use Area-A
+        0x100000, // Supplementary Private Use Area-B
+
+        #endregion
+      };
+
+    private static readonly UnicodeBlock?[] _unicodeBlocks =
+      {
+        #region Unicode blocks
+        UnicodeBlock.BasicLatin,
+        UnicodeBlock.Latin1Supplement,
+        UnicodeBlock.LatinExtendedA,
+        UnicodeBlock.LatinExtendedB,
+        UnicodeBlock.IpaExtensions,
+        UnicodeBlock.SpacingModifierLetters,
+        UnicodeBlock.CombiningDiacriticalMarks,
+        UnicodeBlock.Greek,
+        UnicodeBlock.Cyrillic,
+        UnicodeBlock.CyrillicSupplementary,
+        UnicodeBlock.Armenian,
+        UnicodeBlock.Hebrew,
+        UnicodeBlock.Arabic,
+        UnicodeBlock.Syriac,
+        null,
+        UnicodeBlock.Thaana,
+        null,
+        UnicodeBlock.Devanagari,
+        UnicodeBlock.Bengali,
+        UnicodeBlock.Gurmukhi,
+        UnicodeBlock.Gujarati,
+        UnicodeBlock.Oriya,
+        UnicodeBlock.Tamil,
+        UnicodeBlock.Telugu,
+        UnicodeBlock.Kannada,
+        UnicodeBlock.Malayalam,
+        UnicodeBlock.Sinhala,
+        UnicodeBlock.Thai,
+        UnicodeBlock.Lao,
+        UnicodeBlock.Tibetan,
+        UnicodeBlock.Myanmar,
+        UnicodeBlock.Georgian,
+        UnicodeBlock.HangulJamo,
+        UnicodeBlock.Ethiopic,
+        null,
+        UnicodeBlock.Cherokee,
+        UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
+        UnicodeBlock.Ogham,
+        UnicodeBlock.Runic,
+        UnicodeBlock.Tagalog,
+        UnicodeBlock.Hanunoo,
+        UnicodeBlock.Buhid,
+        UnicodeBlock.Tagbanwa,
+        UnicodeBlock.Khmer,
+        UnicodeBlock.Mongolian,
+        null,
+        UnicodeBlock.Limbu,
+        UnicodeBlock.TaiLe,
+        null,
+        UnicodeBlock.KhmerSymbols,
+        null,
+        UnicodeBlock.PhoneticExtensions,
+        null,
+        UnicodeBlock.LatinExtendedAdditional,
+        UnicodeBlock.GreekExtended,
+        UnicodeBlock.GeneralPunctuation,
+        UnicodeBlock.SuperscriptsAndSubscripts,
+        UnicodeBlock.CurrencySymbols,
+        UnicodeBlock.CombiningMarksForSymbols,
+        UnicodeBlock.LetterlikeSymbols,
+        UnicodeBlock.NumberForms,
+        UnicodeBlock.Arrows,
+        UnicodeBlock.MathematicalOperators,
+        UnicodeBlock.MiscellaneousTechnical,
+        UnicodeBlock.ControlPictures,
+        UnicodeBlock.OpticalCharacterRecognition,
+        UnicodeBlock.EnclosedAlphanumerics,
+        UnicodeBlock.BoxDrawing,
+        UnicodeBlock.BlockElements,
+        UnicodeBlock.GeometricShapes,
+        UnicodeBlock.MiscellaneousSymbols,
+        UnicodeBlock.Dingbats,
+        UnicodeBlock.MiscellaneousMathematicalSymbolsA,
+        UnicodeBlock.SupplementalArrowsA,
+        UnicodeBlock.BraillePatterns,
+        UnicodeBlock.SupplementalArrowsB,
+        UnicodeBlock.MiscellaneousMathematicalSymbolsB,
+        UnicodeBlock.SupplementalMathematicalOperators,
+        UnicodeBlock.MiscellaneousSymbolsAndArrows,
+        null,
+        UnicodeBlock.CjkRadicalsSupplement,
+        UnicodeBlock.KangxiRadicals,
+        null,
+        UnicodeBlock.IdeographicDescriptionCharacters,
+        UnicodeBlock.CjkSymbolsAndPunctuation,
+        UnicodeBlock.Hiragana,
+        UnicodeBlock.Katakana,
+        UnicodeBlock.Bopomofo,
+        UnicodeBlock.HangulCompatibilityJamo,
+        UnicodeBlock.Kanbun,
+        UnicodeBlock.BopomofoExtended,
+        null,
+        UnicodeBlock.KatakanaPhoneticExtensions,
+        UnicodeBlock.EnclosedCjkLettersAndMonths,
+        UnicodeBlock.CjkCompatibility,
+        UnicodeBlock.CjkUnifiedIdeographsExtensionA,
+        UnicodeBlock.YijingHexagramSymbols,
+        UnicodeBlock.CjkUnifiedIdeographs,
+        UnicodeBlock.YiSyllables,
+        UnicodeBlock.YiRadicals,
+        null,
+        UnicodeBlock.HangulSyllables,
+        null,
+        UnicodeBlock.HighSurrogates,
+        UnicodeBlock.HighPrivateUseSurrogates,
+        UnicodeBlock.LowSurrogates,
+        UnicodeBlock.PrivateUseArea,
+        UnicodeBlock.CjkCompatibilityIdeographs,
+        UnicodeBlock.AlphabeticPresentationForms,
+        UnicodeBlock.ArabicPresentationFormsA,
+        UnicodeBlock.VariationSelectors,
+        null,
+        UnicodeBlock.CombiningHalfMarks,
+        UnicodeBlock.CjkCompatibilityForms,
+        UnicodeBlock.SmallFormVariants,
+        UnicodeBlock.ArabicPresentationFormsB,
+        UnicodeBlock.HalfwidthAndFullwidthForms,
+        UnicodeBlock.Specials,
+        UnicodeBlock.LinearBSyllabary,
+        UnicodeBlock.LinearBIdeograms,
+        UnicodeBlock.AegeanNumbers,
+        null,
+        UnicodeBlock.OldItalic,
+        UnicodeBlock.Gothic,
+        null,
+        UnicodeBlock.Ugaritic,
+        null,
+        UnicodeBlock.Deseret,
+        UnicodeBlock.Shavian,
+        UnicodeBlock.Osmanya,
+        null,
+        UnicodeBlock.CypriotSyllabary,
+        null,
+        UnicodeBlock.ByzantineMusicalSymbols,
+        UnicodeBlock.MusicalSymbols,
+        null,
+        UnicodeBlock.TaiXuanJingSymbols,
+        null,
+        UnicodeBlock.MathematicalAlphanumericSymbols,
+        null,
+        UnicodeBlock.CjkUnifiedIdeographsExtensionB,
+        null,
+        UnicodeBlock.CjkCompatibilityIdeographsSupplement,
+        null,
+        UnicodeBlock.Tags,
+        null,
+        UnicodeBlock.VariationSelectorsSupplement,
+        null,
+        UnicodeBlock.SupplementaryPrivateUseAreaA,
+        UnicodeBlock.SupplementaryPrivateUseAreaB,
+
+        #endregion
+      };
+
+    #region Public methods
+
+    /// <remarks>
+    /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
+    /// </remarks>
+    public static UnicodeBlock? GetUnicodeBlock(this char ch)
+    {
+      int codePoint = ch;
+
+      if (!IsValidCodePoint(codePoint))
+      {
+        throw new ArgumentException("Argument is not a valid code point.", "ch");
+      }
+
+      int top, bottom, current;
+
+      bottom = 0;
+      top = _unicodeBlockStarts.Length;
+      current = top / 2;
+
+      // invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
+      while (top - bottom > 1)
+      {
+        if (codePoint >= _unicodeBlockStarts[current])
+        {
+          bottom = current;
+        }
+        else
+        {
+          top = current;
+        }
+
+        current = (top + bottom) / 2;
+      }
+
+      return _unicodeBlocks[current];
+    }
+
+    #endregion
+
+    #region Private helper methods
+
+    private static bool IsValidCodePoint(int codePoint)
+    {
+      return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
+    }
+
+    #endregion
+  }
+}
diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs
new file mode 100644
index 0000000000..d55ca80df6
--- /dev/null
+++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs
@@ -0,0 +1,51 @@
+using System;
+
+namespace NLangDetect.Core.Extensions
+{
+  public static class RandomExtensions
+  {
+    private const double _Epsilon = 2.22044604925031E-15;
+
+    private static readonly object _mutex = new object();
+
+    private static double _nextNextGaussian;
+    private static bool _hasNextNextGaussian;
+
+    /// <summary>
+    /// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
+    /// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
+    /// </summary>
+    /// <remarks>
+    /// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
+    /// </remarks>
+    public static double NextGaussian(this Random random)
+    {
+      lock (_mutex)
+      {
+        if (_hasNextNextGaussian)
+        {
+          _hasNextNextGaussian = false;
+
+          return _nextNextGaussian;
+        }
+
+        double v1, v2, s;
+
+        do
+        {
+          v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
+          v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
+          s = v1 * v1 + v2 * v2;
+        }
+        while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
+
+        double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
+
+        _nextNextGaussian = v2 * multiplier;
+        _hasNextNextGaussian = true;
+
+        return v1 * multiplier;
+      }
+    }
+  }
+}
diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs
new file mode 100644
index 0000000000..fc6c58a95b
--- /dev/null
+++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs
@@ -0,0 +1,25 @@
+using System;
+
+namespace NLangDetect.Core.Extensions
+{
+  public static class StringExtensions
+  {
+    /// <summary>
+    /// Returns a new character sequence that is a subsequence of this sequence. The subsequence starts with the character at the specified index and ends with the character at index end - 1. The length of the returned sequence is end - start, so if start == end then an empty sequence is returned.
+    /// </summary>
+    /// <param name="s"></param>
+    /// <param name="start">the start index, inclusive</param>
+    /// <param name="end">the end index, exclusive</param>
+    /// <returns>the specified subsequence</returns>
+    /// <exception cref="IndexOutOfRangeException"> if start or end are negative, if end is greater than length(), or if start is greater than end</exception>
+    public static string SubSequence(this string s, int start, int end)
+    {
+      if (start < 0) throw new ArgumentOutOfRangeException("start", "Argument must not be negative.");
+      if (end < 0) throw new ArgumentOutOfRangeException("end", "Argument must not be negative.");
+      if (end > s.Length) throw new ArgumentOutOfRangeException("end", "Argument must not be greater than the input string's length.");
+      if (start > end) throw new ArgumentOutOfRangeException("start", "Argument must not be greater than the 'end' argument.");
+      
+      return s.Substring(start, end - start);
+    }
+  }
+}
diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs
new file mode 100644
index 0000000000..71b5de75e1
--- /dev/null
+++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs
@@ -0,0 +1,131 @@
+namespace NLangDetect.Core.Extensions
+{
+  public enum UnicodeBlock
+  {
+    BasicLatin,
+    Latin1Supplement,
+    LatinExtendedA,
+    LatinExtendedB,
+    IpaExtensions,
+    SpacingModifierLetters,
+    CombiningDiacriticalMarks,
+    Greek,
+    Cyrillic,
+    CyrillicSupplementary,
+    Armenian,
+    Hebrew,
+    Arabic,
+    Syriac,
+    Thaana,
+    Devanagari,
+    Bengali,
+    Gurmukhi,
+    Gujarati,
+    Oriya,
+    Tamil,
+    Telugu,
+    Kannada,
+    Malayalam,
+    Sinhala,
+    Thai,
+    Lao,
+    Tibetan,
+    Myanmar,
+    Georgian,
+    HangulJamo,
+    Ethiopic,
+    Cherokee,
+    UnifiedCanadianAboriginalSyllabics,
+    Ogham,
+    Runic,
+    Tagalog,
+    Hanunoo,
+    Buhid,
+    Tagbanwa,
+    Khmer,
+    Mongolian,
+    Limbu,
+    TaiLe,
+    KhmerSymbols,
+    PhoneticExtensions,
+    LatinExtendedAdditional,
+    GreekExtended,
+    GeneralPunctuation,
+    SuperscriptsAndSubscripts,
+    CurrencySymbols,
+    CombiningMarksForSymbols,
+    LetterlikeSymbols,
+    NumberForms,
+    Arrows,
+    MathematicalOperators,
+    MiscellaneousTechnical,
+    ControlPictures,
+    OpticalCharacterRecognition,
+    EnclosedAlphanumerics,
+    BoxDrawing,
+    BlockElements,
+    GeometricShapes,
+    MiscellaneousSymbols,
+    Dingbats,
+    MiscellaneousMathematicalSymbolsA,
+    SupplementalArrowsA,
+    BraillePatterns,
+    SupplementalArrowsB,
+    MiscellaneousMathematicalSymbolsB,
+    SupplementalMathematicalOperators,
+    MiscellaneousSymbolsAndArrows,
+    CjkRadicalsSupplement,
+    KangxiRadicals,
+    IdeographicDescriptionCharacters,
+    CjkSymbolsAndPunctuation,
+    Hiragana,
+    Katakana,
+    Bopomofo,
+    HangulCompatibilityJamo,
+    Kanbun,
+    BopomofoExtended,
+    KatakanaPhoneticExtensions,
+    EnclosedCjkLettersAndMonths,
+    CjkCompatibility,
+    CjkUnifiedIdeographsExtensionA,
+    YijingHexagramSymbols,
+    CjkUnifiedIdeographs,
+    YiSyllables,
+    YiRadicals,
+    HangulSyllables,
+    HighSurrogates,
+    HighPrivateUseSurrogates,
+    LowSurrogates,
+    PrivateUseArea,
+    CjkCompatibilityIdeographs,
+    AlphabeticPresentationForms,
+    ArabicPresentationFormsA,
+    VariationSelectors,
+    CombiningHalfMarks,
+    CjkCompatibilityForms,
+    SmallFormVariants,
+    ArabicPresentationFormsB,
+    HalfwidthAndFullwidthForms,
+    Specials,
+    LinearBSyllabary,
+    LinearBIdeograms,
+    AegeanNumbers,
+    OldItalic,
+    Gothic,
+    Ugaritic,
+    Deseret,
+    Shavian,
+    Osmanya,
+    CypriotSyllabary,
+    ByzantineMusicalSymbols,
+    MusicalSymbols,
+    TaiXuanJingSymbols,
+    MathematicalAlphanumericSymbols,
+    CjkUnifiedIdeographsExtensionB,
+    CjkCompatibilityIdeographsSupplement,
+    Tags,
+    VariationSelectorsSupplement,
+    SupplementaryPrivateUseAreaA,
+    SupplementaryPrivateUseAreaB,
+  }
+}