aboutsummaryrefslogtreecommitdiff
path: root/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions
diff options
context:
space:
mode:
Diffstat (limited to 'Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions')
-rw-r--r--Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs374
-rw-r--r--Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs51
-rw-r--r--Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs25
-rw-r--r--Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs131
4 files changed, 581 insertions, 0 deletions
diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs
new file mode 100644
index 000000000..59076bd66
--- /dev/null
+++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs
@@ -0,0 +1,374 @@
+using System;
+
+namespace NLangDetect.Core.Extensions
+{
+ public static class CharExtensions
+ {
+ private const int MIN_CODE_POINT = 0x000000;
+ private const int MAX_CODE_POINT = 0x10ffff;
+
+ private static readonly int[] _unicodeBlockStarts =
+ {
+ #region Unicode block starts
+
+ 0x0000, // Basic Latin
+ 0x0080, // Latin-1 Supplement
+ 0x0100, // Latin Extended-A
+ 0x0180, // Latin Extended-B
+ 0x0250, // IPA Extensions
+ 0x02B0, // Spacing Modifier Letters
+ 0x0300, // Combining Diacritical Marks
+ 0x0370, // Greek and Coptic
+ 0x0400, // Cyrillic
+ 0x0500, // Cyrillic Supplementary
+ 0x0530, // Armenian
+ 0x0590, // Hebrew
+ 0x0600, // Arabic
+ 0x0700, // Syriac
+ 0x0750, // unassigned
+ 0x0780, // Thaana
+ 0x07C0, // unassigned
+ 0x0900, // Devanagari
+ 0x0980, // Bengali
+ 0x0A00, // Gurmukhi
+ 0x0A80, // Gujarati
+ 0x0B00, // Oriya
+ 0x0B80, // Tamil
+ 0x0C00, // Telugu
+ 0x0C80, // Kannada
+ 0x0D00, // Malayalam
+ 0x0D80, // Sinhala
+ 0x0E00, // Thai
+ 0x0E80, // Lao
+ 0x0F00, // Tibetan
+ 0x1000, // Myanmar
+ 0x10A0, // Georgian
+ 0x1100, // Hangul Jamo
+ 0x1200, // Ethiopic
+ 0x1380, // unassigned
+ 0x13A0, // Cherokee
+ 0x1400, // Unified Canadian Aboriginal Syllabics
+ 0x1680, // Ogham
+ 0x16A0, // Runic
+ 0x1700, // Tagalog
+ 0x1720, // Hanunoo
+ 0x1740, // Buhid
+ 0x1760, // Tagbanwa
+ 0x1780, // Khmer
+ 0x1800, // Mongolian
+ 0x18B0, // unassigned
+ 0x1900, // Limbu
+ 0x1950, // Tai Le
+ 0x1980, // unassigned
+ 0x19E0, // Khmer Symbols
+ 0x1A00, // unassigned
+ 0x1D00, // Phonetic Extensions
+ 0x1D80, // unassigned
+ 0x1E00, // Latin Extended Additional
+ 0x1F00, // Greek Extended
+ 0x2000, // General Punctuation
+ 0x2070, // Superscripts and Subscripts
+ 0x20A0, // Currency Symbols
+ 0x20D0, // Combining Diacritical Marks for Symbols
+ 0x2100, // Letterlike Symbols
+ 0x2150, // Number Forms
+ 0x2190, // Arrows
+ 0x2200, // Mathematical Operators
+ 0x2300, // Miscellaneous Technical
+ 0x2400, // Control Pictures
+ 0x2440, // Optical Character Recognition
+ 0x2460, // Enclosed Alphanumerics
+ 0x2500, // Box Drawing
+ 0x2580, // Block Elements
+ 0x25A0, // Geometric Shapes
+ 0x2600, // Miscellaneous Symbols
+ 0x2700, // Dingbats
+ 0x27C0, // Miscellaneous Mathematical Symbols-A
+ 0x27F0, // Supplemental Arrows-A
+ 0x2800, // Braille Patterns
+ 0x2900, // Supplemental Arrows-B
+ 0x2980, // Miscellaneous Mathematical Symbols-B
+ 0x2A00, // Supplemental Mathematical Operators
+ 0x2B00, // Miscellaneous Symbols and Arrows
+ 0x2C00, // unassigned
+ 0x2E80, // CJK Radicals Supplement
+ 0x2F00, // Kangxi Radicals
+ 0x2FE0, // unassigned
+ 0x2FF0, // Ideographic Description Characters
+ 0x3000, // CJK Symbols and Punctuation
+ 0x3040, // Hiragana
+ 0x30A0, // Katakana
+ 0x3100, // Bopomofo
+ 0x3130, // Hangul Compatibility Jamo
+ 0x3190, // Kanbun
+ 0x31A0, // Bopomofo Extended
+ 0x31C0, // unassigned
+ 0x31F0, // Katakana Phonetic Extensions
+ 0x3200, // Enclosed CJK Letters and Months
+ 0x3300, // CJK Compatibility
+ 0x3400, // CJK Unified Ideographs Extension A
+ 0x4DC0, // Yijing Hexagram Symbols
+ 0x4E00, // CJK Unified Ideographs
+ 0xA000, // Yi Syllables
+ 0xA490, // Yi Radicals
+ 0xA4D0, // unassigned
+ 0xAC00, // Hangul Syllables
+ 0xD7B0, // unassigned
+ 0xD800, // High Surrogates
+ 0xDB80, // High Private Use Surrogates
+ 0xDC00, // Low Surrogates
+ 0xE000, // Private Use
+ 0xF900, // CJK Compatibility Ideographs
+ 0xFB00, // Alphabetic Presentation Forms
+ 0xFB50, // Arabic Presentation Forms-A
+ 0xFE00, // Variation Selectors
+ 0xFE10, // unassigned
+ 0xFE20, // Combining Half Marks
+ 0xFE30, // CJK Compatibility Forms
+ 0xFE50, // Small Form Variants
+ 0xFE70, // Arabic Presentation Forms-B
+ 0xFF00, // Halfwidth and Fullwidth Forms
+ 0xFFF0, // Specials
+ 0x10000, // Linear B Syllabary
+ 0x10080, // Linear B Ideograms
+ 0x10100, // Aegean Numbers
+ 0x10140, // unassigned
+ 0x10300, // Old Italic
+ 0x10330, // Gothic
+ 0x10350, // unassigned
+ 0x10380, // Ugaritic
+ 0x103A0, // unassigned
+ 0x10400, // Deseret
+ 0x10450, // Shavian
+ 0x10480, // Osmanya
+ 0x104B0, // unassigned
+ 0x10800, // Cypriot Syllabary
+ 0x10840, // unassigned
+ 0x1D000, // Byzantine Musical Symbols
+ 0x1D100, // Musical Symbols
+ 0x1D200, // unassigned
+ 0x1D300, // Tai Xuan Jing Symbols
+ 0x1D360, // unassigned
+ 0x1D400, // Mathematical Alphanumeric Symbols
+ 0x1D800, // unassigned
+ 0x20000, // CJK Unified Ideographs Extension B
+ 0x2A6E0, // unassigned
+ 0x2F800, // CJK Compatibility Ideographs Supplement
+ 0x2FA20, // unassigned
+ 0xE0000, // Tags
+ 0xE0080, // unassigned
+ 0xE0100, // Variation Selectors Supplement
+ 0xE01F0, // unassigned
+ 0xF0000, // Supplementary Private Use Area-A
+ 0x100000, // Supplementary Private Use Area-B
+
+ #endregion
+ };
+
+ private static readonly UnicodeBlock?[] _unicodeBlocks =
+ {
+ #region Unicode blocks
+ UnicodeBlock.BasicLatin,
+ UnicodeBlock.Latin1Supplement,
+ UnicodeBlock.LatinExtendedA,
+ UnicodeBlock.LatinExtendedB,
+ UnicodeBlock.IpaExtensions,
+ UnicodeBlock.SpacingModifierLetters,
+ UnicodeBlock.CombiningDiacriticalMarks,
+ UnicodeBlock.Greek,
+ UnicodeBlock.Cyrillic,
+ UnicodeBlock.CyrillicSupplementary,
+ UnicodeBlock.Armenian,
+ UnicodeBlock.Hebrew,
+ UnicodeBlock.Arabic,
+ UnicodeBlock.Syriac,
+ null,
+ UnicodeBlock.Thaana,
+ null,
+ UnicodeBlock.Devanagari,
+ UnicodeBlock.Bengali,
+ UnicodeBlock.Gurmukhi,
+ UnicodeBlock.Gujarati,
+ UnicodeBlock.Oriya,
+ UnicodeBlock.Tamil,
+ UnicodeBlock.Telugu,
+ UnicodeBlock.Kannada,
+ UnicodeBlock.Malayalam,
+ UnicodeBlock.Sinhala,
+ UnicodeBlock.Thai,
+ UnicodeBlock.Lao,
+ UnicodeBlock.Tibetan,
+ UnicodeBlock.Myanmar,
+ UnicodeBlock.Georgian,
+ UnicodeBlock.HangulJamo,
+ UnicodeBlock.Ethiopic,
+ null,
+ UnicodeBlock.Cherokee,
+ UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
+ UnicodeBlock.Ogham,
+ UnicodeBlock.Runic,
+ UnicodeBlock.Tagalog,
+ UnicodeBlock.Hanunoo,
+ UnicodeBlock.Buhid,
+ UnicodeBlock.Tagbanwa,
+ UnicodeBlock.Khmer,
+ UnicodeBlock.Mongolian,
+ null,
+ UnicodeBlock.Limbu,
+ UnicodeBlock.TaiLe,
+ null,
+ UnicodeBlock.KhmerSymbols,
+ null,
+ UnicodeBlock.PhoneticExtensions,
+ null,
+ UnicodeBlock.LatinExtendedAdditional,
+ UnicodeBlock.GreekExtended,
+ UnicodeBlock.GeneralPunctuation,
+ UnicodeBlock.SuperscriptsAndSubscripts,
+ UnicodeBlock.CurrencySymbols,
+ UnicodeBlock.CombiningMarksForSymbols,
+ UnicodeBlock.LetterlikeSymbols,
+ UnicodeBlock.NumberForms,
+ UnicodeBlock.Arrows,
+ UnicodeBlock.MathematicalOperators,
+ UnicodeBlock.MiscellaneousTechnical,
+ UnicodeBlock.ControlPictures,
+ UnicodeBlock.OpticalCharacterRecognition,
+ UnicodeBlock.EnclosedAlphanumerics,
+ UnicodeBlock.BoxDrawing,
+ UnicodeBlock.BlockElements,
+ UnicodeBlock.GeometricShapes,
+ UnicodeBlock.MiscellaneousSymbols,
+ UnicodeBlock.Dingbats,
+ UnicodeBlock.MiscellaneousMathematicalSymbolsA,
+ UnicodeBlock.SupplementalArrowsA,
+ UnicodeBlock.BraillePatterns,
+ UnicodeBlock.SupplementalArrowsB,
+ UnicodeBlock.MiscellaneousMathematicalSymbolsB,
+ UnicodeBlock.SupplementalMathematicalOperators,
+ UnicodeBlock.MiscellaneousSymbolsAndArrows,
+ null,
+ UnicodeBlock.CjkRadicalsSupplement,
+ UnicodeBlock.KangxiRadicals,
+ null,
+ UnicodeBlock.IdeographicDescriptionCharacters,
+ UnicodeBlock.CjkSymbolsAndPunctuation,
+ UnicodeBlock.Hiragana,
+ UnicodeBlock.Katakana,
+ UnicodeBlock.Bopomofo,
+ UnicodeBlock.HangulCompatibilityJamo,
+ UnicodeBlock.Kanbun,
+ UnicodeBlock.BopomofoExtended,
+ null,
+ UnicodeBlock.KatakanaPhoneticExtensions,
+ UnicodeBlock.EnclosedCjkLettersAndMonths,
+ UnicodeBlock.CjkCompatibility,
+ UnicodeBlock.CjkUnifiedIdeographsExtensionA,
+ UnicodeBlock.YijingHexagramSymbols,
+ UnicodeBlock.CjkUnifiedIdeographs,
+ UnicodeBlock.YiSyllables,
+ UnicodeBlock.YiRadicals,
+ null,
+ UnicodeBlock.HangulSyllables,
+ null,
+ UnicodeBlock.HighSurrogates,
+ UnicodeBlock.HighPrivateUseSurrogates,
+ UnicodeBlock.LowSurrogates,
+ UnicodeBlock.PrivateUseArea,
+ UnicodeBlock.CjkCompatibilityIdeographs,
+ UnicodeBlock.AlphabeticPresentationForms,
+ UnicodeBlock.ArabicPresentationFormsA,
+ UnicodeBlock.VariationSelectors,
+ null,
+ UnicodeBlock.CombiningHalfMarks,
+ UnicodeBlock.CjkCompatibilityForms,
+ UnicodeBlock.SmallFormVariants,
+ UnicodeBlock.ArabicPresentationFormsB,
+ UnicodeBlock.HalfwidthAndFullwidthForms,
+ UnicodeBlock.Specials,
+ UnicodeBlock.LinearBSyllabary,
+ UnicodeBlock.LinearBIdeograms,
+ UnicodeBlock.AegeanNumbers,
+ null,
+ UnicodeBlock.OldItalic,
+ UnicodeBlock.Gothic,
+ null,
+ UnicodeBlock.Ugaritic,
+ null,
+ UnicodeBlock.Deseret,
+ UnicodeBlock.Shavian,
+ UnicodeBlock.Osmanya,
+ null,
+ UnicodeBlock.CypriotSyllabary,
+ null,
+ UnicodeBlock.ByzantineMusicalSymbols,
+ UnicodeBlock.MusicalSymbols,
+ null,
+ UnicodeBlock.TaiXuanJingSymbols,
+ null,
+ UnicodeBlock.MathematicalAlphanumericSymbols,
+ null,
+ UnicodeBlock.CjkUnifiedIdeographsExtensionB,
+ null,
+ UnicodeBlock.CjkCompatibilityIdeographsSupplement,
+ null,
+ UnicodeBlock.Tags,
+ null,
+ UnicodeBlock.VariationSelectorsSupplement,
+ null,
+ UnicodeBlock.SupplementaryPrivateUseAreaA,
+ UnicodeBlock.SupplementaryPrivateUseAreaB,
+
+ #endregion
+ };
+
+ #region Public methods
+
+ /// <remarks>
+ /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
+ /// </remarks>
+ public static UnicodeBlock? GetUnicodeBlock(this char ch)
+ {
+ int codePoint = ch;
+
+ if (!IsValidCodePoint(codePoint))
+ {
+ throw new ArgumentException("Argument is not a valid code point.", "ch");
+ }
+
+ int top, bottom, current;
+
+ bottom = 0;
+ top = _unicodeBlockStarts.Length;
+ current = top / 2;
+
+ // invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
+ while (top - bottom > 1)
+ {
+ if (codePoint >= _unicodeBlockStarts[current])
+ {
+ bottom = current;
+ }
+ else
+ {
+ top = current;
+ }
+
+ current = (top + bottom) / 2;
+ }
+
+ return _unicodeBlocks[current];
+ }
+
+ #endregion
+
+ #region Private helper methods
+
+ private static bool IsValidCodePoint(int codePoint)
+ {
+ return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
+ }
+
+ #endregion
+ }
+}
diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs
new file mode 100644
index 000000000..d55ca80df
--- /dev/null
+++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs
@@ -0,0 +1,51 @@
+using System;
+
+namespace NLangDetect.Core.Extensions
+{
+ public static class RandomExtensions
+ {
+ private const double _Epsilon = 2.22044604925031E-15;
+
+ private static readonly object _mutex = new object();
+
+ private static double _nextNextGaussian;
+ private static bool _hasNextNextGaussian;
+
+ /// <summary>
+ /// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
+ /// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
+ /// </summary>
+ /// <remarks>
+ /// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
+ /// </remarks>
+ public static double NextGaussian(this Random random)
+ {
+ lock (_mutex)
+ {
+ if (_hasNextNextGaussian)
+ {
+ _hasNextNextGaussian = false;
+
+ return _nextNextGaussian;
+ }
+
+ double v1, v2, s;
+
+ do
+ {
+ v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
+ v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
+ s = v1 * v1 + v2 * v2;
+ }
+ while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
+
+ double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
+
+ _nextNextGaussian = v2 * multiplier;
+ _hasNextNextGaussian = true;
+
+ return v1 * multiplier;
+ }
+ }
+ }
+}
diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs
new file mode 100644
index 000000000..fc6c58a95
--- /dev/null
+++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs
@@ -0,0 +1,25 @@
+using System;
+
+namespace NLangDetect.Core.Extensions
+{
+ public static class StringExtensions
+ {
+ /// <summary>
+ /// Returns a new character sequence that is a subsequence of this sequence. The subsequence starts with the character at the specified index and ends with the character at index end - 1. The length of the returned sequence is end - start, so if start == end then an empty sequence is returned.
+ /// </summary>
+ /// <param name="s"></param>
+ /// <param name="start">the start index, inclusive</param>
+ /// <param name="end">the end index, exclusive</param>
+ /// <returns>the specified subsequence</returns>
+ /// <exception cref="IndexOutOfRangeException"> if start or end are negative, if end is greater than length(), or if start is greater than end</exception>
+ public static string SubSequence(this string s, int start, int end)
+ {
+ if (start < 0) throw new ArgumentOutOfRangeException("start", "Argument must not be negative.");
+ if (end < 0) throw new ArgumentOutOfRangeException("end", "Argument must not be negative.");
+ if (end > s.Length) throw new ArgumentOutOfRangeException("end", "Argument must not be greater than the input string's length.");
+ if (start > end) throw new ArgumentOutOfRangeException("start", "Argument must not be greater than the 'end' argument.");
+
+ return s.Substring(start, end - start);
+ }
+ }
+}
diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs
new file mode 100644
index 000000000..71b5de75e
--- /dev/null
+++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs
@@ -0,0 +1,131 @@
+namespace NLangDetect.Core.Extensions
+{
+ public enum UnicodeBlock
+ {
+ BasicLatin,
+ Latin1Supplement,
+ LatinExtendedA,
+ LatinExtendedB,
+ IpaExtensions,
+ SpacingModifierLetters,
+ CombiningDiacriticalMarks,
+ Greek,
+ Cyrillic,
+ CyrillicSupplementary,
+ Armenian,
+ Hebrew,
+ Arabic,
+ Syriac,
+ Thaana,
+ Devanagari,
+ Bengali,
+ Gurmukhi,
+ Gujarati,
+ Oriya,
+ Tamil,
+ Telugu,
+ Kannada,
+ Malayalam,
+ Sinhala,
+ Thai,
+ Lao,
+ Tibetan,
+ Myanmar,
+ Georgian,
+ HangulJamo,
+ Ethiopic,
+ Cherokee,
+ UnifiedCanadianAboriginalSyllabics,
+ Ogham,
+ Runic,
+ Tagalog,
+ Hanunoo,
+ Buhid,
+ Tagbanwa,
+ Khmer,
+ Mongolian,
+ Limbu,
+ TaiLe,
+ KhmerSymbols,
+ PhoneticExtensions,
+ LatinExtendedAdditional,
+ GreekExtended,
+ GeneralPunctuation,
+ SuperscriptsAndSubscripts,
+ CurrencySymbols,
+ CombiningMarksForSymbols,
+ LetterlikeSymbols,
+ NumberForms,
+ Arrows,
+ MathematicalOperators,
+ MiscellaneousTechnical,
+ ControlPictures,
+ OpticalCharacterRecognition,
+ EnclosedAlphanumerics,
+ BoxDrawing,
+ BlockElements,
+ GeometricShapes,
+ MiscellaneousSymbols,
+ Dingbats,
+ MiscellaneousMathematicalSymbolsA,
+ SupplementalArrowsA,
+ BraillePatterns,
+ SupplementalArrowsB,
+ MiscellaneousMathematicalSymbolsB,
+ SupplementalMathematicalOperators,
+ MiscellaneousSymbolsAndArrows,
+ CjkRadicalsSupplement,
+ KangxiRadicals,
+ IdeographicDescriptionCharacters,
+ CjkSymbolsAndPunctuation,
+ Hiragana,
+ Katakana,
+ Bopomofo,
+ HangulCompatibilityJamo,
+ Kanbun,
+ BopomofoExtended,
+ KatakanaPhoneticExtensions,
+ EnclosedCjkLettersAndMonths,
+ CjkCompatibility,
+ CjkUnifiedIdeographsExtensionA,
+ YijingHexagramSymbols,
+ CjkUnifiedIdeographs,
+ YiSyllables,
+ YiRadicals,
+ HangulSyllables,
+ HighSurrogates,
+ HighPrivateUseSurrogates,
+ LowSurrogates,
+ PrivateUseArea,
+ CjkCompatibilityIdeographs,
+ AlphabeticPresentationForms,
+ ArabicPresentationFormsA,
+ VariationSelectors,
+ CombiningHalfMarks,
+ CjkCompatibilityForms,
+ SmallFormVariants,
+ ArabicPresentationFormsB,
+ HalfwidthAndFullwidthForms,
+ Specials,
+ LinearBSyllabary,
+ LinearBIdeograms,
+ AegeanNumbers,
+ OldItalic,
+ Gothic,
+ Ugaritic,
+ Deseret,
+ Shavian,
+ Osmanya,
+ CypriotSyllabary,
+ ByzantineMusicalSymbols,
+ MusicalSymbols,
+ TaiXuanJingSymbols,
+ MathematicalAlphanumericSymbols,
+ CjkUnifiedIdeographsExtensionB,
+ CjkCompatibilityIdeographsSupplement,
+ Tags,
+ VariationSelectorsSupplement,
+ SupplementaryPrivateUseAreaA,
+ SupplementaryPrivateUseAreaB,
+ }
+}