diff options
Diffstat (limited to 'Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions')
4 files changed, 581 insertions, 0 deletions
diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs new file mode 100644 index 000000000..59076bd66 --- /dev/null +++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs @@ -0,0 +1,374 @@ +using System; + +namespace NLangDetect.Core.Extensions +{ + public static class CharExtensions + { + private const int MIN_CODE_POINT = 0x000000; + private const int MAX_CODE_POINT = 0x10ffff; + + private static readonly int[] _unicodeBlockStarts = + { + #region Unicode block starts + + 0x0000, // Basic Latin + 0x0080, // Latin-1 Supplement + 0x0100, // Latin Extended-A + 0x0180, // Latin Extended-B + 0x0250, // IPA Extensions + 0x02B0, // Spacing Modifier Letters + 0x0300, // Combining Diacritical Marks + 0x0370, // Greek and Coptic + 0x0400, // Cyrillic + 0x0500, // Cyrillic Supplementary + 0x0530, // Armenian + 0x0590, // Hebrew + 0x0600, // Arabic + 0x0700, // Syriac + 0x0750, // unassigned + 0x0780, // Thaana + 0x07C0, // unassigned + 0x0900, // Devanagari + 0x0980, // Bengali + 0x0A00, // Gurmukhi + 0x0A80, // Gujarati + 0x0B00, // Oriya + 0x0B80, // Tamil + 0x0C00, // Telugu + 0x0C80, // Kannada + 0x0D00, // Malayalam + 0x0D80, // Sinhala + 0x0E00, // Thai + 0x0E80, // Lao + 0x0F00, // Tibetan + 0x1000, // Myanmar + 0x10A0, // Georgian + 0x1100, // Hangul Jamo + 0x1200, // Ethiopic + 0x1380, // unassigned + 0x13A0, // Cherokee + 0x1400, // Unified Canadian Aboriginal Syllabics + 0x1680, // Ogham + 0x16A0, // Runic + 0x1700, // Tagalog + 0x1720, // Hanunoo + 0x1740, // Buhid + 0x1760, // Tagbanwa + 0x1780, // Khmer + 0x1800, // Mongolian + 0x18B0, // unassigned + 0x1900, // Limbu + 0x1950, // Tai Le + 0x1980, // unassigned + 0x19E0, // Khmer Symbols + 0x1A00, // unassigned + 0x1D00, // Phonetic Extensions + 0x1D80, // unassigned + 0x1E00, // Latin Extended Additional + 0x1F00, // Greek Extended + 0x2000, // General Punctuation + 0x2070, // Superscripts and Subscripts + 0x20A0, // Currency Symbols + 0x20D0, // Combining Diacritical Marks for Symbols + 0x2100, // Letterlike Symbols + 0x2150, // Number Forms + 0x2190, // Arrows + 0x2200, // Mathematical Operators + 0x2300, // Miscellaneous Technical + 0x2400, // Control Pictures + 0x2440, // Optical Character Recognition + 0x2460, // Enclosed Alphanumerics + 0x2500, // Box Drawing + 0x2580, // Block Elements + 0x25A0, // Geometric Shapes + 0x2600, // Miscellaneous Symbols + 0x2700, // Dingbats + 0x27C0, // Miscellaneous Mathematical Symbols-A + 0x27F0, // Supplemental Arrows-A + 0x2800, // Braille Patterns + 0x2900, // Supplemental Arrows-B + 0x2980, // Miscellaneous Mathematical Symbols-B + 0x2A00, // Supplemental Mathematical Operators + 0x2B00, // Miscellaneous Symbols and Arrows + 0x2C00, // unassigned + 0x2E80, // CJK Radicals Supplement + 0x2F00, // Kangxi Radicals + 0x2FE0, // unassigned + 0x2FF0, // Ideographic Description Characters + 0x3000, // CJK Symbols and Punctuation + 0x3040, // Hiragana + 0x30A0, // Katakana + 0x3100, // Bopomofo + 0x3130, // Hangul Compatibility Jamo + 0x3190, // Kanbun + 0x31A0, // Bopomofo Extended + 0x31C0, // unassigned + 0x31F0, // Katakana Phonetic Extensions + 0x3200, // Enclosed CJK Letters and Months + 0x3300, // CJK Compatibility + 0x3400, // CJK Unified Ideographs Extension A + 0x4DC0, // Yijing Hexagram Symbols + 0x4E00, // CJK Unified Ideographs + 0xA000, // Yi Syllables + 0xA490, // Yi Radicals + 0xA4D0, // unassigned + 0xAC00, // Hangul Syllables + 0xD7B0, // unassigned + 0xD800, // High Surrogates + 0xDB80, // High Private Use Surrogates + 0xDC00, // Low Surrogates + 0xE000, // Private Use + 0xF900, // CJK Compatibility Ideographs + 0xFB00, // Alphabetic Presentation Forms + 0xFB50, // Arabic Presentation Forms-A + 0xFE00, // Variation Selectors + 0xFE10, // unassigned + 0xFE20, // Combining Half Marks + 0xFE30, // CJK Compatibility Forms + 0xFE50, // Small Form Variants + 0xFE70, // Arabic Presentation Forms-B + 0xFF00, // Halfwidth and Fullwidth Forms + 0xFFF0, // Specials + 0x10000, // Linear B Syllabary + 0x10080, // Linear B Ideograms + 0x10100, // Aegean Numbers + 0x10140, // unassigned + 0x10300, // Old Italic + 0x10330, // Gothic + 0x10350, // unassigned + 0x10380, // Ugaritic + 0x103A0, // unassigned + 0x10400, // Deseret + 0x10450, // Shavian + 0x10480, // Osmanya + 0x104B0, // unassigned + 0x10800, // Cypriot Syllabary + 0x10840, // unassigned + 0x1D000, // Byzantine Musical Symbols + 0x1D100, // Musical Symbols + 0x1D200, // unassigned + 0x1D300, // Tai Xuan Jing Symbols + 0x1D360, // unassigned + 0x1D400, // Mathematical Alphanumeric Symbols + 0x1D800, // unassigned + 0x20000, // CJK Unified Ideographs Extension B + 0x2A6E0, // unassigned + 0x2F800, // CJK Compatibility Ideographs Supplement + 0x2FA20, // unassigned + 0xE0000, // Tags + 0xE0080, // unassigned + 0xE0100, // Variation Selectors Supplement + 0xE01F0, // unassigned + 0xF0000, // Supplementary Private Use Area-A + 0x100000, // Supplementary Private Use Area-B + + #endregion + }; + + private static readonly UnicodeBlock?[] _unicodeBlocks = + { + #region Unicode blocks + UnicodeBlock.BasicLatin, + UnicodeBlock.Latin1Supplement, + UnicodeBlock.LatinExtendedA, + UnicodeBlock.LatinExtendedB, + UnicodeBlock.IpaExtensions, + UnicodeBlock.SpacingModifierLetters, + UnicodeBlock.CombiningDiacriticalMarks, + UnicodeBlock.Greek, + UnicodeBlock.Cyrillic, + UnicodeBlock.CyrillicSupplementary, + UnicodeBlock.Armenian, + UnicodeBlock.Hebrew, + UnicodeBlock.Arabic, + UnicodeBlock.Syriac, + null, + UnicodeBlock.Thaana, + null, + UnicodeBlock.Devanagari, + UnicodeBlock.Bengali, + UnicodeBlock.Gurmukhi, + UnicodeBlock.Gujarati, + UnicodeBlock.Oriya, + UnicodeBlock.Tamil, + UnicodeBlock.Telugu, + UnicodeBlock.Kannada, + UnicodeBlock.Malayalam, + UnicodeBlock.Sinhala, + UnicodeBlock.Thai, + UnicodeBlock.Lao, + UnicodeBlock.Tibetan, + UnicodeBlock.Myanmar, + UnicodeBlock.Georgian, + UnicodeBlock.HangulJamo, + UnicodeBlock.Ethiopic, + null, + UnicodeBlock.Cherokee, + UnicodeBlock.UnifiedCanadianAboriginalSyllabics, + UnicodeBlock.Ogham, + UnicodeBlock.Runic, + UnicodeBlock.Tagalog, + UnicodeBlock.Hanunoo, + UnicodeBlock.Buhid, + UnicodeBlock.Tagbanwa, + UnicodeBlock.Khmer, + UnicodeBlock.Mongolian, + null, + UnicodeBlock.Limbu, + UnicodeBlock.TaiLe, + null, + UnicodeBlock.KhmerSymbols, + null, + UnicodeBlock.PhoneticExtensions, + null, + UnicodeBlock.LatinExtendedAdditional, + UnicodeBlock.GreekExtended, + UnicodeBlock.GeneralPunctuation, + UnicodeBlock.SuperscriptsAndSubscripts, + UnicodeBlock.CurrencySymbols, + UnicodeBlock.CombiningMarksForSymbols, + UnicodeBlock.LetterlikeSymbols, + UnicodeBlock.NumberForms, + UnicodeBlock.Arrows, + UnicodeBlock.MathematicalOperators, + UnicodeBlock.MiscellaneousTechnical, + UnicodeBlock.ControlPictures, + UnicodeBlock.OpticalCharacterRecognition, + UnicodeBlock.EnclosedAlphanumerics, + UnicodeBlock.BoxDrawing, + UnicodeBlock.BlockElements, + UnicodeBlock.GeometricShapes, + UnicodeBlock.MiscellaneousSymbols, + UnicodeBlock.Dingbats, + UnicodeBlock.MiscellaneousMathematicalSymbolsA, + UnicodeBlock.SupplementalArrowsA, + UnicodeBlock.BraillePatterns, + UnicodeBlock.SupplementalArrowsB, + UnicodeBlock.MiscellaneousMathematicalSymbolsB, + UnicodeBlock.SupplementalMathematicalOperators, + UnicodeBlock.MiscellaneousSymbolsAndArrows, + null, + UnicodeBlock.CjkRadicalsSupplement, + UnicodeBlock.KangxiRadicals, + null, + UnicodeBlock.IdeographicDescriptionCharacters, + UnicodeBlock.CjkSymbolsAndPunctuation, + UnicodeBlock.Hiragana, + UnicodeBlock.Katakana, + UnicodeBlock.Bopomofo, + UnicodeBlock.HangulCompatibilityJamo, + UnicodeBlock.Kanbun, + UnicodeBlock.BopomofoExtended, + null, + UnicodeBlock.KatakanaPhoneticExtensions, + UnicodeBlock.EnclosedCjkLettersAndMonths, + UnicodeBlock.CjkCompatibility, + UnicodeBlock.CjkUnifiedIdeographsExtensionA, + UnicodeBlock.YijingHexagramSymbols, + UnicodeBlock.CjkUnifiedIdeographs, + UnicodeBlock.YiSyllables, + UnicodeBlock.YiRadicals, + null, + UnicodeBlock.HangulSyllables, + null, + UnicodeBlock.HighSurrogates, + UnicodeBlock.HighPrivateUseSurrogates, + UnicodeBlock.LowSurrogates, + UnicodeBlock.PrivateUseArea, + UnicodeBlock.CjkCompatibilityIdeographs, + UnicodeBlock.AlphabeticPresentationForms, + UnicodeBlock.ArabicPresentationFormsA, + UnicodeBlock.VariationSelectors, + null, + UnicodeBlock.CombiningHalfMarks, + UnicodeBlock.CjkCompatibilityForms, + UnicodeBlock.SmallFormVariants, + UnicodeBlock.ArabicPresentationFormsB, + UnicodeBlock.HalfwidthAndFullwidthForms, + UnicodeBlock.Specials, + UnicodeBlock.LinearBSyllabary, + UnicodeBlock.LinearBIdeograms, + UnicodeBlock.AegeanNumbers, + null, + UnicodeBlock.OldItalic, + UnicodeBlock.Gothic, + null, + UnicodeBlock.Ugaritic, + null, + UnicodeBlock.Deseret, + UnicodeBlock.Shavian, + UnicodeBlock.Osmanya, + null, + UnicodeBlock.CypriotSyllabary, + null, + UnicodeBlock.ByzantineMusicalSymbols, + UnicodeBlock.MusicalSymbols, + null, + UnicodeBlock.TaiXuanJingSymbols, + null, + UnicodeBlock.MathematicalAlphanumericSymbols, + null, + UnicodeBlock.CjkUnifiedIdeographsExtensionB, + null, + UnicodeBlock.CjkCompatibilityIdeographsSupplement, + null, + UnicodeBlock.Tags, + null, + UnicodeBlock.VariationSelectorsSupplement, + null, + UnicodeBlock.SupplementaryPrivateUseAreaA, + UnicodeBlock.SupplementaryPrivateUseAreaB, + + #endregion + }; + + #region Public methods + + /// <remarks> + /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL + /// </remarks> + public static UnicodeBlock? GetUnicodeBlock(this char ch) + { + int codePoint = ch; + + if (!IsValidCodePoint(codePoint)) + { + throw new ArgumentException("Argument is not a valid code point.", "ch"); + } + + int top, bottom, current; + + bottom = 0; + top = _unicodeBlockStarts.Length; + current = top / 2; + + // invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom] + while (top - bottom > 1) + { + if (codePoint >= _unicodeBlockStarts[current]) + { + bottom = current; + } + else + { + top = current; + } + + current = (top + bottom) / 2; + } + + return _unicodeBlocks[current]; + } + + #endregion + + #region Private helper methods + + private static bool IsValidCodePoint(int codePoint) + { + return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; + } + + #endregion + } +} diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs new file mode 100644 index 000000000..d55ca80df --- /dev/null +++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs @@ -0,0 +1,51 @@ +using System; + +namespace NLangDetect.Core.Extensions +{ + public static class RandomExtensions + { + private const double _Epsilon = 2.22044604925031E-15; + + private static readonly object _mutex = new object(); + + private static double _nextNextGaussian; + private static bool _hasNextNextGaussian; + + /// <summary> + /// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence. + /// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned. + /// </summary> + /// <remarks> + /// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian()) + /// </remarks> + public static double NextGaussian(this Random random) + { + lock (_mutex) + { + if (_hasNextNextGaussian) + { + _hasNextNextGaussian = false; + + return _nextNextGaussian; + } + + double v1, v2, s; + + do + { + v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0 + v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0 + s = v1 * v1 + v2 * v2; + } + while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon); + + double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s); + + _nextNextGaussian = v2 * multiplier; + _hasNextNextGaussian = true; + + return v1 * multiplier; + } + } + } +} diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs new file mode 100644 index 000000000..fc6c58a95 --- /dev/null +++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs @@ -0,0 +1,25 @@ +using System; + +namespace NLangDetect.Core.Extensions +{ + public static class StringExtensions + { + /// <summary> + /// Returns a new character sequence that is a subsequence of this sequence. The subsequence starts with the character at the specified index and ends with the character at index end - 1. The length of the returned sequence is end - start, so if start == end then an empty sequence is returned. + /// </summary> + /// <param name="s"></param> + /// <param name="start">the start index, inclusive</param> + /// <param name="end">the end index, exclusive</param> + /// <returns>the specified subsequence</returns> + /// <exception cref="IndexOutOfRangeException"> if start or end are negative, if end is greater than length(), or if start is greater than end</exception> + public static string SubSequence(this string s, int start, int end) + { + if (start < 0) throw new ArgumentOutOfRangeException("start", "Argument must not be negative."); + if (end < 0) throw new ArgumentOutOfRangeException("end", "Argument must not be negative."); + if (end > s.Length) throw new ArgumentOutOfRangeException("end", "Argument must not be greater than the input string's length."); + if (start > end) throw new ArgumentOutOfRangeException("start", "Argument must not be greater than the 'end' argument."); + + return s.Substring(start, end - start); + } + } +} diff --git a/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs new file mode 100644 index 000000000..71b5de75e --- /dev/null +++ b/Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs @@ -0,0 +1,131 @@ +namespace NLangDetect.Core.Extensions +{ + public enum UnicodeBlock + { + BasicLatin, + Latin1Supplement, + LatinExtendedA, + LatinExtendedB, + IpaExtensions, + SpacingModifierLetters, + CombiningDiacriticalMarks, + Greek, + Cyrillic, + CyrillicSupplementary, + Armenian, + Hebrew, + Arabic, + Syriac, + Thaana, + Devanagari, + Bengali, + Gurmukhi, + Gujarati, + Oriya, + Tamil, + Telugu, + Kannada, + Malayalam, + Sinhala, + Thai, + Lao, + Tibetan, + Myanmar, + Georgian, + HangulJamo, + Ethiopic, + Cherokee, + UnifiedCanadianAboriginalSyllabics, + Ogham, + Runic, + Tagalog, + Hanunoo, + Buhid, + Tagbanwa, + Khmer, + Mongolian, + Limbu, + TaiLe, + KhmerSymbols, + PhoneticExtensions, + LatinExtendedAdditional, + GreekExtended, + GeneralPunctuation, + SuperscriptsAndSubscripts, + CurrencySymbols, + CombiningMarksForSymbols, + LetterlikeSymbols, + NumberForms, + Arrows, + MathematicalOperators, + MiscellaneousTechnical, + ControlPictures, + OpticalCharacterRecognition, + EnclosedAlphanumerics, + BoxDrawing, + BlockElements, + GeometricShapes, + MiscellaneousSymbols, + Dingbats, + MiscellaneousMathematicalSymbolsA, + SupplementalArrowsA, + BraillePatterns, + SupplementalArrowsB, + MiscellaneousMathematicalSymbolsB, + SupplementalMathematicalOperators, + MiscellaneousSymbolsAndArrows, + CjkRadicalsSupplement, + KangxiRadicals, + IdeographicDescriptionCharacters, + CjkSymbolsAndPunctuation, + Hiragana, + Katakana, + Bopomofo, + HangulCompatibilityJamo, + Kanbun, + BopomofoExtended, + KatakanaPhoneticExtensions, + EnclosedCjkLettersAndMonths, + CjkCompatibility, + CjkUnifiedIdeographsExtensionA, + YijingHexagramSymbols, + CjkUnifiedIdeographs, + YiSyllables, + YiRadicals, + HangulSyllables, + HighSurrogates, + HighPrivateUseSurrogates, + LowSurrogates, + PrivateUseArea, + CjkCompatibilityIdeographs, + AlphabeticPresentationForms, + ArabicPresentationFormsA, + VariationSelectors, + CombiningHalfMarks, + CjkCompatibilityForms, + SmallFormVariants, + ArabicPresentationFormsB, + HalfwidthAndFullwidthForms, + Specials, + LinearBSyllabary, + LinearBIdeograms, + AegeanNumbers, + OldItalic, + Gothic, + Ugaritic, + Deseret, + Shavian, + Osmanya, + CypriotSyllabary, + ByzantineMusicalSymbols, + MusicalSymbols, + TaiXuanJingSymbols, + MathematicalAlphanumericSymbols, + CjkUnifiedIdeographsExtensionB, + CjkCompatibilityIdeographsSupplement, + Tags, + VariationSelectorsSupplement, + SupplementaryPrivateUseAreaA, + SupplementaryPrivateUseAreaB, + } +} |
