From b35dcbb9f02c27d2d84ee3281a60d654a3fb1259 Mon Sep 17 00:00:00 2001 From: cvium Date: Wed, 16 Jan 2019 20:50:40 +0100 Subject: Remove MediaBrowser.Text since it violates licenses and is overall hacky --- .../TextEncoding/TextEncoding.cs | 271 --------------------- 1 file changed, 271 deletions(-) delete mode 100644 Emby.Server.Implementations/TextEncoding/TextEncoding.cs (limited to 'Emby.Server.Implementations/TextEncoding/TextEncoding.cs') diff --git a/Emby.Server.Implementations/TextEncoding/TextEncoding.cs b/Emby.Server.Implementations/TextEncoding/TextEncoding.cs deleted file mode 100644 index 09705d381..000000000 --- a/Emby.Server.Implementations/TextEncoding/TextEncoding.cs +++ /dev/null @@ -1,271 +0,0 @@ -using System; -using System.Text; -using MediaBrowser.Model.IO; -using MediaBrowser.Model.Serialization; -using MediaBrowser.Model.Text; -using Microsoft.Extensions.Logging; -using NLangDetect.Core; -using UniversalDetector; - -namespace Emby.Server.Implementations.TextEncoding -{ - public class TextEncoding : ITextEncoding - { - private readonly IFileSystem _fileSystem; - private readonly ILogger _logger; - private IJsonSerializer _json; - - public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json) - { - _fileSystem = fileSystem; - _logger = logger; - _json = json; - } - - public Encoding GetASCIIEncoding() - { - return Encoding.ASCII; - } - - private static Encoding GetInitialEncoding(byte[] buffer, int count) - { - if (count >= 3) - { - if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) - return Encoding.UTF8; - } - - if (count >= 2) - { - if (buffer[0] == 0xfe && buffer[1] == 0xff) - return Encoding.Unicode; - } - - if (count >= 4) - { - if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff) - return Encoding.UTF32; - } - - if (count >= 3) - { - if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76) - return Encoding.UTF7; - } - - var result = new TextEncodingDetect().DetectEncoding(buffer, count); - - switch (result) - { - case TextEncodingDetect.CharacterEncoding.Ansi: - return Encoding.ASCII; - case TextEncodingDetect.CharacterEncoding.Ascii: - return Encoding.ASCII; - case TextEncodingDetect.CharacterEncoding.Utf16BeBom: - return Encoding.UTF32; - case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom: - return Encoding.UTF32; - case TextEncodingDetect.CharacterEncoding.Utf16LeBom: - return Encoding.UTF32; - case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom: - return Encoding.UTF32; - case TextEncodingDetect.CharacterEncoding.Utf8Bom: - return Encoding.UTF8; - case TextEncodingDetect.CharacterEncoding.Utf8Nobom: - return Encoding.UTF8; - default: - return null; - } - } - - private bool _langDetectInitialized; - public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection) - { - var index = 0; - - var encoding = GetInitialEncoding(bytes, count); - - if (encoding != null && encoding.Equals(Encoding.UTF8)) - { - return "utf-8"; - } - - if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection) - { - if (!_langDetectInitialized) - { - _langDetectInitialized = true; - LanguageDetector.Initialize(_json); - } - - language = DetectLanguage(bytes, index, count); - - if (!string.IsNullOrWhiteSpace(language)) - { - _logger.LogDebug("Text language detected as {0}", language); - } - } - - var charset = DetectCharset(bytes, index, count, language); - - if (!string.IsNullOrWhiteSpace(charset)) - { - if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase)) - { - return "utf-8"; - } - - if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase)) - { - return charset; - } - } - - if (!string.IsNullOrWhiteSpace(language)) - { - return GetFileCharacterSetFromLanguage(language); - } - - return null; - } - - private string DetectLanguage(byte[] bytes, int index, int count) - { - try - { - return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count)); - } - catch (NLangDetectException ex) - { - _logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException."); - } - - try - { - return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count)); - } - catch (NLangDetectException ex) - { - _logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException."); - } - - try - { - return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count)); - } - catch (NLangDetectException ex) - { - _logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException."); - } - - return null; - } - - public Encoding GetEncodingFromCharset(string charset) - { - if (string.IsNullOrWhiteSpace(charset)) - { - throw new ArgumentNullException(nameof(charset)); - } - - _logger.LogDebug("Getting encoding object for character set: {0}", charset); - - try - { - return Encoding.GetEncoding(charset); - } - catch (ArgumentException) - { - charset = charset.Replace("-", string.Empty); - _logger.LogDebug("Getting encoding object for character set: {0}", charset); - - return Encoding.GetEncoding(charset); - } - } - - public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection) - { - var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection); - - return GetEncodingFromCharset(charset); - } - - private static string GetFileCharacterSetFromLanguage(string language) - { - // https://developer.xamarin.com/api/type/System.Text.Encoding/ - - switch (language.ToLower()) - { - case "tha": - return "windows-874"; - case "hun": - return "windows-1252"; - case "pol": - case "cze": - case "ces": - case "slo": - case "srp": - case "hrv": - case "rum": - case "ron": - case "rom": - case "rup": - return "windows-1250"; - // albanian - case "alb": - case "sqi": - return "windows-1250"; - // slovak - case "slk": - case "slv": - return "windows-1250"; - case "ara": - return "windows-1256"; - case "heb": - return "windows-1255"; - case "grc": - return "windows-1253"; - // greek - case "gre": - case "ell": - return "windows-1253"; - case "crh": - case "ota": - case "tur": - return "windows-1254"; - // bulgarian - case "bul": - case "bgr": - return "windows-1251"; - case "rus": - return "windows-1251"; - case "vie": - return "windows-1258"; - case "kor": - return "cp949"; - default: - return "windows-1252"; - } - } - - private static string DetectCharset(byte[] bytes, int index, int count, string language) - { - var detector = new CharsetDetector(); - detector.Feed(bytes, index, count); - detector.DataEnd(); - - var charset = detector.Charset; - - // This is often incorrectly indetected. If this happens, try to use other techniques instead - if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase)) - { - if (!string.IsNullOrWhiteSpace(language)) - { - return null; - } - } - - return charset; - } - } -} -- cgit v1.2.3