diff options
Diffstat (limited to 'Emby.Common.Implementations/TextEncoding/TextEncoding.cs')
| -rw-r--r-- | Emby.Common.Implementations/TextEncoding/TextEncoding.cs | 252 |
1 files changed, 0 insertions, 252 deletions
diff --git a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs b/Emby.Common.Implementations/TextEncoding/TextEncoding.cs deleted file mode 100644 index 54c47d62c..000000000 --- a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs +++ /dev/null @@ -1,252 +0,0 @@ -using System; -using System.Text; -using MediaBrowser.Model.IO; -using MediaBrowser.Model.Text; -using System.IO; -using System.Threading; -using System.Threading.Tasks; -using MediaBrowser.Model.MediaInfo; -using MediaBrowser.Model.Logging; -using UniversalDetector; -using NLangDetect.Core; -using MediaBrowser.Model.Serialization; - -namespace Emby.Common.Implementations.TextEncoding -{ - public class TextEncoding : ITextEncoding - { - private readonly IFileSystem _fileSystem; - private readonly ILogger _logger; - private IJsonSerializer _json; - - public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json) - { - _fileSystem = fileSystem; - _logger = logger; - _json = json; - } - - public Encoding GetASCIIEncoding() - { - return Encoding.ASCII; - } - - private Encoding GetInitialEncoding(byte[] buffer) - { - if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) - return Encoding.UTF8; - if (buffer[0] == 0xfe && buffer[1] == 0xff) - return Encoding.Unicode; - if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff) - return Encoding.UTF32; - if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76) - return Encoding.UTF7; - - var result = new TextEncodingDetect().DetectEncoding(buffer, buffer.Length); - - switch (result) - { - case TextEncodingDetect.CharacterEncoding.Ansi: - return Encoding.ASCII; - case TextEncodingDetect.CharacterEncoding.Ascii: - return Encoding.ASCII; - case TextEncodingDetect.CharacterEncoding.Utf16BeBom: - return Encoding.UTF32; - case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom: - return Encoding.UTF32; - case TextEncodingDetect.CharacterEncoding.Utf16LeBom: - return Encoding.UTF32; - case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom: - return Encoding.UTF32; - case TextEncodingDetect.CharacterEncoding.Utf8Bom: - return Encoding.UTF8; - case TextEncodingDetect.CharacterEncoding.Utf8Nobom: - return Encoding.UTF8; - default: - return null; - } - } - - private bool _langDetectInitialized; - public string GetDetectedEncodingName(byte[] bytes, string language, bool enableLanguageDetection) - { - var encoding = GetInitialEncoding(bytes); - - if (encoding != null && encoding.Equals(Encoding.UTF8)) - { - return "utf-8"; - } - - if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection) - { - if (!_langDetectInitialized) - { - _langDetectInitialized = true; - LanguageDetector.Initialize(_json); - } - - language = DetectLanguage(bytes); - - if (!string.IsNullOrWhiteSpace(language)) - { - _logger.Debug("Text language detected as {0}", language); - } - } - - var charset = DetectCharset(bytes, language); - - if (!string.IsNullOrWhiteSpace(charset)) - { - if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase)) - { - return "utf-8"; - } - - if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase)) - { - return charset; - } - } - - if (!string.IsNullOrWhiteSpace(language)) - { - return GetFileCharacterSetFromLanguage(language); - } - - return null; - } - - private string DetectLanguage(byte[] bytes) - { - try - { - return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes)); - } - catch (NLangDetectException ex) - { - } - - try - { - return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes)); - } - catch (NLangDetectException ex) - { - } - - try - { - return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes)); - } - catch (NLangDetectException ex) - { - } - - return null; - } - - public Encoding GetEncodingFromCharset(string charset) - { - if (string.IsNullOrWhiteSpace(charset)) - { - throw new ArgumentNullException("charset"); - } - - _logger.Debug("Getting encoding object for character set: {0}", charset); - - try - { - return Encoding.GetEncoding(charset); - } - catch (ArgumentException) - { - charset = charset.Replace("-", string.Empty); - _logger.Debug("Getting encoding object for character set: {0}", charset); - - return Encoding.GetEncoding(charset); - } - } - - public Encoding GetDetectedEncoding(byte[] bytes, string language, bool enableLanguageDetection) - { - var charset = GetDetectedEncodingName(bytes, language, enableLanguageDetection); - - return GetEncodingFromCharset(charset); - } - - private string GetFileCharacterSetFromLanguage(string language) - { - // https://developer.xamarin.com/api/type/System.Text.Encoding/ - - switch (language.ToLower()) - { - case "hun": - return "windows-1252"; - case "pol": - case "cze": - case "ces": - case "slo": - case "srp": - case "hrv": - case "rum": - case "ron": - case "rup": - return "windows-1250"; - // albanian - case "alb": - case "sqi": - return "windows-1250"; - // slovak - case "slk": - case "slv": - return "windows-1250"; - case "ara": - return "windows-1256"; - case "heb": - return "windows-1255"; - case "grc": - return "windows-1253"; - // greek - case "gre": - case "ell": - return "windows-1253"; - case "crh": - case "ota": - case "tur": - return "windows-1254"; - // bulgarian - case "bul": - case "bgr": - return "windows-1251"; - case "rus": - return "windows-1251"; - case "vie": - return "windows-1258"; - case "kor": - return "cp949"; - default: - return "windows-1252"; - } - } - - private string DetectCharset(byte[] bytes, string language) - { - var detector = new CharsetDetector(); - detector.Feed(bytes, 0, bytes.Length); - detector.DataEnd(); - - var charset = detector.Charset; - - // This is often incorrectly indetected. If this happens, try to use other techniques instead - if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase)) - { - if (!string.IsNullOrWhiteSpace(language)) - { - return null; - } - } - - return charset; - } - } -} |
