diff options
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/TextEncoding.cs')
| -rw-r--r-- | Emby.Server.Implementations/TextEncoding/TextEncoding.cs | 265 |
1 files changed, 265 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/TextEncoding.cs b/Emby.Server.Implementations/TextEncoding/TextEncoding.cs new file mode 100644 index 000000000..9eb9be7ea --- /dev/null +++ b/Emby.Server.Implementations/TextEncoding/TextEncoding.cs @@ -0,0 +1,265 @@ +using System; +using System.Text; +using MediaBrowser.Model.IO; +using MediaBrowser.Model.Logging; +using MediaBrowser.Model.Serialization; +using MediaBrowser.Model.Text; +using NLangDetect.Core; +using UniversalDetector; + +namespace Emby.Server.Implementations.TextEncoding +{ + public class TextEncoding : ITextEncoding + { + private readonly IFileSystem _fileSystem; + private readonly ILogger _logger; + private IJsonSerializer _json; + + public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json) + { + _fileSystem = fileSystem; + _logger = logger; + _json = json; + } + + public Encoding GetASCIIEncoding() + { + return Encoding.ASCII; + } + + private Encoding GetInitialEncoding(byte[] buffer, int count) + { + if (count >= 3) + { + if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) + return Encoding.UTF8; + } + + if (count >= 2) + { + if (buffer[0] == 0xfe && buffer[1] == 0xff) + return Encoding.Unicode; + } + + if (count >= 4) + { + if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff) + return Encoding.UTF32; + } + + if (count >= 3) + { + if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76) + return Encoding.UTF7; + } + + var result = new TextEncodingDetect().DetectEncoding(buffer, count); + + switch (result) + { + case TextEncodingDetect.CharacterEncoding.Ansi: + return Encoding.ASCII; + case TextEncodingDetect.CharacterEncoding.Ascii: + return Encoding.ASCII; + case TextEncodingDetect.CharacterEncoding.Utf16BeBom: + return Encoding.UTF32; + case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom: + return Encoding.UTF32; + case TextEncodingDetect.CharacterEncoding.Utf16LeBom: + return Encoding.UTF32; + case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom: + return Encoding.UTF32; + case TextEncodingDetect.CharacterEncoding.Utf8Bom: + return Encoding.UTF8; + case TextEncodingDetect.CharacterEncoding.Utf8Nobom: + return Encoding.UTF8; + default: + return null; + } + } + + private bool _langDetectInitialized; + public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection) + { + var index = 0; + + var encoding = GetInitialEncoding(bytes, count); + + if (encoding != null && encoding.Equals(Encoding.UTF8)) + { + return "utf-8"; + } + + if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection) + { + if (!_langDetectInitialized) + { + _langDetectInitialized = true; + LanguageDetector.Initialize(_json); + } + + language = DetectLanguage(bytes, index, count); + + if (!string.IsNullOrWhiteSpace(language)) + { + _logger.Debug("Text language detected as {0}", language); + } + } + + var charset = DetectCharset(bytes, index, count, language); + + if (!string.IsNullOrWhiteSpace(charset)) + { + if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase)) + { + return "utf-8"; + } + + if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase)) + { + return charset; + } + } + + if (!string.IsNullOrWhiteSpace(language)) + { + return GetFileCharacterSetFromLanguage(language); + } + + return null; + } + + private string DetectLanguage(byte[] bytes, int index, int count) + { + try + { + return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count)); + } + catch (NLangDetectException ex) + { + } + + try + { + return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count)); + } + catch (NLangDetectException ex) + { + } + + try + { + return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count)); + } + catch (NLangDetectException ex) + { + } + + return null; + } + + public Encoding GetEncodingFromCharset(string charset) + { + if (string.IsNullOrWhiteSpace(charset)) + { + throw new ArgumentNullException("charset"); + } + + _logger.Debug("Getting encoding object for character set: {0}", charset); + + try + { + return Encoding.GetEncoding(charset); + } + catch (ArgumentException) + { + charset = charset.Replace("-", string.Empty); + _logger.Debug("Getting encoding object for character set: {0}", charset); + + return Encoding.GetEncoding(charset); + } + } + + public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection) + { + var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection); + + return GetEncodingFromCharset(charset); + } + + private string GetFileCharacterSetFromLanguage(string language) + { + // https://developer.xamarin.com/api/type/System.Text.Encoding/ + + switch (language.ToLower()) + { + case "hun": + return "windows-1252"; + case "pol": + case "cze": + case "ces": + case "slo": + case "srp": + case "hrv": + case "rum": + case "ron": + case "rup": + return "windows-1250"; + // albanian + case "alb": + case "sqi": + return "windows-1250"; + // slovak + case "slk": + case "slv": + return "windows-1250"; + case "ara": + return "windows-1256"; + case "heb": + return "windows-1255"; + case "grc": + return "windows-1253"; + // greek + case "gre": + case "ell": + return "windows-1253"; + case "crh": + case "ota": + case "tur": + return "windows-1254"; + // bulgarian + case "bul": + case "bgr": + return "windows-1251"; + case "rus": + return "windows-1251"; + case "vie": + return "windows-1258"; + case "kor": + return "cp949"; + default: + return "windows-1252"; + } + } + + private string DetectCharset(byte[] bytes, int index, int count, string language) + { + var detector = new CharsetDetector(); + detector.Feed(bytes, index, count); + detector.DataEnd(); + + var charset = detector.Charset; + + // This is often incorrectly indetected. If this happens, try to use other techniques instead + if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase)) + { + if (!string.IsNullOrWhiteSpace(language)) + { + return null; + } + } + + return charset; + } + } +} |
