1 files changed, 265 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/TextEncoding.cs b/Emby.Server.Implementations/TextEncoding/TextEncoding.cs
new file mode 100644
index 000000000..9eb9be7ea
--- /dev/null
+++ b/Emby.Server.Implementations/TextEncoding/TextEncoding.cs
@@ -0,0 +1,265 @@
+using System;
+using System.Text;
+using MediaBrowser.Model.IO;
+using MediaBrowser.Model.Logging;
+using MediaBrowser.Model.Serialization;
+using MediaBrowser.Model.Text;
+using NLangDetect.Core;
+using UniversalDetector;
+
+namespace Emby.Server.Implementations.TextEncoding
+{
+    public class TextEncoding : ITextEncoding
+    {
+        private readonly IFileSystem _fileSystem;
+        private readonly ILogger _logger;
+        private IJsonSerializer _json;
+
+        public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
+        {
+            _fileSystem = fileSystem;
+            _logger = logger;
+            _json = json;
+        }
+
+        public Encoding GetASCIIEncoding()
+        {
+            return Encoding.ASCII;
+        }
+
+        private Encoding GetInitialEncoding(byte[] buffer, int count)
+        {
+            if (count >= 3)
+            {
+                if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
+                    return Encoding.UTF8;
+            }
+
+            if (count >= 2)
+            {
+                if (buffer[0] == 0xfe && buffer[1] == 0xff)
+                    return Encoding.Unicode;
+            }
+
+            if (count >= 4)
+            {
+                if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
+                    return Encoding.UTF32;
+            }
+
+            if (count >= 3)
+            {
+                if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
+                    return Encoding.UTF7;
+            }
+
+            var result = new TextEncodingDetect().DetectEncoding(buffer, count);
+
+            switch (result)
+            {
+                case TextEncodingDetect.CharacterEncoding.Ansi:
+                    return Encoding.ASCII;
+                case TextEncodingDetect.CharacterEncoding.Ascii:
+                    return Encoding.ASCII;
+                case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
+                    return Encoding.UTF32;
+                case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
+                    return Encoding.UTF32;
+                case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
+                    return Encoding.UTF32;
+                case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
+                    return Encoding.UTF32;
+                case TextEncodingDetect.CharacterEncoding.Utf8Bom:
+                    return Encoding.UTF8;
+                case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
+                    return Encoding.UTF8;
+                default:
+                    return null;
+            }
+        }
+
+        private bool _langDetectInitialized;
+        public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)
+        {
+            var index = 0;
+
+            var encoding = GetInitialEncoding(bytes, count);
+
+            if (encoding != null && encoding.Equals(Encoding.UTF8))
+            {
+                return "utf-8";
+            }
+
+            if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
+            {
+                if (!_langDetectInitialized)
+                {
+                    _langDetectInitialized = true;
+                    LanguageDetector.Initialize(_json);
+                }
+
+                language = DetectLanguage(bytes, index, count);
+
+                if (!string.IsNullOrWhiteSpace(language))
+                {
+                    _logger.Debug("Text language detected as {0}", language);
+                }
+            }
+
+            var charset = DetectCharset(bytes, index, count, language);
+
+            if (!string.IsNullOrWhiteSpace(charset))
+            {
+                if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
+                {
+                    return "utf-8";
+                }
+
+                if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
+                {
+                    return charset;
+                }
+            }
+
+            if (!string.IsNullOrWhiteSpace(language))
+            {
+                return GetFileCharacterSetFromLanguage(language);
+            }
+
+            return null;
+        }
+
+        private string DetectLanguage(byte[] bytes, int index, int count)
+        {
+            try
+            {
+                return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));
+            }
+            catch (NLangDetectException ex)
+            {
+            }
+
+            try
+            {
+                return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));
+            }
+            catch (NLangDetectException ex)
+            {
+            }
+
+            try
+            {
+                return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));
+            }
+            catch (NLangDetectException ex)
+            {
+            }
+
+            return null;
+        }
+
+        public Encoding GetEncodingFromCharset(string charset)
+        {
+            if (string.IsNullOrWhiteSpace(charset))
+            {
+                throw new ArgumentNullException("charset");
+            }
+
+            _logger.Debug("Getting encoding object for character set: {0}", charset);
+
+            try
+            {
+                return Encoding.GetEncoding(charset);
+            }
+            catch (ArgumentException)
+            {
+                charset = charset.Replace("-", string.Empty);
+                _logger.Debug("Getting encoding object for character set: {0}", charset);
+
+                return Encoding.GetEncoding(charset);
+            }
+        }
+
+        public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)
+        {
+            var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);
+
+            return GetEncodingFromCharset(charset);
+        }
+
+        private string GetFileCharacterSetFromLanguage(string language)
+        {
+            // https://developer.xamarin.com/api/type/System.Text.Encoding/
+
+            switch (language.ToLower())
+            {
+                case "hun":
+                    return "windows-1252";
+                case "pol":
+                case "cze":
+                case "ces":
+                case "slo":
+                case "srp":
+                case "hrv":
+                case "rum":
+                case "ron":
+                case "rup":
+                    return "windows-1250";
+                // albanian
+                case "alb":
+                case "sqi":
+                    return "windows-1250";
+                // slovak
+                case "slk":
+                case "slv":
+                    return "windows-1250";
+                case "ara":
+                    return "windows-1256";
+                case "heb":
+                    return "windows-1255";
+                case "grc":
+                    return "windows-1253";
+                // greek
+                case "gre":
+                case "ell":
+                    return "windows-1253";
+                case "crh":
+                case "ota":
+                case "tur":
+                    return "windows-1254";
+                // bulgarian
+                case "bul":
+                case "bgr":
+                    return "windows-1251";
+                case "rus":
+                    return "windows-1251";
+                case "vie":
+                    return "windows-1258";
+                case "kor":
+                    return "cp949";
+                default:
+                    return "windows-1252";
+            }
+        }
+
+        private string DetectCharset(byte[] bytes, int index, int count, string language)
+        {
+            var detector = new CharsetDetector();
+            detector.Feed(bytes, index, count);
+            detector.DataEnd();
+
+            var charset = detector.Charset;
+
+            // This is often incorrectly indetected. If this happens, try to use other techniques instead
+            if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
+            {
+                if (!string.IsNullOrWhiteSpace(language))
+                {
+                    return null;
+                }
+            }
+
+            return charset;
+        }
+    }
+}