diff options
Diffstat (limited to 'Emby.Common.Implementations/TextEncoding/TextEncoding.cs')
| -rw-r--r-- | Emby.Common.Implementations/TextEncoding/TextEncoding.cs | 72 |
1 files changed, 66 insertions, 6 deletions
diff --git a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs b/Emby.Common.Implementations/TextEncoding/TextEncoding.cs index 49b424d5f..54c47d62c 100644 --- a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs +++ b/Emby.Common.Implementations/TextEncoding/TextEncoding.cs @@ -8,6 +8,8 @@ using System.Threading.Tasks; using MediaBrowser.Model.MediaInfo; using MediaBrowser.Model.Logging; using UniversalDetector; +using NLangDetect.Core; +using MediaBrowser.Model.Serialization; namespace Emby.Common.Implementations.TextEncoding { @@ -15,11 +17,13 @@ namespace Emby.Common.Implementations.TextEncoding { private readonly IFileSystem _fileSystem; private readonly ILogger _logger; + private IJsonSerializer _json; - public TextEncoding(IFileSystem fileSystem, ILogger logger) + public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json) { _fileSystem = fileSystem; _logger = logger; + _json = json; } public Encoding GetASCIIEncoding() @@ -63,7 +67,8 @@ namespace Emby.Common.Implementations.TextEncoding } } - public string GetDetectedEncodingName(byte[] bytes, string language) + private bool _langDetectInitialized; + public string GetDetectedEncodingName(byte[] bytes, string language, bool enableLanguageDetection) { var encoding = GetInitialEncoding(bytes); @@ -72,6 +77,22 @@ namespace Emby.Common.Implementations.TextEncoding return "utf-8"; } + if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection) + { + if (!_langDetectInitialized) + { + _langDetectInitialized = true; + LanguageDetector.Initialize(_json); + } + + language = DetectLanguage(bytes); + + if (!string.IsNullOrWhiteSpace(language)) + { + _logger.Debug("Text language detected as {0}", language); + } + } + var charset = DetectCharset(bytes, language); if (!string.IsNullOrWhiteSpace(charset)) @@ -95,6 +116,35 @@ namespace Emby.Common.Implementations.TextEncoding return null; } + private string DetectLanguage(byte[] bytes) + { + try + { + return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes)); + } + catch (NLangDetectException ex) + { + } + + try + { + return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes)); + } + catch (NLangDetectException ex) + { + } + + try + { + return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes)); + } + catch (NLangDetectException ex) + { + } + + return null; + } + public Encoding GetEncodingFromCharset(string charset) { if (string.IsNullOrWhiteSpace(charset)) @@ -117,9 +167,9 @@ namespace Emby.Common.Implementations.TextEncoding } } - public Encoding GetDetectedEncoding(byte[] bytes, string language) + public Encoding GetDetectedEncoding(byte[] bytes, string language, bool enableLanguageDetection) { - var charset = GetDetectedEncodingName(bytes, language); + var charset = GetDetectedEncodingName(bytes, language, enableLanguageDetection); return GetEncodingFromCharset(charset); } @@ -136,28 +186,38 @@ namespace Emby.Common.Implementations.TextEncoding case "cze": case "ces": case "slo": - case "slk": - case "slv": case "srp": case "hrv": case "rum": case "ron": case "rup": + return "windows-1250"; + // albanian case "alb": case "sqi": return "windows-1250"; + // slovak + case "slk": + case "slv": + return "windows-1250"; case "ara": return "windows-1256"; case "heb": return "windows-1255"; case "grc": + return "windows-1253"; + // greek case "gre": + case "ell": return "windows-1253"; case "crh": case "ota": case "tur": return "windows-1254"; + // bulgarian + case "bul": case "bgr": + return "windows-1251"; case "rus": return "windows-1251"; case "vie": |
