diff options
| author | Luke Pulverenti <luke.pulverenti@gmail.com> | 2017-06-17 18:59:17 -0400 |
|---|---|---|
| committer | Luke Pulverenti <luke.pulverenti@gmail.com> | 2017-06-17 18:59:17 -0400 |
| commit | 0e7cbb04651bd9b65668ca1635a4625640639560 (patch) | |
| tree | 664fa5a66abb0d492479353da08a95ce6553125f /Emby.Common.Implementations/TextEncoding/TextEncoding.cs | |
| parent | c9d7eb9b0482ab26c2d288c460cd5fba3c993e7a (diff) | |
add subtitle language detection
Diffstat (limited to 'Emby.Common.Implementations/TextEncoding/TextEncoding.cs')
| -rw-r--r-- | Emby.Common.Implementations/TextEncoding/TextEncoding.cs | 63 |
1 files changed, 60 insertions, 3 deletions
diff --git a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs b/Emby.Common.Implementations/TextEncoding/TextEncoding.cs index 49b424d5f..a5caae391 100644 --- a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs +++ b/Emby.Common.Implementations/TextEncoding/TextEncoding.cs @@ -8,6 +8,8 @@ using System.Threading.Tasks; using MediaBrowser.Model.MediaInfo; using MediaBrowser.Model.Logging; using UniversalDetector; +using NLangDetect.Core; +using MediaBrowser.Model.Serialization; namespace Emby.Common.Implementations.TextEncoding { @@ -15,11 +17,13 @@ namespace Emby.Common.Implementations.TextEncoding { private readonly IFileSystem _fileSystem; private readonly ILogger _logger; + private IJsonSerializer _json; - public TextEncoding(IFileSystem fileSystem, ILogger logger) + public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json) { _fileSystem = fileSystem; _logger = logger; + _json = json; } public Encoding GetASCIIEncoding() @@ -63,6 +67,7 @@ namespace Emby.Common.Implementations.TextEncoding } } + private bool _langDetectInitialized; public string GetDetectedEncodingName(byte[] bytes, string language) { var encoding = GetInitialEncoding(bytes); @@ -72,6 +77,22 @@ namespace Emby.Common.Implementations.TextEncoding return "utf-8"; } + if (!_langDetectInitialized) + { + _langDetectInitialized = true; + LanguageDetector.Initialize(_json); + } + + if (string.IsNullOrWhiteSpace(language)) + { + language = DetectLanguage(bytes); + + if (!string.IsNullOrWhiteSpace(language)) + { + _logger.Debug("Text language detected as {0}", language); + } + } + var charset = DetectCharset(bytes, language); if (!string.IsNullOrWhiteSpace(charset)) @@ -95,6 +116,35 @@ namespace Emby.Common.Implementations.TextEncoding return null; } + private string DetectLanguage(byte[] bytes) + { + try + { + return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes)); + } + catch (NLangDetectException ex) + { + } + + try + { + return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes)); + } + catch (NLangDetectException ex) + { + } + + try + { + return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes)); + } + catch (NLangDetectException ex) + { + } + + return null; + } + public Encoding GetEncodingFromCharset(string charset) { if (string.IsNullOrWhiteSpace(charset)) @@ -136,22 +186,29 @@ namespace Emby.Common.Implementations.TextEncoding case "cze": case "ces": case "slo": - case "slk": - case "slv": case "srp": case "hrv": case "rum": case "ron": case "rup": + return "windows-1250"; + // albanian case "alb": case "sqi": return "windows-1250"; + // slovak + case "slk": + case "slv": + return "windows-1250"; case "ara": return "windows-1256"; case "heb": return "windows-1255"; case "grc": + return "windows-1253"; + // greek case "gre": + case "ell": return "windows-1253"; case "crh": case "ota": |
