aboutsummaryrefslogtreecommitdiff
path: root/Emby.Common.Implementations/TextEncoding/TextEncoding.cs
diff options
context:
space:
mode:
authorLuke Pulverenti <luke.pulverenti@gmail.com>2017-06-17 18:59:17 -0400
committerLuke Pulverenti <luke.pulverenti@gmail.com>2017-06-17 18:59:17 -0400
commit0e7cbb04651bd9b65668ca1635a4625640639560 (patch)
tree664fa5a66abb0d492479353da08a95ce6553125f /Emby.Common.Implementations/TextEncoding/TextEncoding.cs
parentc9d7eb9b0482ab26c2d288c460cd5fba3c993e7a (diff)
add subtitle language detection
Diffstat (limited to 'Emby.Common.Implementations/TextEncoding/TextEncoding.cs')
-rw-r--r--Emby.Common.Implementations/TextEncoding/TextEncoding.cs63
1 files changed, 60 insertions, 3 deletions
diff --git a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs b/Emby.Common.Implementations/TextEncoding/TextEncoding.cs
index 49b424d5f..a5caae391 100644
--- a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs
+++ b/Emby.Common.Implementations/TextEncoding/TextEncoding.cs
@@ -8,6 +8,8 @@ using System.Threading.Tasks;
using MediaBrowser.Model.MediaInfo;
using MediaBrowser.Model.Logging;
using UniversalDetector;
+using NLangDetect.Core;
+using MediaBrowser.Model.Serialization;
namespace Emby.Common.Implementations.TextEncoding
{
@@ -15,11 +17,13 @@ namespace Emby.Common.Implementations.TextEncoding
{
private readonly IFileSystem _fileSystem;
private readonly ILogger _logger;
+ private IJsonSerializer _json;
- public TextEncoding(IFileSystem fileSystem, ILogger logger)
+ public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
{
_fileSystem = fileSystem;
_logger = logger;
+ _json = json;
}
public Encoding GetASCIIEncoding()
@@ -63,6 +67,7 @@ namespace Emby.Common.Implementations.TextEncoding
}
}
+ private bool _langDetectInitialized;
public string GetDetectedEncodingName(byte[] bytes, string language)
{
var encoding = GetInitialEncoding(bytes);
@@ -72,6 +77,22 @@ namespace Emby.Common.Implementations.TextEncoding
return "utf-8";
}
+ if (!_langDetectInitialized)
+ {
+ _langDetectInitialized = true;
+ LanguageDetector.Initialize(_json);
+ }
+
+ if (string.IsNullOrWhiteSpace(language))
+ {
+ language = DetectLanguage(bytes);
+
+ if (!string.IsNullOrWhiteSpace(language))
+ {
+ _logger.Debug("Text language detected as {0}", language);
+ }
+ }
+
var charset = DetectCharset(bytes, language);
if (!string.IsNullOrWhiteSpace(charset))
@@ -95,6 +116,35 @@ namespace Emby.Common.Implementations.TextEncoding
return null;
}
+ private string DetectLanguage(byte[] bytes)
+ {
+ try
+ {
+ return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes));
+ }
+ catch (NLangDetectException ex)
+ {
+ }
+
+ try
+ {
+ return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes));
+ }
+ catch (NLangDetectException ex)
+ {
+ }
+
+ try
+ {
+ return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes));
+ }
+ catch (NLangDetectException ex)
+ {
+ }
+
+ return null;
+ }
+
public Encoding GetEncodingFromCharset(string charset)
{
if (string.IsNullOrWhiteSpace(charset))
@@ -136,22 +186,29 @@ namespace Emby.Common.Implementations.TextEncoding
case "cze":
case "ces":
case "slo":
- case "slk":
- case "slv":
case "srp":
case "hrv":
case "rum":
case "ron":
case "rup":
+ return "windows-1250";
+ // albanian
case "alb":
case "sqi":
return "windows-1250";
+ // slovak
+ case "slk":
+ case "slv":
+ return "windows-1250";
case "ara":
return "windows-1256";
case "heb":
return "windows-1255";
case "grc":
+ return "windows-1253";
+ // greek
case "gre":
+ case "ell":
return "windows-1253";
case "crh":
case "ota":