aboutsummaryrefslogtreecommitdiff
path: root/Emby.Common.Implementations/TextEncoding/TextEncoding.cs
diff options
context:
space:
mode:
Diffstat (limited to 'Emby.Common.Implementations/TextEncoding/TextEncoding.cs')
-rw-r--r--Emby.Common.Implementations/TextEncoding/TextEncoding.cs72
1 files changed, 66 insertions, 6 deletions
diff --git a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs b/Emby.Common.Implementations/TextEncoding/TextEncoding.cs
index 49b424d5f..54c47d62c 100644
--- a/Emby.Common.Implementations/TextEncoding/TextEncoding.cs
+++ b/Emby.Common.Implementations/TextEncoding/TextEncoding.cs
@@ -8,6 +8,8 @@ using System.Threading.Tasks;
using MediaBrowser.Model.MediaInfo;
using MediaBrowser.Model.Logging;
using UniversalDetector;
+using NLangDetect.Core;
+using MediaBrowser.Model.Serialization;
namespace Emby.Common.Implementations.TextEncoding
{
@@ -15,11 +17,13 @@ namespace Emby.Common.Implementations.TextEncoding
{
private readonly IFileSystem _fileSystem;
private readonly ILogger _logger;
+ private IJsonSerializer _json;
- public TextEncoding(IFileSystem fileSystem, ILogger logger)
+ public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
{
_fileSystem = fileSystem;
_logger = logger;
+ _json = json;
}
public Encoding GetASCIIEncoding()
@@ -63,7 +67,8 @@ namespace Emby.Common.Implementations.TextEncoding
}
}
- public string GetDetectedEncodingName(byte[] bytes, string language)
+ private bool _langDetectInitialized;
+ public string GetDetectedEncodingName(byte[] bytes, string language, bool enableLanguageDetection)
{
var encoding = GetInitialEncoding(bytes);
@@ -72,6 +77,22 @@ namespace Emby.Common.Implementations.TextEncoding
return "utf-8";
}
+ if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
+ {
+ if (!_langDetectInitialized)
+ {
+ _langDetectInitialized = true;
+ LanguageDetector.Initialize(_json);
+ }
+
+ language = DetectLanguage(bytes);
+
+ if (!string.IsNullOrWhiteSpace(language))
+ {
+ _logger.Debug("Text language detected as {0}", language);
+ }
+ }
+
var charset = DetectCharset(bytes, language);
if (!string.IsNullOrWhiteSpace(charset))
@@ -95,6 +116,35 @@ namespace Emby.Common.Implementations.TextEncoding
return null;
}
+ private string DetectLanguage(byte[] bytes)
+ {
+ try
+ {
+ return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes));
+ }
+ catch (NLangDetectException ex)
+ {
+ }
+
+ try
+ {
+ return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes));
+ }
+ catch (NLangDetectException ex)
+ {
+ }
+
+ try
+ {
+ return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes));
+ }
+ catch (NLangDetectException ex)
+ {
+ }
+
+ return null;
+ }
+
public Encoding GetEncodingFromCharset(string charset)
{
if (string.IsNullOrWhiteSpace(charset))
@@ -117,9 +167,9 @@ namespace Emby.Common.Implementations.TextEncoding
}
}
- public Encoding GetDetectedEncoding(byte[] bytes, string language)
+ public Encoding GetDetectedEncoding(byte[] bytes, string language, bool enableLanguageDetection)
{
- var charset = GetDetectedEncodingName(bytes, language);
+ var charset = GetDetectedEncodingName(bytes, language, enableLanguageDetection);
return GetEncodingFromCharset(charset);
}
@@ -136,28 +186,38 @@ namespace Emby.Common.Implementations.TextEncoding
case "cze":
case "ces":
case "slo":
- case "slk":
- case "slv":
case "srp":
case "hrv":
case "rum":
case "ron":
case "rup":
+ return "windows-1250";
+ // albanian
case "alb":
case "sqi":
return "windows-1250";
+ // slovak
+ case "slk":
+ case "slv":
+ return "windows-1250";
case "ara":
return "windows-1256";
case "heb":
return "windows-1255";
case "grc":
+ return "windows-1253";
+ // greek
case "gre":
+ case "ell":
return "windows-1253";
case "crh":
case "ota":
case "tur":
return "windows-1254";
+ // bulgarian
+ case "bul":
case "bgr":
+ return "windows-1251";
case "rus":
return "windows-1251";
case "vie":