aboutsummaryrefslogtreecommitdiff
path: root/MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs
diff options
context:
space:
mode:
authorLuke Pulverenti <luke.pulverenti@gmail.com>2017-04-18 01:53:39 -0400
committerLuke Pulverenti <luke.pulverenti@gmail.com>2017-04-18 01:53:39 -0400
commit4d7d8961b44aadd1d1c8f84bedd5d5ff9508d876 (patch)
tree2d6390aa6429d6ef37191235f2140611262186da /MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs
parent6a66aef608a0caa567f603edbd9a4e3466fda469 (diff)
update encoding detection
Diffstat (limited to 'MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs')
-rw-r--r--MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs151
1 files changed, 19 insertions, 132 deletions
diff --git a/MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs b/MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs
index 77b976206..231a2ae85 100644
--- a/MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs
+++ b/MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs
@@ -21,7 +21,6 @@ using MediaBrowser.Model.IO;
using MediaBrowser.Model.Diagnostics;
using MediaBrowser.Model.Dto;
using MediaBrowser.Model.Text;
-using UniversalDetector;
namespace MediaBrowser.MediaEncoding.Subtitles
{
@@ -197,17 +196,20 @@ namespace MediaBrowser.MediaEncoding.Subtitles
{
if (requiresCharset)
{
- var charset = await GetSubtitleFileCharacterSet(path, language, protocol, cancellationToken).ConfigureAwait(false);
+ var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false);
+
+ var charset = _textEncoding.GetDetectedEncodingName(bytes, language);
+ _logger.Debug("charset {0} detected for {1}", charset ?? "null", path);
if (!string.IsNullOrEmpty(charset))
{
- using (var fs = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
+ using (var inputStream = _memoryStreamProvider.CreateNew(bytes))
{
- using (var reader = new StreamReader(fs, GetEncoding(charset)))
+ using (var reader = new StreamReader(inputStream, _textEncoding.GetEncodingFromCharset(charset)))
{
var text = await reader.ReadToEndAsync().ConfigureAwait(false);
- var bytes = Encoding.UTF8.GetBytes(text);
+ bytes = Encoding.UTF8.GetBytes(text);
return _memoryStreamProvider.CreateNew(bytes);
}
@@ -218,28 +220,6 @@ namespace MediaBrowser.MediaEncoding.Subtitles
return _fileSystem.OpenRead(path);
}
- private Encoding GetEncoding(string charset)
- {
- if (string.IsNullOrWhiteSpace(charset))
- {
- throw new ArgumentNullException("charset");
- }
-
- _logger.Debug("Getting encoding object for character set: {0}", charset);
-
- try
- {
- return Encoding.GetEncoding(charset);
- }
- catch (ArgumentException)
- {
- charset = charset.Replace("-", string.Empty);
- _logger.Debug("Getting encoding object for character set: {0}", charset);
-
- return Encoding.GetEncoding(charset);
- }
- }
-
private async Task<Tuple<string, MediaProtocol, string, bool>> GetReadableFile(string mediaPath,
string[] inputFiles,
MediaProtocol protocol,
@@ -724,126 +704,33 @@ namespace MediaBrowser.MediaEncoding.Subtitles
public async Task<string> GetSubtitleFileCharacterSet(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken)
{
- if (protocol == MediaProtocol.File)
- {
- var fileEncoding = _textEncoding.GetFileEncoding(path);
-
- if (fileEncoding != null && fileEncoding.Equals(Encoding.UTF8))
- {
- return string.Empty;
- }
- }
-
- var charset = await DetectCharset(path, language, protocol, cancellationToken).ConfigureAwait(false);
-
- if (!string.IsNullOrWhiteSpace(charset))
- {
- if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
- {
- return null;
- }
-
- return charset;
- }
+ var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false);
- if (!string.IsNullOrWhiteSpace(language))
- {
- return GetSubtitleFileCharacterSetFromLanguage(language);
- }
+ var charset = _textEncoding.GetDetectedEncodingName(bytes, language);
- return null;
- }
+ _logger.Debug("charset {0} detected for {1}", charset ?? "null", path);
- public string GetSubtitleFileCharacterSetFromLanguage(string language)
- {
- // https://developer.xamarin.com/api/type/System.Text.Encoding/
-
- switch (language.ToLower())
- {
- case "hun":
- return "windows-1252";
- case "pol":
- case "cze":
- case "ces":
- case "slo":
- case "slk":
- case "slv":
- case "srp":
- case "hrv":
- case "rum":
- case "ron":
- case "rup":
- case "alb":
- case "sqi":
- return "windows-1250";
- case "ara":
- return "windows-1256";
- case "heb":
- return "windows-1255";
- case "grc":
- case "gre":
- return "windows-1253";
- case "crh":
- case "ota":
- case "tur":
- return "windows-1254";
- case "rus":
- return "windows-1251";
- case "vie":
- return "windows-1258";
- case "kor":
- return "cp949";
- default:
- return "windows-1252";
- }
+ return charset;
}
- private async Task<string> DetectCharset(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken)
+ private async Task<byte[]> GetBytes(string path, MediaProtocol protocol, CancellationToken cancellationToken)
{
- try
+ if (protocol == MediaProtocol.Http)
{
- using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
+ using (var file = await _httpClient.Get(path, cancellationToken).ConfigureAwait(false))
{
- var detector = new CharsetDetector();
- detector.Feed(file);
- detector.DataEnd();
-
- var charset = detector.Charset;
-
- if (!string.IsNullOrWhiteSpace(charset))
+ using (var memoryStream = new MemoryStream())
{
- _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
- }
+ await file.CopyToAsync(memoryStream).ConfigureAwait(false);
+ memoryStream.Position = 0;
- // This is often incorrectly indetected. If this happens, try to use other techniques instead
- if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
- {
- if (!string.IsNullOrWhiteSpace(language))
- {
- return null;
- }
+ return memoryStream.ToArray();
}
-
- return charset;
}
}
- catch (IOException ex)
- {
- _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
- }
-
- return null;
- }
-
- private async Task<Stream> GetStream(string path, MediaProtocol protocol, CancellationToken cancellationToken)
- {
- if (protocol == MediaProtocol.Http)
- {
- return await _httpClient.Get(path, cancellationToken).ConfigureAwait(false);
- }
if (protocol == MediaProtocol.File)
{
- return _fileSystem.GetFileStream(path, FileOpenMode.Open, FileAccessMode.Read, FileShareMode.ReadWrite);
+ return _fileSystem.ReadAllBytes(path);
}
throw new ArgumentOutOfRangeException("protocol");