diff options
| author | Luke Pulverenti <luke.pulverenti@gmail.com> | 2017-04-18 01:53:39 -0400 |
|---|---|---|
| committer | Luke Pulverenti <luke.pulverenti@gmail.com> | 2017-04-18 01:53:39 -0400 |
| commit | 4d7d8961b44aadd1d1c8f84bedd5d5ff9508d876 (patch) | |
| tree | 2d6390aa6429d6ef37191235f2140611262186da /MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs | |
| parent | 6a66aef608a0caa567f603edbd9a4e3466fda469 (diff) | |
update encoding detection
Diffstat (limited to 'MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs')
| -rw-r--r-- | MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs | 151 |
1 files changed, 19 insertions, 132 deletions
diff --git a/MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs b/MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs index 77b976206..231a2ae85 100644 --- a/MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs +++ b/MediaBrowser.MediaEncoding/Subtitles/SubtitleEncoder.cs @@ -21,7 +21,6 @@ using MediaBrowser.Model.IO; using MediaBrowser.Model.Diagnostics; using MediaBrowser.Model.Dto; using MediaBrowser.Model.Text; -using UniversalDetector; namespace MediaBrowser.MediaEncoding.Subtitles { @@ -197,17 +196,20 @@ namespace MediaBrowser.MediaEncoding.Subtitles { if (requiresCharset) { - var charset = await GetSubtitleFileCharacterSet(path, language, protocol, cancellationToken).ConfigureAwait(false); + var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false); + + var charset = _textEncoding.GetDetectedEncodingName(bytes, language); + _logger.Debug("charset {0} detected for {1}", charset ?? "null", path); if (!string.IsNullOrEmpty(charset)) { - using (var fs = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) + using (var inputStream = _memoryStreamProvider.CreateNew(bytes)) { - using (var reader = new StreamReader(fs, GetEncoding(charset))) + using (var reader = new StreamReader(inputStream, _textEncoding.GetEncodingFromCharset(charset))) { var text = await reader.ReadToEndAsync().ConfigureAwait(false); - var bytes = Encoding.UTF8.GetBytes(text); + bytes = Encoding.UTF8.GetBytes(text); return _memoryStreamProvider.CreateNew(bytes); } @@ -218,28 +220,6 @@ namespace MediaBrowser.MediaEncoding.Subtitles return _fileSystem.OpenRead(path); } - private Encoding GetEncoding(string charset) - { - if (string.IsNullOrWhiteSpace(charset)) - { - throw new ArgumentNullException("charset"); - } - - _logger.Debug("Getting encoding object for character set: {0}", charset); - - try - { - return Encoding.GetEncoding(charset); - } - catch (ArgumentException) - { - charset = charset.Replace("-", string.Empty); - _logger.Debug("Getting encoding object for character set: {0}", charset); - - return Encoding.GetEncoding(charset); - } - } - private async Task<Tuple<string, MediaProtocol, string, bool>> GetReadableFile(string mediaPath, string[] inputFiles, MediaProtocol protocol, @@ -724,126 +704,33 @@ namespace MediaBrowser.MediaEncoding.Subtitles public async Task<string> GetSubtitleFileCharacterSet(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken) { - if (protocol == MediaProtocol.File) - { - var fileEncoding = _textEncoding.GetFileEncoding(path); - - if (fileEncoding != null && fileEncoding.Equals(Encoding.UTF8)) - { - return string.Empty; - } - } - - var charset = await DetectCharset(path, language, protocol, cancellationToken).ConfigureAwait(false); - - if (!string.IsNullOrWhiteSpace(charset)) - { - if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase)) - { - return null; - } - - return charset; - } + var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false); - if (!string.IsNullOrWhiteSpace(language)) - { - return GetSubtitleFileCharacterSetFromLanguage(language); - } + var charset = _textEncoding.GetDetectedEncodingName(bytes, language); - return null; - } + _logger.Debug("charset {0} detected for {1}", charset ?? "null", path); - public string GetSubtitleFileCharacterSetFromLanguage(string language) - { - // https://developer.xamarin.com/api/type/System.Text.Encoding/ - - switch (language.ToLower()) - { - case "hun": - return "windows-1252"; - case "pol": - case "cze": - case "ces": - case "slo": - case "slk": - case "slv": - case "srp": - case "hrv": - case "rum": - case "ron": - case "rup": - case "alb": - case "sqi": - return "windows-1250"; - case "ara": - return "windows-1256"; - case "heb": - return "windows-1255"; - case "grc": - case "gre": - return "windows-1253"; - case "crh": - case "ota": - case "tur": - return "windows-1254"; - case "rus": - return "windows-1251"; - case "vie": - return "windows-1258"; - case "kor": - return "cp949"; - default: - return "windows-1252"; - } + return charset; } - private async Task<string> DetectCharset(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken) + private async Task<byte[]> GetBytes(string path, MediaProtocol protocol, CancellationToken cancellationToken) { - try + if (protocol == MediaProtocol.Http) { - using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) + using (var file = await _httpClient.Get(path, cancellationToken).ConfigureAwait(false)) { - var detector = new CharsetDetector(); - detector.Feed(file); - detector.DataEnd(); - - var charset = detector.Charset; - - if (!string.IsNullOrWhiteSpace(charset)) + using (var memoryStream = new MemoryStream()) { - _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path); - } + await file.CopyToAsync(memoryStream).ConfigureAwait(false); + memoryStream.Position = 0; - // This is often incorrectly indetected. If this happens, try to use other techniques instead - if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase)) - { - if (!string.IsNullOrWhiteSpace(language)) - { - return null; - } + return memoryStream.ToArray(); } - - return charset; } } - catch (IOException ex) - { - _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path); - } - - return null; - } - - private async Task<Stream> GetStream(string path, MediaProtocol protocol, CancellationToken cancellationToken) - { - if (protocol == MediaProtocol.Http) - { - return await _httpClient.Get(path, cancellationToken).ConfigureAwait(false); - } if (protocol == MediaProtocol.File) { - return _fileSystem.GetFileStream(path, FileOpenMode.Open, FileAccessMode.Read, FileShareMode.ReadWrite); + return _fileSystem.ReadAllBytes(path); } throw new ArgumentOutOfRangeException("protocol"); |
