diff options
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs')
| -rw-r--r-- | Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs | 409 |
1 files changed, 0 insertions, 409 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs b/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs deleted file mode 100644 index a0395a21b..000000000 --- a/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs +++ /dev/null @@ -1,409 +0,0 @@ -namespace Emby.Server.Implementations.TextEncoding -{ - // Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com> - // - // https://www.autoitscript.com - // - // Licensed under the Apache License, Version 2.0 (the "License"); - // you may not use this file except in compliance with the License. - // You may obtain a copy of the License at - // - // http://www.apache.org/licenses/LICENSE-2.0 - // - // Unless required by applicable law or agreed to in writing, software - // distributed under the License is distributed on an "AS IS" BASIS, - // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - // See the License for the specific language governing permissions and - // limitations under the License. - /// <summary> - /// Credit: https://github.com/AutoIt/text-encoding-detect - /// </summary> - public class TextEncodingDetect - { - private readonly byte[] _utf16BeBom = - { - 0xFE, - 0xFF - }; - - private readonly byte[] _utf16LeBom = - { - 0xFF, - 0xFE - }; - - private readonly byte[] _utf8Bom = - { - 0xEF, - 0xBB, - 0xBF - }; - - private bool _nullSuggestsBinary = true; - private double _utf16ExpectedNullPercent = 70; - private double _utf16UnexpectedNullPercent = 10; - - public enum CharacterEncoding - { - None, // Unknown or binary - Ansi, // 0-255 - Ascii, // 0-127 - Utf8Bom, // UTF8 with BOM - Utf8Nobom, // UTF8 without BOM - Utf16LeBom, // UTF16 LE with BOM - Utf16LeNoBom, // UTF16 LE without BOM - Utf16BeBom, // UTF16-BE with BOM - Utf16BeNoBom // UTF16-BE without BOM - } - - /// <summary> - /// Sets if the presence of nulls in a buffer indicate the buffer is binary data rather than text. - /// </summary> - public bool NullSuggestsBinary - { - set - { - _nullSuggestsBinary = value; - } - } - - public double Utf16ExpectedNullPercent - { - set - { - if (value > 0 && value < 100) - { - _utf16ExpectedNullPercent = value; - } - } - } - - public double Utf16UnexpectedNullPercent - { - set - { - if (value > 0 && value < 100) - { - _utf16UnexpectedNullPercent = value; - } - } - } - - /// <summary> - /// Gets the BOM length for a given Encoding mode. - /// </summary> - /// <param name="encoding"></param> - /// <returns>The BOM length.</returns> - public static int GetBomLengthFromEncodingMode(CharacterEncoding encoding) - { - int length; - - switch (encoding) - { - case CharacterEncoding.Utf16BeBom: - case CharacterEncoding.Utf16LeBom: - length = 2; - break; - - case CharacterEncoding.Utf8Bom: - length = 3; - break; - - default: - length = 0; - break; - } - - return length; - } - - /// <summary> - /// Checks for a BOM sequence in a byte buffer. - /// </summary> - /// <param name="buffer"></param> - /// <param name="size"></param> - /// <returns>Encoding type or Encoding.None if no BOM.</returns> - public CharacterEncoding CheckBom(byte[] buffer, int size) - { - // Check for BOM - if (size >= 2 && buffer[0] == _utf16LeBom[0] && buffer[1] == _utf16LeBom[1]) - { - return CharacterEncoding.Utf16LeBom; - } - - if (size >= 2 && buffer[0] == _utf16BeBom[0] && buffer[1] == _utf16BeBom[1]) - { - return CharacterEncoding.Utf16BeBom; - } - - if (size >= 3 && buffer[0] == _utf8Bom[0] && buffer[1] == _utf8Bom[1] && buffer[2] == _utf8Bom[2]) - { - return CharacterEncoding.Utf8Bom; - } - - return CharacterEncoding.None; - } - - /// <summary> - /// Automatically detects the Encoding type of a given byte buffer. - /// </summary> - /// <param name="buffer">The byte buffer.</param> - /// <param name="size">The size of the byte buffer.</param> - /// <returns>The Encoding type or Encoding.None if unknown.</returns> - public CharacterEncoding DetectEncoding(byte[] buffer, int size) - { - // First check if we have a BOM and return that if so - CharacterEncoding encoding = CheckBom(buffer, size); - if (encoding != CharacterEncoding.None) - { - return encoding; - } - - // Now check for valid UTF8 - encoding = CheckUtf8(buffer, size); - if (encoding != CharacterEncoding.None) - { - return encoding; - } - - // Now try UTF16 - encoding = CheckUtf16NewlineChars(buffer, size); - if (encoding != CharacterEncoding.None) - { - return encoding; - } - - encoding = CheckUtf16Ascii(buffer, size); - if (encoding != CharacterEncoding.None) - { - return encoding; - } - - // ANSI or None (binary) then - if (!DoesContainNulls(buffer, size)) - { - return CharacterEncoding.Ansi; - } - - // Found a null, return based on the preference in null_suggests_binary_ - return _nullSuggestsBinary ? CharacterEncoding.None : CharacterEncoding.Ansi; - } - - /// <summary> - /// Checks if a buffer contains text that looks like utf16 by scanning for - /// newline chars that would be present even in non-english text. - /// </summary> - /// <param name="buffer">The byte buffer.</param> - /// <param name="size">The size of the byte buffer.</param> - /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns> - private static CharacterEncoding CheckUtf16NewlineChars(byte[] buffer, int size) - { - if (size < 2) - { - return CharacterEncoding.None; - } - - // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes - size--; - - var leControlChars = 0; - var beControlChars = 0; - - uint pos = 0; - while (pos < size) - { - byte ch1 = buffer[pos++]; - byte ch2 = buffer[pos++]; - - if (ch1 == 0) - { - if (ch2 == 0x0a || ch2 == 0x0d) - { - ++beControlChars; - } - } - else if (ch2 == 0) - { - if (ch1 == 0x0a || ch1 == 0x0d) - { - ++leControlChars; - } - } - - // If we are getting both LE and BE control chars then this file is not utf16 - if (leControlChars > 0 && beControlChars > 0) - { - return CharacterEncoding.None; - } - } - - if (leControlChars > 0) - { - return CharacterEncoding.Utf16LeNoBom; - } - - return beControlChars > 0 ? CharacterEncoding.Utf16BeNoBom : CharacterEncoding.None; - } - - /// <summary> - /// Checks if a buffer contains any nulls. Used to check for binary vs text data. - /// </summary> - /// <param name="buffer">The byte buffer.</param> - /// <param name="size">The size of the byte buffer.</param> - private static bool DoesContainNulls(byte[] buffer, int size) - { - uint pos = 0; - while (pos < size) - { - if (buffer[pos++] == 0) - { - return true; - } - } - - return false; - } - - /// <summary> - /// Checks if a buffer contains text that looks like utf16. This is done based - /// on the use of nulls which in ASCII/script like text can be useful to identify. - /// </summary> - /// <param name="buffer">The byte buffer.</param> - /// <param name="size">The size of the byte buffer.</param> - /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns> - private CharacterEncoding CheckUtf16Ascii(byte[] buffer, int size) - { - var numOddNulls = 0; - var numEvenNulls = 0; - - // Get even nulls - uint pos = 0; - while (pos < size) - { - if (buffer[pos] == 0) - { - numEvenNulls++; - } - - pos += 2; - } - - // Get odd nulls - pos = 1; - while (pos < size) - { - if (buffer[pos] == 0) - { - numOddNulls++; - } - - pos += 2; - } - - double evenNullThreshold = numEvenNulls * 2.0 / size; - double oddNullThreshold = numOddNulls * 2.0 / size; - double expectedNullThreshold = _utf16ExpectedNullPercent / 100.0; - double unexpectedNullThreshold = _utf16UnexpectedNullPercent / 100.0; - - // Lots of odd nulls, low number of even nulls - if (evenNullThreshold < unexpectedNullThreshold && oddNullThreshold > expectedNullThreshold) - { - return CharacterEncoding.Utf16LeNoBom; - } - - // Lots of even nulls, low number of odd nulls - if (oddNullThreshold < unexpectedNullThreshold && evenNullThreshold > expectedNullThreshold) - { - return CharacterEncoding.Utf16BeNoBom; - } - - // Don't know - return CharacterEncoding.None; - } - - /// <summary> - /// Checks if a buffer contains valid utf8. - /// </summary> - /// <param name="buffer">The byte buffer.</param> - /// <param name="size">The size of the byte buffer.</param> - /// <returns> - /// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or - /// Encoding.ASCII (data in 0.127 range). - /// </returns> - /// <returns>2</returns> - private CharacterEncoding CheckUtf8(byte[] buffer, int size) - { - // UTF8 Valid sequences - // 0xxxxxxx ASCII - // 110xxxxx 10xxxxxx 2-byte - // 1110xxxx 10xxxxxx 10xxxxxx 3-byte - // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte - // - // Width in UTF8 - // Decimal Width - // 0-127 1 byte - // 194-223 2 bytes - // 224-239 3 bytes - // 240-244 4 bytes - // - // Subsequent chars are in the range 128-191 - var onlySawAsciiRange = true; - uint pos = 0; - - while (pos < size) - { - byte ch = buffer[pos++]; - - if (ch == 0 && _nullSuggestsBinary) - { - return CharacterEncoding.None; - } - - int moreChars; - if (ch <= 127) - { - // 1 byte - moreChars = 0; - } - else if (ch >= 194 && ch <= 223) - { - // 2 Byte - moreChars = 1; - } - else if (ch >= 224 && ch <= 239) - { - // 3 Byte - moreChars = 2; - } - else if (ch >= 240 && ch <= 244) - { - // 4 Byte - moreChars = 3; - } - else - { - return CharacterEncoding.None; // Not utf8 - } - - // Check secondary chars are in range if we are expecting any - while (moreChars > 0 && pos < size) - { - onlySawAsciiRange = false; // Seen non-ascii chars now - - ch = buffer[pos++]; - if (ch < 128 || ch > 191) - { - return CharacterEncoding.None; // Not utf8 - } - - --moreChars; - } - } - - // If we get to here then only valid UTF-8 sequences have been processed - - // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide) - return onlySawAsciiRange ? CharacterEncoding.Ascii : CharacterEncoding.Utf8Nobom; - } - } -} |
