aboutsummaryrefslogtreecommitdiff
path: root/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs
diff options
context:
space:
mode:
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs')
-rw-r--r--Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs409
1 files changed, 409 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs b/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs
new file mode 100644
index 000000000..a0395a21b
--- /dev/null
+++ b/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs
@@ -0,0 +1,409 @@
+namespace Emby.Server.Implementations.TextEncoding
+{
+ // Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com>
+ //
+ // https://www.autoitscript.com
+ //
+ // Licensed under the Apache License, Version 2.0 (the "License");
+ // you may not use this file except in compliance with the License.
+ // You may obtain a copy of the License at
+ //
+ // http://www.apache.org/licenses/LICENSE-2.0
+ //
+ // Unless required by applicable law or agreed to in writing, software
+ // distributed under the License is distributed on an "AS IS" BASIS,
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ // See the License for the specific language governing permissions and
+ // limitations under the License.
+ /// <summary>
+ /// Credit: https://github.com/AutoIt/text-encoding-detect
+ /// </summary>
+ public class TextEncodingDetect
+ {
+ private readonly byte[] _utf16BeBom =
+ {
+ 0xFE,
+ 0xFF
+ };
+
+ private readonly byte[] _utf16LeBom =
+ {
+ 0xFF,
+ 0xFE
+ };
+
+ private readonly byte[] _utf8Bom =
+ {
+ 0xEF,
+ 0xBB,
+ 0xBF
+ };
+
+ private bool _nullSuggestsBinary = true;
+ private double _utf16ExpectedNullPercent = 70;
+ private double _utf16UnexpectedNullPercent = 10;
+
+ public enum CharacterEncoding
+ {
+ None, // Unknown or binary
+ Ansi, // 0-255
+ Ascii, // 0-127
+ Utf8Bom, // UTF8 with BOM
+ Utf8Nobom, // UTF8 without BOM
+ Utf16LeBom, // UTF16 LE with BOM
+ Utf16LeNoBom, // UTF16 LE without BOM
+ Utf16BeBom, // UTF16-BE with BOM
+ Utf16BeNoBom // UTF16-BE without BOM
+ }
+
+ /// <summary>
+ /// Sets if the presence of nulls in a buffer indicate the buffer is binary data rather than text.
+ /// </summary>
+ public bool NullSuggestsBinary
+ {
+ set
+ {
+ _nullSuggestsBinary = value;
+ }
+ }
+
+ public double Utf16ExpectedNullPercent
+ {
+ set
+ {
+ if (value > 0 && value < 100)
+ {
+ _utf16ExpectedNullPercent = value;
+ }
+ }
+ }
+
+ public double Utf16UnexpectedNullPercent
+ {
+ set
+ {
+ if (value > 0 && value < 100)
+ {
+ _utf16UnexpectedNullPercent = value;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Gets the BOM length for a given Encoding mode.
+ /// </summary>
+ /// <param name="encoding"></param>
+ /// <returns>The BOM length.</returns>
+ public static int GetBomLengthFromEncodingMode(CharacterEncoding encoding)
+ {
+ int length;
+
+ switch (encoding)
+ {
+ case CharacterEncoding.Utf16BeBom:
+ case CharacterEncoding.Utf16LeBom:
+ length = 2;
+ break;
+
+ case CharacterEncoding.Utf8Bom:
+ length = 3;
+ break;
+
+ default:
+ length = 0;
+ break;
+ }
+
+ return length;
+ }
+
+ /// <summary>
+ /// Checks for a BOM sequence in a byte buffer.
+ /// </summary>
+ /// <param name="buffer"></param>
+ /// <param name="size"></param>
+ /// <returns>Encoding type or Encoding.None if no BOM.</returns>
+ public CharacterEncoding CheckBom(byte[] buffer, int size)
+ {
+ // Check for BOM
+ if (size >= 2 && buffer[0] == _utf16LeBom[0] && buffer[1] == _utf16LeBom[1])
+ {
+ return CharacterEncoding.Utf16LeBom;
+ }
+
+ if (size >= 2 && buffer[0] == _utf16BeBom[0] && buffer[1] == _utf16BeBom[1])
+ {
+ return CharacterEncoding.Utf16BeBom;
+ }
+
+ if (size >= 3 && buffer[0] == _utf8Bom[0] && buffer[1] == _utf8Bom[1] && buffer[2] == _utf8Bom[2])
+ {
+ return CharacterEncoding.Utf8Bom;
+ }
+
+ return CharacterEncoding.None;
+ }
+
+ /// <summary>
+ /// Automatically detects the Encoding type of a given byte buffer.
+ /// </summary>
+ /// <param name="buffer">The byte buffer.</param>
+ /// <param name="size">The size of the byte buffer.</param>
+ /// <returns>The Encoding type or Encoding.None if unknown.</returns>
+ public CharacterEncoding DetectEncoding(byte[] buffer, int size)
+ {
+ // First check if we have a BOM and return that if so
+ CharacterEncoding encoding = CheckBom(buffer, size);
+ if (encoding != CharacterEncoding.None)
+ {
+ return encoding;
+ }
+
+ // Now check for valid UTF8
+ encoding = CheckUtf8(buffer, size);
+ if (encoding != CharacterEncoding.None)
+ {
+ return encoding;
+ }
+
+ // Now try UTF16
+ encoding = CheckUtf16NewlineChars(buffer, size);
+ if (encoding != CharacterEncoding.None)
+ {
+ return encoding;
+ }
+
+ encoding = CheckUtf16Ascii(buffer, size);
+ if (encoding != CharacterEncoding.None)
+ {
+ return encoding;
+ }
+
+ // ANSI or None (binary) then
+ if (!DoesContainNulls(buffer, size))
+ {
+ return CharacterEncoding.Ansi;
+ }
+
+ // Found a null, return based on the preference in null_suggests_binary_
+ return _nullSuggestsBinary ? CharacterEncoding.None : CharacterEncoding.Ansi;
+ }
+
+ /// <summary>
+ /// Checks if a buffer contains text that looks like utf16 by scanning for
+ /// newline chars that would be present even in non-english text.
+ /// </summary>
+ /// <param name="buffer">The byte buffer.</param>
+ /// <param name="size">The size of the byte buffer.</param>
+ /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
+ private static CharacterEncoding CheckUtf16NewlineChars(byte[] buffer, int size)
+ {
+ if (size < 2)
+ {
+ return CharacterEncoding.None;
+ }
+
+ // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes
+ size--;
+
+ var leControlChars = 0;
+ var beControlChars = 0;
+
+ uint pos = 0;
+ while (pos < size)
+ {
+ byte ch1 = buffer[pos++];
+ byte ch2 = buffer[pos++];
+
+ if (ch1 == 0)
+ {
+ if (ch2 == 0x0a || ch2 == 0x0d)
+ {
+ ++beControlChars;
+ }
+ }
+ else if (ch2 == 0)
+ {
+ if (ch1 == 0x0a || ch1 == 0x0d)
+ {
+ ++leControlChars;
+ }
+ }
+
+ // If we are getting both LE and BE control chars then this file is not utf16
+ if (leControlChars > 0 && beControlChars > 0)
+ {
+ return CharacterEncoding.None;
+ }
+ }
+
+ if (leControlChars > 0)
+ {
+ return CharacterEncoding.Utf16LeNoBom;
+ }
+
+ return beControlChars > 0 ? CharacterEncoding.Utf16BeNoBom : CharacterEncoding.None;
+ }
+
+ /// <summary>
+ /// Checks if a buffer contains any nulls. Used to check for binary vs text data.
+ /// </summary>
+ /// <param name="buffer">The byte buffer.</param>
+ /// <param name="size">The size of the byte buffer.</param>
+ private static bool DoesContainNulls(byte[] buffer, int size)
+ {
+ uint pos = 0;
+ while (pos < size)
+ {
+ if (buffer[pos++] == 0)
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /// <summary>
+ /// Checks if a buffer contains text that looks like utf16. This is done based
+ /// on the use of nulls which in ASCII/script like text can be useful to identify.
+ /// </summary>
+ /// <param name="buffer">The byte buffer.</param>
+ /// <param name="size">The size of the byte buffer.</param>
+ /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
+ private CharacterEncoding CheckUtf16Ascii(byte[] buffer, int size)
+ {
+ var numOddNulls = 0;
+ var numEvenNulls = 0;
+
+ // Get even nulls
+ uint pos = 0;
+ while (pos < size)
+ {
+ if (buffer[pos] == 0)
+ {
+ numEvenNulls++;
+ }
+
+ pos += 2;
+ }
+
+ // Get odd nulls
+ pos = 1;
+ while (pos < size)
+ {
+ if (buffer[pos] == 0)
+ {
+ numOddNulls++;
+ }
+
+ pos += 2;
+ }
+
+ double evenNullThreshold = numEvenNulls * 2.0 / size;
+ double oddNullThreshold = numOddNulls * 2.0 / size;
+ double expectedNullThreshold = _utf16ExpectedNullPercent / 100.0;
+ double unexpectedNullThreshold = _utf16UnexpectedNullPercent / 100.0;
+
+ // Lots of odd nulls, low number of even nulls
+ if (evenNullThreshold < unexpectedNullThreshold && oddNullThreshold > expectedNullThreshold)
+ {
+ return CharacterEncoding.Utf16LeNoBom;
+ }
+
+ // Lots of even nulls, low number of odd nulls
+ if (oddNullThreshold < unexpectedNullThreshold && evenNullThreshold > expectedNullThreshold)
+ {
+ return CharacterEncoding.Utf16BeNoBom;
+ }
+
+ // Don't know
+ return CharacterEncoding.None;
+ }
+
+ /// <summary>
+ /// Checks if a buffer contains valid utf8.
+ /// </summary>
+ /// <param name="buffer">The byte buffer.</param>
+ /// <param name="size">The size of the byte buffer.</param>
+ /// <returns>
+ /// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or
+ /// Encoding.ASCII (data in 0.127 range).
+ /// </returns>
+ /// <returns>2</returns>
+ private CharacterEncoding CheckUtf8(byte[] buffer, int size)
+ {
+ // UTF8 Valid sequences
+ // 0xxxxxxx ASCII
+ // 110xxxxx 10xxxxxx 2-byte
+ // 1110xxxx 10xxxxxx 10xxxxxx 3-byte
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte
+ //
+ // Width in UTF8
+ // Decimal Width
+ // 0-127 1 byte
+ // 194-223 2 bytes
+ // 224-239 3 bytes
+ // 240-244 4 bytes
+ //
+ // Subsequent chars are in the range 128-191
+ var onlySawAsciiRange = true;
+ uint pos = 0;
+
+ while (pos < size)
+ {
+ byte ch = buffer[pos++];
+
+ if (ch == 0 && _nullSuggestsBinary)
+ {
+ return CharacterEncoding.None;
+ }
+
+ int moreChars;
+ if (ch <= 127)
+ {
+ // 1 byte
+ moreChars = 0;
+ }
+ else if (ch >= 194 && ch <= 223)
+ {
+ // 2 Byte
+ moreChars = 1;
+ }
+ else if (ch >= 224 && ch <= 239)
+ {
+ // 3 Byte
+ moreChars = 2;
+ }
+ else if (ch >= 240 && ch <= 244)
+ {
+ // 4 Byte
+ moreChars = 3;
+ }
+ else
+ {
+ return CharacterEncoding.None; // Not utf8
+ }
+
+ // Check secondary chars are in range if we are expecting any
+ while (moreChars > 0 && pos < size)
+ {
+ onlySawAsciiRange = false; // Seen non-ascii chars now
+
+ ch = buffer[pos++];
+ if (ch < 128 || ch > 191)
+ {
+ return CharacterEncoding.None; // Not utf8
+ }
+
+ --moreChars;
+ }
+ }
+
+ // If we get to here then only valid UTF-8 sequences have been processed
+
+ // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide)
+ return onlySawAsciiRange ? CharacterEncoding.Ascii : CharacterEncoding.Utf8Nobom;
+ }
+ }
+}