merge common implementations and server implementations

author: Luke Pulverenti <luke.pulverenti@gmail.com> 2017-08-16 02:43:41 -0400
committer: Luke Pulverenti <luke.pulverenti@gmail.com> 2017-08-16 02:43:41 -0400
commit: bfcd1b520fd79b893e721ba916ae5e1656407d2f (patch)
tree: 6a05119800484435fb384da25c6390054a27c3c3 /Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs
parent: e3531534b85aeaaa3e4aaf462d5e77ea142dc762 (diff)
1 files changed, 409 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs b/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs
new file mode 100644
index 000000000..a0395a21b
--- /dev/null
+++ b/Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs
@@ -0,0 +1,409 @@
+namespace Emby.Server.Implementations.TextEncoding
+{
+    // Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com>
+    // 
+    // https://www.autoitscript.com 
+    //
+    // Licensed under the Apache License, Version 2.0 (the "License");
+    // you may not use this file except in compliance with the License.
+    // You may obtain a copy of the License at
+    //
+    //    http://www.apache.org/licenses/LICENSE-2.0
+    // 
+    // Unless required by applicable law or agreed to in writing, software
+    // distributed under the License is distributed on an "AS IS" BASIS,
+    // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    // See the License for the specific language governing permissions and
+    // limitations under the License.
+    /// <summary>
+    /// Credit: https://github.com/AutoIt/text-encoding-detect
+    /// </summary>
+    public class TextEncodingDetect
+    {
+        private readonly byte[] _utf16BeBom =
+        {
+            0xFE,
+            0xFF
+        };
+
+        private readonly byte[] _utf16LeBom =
+        {
+            0xFF,
+            0xFE
+        };
+
+        private readonly byte[] _utf8Bom =
+        {
+            0xEF,
+            0xBB,
+            0xBF
+        };
+
+        private bool _nullSuggestsBinary = true;
+        private double _utf16ExpectedNullPercent = 70;
+        private double _utf16UnexpectedNullPercent = 10;
+
+        public enum CharacterEncoding
+        {
+            None, // Unknown or binary
+            Ansi, // 0-255
+            Ascii, // 0-127
+            Utf8Bom, // UTF8 with BOM
+            Utf8Nobom, // UTF8 without BOM
+            Utf16LeBom, // UTF16 LE with BOM
+            Utf16LeNoBom, // UTF16 LE without BOM
+            Utf16BeBom, // UTF16-BE with BOM
+            Utf16BeNoBom // UTF16-BE without BOM
+        }
+
+        /// <summary>
+        ///     Sets if the presence of nulls in a buffer indicate the buffer is binary data rather than text.
+        /// </summary>
+        public bool NullSuggestsBinary
+        {
+            set
+            {
+                _nullSuggestsBinary = value;
+            }
+        }
+
+        public double Utf16ExpectedNullPercent
+        {
+            set
+            {
+                if (value > 0 && value < 100)
+                {
+                    _utf16ExpectedNullPercent = value;
+                }
+            }
+        }
+
+        public double Utf16UnexpectedNullPercent
+        {
+            set
+            {
+                if (value > 0 && value < 100)
+                {
+                    _utf16UnexpectedNullPercent = value;
+                }
+            }
+        }
+
+        /// <summary>
+        ///     Gets the BOM length for a given Encoding mode.
+        /// </summary>
+        /// <param name="encoding"></param>
+        /// <returns>The BOM length.</returns>
+        public static int GetBomLengthFromEncodingMode(CharacterEncoding encoding)
+        {
+            int length;
+
+            switch (encoding)
+            {
+                case CharacterEncoding.Utf16BeBom:
+                case CharacterEncoding.Utf16LeBom:
+                    length = 2;
+                    break;
+
+                case CharacterEncoding.Utf8Bom:
+                    length = 3;
+                    break;
+
+                default:
+                    length = 0;
+                    break;
+            }
+
+            return length;
+        }
+
+        /// <summary>
+        ///     Checks for a BOM sequence in a byte buffer.
+        /// </summary>
+        /// <param name="buffer"></param>
+        /// <param name="size"></param>
+        /// <returns>Encoding type or Encoding.None if no BOM.</returns>
+        public CharacterEncoding CheckBom(byte[] buffer, int size)
+        {
+            // Check for BOM
+            if (size >= 2 && buffer[0] == _utf16LeBom[0] && buffer[1] == _utf16LeBom[1])
+            {
+                return CharacterEncoding.Utf16LeBom;
+            }
+
+            if (size >= 2 && buffer[0] == _utf16BeBom[0] && buffer[1] == _utf16BeBom[1])
+            {
+                return CharacterEncoding.Utf16BeBom;
+            }
+
+            if (size >= 3 && buffer[0] == _utf8Bom[0] && buffer[1] == _utf8Bom[1] && buffer[2] == _utf8Bom[2])
+            {
+                return CharacterEncoding.Utf8Bom;
+            }
+
+            return CharacterEncoding.None;
+        }
+
+        /// <summary>
+        ///     Automatically detects the Encoding type of a given byte buffer.
+        /// </summary>
+        /// <param name="buffer">The byte buffer.</param>
+        /// <param name="size">The size of the byte buffer.</param>
+        /// <returns>The Encoding type or Encoding.None if unknown.</returns>
+        public CharacterEncoding DetectEncoding(byte[] buffer, int size)
+        {
+            // First check if we have a BOM and return that if so
+            CharacterEncoding encoding = CheckBom(buffer, size);
+            if (encoding != CharacterEncoding.None)
+            {
+                return encoding;
+            }
+
+            // Now check for valid UTF8
+            encoding = CheckUtf8(buffer, size);
+            if (encoding != CharacterEncoding.None)
+            {
+                return encoding;
+            }
+
+            // Now try UTF16 
+            encoding = CheckUtf16NewlineChars(buffer, size);
+            if (encoding != CharacterEncoding.None)
+            {
+                return encoding;
+            }
+
+            encoding = CheckUtf16Ascii(buffer, size);
+            if (encoding != CharacterEncoding.None)
+            {
+                return encoding;
+            }
+
+            // ANSI or None (binary) then
+            if (!DoesContainNulls(buffer, size))
+            {
+                return CharacterEncoding.Ansi;
+            }
+
+            // Found a null, return based on the preference in null_suggests_binary_
+            return _nullSuggestsBinary ? CharacterEncoding.None : CharacterEncoding.Ansi;
+        }
+
+        /// <summary>
+        ///     Checks if a buffer contains text that looks like utf16 by scanning for
+        ///     newline chars that would be present even in non-english text.
+        /// </summary>
+        /// <param name="buffer">The byte buffer.</param>
+        /// <param name="size">The size of the byte buffer.</param>
+        /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
+        private static CharacterEncoding CheckUtf16NewlineChars(byte[] buffer, int size)
+        {
+            if (size < 2)
+            {
+                return CharacterEncoding.None;
+            }
+
+            // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes
+            size--;
+
+            var leControlChars = 0;
+            var beControlChars = 0;
+
+            uint pos = 0;
+            while (pos < size)
+            {
+                byte ch1 = buffer[pos++];
+                byte ch2 = buffer[pos++];
+
+                if (ch1 == 0)
+                {
+                    if (ch2 == 0x0a || ch2 == 0x0d)
+                    {
+                        ++beControlChars;
+                    }
+                }
+                else if (ch2 == 0)
+                {
+                    if (ch1 == 0x0a || ch1 == 0x0d)
+                    {
+                        ++leControlChars;
+                    }
+                }
+
+                // If we are getting both LE and BE control chars then this file is not utf16
+                if (leControlChars > 0 && beControlChars > 0)
+                {
+                    return CharacterEncoding.None;
+                }
+            }
+
+            if (leControlChars > 0)
+            {
+                return CharacterEncoding.Utf16LeNoBom;
+            }
+
+            return beControlChars > 0 ? CharacterEncoding.Utf16BeNoBom : CharacterEncoding.None;
+        }
+
+        /// <summary>
+        /// Checks if a buffer contains any nulls. Used to check for binary vs text data.
+        /// </summary>
+        /// <param name="buffer">The byte buffer.</param>
+        /// <param name="size">The size of the byte buffer.</param>
+        private static bool DoesContainNulls(byte[] buffer, int size)
+        {
+            uint pos = 0;
+            while (pos < size)
+            {
+                if (buffer[pos++] == 0)
+                {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        /// <summary>
+        ///     Checks if a buffer contains text that looks like utf16. This is done based
+        ///     on the use of nulls which in ASCII/script like text can be useful to identify.
+        /// </summary>
+        /// <param name="buffer">The byte buffer.</param>
+        /// <param name="size">The size of the byte buffer.</param>
+        /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
+        private CharacterEncoding CheckUtf16Ascii(byte[] buffer, int size)
+        {
+            var numOddNulls = 0;
+            var numEvenNulls = 0;
+
+            // Get even nulls
+            uint pos = 0;
+            while (pos < size)
+            {
+                if (buffer[pos] == 0)
+                {
+                    numEvenNulls++;
+                }
+
+                pos += 2;
+            }
+
+            // Get odd nulls
+            pos = 1;
+            while (pos < size)
+            {
+                if (buffer[pos] == 0)
+                {
+                    numOddNulls++;
+                }
+
+                pos += 2;
+            }
+
+            double evenNullThreshold = numEvenNulls * 2.0 / size;
+            double oddNullThreshold = numOddNulls * 2.0 / size;
+            double expectedNullThreshold = _utf16ExpectedNullPercent / 100.0;
+            double unexpectedNullThreshold = _utf16UnexpectedNullPercent / 100.0;
+
+            // Lots of odd nulls, low number of even nulls
+            if (evenNullThreshold < unexpectedNullThreshold && oddNullThreshold > expectedNullThreshold)
+            {
+                return CharacterEncoding.Utf16LeNoBom;
+            }
+
+            // Lots of even nulls, low number of odd nulls
+            if (oddNullThreshold < unexpectedNullThreshold && evenNullThreshold > expectedNullThreshold)
+            {
+                return CharacterEncoding.Utf16BeNoBom;
+            }
+
+            // Don't know
+            return CharacterEncoding.None;
+        }
+
+        /// <summary>
+        ///     Checks if a buffer contains valid utf8.
+        /// </summary>
+        /// <param name="buffer">The byte buffer.</param>
+        /// <param name="size">The size of the byte buffer.</param>
+        /// <returns>
+        ///     Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or
+        ///     Encoding.ASCII (data in 0.127 range).
+        /// </returns>
+        /// <returns>2</returns>
+        private CharacterEncoding CheckUtf8(byte[] buffer, int size)
+        {
+            // UTF8 Valid sequences
+            // 0xxxxxxx  ASCII
+            // 110xxxxx 10xxxxxx  2-byte
+            // 1110xxxx 10xxxxxx 10xxxxxx  3-byte
+            // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  4-byte
+            //
+            // Width in UTF8
+            // Decimal      Width
+            // 0-127        1 byte
+            // 194-223      2 bytes
+            // 224-239      3 bytes
+            // 240-244      4 bytes
+            //
+            // Subsequent chars are in the range 128-191
+            var onlySawAsciiRange = true;
+            uint pos = 0;
+
+            while (pos < size)
+            {
+                byte ch = buffer[pos++];
+
+                if (ch == 0 && _nullSuggestsBinary)
+                {
+                    return CharacterEncoding.None;
+                }
+
+                int moreChars;
+                if (ch <= 127)
+                {
+                    // 1 byte
+                    moreChars = 0;
+                }
+                else if (ch >= 194 && ch <= 223)
+                {
+                    // 2 Byte
+                    moreChars = 1;
+                }
+                else if (ch >= 224 && ch <= 239)
+                {
+                    // 3 Byte
+                    moreChars = 2;
+                }
+                else if (ch >= 240 && ch <= 244)
+                {
+                    // 4 Byte
+                    moreChars = 3;
+                }
+                else
+                {
+                    return CharacterEncoding.None; // Not utf8
+                }
+
+                // Check secondary chars are in range if we are expecting any
+                while (moreChars > 0 && pos < size)
+                {
+                    onlySawAsciiRange = false; // Seen non-ascii chars now
+
+                    ch = buffer[pos++];
+                    if (ch < 128 || ch > 191)
+                    {
+                        return CharacterEncoding.None; // Not utf8
+                    }
+
+                    --moreChars;
+                }
+            }
+
+            // If we get to here then only valid UTF-8 sequences have been processed
+
+            // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide)
+            return onlySawAsciiRange ? CharacterEncoding.Ascii : CharacterEncoding.Utf8Nobom;
+        }
+    }
+}
author	Luke Pulverenti <luke.pulverenti@gmail.com>	2017-08-16 02:43:41 -0400
committer	Luke Pulverenti <luke.pulverenti@gmail.com>	2017-08-16 02:43:41 -0400
commit	bfcd1b520fd79b893e721ba916ae5e1656407d2f (patch)
tree	6a05119800484435fb384da25c6390054a27c3c3 /Emby.Server.Implementations/TextEncoding/TextEncodingDetect.cs
parent	e3531534b85aeaaa3e4aaf462d5e77ea142dc762 (diff)