1 files changed, 257 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/UniversalDetector.cs b/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/UniversalDetector.cs
new file mode 100644
index 000000000..0c9a4ee60
--- /dev/null
+++ b/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/UniversalDetector.cs
@@ -0,0 +1,257 @@
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Universal charset detector code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 2001
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *          Shy Shalom <shooshX@gmail.com>
+ *          Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
+ * 
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+namespace UniversalDetector.Core
+{
+
+    enum InputState { PureASCII=0, EscASCII=1, Highbyte=2 };
+
+    public abstract class UniversalDetector 
+    {
+        protected const int FILTER_CHINESE_SIMPLIFIED = 1;
+        protected const int FILTER_CHINESE_TRADITIONAL = 2;
+        protected const int FILTER_JAPANESE = 4;
+        protected const int FILTER_KOREAN = 8;
+        protected const int FILTER_NON_CJK = 16;
+        protected const int FILTER_ALL = 31;
+        protected static int FILTER_CHINESE = 
+            FILTER_CHINESE_SIMPLIFIED | FILTER_CHINESE_TRADITIONAL;
+        protected static int FILTER_CJK = 
+                FILTER_JAPANESE | FILTER_KOREAN | FILTER_CHINESE_SIMPLIFIED 
+                | FILTER_CHINESE_TRADITIONAL;
+        
+        protected const float SHORTCUT_THRESHOLD = 0.95f;
+        protected const float MINIMUM_THRESHOLD = 0.20f;
+
+        internal InputState inputState;
+        protected bool start;
+        protected bool gotData;
+        protected bool done;
+        protected byte lastChar;
+        protected int bestGuess;
+        protected const int PROBERS_NUM = 3;
+        protected int languageFilter;
+        protected CharsetProber[] charsetProbers = new CharsetProber[PROBERS_NUM];
+        protected CharsetProber escCharsetProber;
+        protected string detectedCharset;
+
+        public UniversalDetector(int languageFilter) { 
+            this.start = true;
+            this.inputState = InputState.PureASCII;
+            this.lastChar = 0x00;   
+            this.bestGuess = -1;
+            this.languageFilter = languageFilter;
+        }
+
+        public virtual void Feed(byte[] buf, int offset, int len)
+        { 
+            if (done) {
+                return;
+            }
+
+            if (len > 0)
+                gotData = true;
+
+            // If the data starts with BOM, we know it is UTF
+            if (start) {
+                start = false;
+                if (len > 3) {
+                    switch (buf[0]) {
+                    case 0xEF:
+                        if (0xBB == buf[1] && 0xBF == buf[2])
+                            detectedCharset = "UTF-8";
+                        break;
+                    case 0xFE:
+                        if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
+                            // FE FF 00 00  UCS-4, unusual octet order BOM (3412)
+                            detectedCharset = "X-ISO-10646-UCS-4-3412";
+                        else if (0xFF == buf[1])
+                            detectedCharset = "UTF-16BE";
+                        break;
+                    case 0x00:
+                        if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3])
+                            detectedCharset = "UTF-32BE";
+                        else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3])
+                            // 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
+                            detectedCharset = "X-ISO-10646-UCS-4-2143";
+                        break;
+                    case 0xFF:
+                        if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3])
+                            detectedCharset = "UTF-32LE";
+                        else if (0xFE == buf[1])
+                            detectedCharset = "UTF-16LE";
+                        break;
+                    }  // switch
+                }
+                if (detectedCharset != null) {
+                    done = true;
+                    return;
+                }
+            }
+
+            for (int i = 0; i < len; i++) {
+                
+                // other than 0xa0, if every other character is ascii, the page is ascii
+                if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0)  {
+                    // we got a non-ascii byte (high-byte)
+                    if (inputState != InputState.Highbyte) {
+                        inputState = InputState.Highbyte;
+
+                        // kill EscCharsetProber if it is active
+                        if (escCharsetProber != null) {
+                            escCharsetProber = null;
+                        }
+
+                        // start multibyte and singlebyte charset prober
+                        if (charsetProbers[0] == null)
+                            charsetProbers[0] = new MBCSGroupProber();
+                        if (charsetProbers[1] == null)
+                            charsetProbers[1] = new SBCSGroupProber();
+                        if (charsetProbers[2] == null)
+                            charsetProbers[2] = new Latin1Prober(); 
+                    }
+                } else { 
+                    if (inputState == InputState.PureASCII &&
+                        (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) {
+                        // found escape character or HZ "~{"
+                        inputState = InputState.EscASCII;
+                    }
+                    lastChar = buf[i];
+                }
+            }
+            
+            ProbingState st = ProbingState.NotMe;
+            
+            switch (inputState) {
+                case InputState.EscASCII:
+                    if (escCharsetProber == null) {
+                        escCharsetProber = new EscCharsetProber();
+                    }
+                    st = escCharsetProber.HandleData(buf, offset, len);
+                    if (st == ProbingState.FoundIt) {
+                        done = true;
+                        detectedCharset = escCharsetProber.GetCharsetName();
+                    }
+                    break;
+                case InputState.Highbyte:
+                    for (int i = 0; i < PROBERS_NUM; i++) {
+                        if (charsetProbers[i] != null) {
+                            st = charsetProbers[i].HandleData(buf, offset, len);
+                            #if DEBUG                            
+                            charsetProbers[i].DumpStatus();
+                            #endif                        
+                            if (st == ProbingState.FoundIt) {
+                                done = true;
+                                detectedCharset = charsetProbers[i].GetCharsetName();
+                                return;
+                            }  
+                        }
+                    }
+                    break;
+                default:  
+                    // pure ascii
+                    break;
+            }
+            return;
+        }
+
+        /// <summary>
+        /// Notify detector that no further data is available. 
+        /// </summary>
+        public virtual void DataEnd()
+        {
+            if (!gotData) {
+                // we haven't got any data yet, return immediately 
+                // caller program sometimes call DataEnd before anything has 
+                // been sent to detector
+                return;
+            }
+
+            if (detectedCharset != null) {
+                done = true;
+                Report(detectedCharset, 1.0f);
+                return;
+            } 
+
+            if (inputState == InputState.Highbyte) {
+                float proberConfidence = 0.0f;
+                float maxProberConfidence = 0.0f;
+                int maxProber = 0;
+                for (int i = 0; i < PROBERS_NUM; i++) {
+                    if (charsetProbers[i] != null) {
+                        proberConfidence = charsetProbers[i].GetConfidence();
+                        if (proberConfidence > maxProberConfidence) {
+                            maxProberConfidence = proberConfidence;
+                            maxProber = i;
+                        }
+                    }
+                }
+                
+                if (maxProberConfidence > MINIMUM_THRESHOLD) {
+                    Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence);
+                } 
+                
+            } else if (inputState == InputState.PureASCII) {
+                Report("ASCII", 1.0f);
+            } 
+        }
+
+        /// <summary>
+        /// Clear internal state of charset detector.
+        /// In the original interface this method is protected. 
+        /// </summary>
+        public virtual void Reset() 
+        { 
+            done = false;
+            start = true;
+            detectedCharset = null;
+            gotData = false;
+            bestGuess = -1;
+            inputState = InputState.PureASCII;
+            lastChar = 0x00;
+            if (escCharsetProber != null)
+                escCharsetProber.Reset();
+            for (int i = 0; i < PROBERS_NUM; i++)
+                if (charsetProbers[i] != null)
+                    charsetProbers[i].Reset();
+        }
+        
+        protected abstract void Report(string charset, float confidence);
+
+    }
+}