diff options
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/UniversalDetector.cs')
| -rw-r--r-- | Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/UniversalDetector.cs | 257 |
1 files changed, 257 insertions, 0 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/UniversalDetector.cs b/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/UniversalDetector.cs new file mode 100644 index 000000000..0c9a4ee60 --- /dev/null +++ b/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/UniversalDetector.cs @@ -0,0 +1,257 @@ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Universal charset detector code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 2001 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Shy Shalom <shooshX@gmail.com> + * Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +namespace UniversalDetector.Core +{ + + enum InputState { PureASCII=0, EscASCII=1, Highbyte=2 }; + + public abstract class UniversalDetector + { + protected const int FILTER_CHINESE_SIMPLIFIED = 1; + protected const int FILTER_CHINESE_TRADITIONAL = 2; + protected const int FILTER_JAPANESE = 4; + protected const int FILTER_KOREAN = 8; + protected const int FILTER_NON_CJK = 16; + protected const int FILTER_ALL = 31; + protected static int FILTER_CHINESE = + FILTER_CHINESE_SIMPLIFIED | FILTER_CHINESE_TRADITIONAL; + protected static int FILTER_CJK = + FILTER_JAPANESE | FILTER_KOREAN | FILTER_CHINESE_SIMPLIFIED + | FILTER_CHINESE_TRADITIONAL; + + protected const float SHORTCUT_THRESHOLD = 0.95f; + protected const float MINIMUM_THRESHOLD = 0.20f; + + internal InputState inputState; + protected bool start; + protected bool gotData; + protected bool done; + protected byte lastChar; + protected int bestGuess; + protected const int PROBERS_NUM = 3; + protected int languageFilter; + protected CharsetProber[] charsetProbers = new CharsetProber[PROBERS_NUM]; + protected CharsetProber escCharsetProber; + protected string detectedCharset; + + public UniversalDetector(int languageFilter) { + this.start = true; + this.inputState = InputState.PureASCII; + this.lastChar = 0x00; + this.bestGuess = -1; + this.languageFilter = languageFilter; + } + + public virtual void Feed(byte[] buf, int offset, int len) + { + if (done) { + return; + } + + if (len > 0) + gotData = true; + + // If the data starts with BOM, we know it is UTF + if (start) { + start = false; + if (len > 3) { + switch (buf[0]) { + case 0xEF: + if (0xBB == buf[1] && 0xBF == buf[2]) + detectedCharset = "UTF-8"; + break; + case 0xFE: + if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) + // FE FF 00 00 UCS-4, unusual octet order BOM (3412) + detectedCharset = "X-ISO-10646-UCS-4-3412"; + else if (0xFF == buf[1]) + detectedCharset = "UTF-16BE"; + break; + case 0x00: + if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3]) + detectedCharset = "UTF-32BE"; + else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3]) + // 00 00 FF FE UCS-4, unusual octet order BOM (2143) + detectedCharset = "X-ISO-10646-UCS-4-2143"; + break; + case 0xFF: + if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) + detectedCharset = "UTF-32LE"; + else if (0xFE == buf[1]) + detectedCharset = "UTF-16LE"; + break; + } // switch + } + if (detectedCharset != null) { + done = true; + return; + } + } + + for (int i = 0; i < len; i++) { + + // other than 0xa0, if every other character is ascii, the page is ascii + if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) { + // we got a non-ascii byte (high-byte) + if (inputState != InputState.Highbyte) { + inputState = InputState.Highbyte; + + // kill EscCharsetProber if it is active + if (escCharsetProber != null) { + escCharsetProber = null; + } + + // start multibyte and singlebyte charset prober + if (charsetProbers[0] == null) + charsetProbers[0] = new MBCSGroupProber(); + if (charsetProbers[1] == null) + charsetProbers[1] = new SBCSGroupProber(); + if (charsetProbers[2] == null) + charsetProbers[2] = new Latin1Prober(); + } + } else { + if (inputState == InputState.PureASCII && + (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) { + // found escape character or HZ "~{" + inputState = InputState.EscASCII; + } + lastChar = buf[i]; + } + } + + ProbingState st = ProbingState.NotMe; + + switch (inputState) { + case InputState.EscASCII: + if (escCharsetProber == null) { + escCharsetProber = new EscCharsetProber(); + } + st = escCharsetProber.HandleData(buf, offset, len); + if (st == ProbingState.FoundIt) { + done = true; + detectedCharset = escCharsetProber.GetCharsetName(); + } + break; + case InputState.Highbyte: + for (int i = 0; i < PROBERS_NUM; i++) { + if (charsetProbers[i] != null) { + st = charsetProbers[i].HandleData(buf, offset, len); + #if DEBUG + charsetProbers[i].DumpStatus(); + #endif + if (st == ProbingState.FoundIt) { + done = true; + detectedCharset = charsetProbers[i].GetCharsetName(); + return; + } + } + } + break; + default: + // pure ascii + break; + } + return; + } + + /// <summary> + /// Notify detector that no further data is available. + /// </summary> + public virtual void DataEnd() + { + if (!gotData) { + // we haven't got any data yet, return immediately + // caller program sometimes call DataEnd before anything has + // been sent to detector + return; + } + + if (detectedCharset != null) { + done = true; + Report(detectedCharset, 1.0f); + return; + } + + if (inputState == InputState.Highbyte) { + float proberConfidence = 0.0f; + float maxProberConfidence = 0.0f; + int maxProber = 0; + for (int i = 0; i < PROBERS_NUM; i++) { + if (charsetProbers[i] != null) { + proberConfidence = charsetProbers[i].GetConfidence(); + if (proberConfidence > maxProberConfidence) { + maxProberConfidence = proberConfidence; + maxProber = i; + } + } + } + + if (maxProberConfidence > MINIMUM_THRESHOLD) { + Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence); + } + + } else if (inputState == InputState.PureASCII) { + Report("ASCII", 1.0f); + } + } + + /// <summary> + /// Clear internal state of charset detector. + /// In the original interface this method is protected. + /// </summary> + public virtual void Reset() + { + done = false; + start = true; + detectedCharset = null; + gotData = false; + bestGuess = -1; + inputState = InputState.PureASCII; + lastChar = 0x00; + if (escCharsetProber != null) + escCharsetProber.Reset(); + for (int i = 0; i < PROBERS_NUM; i++) + if (charsetProbers[i] != null) + charsetProbers[i].Reset(); + } + + protected abstract void Report(string charset, float confidence); + + } +} |
