/* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Mozilla Universal charset detector code. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 2001 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Shy Shalom * Rudi Pettazzi (C# port) * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ namespace UniversalDetector.Core { enum InputState { PureASCII=0, EscASCII=1, Highbyte=2 }; public abstract class UniversalDetector { protected const int FILTER_CHINESE_SIMPLIFIED = 1; protected const int FILTER_CHINESE_TRADITIONAL = 2; protected const int FILTER_JAPANESE = 4; protected const int FILTER_KOREAN = 8; protected const int FILTER_NON_CJK = 16; protected const int FILTER_ALL = 31; protected static int FILTER_CHINESE = FILTER_CHINESE_SIMPLIFIED | FILTER_CHINESE_TRADITIONAL; protected static int FILTER_CJK = FILTER_JAPANESE | FILTER_KOREAN | FILTER_CHINESE_SIMPLIFIED | FILTER_CHINESE_TRADITIONAL; protected const float SHORTCUT_THRESHOLD = 0.95f; protected const float MINIMUM_THRESHOLD = 0.20f; internal InputState inputState; protected bool start; protected bool gotData; protected bool done; protected byte lastChar; protected int bestGuess; protected const int PROBERS_NUM = 3; protected int languageFilter; protected CharsetProber[] charsetProbers = new CharsetProber[PROBERS_NUM]; protected CharsetProber escCharsetProber; protected string detectedCharset; public UniversalDetector(int languageFilter) { this.start = true; this.inputState = InputState.PureASCII; this.lastChar = 0x00; this.bestGuess = -1; this.languageFilter = languageFilter; } public virtual void Feed(byte[] buf, int offset, int len) { if (done) { return; } if (len > 0) gotData = true; // If the data starts with BOM, we know it is UTF if (start) { start = false; if (len > 3) { switch (buf[0]) { case 0xEF: if (0xBB == buf[1] && 0xBF == buf[2]) detectedCharset = "UTF-8"; break; case 0xFE: if (0xFF == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) // FE FF 00 00 UCS-4, unusual octet order BOM (3412) detectedCharset = "X-ISO-10646-UCS-4-3412"; else if (0xFF == buf[1]) detectedCharset = "UTF-16BE"; break; case 0x00: if (0x00 == buf[1] && 0xFE == buf[2] && 0xFF == buf[3]) detectedCharset = "UTF-32BE"; else if (0x00 == buf[1] && 0xFF == buf[2] && 0xFE == buf[3]) // 00 00 FF FE UCS-4, unusual octet order BOM (2143) detectedCharset = "X-ISO-10646-UCS-4-2143"; break; case 0xFF: if (0xFE == buf[1] && 0x00 == buf[2] && 0x00 == buf[3]) detectedCharset = "UTF-32LE"; else if (0xFE == buf[1]) detectedCharset = "UTF-16LE"; break; } // switch } if (detectedCharset != null) { done = true; return; } } for (int i = 0; i < len; i++) { // other than 0xa0, if every other character is ascii, the page is ascii if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) { // we got a non-ascii byte (high-byte) if (inputState != InputState.Highbyte) { inputState = InputState.Highbyte; // kill EscCharsetProber if it is active if (escCharsetProber != null) { escCharsetProber = null; } // start multibyte and singlebyte charset prober if (charsetProbers[0] == null) charsetProbers[0] = new MBCSGroupProber(); if (charsetProbers[1] == null) charsetProbers[1] = new SBCSGroupProber(); if (charsetProbers[2] == null) charsetProbers[2] = new Latin1Prober(); } } else { if (inputState == InputState.PureASCII && (buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) { // found escape character or HZ "~{" inputState = InputState.EscASCII; } lastChar = buf[i]; } } ProbingState st = ProbingState.NotMe; switch (inputState) { case InputState.EscASCII: if (escCharsetProber == null) { escCharsetProber = new EscCharsetProber(); } st = escCharsetProber.HandleData(buf, offset, len); if (st == ProbingState.FoundIt) { done = true; detectedCharset = escCharsetProber.GetCharsetName(); } break; case InputState.Highbyte: for (int i = 0; i < PROBERS_NUM; i++) { if (charsetProbers[i] != null) { st = charsetProbers[i].HandleData(buf, offset, len); #if DEBUG charsetProbers[i].DumpStatus(); #endif if (st == ProbingState.FoundIt) { done = true; detectedCharset = charsetProbers[i].GetCharsetName(); return; } } } break; default: // pure ascii break; } return; } /// /// Notify detector that no further data is available. /// public virtual void DataEnd() { if (!gotData) { // we haven't got any data yet, return immediately // caller program sometimes call DataEnd before anything has // been sent to detector return; } if (detectedCharset != null) { done = true; Report(detectedCharset, 1.0f); return; } if (inputState == InputState.Highbyte) { float proberConfidence = 0.0f; float maxProberConfidence = 0.0f; int maxProber = 0; for (int i = 0; i < PROBERS_NUM; i++) { if (charsetProbers[i] != null) { proberConfidence = charsetProbers[i].GetConfidence(); if (proberConfidence > maxProberConfidence) { maxProberConfidence = proberConfidence; maxProber = i; } } } if (maxProberConfidence > MINIMUM_THRESHOLD) { Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence); } } else if (inputState == InputState.PureASCII) { Report("ASCII", 1.0f); } } /// /// Clear internal state of charset detector. /// In the original interface this method is protected. /// public virtual void Reset() { done = false; start = true; detectedCharset = null; gotData = false; bestGuess = -1; inputState = InputState.PureASCII; lastChar = 0x00; if (escCharsetProber != null) escCharsetProber.Reset(); for (int i = 0; i < PROBERS_NUM; i++) if (charsetProbers[i] != null) charsetProbers[i].Reset(); } protected abstract void Report(string charset, float confidence); } }