diff options
| author | Bond-009 <bond.009@outlook.com> | 2019-01-16 19:10:42 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2019-01-16 19:10:42 +0100 |
| commit | 900dc851e6c810f9e1772a6fb901a5a7e2801baf (patch) | |
| tree | 205bac3cd6df971ee18739e59bd4da0ffe91718b /Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs | |
| parent | 07a8e49c4b1e4a2dddbaa49ab6f1ff4f271fbf20 (diff) | |
| parent | 933ef438894ed233fec46badf58dd4f26492e832 (diff) | |
Merge branch 'dev' into cleanup
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs')
| -rw-r--r-- | Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs | 68 |
1 files changed, 34 insertions, 34 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs b/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs index 93b9d7580..7d28224c5 100644 --- a/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs +++ b/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs @@ -21,7 +21,7 @@ * Contributor(s): * Shy Shalom <shooshX@gmail.com> * Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port) - * + * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), @@ -48,7 +48,7 @@ namespace UniversalDetector.Core // hiragana frequency category table // This is hiragana 2-char sequence table, the number in each cell represents its frequency category - protected static byte[,] jp2CharContext = { + protected static byte[,] jp2CharContext = { { 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,}, { 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,}, { 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,}, @@ -133,35 +133,35 @@ namespace UniversalDetector.Core { 0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3,}, { 0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1,}, }; - + // category counters, each integer counts sequence in its category int[] relSample = new int[CATEGORIES_NUM]; // total sequence received int totalRel; - + // The order of previous char int lastCharOrder; - // if last byte in current buffer is not the last byte of a character, + // if last byte in current buffer is not the last byte of a character, // we need to know how many byte to skip in next buffer. int needToSkipCharNum; - // If this flag is set to true, detection is done and conclusion has + // If this flag is set to true, detection is done and conclusion has // been made bool done; - + public JapaneseContextAnalyser() { - Reset(); + Reset(); } - + public float GetConfidence() { // This is just one way to calculate confidence. It works well for me. if (totalRel > MINIMUM_DATA_THRESHOLD) return ((float)(totalRel - relSample[0]))/totalRel; - else + else return DONT_KNOW; } @@ -170,15 +170,15 @@ namespace UniversalDetector.Core int charLen = 0; int max = offset + len; - + if (done) return; - // The buffer we got is byte oriented, and a character may span + // The buffer we got is byte oriented, and a character may span // more than one buffer. In case the last one or two byte in last - // buffer is not complete, we record how many byte needed to + // buffer is not complete, we record how many byte needed to // complete that character and skip these bytes here. We can choose - // to record those bytes as well and analyse the character once it + // to record those bytes as well and analyse the character once it // is complete, but since a character will not make much difference, // skipping it will simplify our logic and improve performance. for (int i = needToSkipCharNum+offset; i < max; ) { @@ -200,14 +200,14 @@ namespace UniversalDetector.Core } } } - + public void HandleOneChar(byte[] buf, int offset, int charLen) { - if (totalRel > MAX_REL_THRESHOLD) + if (totalRel > MAX_REL_THRESHOLD) done = true; - if (done) + if (done) return; - + // Only 2-bytes characters are of our interest int order = (charLen == 2) ? GetOrder(buf, offset) : -1; if (order != -1 && lastCharOrder != -1) { @@ -217,7 +217,7 @@ namespace UniversalDetector.Core } lastCharOrder = order; } - + public void Reset() { totalRel = 0; @@ -228,18 +228,18 @@ namespace UniversalDetector.Core done = false; } } - + protected abstract int GetOrder(byte[] buf, int offset, out int charLen); - + protected abstract int GetOrder(byte[] buf, int offset); - - public bool GotEnoughData() + + public bool GotEnoughData() { return totalRel > ENOUGH_REL_THRESHOLD; } - + } - + public class SJISContextAnalyser : JapaneseContextAnalyser { private const byte HIRAGANA_FIRST_BYTE = 0x82; @@ -247,10 +247,10 @@ namespace UniversalDetector.Core protected override int GetOrder(byte[] buf, int offset, out int charLen) { //find out current char's byte length - if (buf[offset] >= 0x81 && buf[offset] <= 0x9F + if (buf[offset] >= 0x81 && buf[offset] <= 0x9F || buf[offset] >= 0xe0 && buf[offset] <= 0xFC) charLen = 2; - else + else charLen = 1; // return its order if it is hiragana @@ -259,7 +259,7 @@ namespace UniversalDetector.Core if (low >= 0x9F && low <= 0xF1) return low - 0x9F; } - return -1; + return -1; } protected override int GetOrder(byte[] buf, int offset) @@ -274,15 +274,15 @@ namespace UniversalDetector.Core } } - + public class EUCJPContextAnalyser : JapaneseContextAnalyser { private const byte HIRAGANA_FIRST_BYTE = 0xA4; - + protected override int GetOrder(byte[] buf, int offset, out int charLen) { byte high = buf[offset]; - + //find out current char's byte length if (high == 0x8E || high >= 0xA1 && high <= 0xFE) charLen = 2; @@ -297,9 +297,9 @@ namespace UniversalDetector.Core if (low >= 0xA1 && low <= 0xF3) return low - 0xA1; } - return -1; + return -1; } - + protected override int GetOrder(byte[] buf, int offset) { // We are only interested in Hiragana @@ -309,7 +309,7 @@ namespace UniversalDetector.Core return low - 0xA1; } return -1; - } + } } } |
