aboutsummaryrefslogtreecommitdiff
path: root/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs
diff options
context:
space:
mode:
authorBond-009 <bond.009@outlook.com>2019-01-16 19:10:42 +0100
committerGitHub <noreply@github.com>2019-01-16 19:10:42 +0100
commit900dc851e6c810f9e1772a6fb901a5a7e2801baf (patch)
tree205bac3cd6df971ee18739e59bd4da0ffe91718b /Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs
parent07a8e49c4b1e4a2dddbaa49ab6f1ff4f271fbf20 (diff)
parent933ef438894ed233fec46badf58dd4f26492e832 (diff)
Merge branch 'dev' into cleanup
Diffstat (limited to 'Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs')
-rw-r--r--Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs68
1 files changed, 34 insertions, 34 deletions
diff --git a/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs b/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs
index 93b9d7580..7d28224c5 100644
--- a/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs
+++ b/Emby.Server.Implementations/TextEncoding/UniversalDetector/Core/JapaneseContextAnalyser.cs
@@ -21,7 +21,7 @@
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
- *
+ *
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
@@ -48,7 +48,7 @@ namespace UniversalDetector.Core
// hiragana frequency category table
// This is hiragana 2-char sequence table, the number in each cell represents its frequency category
- protected static byte[,] jp2CharContext = {
+ protected static byte[,] jp2CharContext = {
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
{ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,},
@@ -133,35 +133,35 @@ namespace UniversalDetector.Core
{ 0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3,},
{ 0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1,},
};
-
+
// category counters, each integer counts sequence in its category
int[] relSample = new int[CATEGORIES_NUM];
// total sequence received
int totalRel;
-
+
// The order of previous char
int lastCharOrder;
- // if last byte in current buffer is not the last byte of a character,
+ // if last byte in current buffer is not the last byte of a character,
// we need to know how many byte to skip in next buffer.
int needToSkipCharNum;
- // If this flag is set to true, detection is done and conclusion has
+ // If this flag is set to true, detection is done and conclusion has
// been made
bool done;
-
+
public JapaneseContextAnalyser()
{
- Reset();
+ Reset();
}
-
+
public float GetConfidence()
{
// This is just one way to calculate confidence. It works well for me.
if (totalRel > MINIMUM_DATA_THRESHOLD)
return ((float)(totalRel - relSample[0]))/totalRel;
- else
+ else
return DONT_KNOW;
}
@@ -170,15 +170,15 @@ namespace UniversalDetector.Core
int charLen = 0;
int max = offset + len;
-
+
if (done)
return;
- // The buffer we got is byte oriented, and a character may span
+ // The buffer we got is byte oriented, and a character may span
// more than one buffer. In case the last one or two byte in last
- // buffer is not complete, we record how many byte needed to
+ // buffer is not complete, we record how many byte needed to
// complete that character and skip these bytes here. We can choose
- // to record those bytes as well and analyse the character once it
+ // to record those bytes as well and analyse the character once it
// is complete, but since a character will not make much difference,
// skipping it will simplify our logic and improve performance.
for (int i = needToSkipCharNum+offset; i < max; ) {
@@ -200,14 +200,14 @@ namespace UniversalDetector.Core
}
}
}
-
+
public void HandleOneChar(byte[] buf, int offset, int charLen)
{
- if (totalRel > MAX_REL_THRESHOLD)
+ if (totalRel > MAX_REL_THRESHOLD)
done = true;
- if (done)
+ if (done)
return;
-
+
// Only 2-bytes characters are of our interest
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
if (order != -1 && lastCharOrder != -1) {
@@ -217,7 +217,7 @@ namespace UniversalDetector.Core
}
lastCharOrder = order;
}
-
+
public void Reset()
{
totalRel = 0;
@@ -228,18 +228,18 @@ namespace UniversalDetector.Core
done = false;
}
}
-
+
protected abstract int GetOrder(byte[] buf, int offset, out int charLen);
-
+
protected abstract int GetOrder(byte[] buf, int offset);
-
- public bool GotEnoughData()
+
+ public bool GotEnoughData()
{
return totalRel > ENOUGH_REL_THRESHOLD;
}
-
+
}
-
+
public class SJISContextAnalyser : JapaneseContextAnalyser
{
private const byte HIRAGANA_FIRST_BYTE = 0x82;
@@ -247,10 +247,10 @@ namespace UniversalDetector.Core
protected override int GetOrder(byte[] buf, int offset, out int charLen)
{
//find out current char's byte length
- if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
+ if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
|| buf[offset] >= 0xe0 && buf[offset] <= 0xFC)
charLen = 2;
- else
+ else
charLen = 1;
// return its order if it is hiragana
@@ -259,7 +259,7 @@ namespace UniversalDetector.Core
if (low >= 0x9F && low <= 0xF1)
return low - 0x9F;
}
- return -1;
+ return -1;
}
protected override int GetOrder(byte[] buf, int offset)
@@ -274,15 +274,15 @@ namespace UniversalDetector.Core
}
}
-
+
public class EUCJPContextAnalyser : JapaneseContextAnalyser
{
private const byte HIRAGANA_FIRST_BYTE = 0xA4;
-
+
protected override int GetOrder(byte[] buf, int offset, out int charLen)
{
byte high = buf[offset];
-
+
//find out current char's byte length
if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
charLen = 2;
@@ -297,9 +297,9 @@ namespace UniversalDetector.Core
if (low >= 0xA1 && low <= 0xF3)
return low - 0xA1;
}
- return -1;
+ return -1;
}
-
+
protected override int GetOrder(byte[] buf, int offset)
{
// We are only interested in Hiragana
@@ -309,7 +309,7 @@ namespace UniversalDetector.Core
return low - 0xA1;
}
return -1;
- }
+ }
}
}