// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2005-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#include "cmemory.h"
#include "csmatch.h"
#include "csrmbcs.h"
#include <math.h>
U_NAMESPACE_BEGIN
#define min(x,y) (((x)<(y))?(x):(y))
static const uint16_t commonChars_sjis [] = {
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
0 x8140,
0 x8141,
0 x8142,
0 x8145,
0 x815b,
0 x8169,
0 x816a,
0 x8175,
0 x8176,
0 x82a0,
0 x82a2,
0 x82a4,
0 x82a9,
0 x82aa,
0 x82ab,
0 x82ad,
0 x82af,
0 x82b1,
0 x82b3,
0 x82b5,
0 x82b7,
0 x82bd,
0 x82be,
0 x82c1,
0 x82c4,
0 x82c5,
0 x82c6,
0 x82c8,
0 x82c9,
0 x82cc,
0 x82cd,
0 x82dc,
0 x82e0,
0 x82e7,
0 x82e8,
0 x82e9,
0 x82ea,
0 x82f0,
0 x82f1,
0 x8341,
0 x8343,
0 x834e,
0 x834f,
0 x8358,
0 x835e,
0 x8362,
0 x8367,
0 x8375,
0 x8376,
0 x8389,
0 x838a,
0 x838b,
0 x838d,
0 x8393,
0 x8e96,
0 x93fa,
0 x95aa};
static const uint16_t commonChars_euc_jp[] = {
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
0 xa1a1,
0 xa1a2,
0 xa1a3,
0 xa1a6,
0 xa1bc,
0 xa1ca,
0 xa1cb,
0 xa1d6,
0 xa1d7,
0 xa4a2,
0 xa4a4,
0 xa4a6,
0 xa4a8,
0 xa4aa,
0 xa4ab,
0 xa4ac,
0 xa4ad,
0 xa4af,
0 xa4b1,
0 xa4b3,
0 xa4b5,
0 xa4b7,
0 xa4b9,
0 xa4bb,
0 xa4bd,
0 xa4bf,
0 xa4c0,
0 xa4c1,
0 xa4c3,
0 xa4c4,
0 xa4c6,
0 xa4c7,
0 xa4c8,
0 xa4c9,
0 xa4ca,
0 xa4cb,
0 xa4ce,
0 xa4cf,
0 xa4d0,
0 xa4de,
0 xa4df,
0 xa4e1,
0 xa4e2,
0 xa4e4,
0 xa4e8,
0 xa4e9,
0 xa4ea,
0 xa4eb,
0 xa4ec,
0 xa4ef,
0 xa4f2,
0 xa4f3,
0 xa5a2,
0 xa5a3,
0 xa5a4,
0 xa5a6,
0 xa5a7,
0 xa5aa,
0 xa5ad,
0 xa5af,
0 xa5b0,
0 xa5b3,
0 xa5b5,
0 xa5b7,
0 xa5b8,
0 xa5b9,
0 xa5bf,
0 xa5c3,
0 xa5c6,
0 xa5c7,
0 xa5c8,
0 xa5c9,
0 xa5cb,
0 xa5d0,
0 xa5d5,
0 xa5d6,
0 xa5d7,
0 xa5de,
0 xa5e0,
0 xa5e1,
0 xa5e5,
0 xa5e9,
0 xa5ea,
0 xa5eb,
0 xa5ec,
0 xa5ed,
0 xa5f3,
0 xb8a9,
0 xb9d4,
0 xbaee,
0 xbbc8,
0 xbef0,
0 xbfb7,
0 xc4ea,
0 xc6fc,
0 xc7bd,
0 xcab8,
0 xcaf3,
0 xcbdc,
0 xcdd1};
static const uint16_t commonChars_euc_kr[] = {
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
0 xb0a1,
0 xb0b3,
0 xb0c5,
0 xb0cd,
0 xb0d4,
0 xb0e6,
0 xb0ed,
0 xb0f8,
0 xb0fa,
0 xb0fc,
0 xb1b8,
0 xb1b9,
0 xb1c7,
0 xb1d7,
0 xb1e2,
0 xb3aa,
0 xb3bb,
0 xb4c2,
0 xb4cf,
0 xb4d9,
0 xb4eb,
0 xb5a5,
0 xb5b5,
0 xb5bf,
0 xb5c7,
0 xb5e9,
0 xb6f3,
0 xb7af,
0 xb7c2,
0 xb7ce,
0 xb8a6,
0 xb8ae,
0 xb8b6,
0 xb8b8,
0 xb8bb,
0 xb8e9,
0 xb9ab,
0 xb9ae,
0 xb9cc,
0 xb9ce,
0 xb9fd,
0 xbab8,
0 xbace,
0 xbad0,
0 xbaf1,
0 xbbe7,
0 xbbf3,
0 xbbfd,
0 xbcad,
0 xbcba,
0 xbcd2,
0 xbcf6,
0 xbdba,
0 xbdc0,
0 xbdc3,
0 xbdc5,
0 xbec6,
0 xbec8,
0 xbedf,
0 xbeee,
0 xbef8,
0 xbefa,
0 xbfa1,
0 xbfa9,
0 xbfc0,
0 xbfe4,
0 xbfeb,
0 xbfec,
0 xbff8,
0 xc0a7,
0 xc0af,
0 xc0b8,
0 xc0ba,
0 xc0bb,
0 xc0bd,
0 xc0c7,
0 xc0cc,
0 xc0ce,
0 xc0cf,
0 xc0d6,
0 xc0da,
0 xc0e5,
0 xc0fb,
0 xc0fc,
0 xc1a4,
0 xc1a6,
0 xc1b6,
0 xc1d6,
0 xc1df,
0 xc1f6,
0 xc1f8,
0 xc4a1,
0 xc5cd,
0 xc6ae,
0 xc7cf,
0 xc7d1,
0 xc7d2,
0 xc7d8,
0 xc7e5,
0 xc8ad};
static const uint16_t commonChars_big5[] = {
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
0 xa140,
0 xa141,
0 xa142,
0 xa143,
0 xa147,
0 xa149,
0 xa175,
0 xa176,
0 xa440,
0 xa446,
0 xa447,
0 xa448,
0 xa451,
0 xa454,
0 xa457,
0 xa464,
0 xa46a,
0 xa46c,
0 xa477,
0 xa4a3,
0 xa4a4,
0 xa4a7,
0 xa4c1,
0 xa4ce,
0 xa4d1,
0 xa4df,
0 xa4e8,
0 xa4fd,
0 xa540,
0 xa548,
0 xa558,
0 xa569,
0 xa5cd,
0 xa5e7,
0 xa657,
0 xa661,
0 xa662,
0 xa668,
0 xa670,
0 xa6a8,
0 xa6b3,
0 xa6b9,
0 xa6d3,
0 xa6db,
0 xa6e6,
0 xa6f2,
0 xa740,
0 xa751,
0 xa759,
0 xa7da,
0 xa8a3,
0 xa8a5,
0 xa8ad,
0 xa8d1,
0 xa8d3,
0 xa8e4,
0 xa8fc,
0 xa9c0,
0 xa9d2,
0 xa9f3,
0 xaa6b,
0 xaaba,
0 xaabe,
0 xaacc,
0 xaafc,
0 xac47,
0 xac4f,
0 xacb0,
0 xacd2,
0 xad59,
0 xaec9,
0 xafe0,
0 xb0ea,
0 xb16f,
0 xb2b3,
0 xb2c4,
0 xb36f,
0 xb44c,
0 xb44e,
0 xb54c,
0 xb5a5,
0 xb5bd,
0 xb5d0,
0 xb5d8,
0 xb671,
0 xb7ed,
0 xb867,
0 xb944,
0 xbad8,
0 xbb44,
0 xbba1,
0 xbdd1,
0 xc2c4,
0 xc3b9,
0 xc440,
0 xc45f};
static const uint16_t commonChars_gb_18030[] = {
// TODO: This set of data comes from the character frequency-
// of-occurrence analysis tool. The data needs to be moved
// into a resource and loaded from there.
0 xa1a1,
0 xa1a2,
0 xa1a3,
0 xa1a4,
0 xa1b0,
0 xa1b1,
0 xa1f1,
0 xa1f3,
0 xa3a1,
0 xa3ac,
0 xa3ba,
0 xb1a8,
0 xb1b8,
0 xb1be,
0 xb2bb,
0 xb3c9,
0 xb3f6,
0 xb4f3,
0 xb5bd,
0 xb5c4,
0 xb5e3,
0 xb6af,
0 xb6d4,
0 xb6e0,
0 xb7a2,
0 xb7a8,
0 xb7bd,
0 xb7d6,
0 xb7dd,
0 xb8b4,
0 xb8df,
0 xb8f6,
0 xb9ab,
0 xb9c9,
0 xb9d8,
0 xb9fa,
0 xb9fd,
0 xbacd,
0 xbba7,
0 xbbd6,
0 xbbe1,
0 xbbfa,
0 xbcbc,
0 xbcdb,
0 xbcfe,
0 xbdcc,
0 xbecd,
0 xbedd,
0 xbfb4,
0 xbfc6,
0 xbfc9,
0 xc0b4,
0 xc0ed,
0 xc1cb,
0 xc2db,
0 xc3c7,
0 xc4dc,
0 xc4ea,
0 xc5cc,
0 xc6f7,
0 xc7f8,
0 xc8ab,
0 xc8cb,
0 xc8d5,
0 xc8e7,
0 xc9cf,
0 xc9fa,
0 xcab1,
0 xcab5,
0 xcac7,
0 xcad0,
0 xcad6,
0 xcaf5,
0 xcafd,
0 xccec,
0 xcdf8,
0 xceaa,
0 xcec4,
0 xced2,
0 xcee5,
0 xcfb5,
0 xcfc2,
0 xcfd6,
0 xd0c2,
0 xd0c5,
0 xd0d0,
0 xd0d4,
0 xd1a7,
0 xd2aa,
0 xd2b2,
0 xd2b5,
0 xd2bb,
0 xd2d4,
0 xd3c3,
0 xd3d0,
0 xd3fd,
0 xd4c2,
0 xd4da,
0 xd5e2,
0 xd6d0};
static int32_t binarySearch(
const uint16_t *array, int32_t len, uint16_t value)
{
int32_t start =
0 , end = len-
1 ;
int32_t mid = (start+end)/
2 ;
while (start <= end) {
if (array[mid] == value) {
return mid;
}
if (array[mid] < value){
start = mid+
1 ;
}
else {
end = mid-
1 ;
}
mid = (start+end)/
2 ;
}
return -
1 ;
}
IteratedChar::IteratedChar() :
charValue(
0 ), index(-
1 ), nextIndex(
0 ), error(
false ), done(
false )
{
// nothing else to do.
}
/*void IteratedChar::reset()
{
charValue = 0;
index = -1;
nextIndex = 0;
error = false;
done = false;
}*/
int32_t IteratedChar::nextByte(InputText *det)
{
if (nextIndex >= det->fRawLength) {
done =
true ;
return -
1 ;
}
return det->fRawInput[nextIndex++];
}
CharsetRecog_mbcs::~CharsetRecog_mbcs()
{
// nothing to do.
}
int32_t CharsetRecog_mbcs::match_mbcs(InputText *det,
const uint16_t commonChars[], in
t32_t commonCharsLen) const {
int32_t doubleByteCharCount = 0 ;
int32_t commonCharCount = 0 ;
int32_t badCharCount = 0 ;
int32_t totalCharCount = 0 ;
int32_t confidence = 0 ;
IteratedChar iter;
while (nextChar(&iter, det)) {
totalCharCount++;
if (iter.error) {
badCharCount++;
} else {
if (iter.charValue > 0 xFF) {
doubleByteCharCount++;
if (commonChars != nullptr) {
if (binarySearch(commonChars, commonCharsLen, static_cast <uint16_t>(iter.charValue)) >= 0 ){
commonCharCount += 1 ;
}
}
}
}
if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
// Bail out early if the byte data is not matching the encoding scheme.
// break detectBlock;
return confidence;
}
}
if (doubleByteCharCount <= 10 && badCharCount == 0 ) {
// Not many multi-byte chars.
if (doubleByteCharCount == 0 && totalCharCount < 10 ) {
// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
// We don't have enough data to have any confidence.
// Statistical analysis of single byte non-ASCII characters would probably help here.
confidence = 0 ;
}
else {
// ASCII or ISO file? It's probably not our encoding,
// but is not incompatible with our encoding, so don't give it a zero.
confidence = 10 ;
}
return confidence;
}
//
// No match if there are too many characters that don't fit the encoding scheme.
// (should we have zero tolerance for these?)
//
if (doubleByteCharCount < 20 *badCharCount) {
confidence = 0 ;
return confidence;
}
if (commonChars == nullptr) {
// We have no statistics on frequently occurring characters.
// Assess confidence purely on having a reasonable number of
// multi-byte characters (the more the better)
confidence = 30 + doubleByteCharCount - 20 *badCharCount;
if (confidence > 100 ) {
confidence = 100 ;
}
} else {
//
// Frequency of occurrence statistics exist.
//
double maxVal = log(static_cast <double >(doubleByteCharCount) / 4 ); /*(float)?*/
double scaleFactor = 90 .0 / maxVal;
confidence = static_cast <int32_t>(log(static_cast <double >(commonCharCount) + 1 ) * scaleFactor + 10 .0 );
confidence = min(confidence, 100 );
}
if (confidence < 0 ) {
confidence = 0 ;
}
return confidence;
}
CharsetRecog_sjis::~CharsetRecog_sjis()
{
// nothing to do
}
UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
it->index = it->nextIndex;
it->error = false ;
int32_t firstByte = it->charValue = it->nextByte(det);
if (firstByte < 0 ) {
return false ;
}
if (firstByte <= 0 x7F || (firstByte > 0 xA0 && firstByte <= 0 xDF)) {
return true ;
}
int32_t secondByte = it->nextByte(det);
if (secondByte >= 0 ) {
it->charValue = (firstByte << 8 ) | secondByte;
}
// else we'll handle the error later.
if (! ((secondByte >= 0 x40 && secondByte <= 0 x7F) || (secondByte >= 0 x80 && secondByte <= 0 xFE))) {
// Illegal second byte value.
it->error = true ;
}
return true ;
}
UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
results->set(det, this , confidence);
return (confidence > 0 );
}
const char *CharsetRecog_sjis::getName() const
{
return "Shift_JIS" ;
}
const char *CharsetRecog_sjis::getLanguage() const
{
return "ja" ;
}
CharsetRecog_euc::~CharsetRecog_euc()
{
// nothing to do
}
UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
int32_t firstByte = 0 ;
int32_t secondByte = 0 ;
int32_t thirdByte = 0 ;
it->index = it->nextIndex;
it->error = false ;
firstByte = it->charValue = it->nextByte(det);
if (firstByte < 0 ) {
// Ran off the end of the input data
return false ;
}
if (firstByte <= 0 x8D) {
// single byte char
return true ;
}
secondByte = it->nextByte(det);
if (secondByte >= 0 ) {
it->charValue = (it->charValue << 8 ) | secondByte;
}
// else we'll handle the error later.
if (firstByte >= 0 xA1 && firstByte <= 0 xFE) {
// Two byte Char
if (secondByte < 0 xA1) {
it->error = true ;
}
return true ;
}
if (firstByte == 0 x8E) {
// Code Set 2.
// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
// We don't know which we've got.
// Treat it like EUC-JP. If the data really was EUC-TW, the following two
// bytes will look like a well formed 2 byte char.
if (secondByte < 0 xA1) {
it->error = true ;
}
return true ;
}
if (firstByte == 0 x8F) {
// Code set 3.
// Three byte total char size, two bytes of actual char value.
thirdByte = it->nextByte(det);
it->charValue = (it->charValue << 8 ) | thirdByte;
if (thirdByte < 0 xa1) {
// Bad second byte or ran off the end of the input data with a non-ASCII first byte.
it->error = true ;
}
}
return true ;
}
CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
{
// nothing to do
}
const char *CharsetRecog_euc_jp::getName() const
{
return "EUC-JP" ;
}
const char *CharsetRecog_euc_jp::getLanguage() const
{
return "ja" ;
}
UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
{
int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
results->set(det, this , confidence);
return (confidence > 0 );
}
CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
{
// nothing to do
}
const char *CharsetRecog_euc_kr::getName() const
{
return "EUC-KR" ;
}
const char *CharsetRecog_euc_kr::getLanguage() const
{
return "ko" ;
}
UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
{
int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
results->set(det, this , confidence);
return (confidence > 0 );
}
CharsetRecog_big5::~CharsetRecog_big5()
{
// nothing to do
}
UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
{
int32_t firstByte;
it->index = it->nextIndex;
it->error = false ;
firstByte = it->charValue = it->nextByte(det);
if (firstByte < 0 ) {
return false ;
}
if (firstByte <= 0 x7F || firstByte == 0 xFF) {
// single byte character.
return true ;
}
int32_t secondByte = it->nextByte(det);
if (secondByte >= 0 ) {
it->charValue = (it->charValue << 8 ) | secondByte;
}
// else we'll handle the error later.
if (secondByte < 0 x40 || secondByte == 0 x7F || secondByte == 0 xFF) {
it->error = true ;
}
return true ;
}
const char *CharsetRecog_big5::getName() const
{
return "Big5" ;
}
const char *CharsetRecog_big5::getLanguage() const
{
return "zh" ;
}
UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
{
int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
results->set(det, this , confidence);
return (confidence > 0 );
}
CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
{
// nothing to do
}
UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
int32_t firstByte = 0 ;
int32_t secondByte = 0 ;
int32_t thirdByte = 0 ;
int32_t fourthByte = 0 ;
it->index = it->nextIndex;
it->error = false ;
firstByte = it->charValue = it->nextByte(det);
if (firstByte < 0 ) {
// Ran off the end of the input data
return false ;
}
if (firstByte <= 0 x80) {
// single byte char
return true ;
}
secondByte = it->nextByte(det);
if (secondByte >= 0 ) {
it->charValue = (it->charValue << 8 ) | secondByte;
}
// else we'll handle the error later.
if (firstByte >= 0 x81 && firstByte <= 0 xFE) {
// Two byte Char
if ((secondByte >= 0 x40 && secondByte <= 0 x7E) || (secondByte >=80 && secondByte <= 0 xFE)) {
return true ;
}
// Four byte char
if (secondByte >= 0 x30 && secondByte <= 0 x39) {
thirdByte = it->nextByte(det);
if (thirdByte >= 0 x81 && thirdByte <= 0 xFE) {
fourthByte = it->nextByte(det);
if (fourthByte >= 0 x30 && fourthByte <= 0 x39) {
it->charValue = (it->charValue << 16 ) | (thirdByte << 8 ) | fourthByte;
return true ;
}
}
}
// Something wasn't valid, or we ran out of data (-1).
it->error = true ;
}
return true ;
}
const char *CharsetRecog_gb_18030::getName() const
{
return "GB18030" ;
}
const char *CharsetRecog_gb_18030::getLanguage() const
{
return "zh" ;
}
UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
{
int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
results->set(det, this , confidence);
return (confidence > 0 );
}
U_NAMESPACE_END
#endif
Messung V0.5 in Prozent C=87 H=98 G=92
¤ Dauer der Verarbeitung: 0.13 Sekunden
(vorverarbeitet am 2026-06-04)
¤
*© Formatika GbR, Deutschland