/** * Matching function shared among the 2022 detectors JP, CN and KR * Counts up the number of legal and unrecognized escape sequences in * the sample of text, and computes a score based on the total number & * the proportion that fit the encoding. * * * @param text the byte buffer containing text to analyse * @param textLen the size of the text in the byte. * @param escapeSequences the byte escape sequences to test for. * @return match quality, in the range of 0-100.
*/
int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) const
{
int32_t i, j;
int32_t escN;
int32_t hits = 0;
int32_t misses = 0;
int32_t shifts = 0;
int32_t quality;
// // Initial quality is based on relative proportion of recognized vs. // unrecognized escape sequences. // All good: quality = 100; // half or less good: quality = 0; // linear inbetween.
quality = (100*hits - 100*misses) / (hits + misses);
// Back off quality if there were too few escape sequences seen. // Include shifts in this computation, so that KR does not get penalized // for having only a single Escape sequence, but many shifts. if (hits+shifts < 5) {
quality -= (5-(hits+shifts))*10;
}
if (quality < 0) {
quality = 0;
}
return quality;
}
staticconst uint8_t escapeSequences_2022JP[][5] = {
{0x1b, 0x24, 0x28, 0x43, 0x00}, // KS X 1001:1992
{0x1b, 0x24, 0x28, 0x44, 0x00}, // JIS X 212-1990
{0x1b, 0x24, 0x40, 0x00, 0x00}, // JIS C 6226-1978
{0x1b, 0x24, 0x41, 0x00, 0x00}, // GB 2312-80
{0x1b, 0x24, 0x42, 0x00, 0x00}, // JIS X 208-1983
{0x1b, 0x26, 0x40, 0x00, 0x00}, // JIS X 208 1990, 1997
{0x1b, 0x28, 0x42, 0x00, 0x00}, // ASCII
{0x1b, 0x28, 0x48, 0x00, 0x00}, // JIS-Roman
{0x1b, 0x28, 0x49, 0x00, 0x00}, // Half-width katakana
{0x1b, 0x28, 0x4a, 0x00, 0x00}, // JIS-Roman
{0x1b, 0x2e, 0x41, 0x00, 0x00}, // ISO 8859-1
{0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7
};
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.