#ifdef U_ENABLE_GENERIC_ISO_2022 /* * I am disabling the generic ISO-2022 converter after proposing to do so on * the icu mailing list two days ago. * * Reasons: * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of * its designation sequences, single shifts with return to the previous state, * switch-with-no-return to UTF-16BE or similar, etc. * This is unlike the language-specific variants like ISO-2022-JP which * require a much smaller repertoire of ISO-2022 features. * These variants continue to be supported. * 2. I believe that no one is really using the generic ISO-2022 converter * but rather always one of the language-specific variants. * Note that ICU's generic ISO-2022 converter has always output one escape * sequence followed by UTF-8 for the whole stream. * 3. Switching between subcharsets is extremely slow, because each time * the previous converter is closed and a new one opened, * without any kind of caching, least-recently-used list, etc. * 4. The code is currently buggy, and given the above it does not seem * reasonable to spend the time on maintenance. * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. * This means, for example, that when ISO-8859-7 is designated, the following * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. * The ICU ISO-2022 converter does not handle this - and has no information * about which subconverter would have to be shifted vs. which is designed * for 7-bit ISO-2022. * * Markus Scherer 2003-dec-03
*/ #endif
/* * 94-character sets with native byte values A1..FE are encoded in ISO 2022 * as bytes 21..7E. (Subtract 0x80.) * 96-character sets with native byte values A0..FF are encoded in ISO 2022 * as bytes 20..7F. (Subtract 0x80.) * Do not encode C1 control codes with native bytes 80..9F * as bytes 00..1F (C0 control codes).
*/ enum {
GR94_START=0xa1,
GR94_END=0xfe,
GR96_START=0xa0,
GR96_END=0xff
};
/* * ISO 2022 control codes must not be converted from Unicode * because they would mess up the byte stream. * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b * corresponding to SO, SI, and ESC.
*/ #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
/* for ISO-2022-JP and -CN implementations */ typedefenum { /* shared values */
INVALID_STATE=-1,
ASCII = 0,
/* CN */ /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
GB2312_1=1,
ISO_IR_165=2,
CNS_11643=3,
/* * these are used in StateEnum and ISO2022State variables, * but CNS_11643 must be used to index into myConverterArray[]
*/
CNS_11643_0=0x20,
CNS_11643_1,
CNS_11643_2,
CNS_11643_3,
CNS_11643_4,
CNS_11643_5,
CNS_11643_6,
CNS_11643_7
} StateEnum;
/* is the StateEnum charset value for a DBCS charset? */ #if UCONFIG_ONLY_HTML_CONVERSION #define IS_JP_DBCS(cs) (JISX208==(cs)) #else #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) #endif
#define CSM(cs) ((uint16_t)1<<(cs))
/* * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x * * Note: The converter uses some leniency: * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in * all versions, not just JIS7 and JIS8. * - ICU does not distinguish between different versions of JIS X 0208.
*/ #if UCONFIG_ONLY_HTML_CONVERSION enum { MAX_JA_VERSION=0 }; #else enum { MAX_JA_VERSION=4 }; #endif staticconst uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), #if !UCONFIG_ONLY_HTML_CONVERSION
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) #endif
};
typedefstruct ISO2022State {
int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
int8_t prevG; /* g before single shift (SS2 or SS3) */
} ISO2022State;
typedefenum
{
INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
} UCNV_TableStates_2022;
/* * The way these state transition arrays work is: * ex : ESC$B is the sequence for JISX208 * a) First Iteration: char is ESC * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index * int x = normalize_esq_chars_2022[27] which is equal to 1 * ii) Search for this value in escSeqStateTable_Key_2022[] * value of x is stored at escSeqStateTable_Key_2022[0] * iii) Save this index as offset * iv) Get state of this sequence from escSeqStateTable_Value_2022[] * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 * b) Switch on this state and continue to next char * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index * which is normalize_esq_chars_2022[36] == 4 * ii) x is currently 1(from above) * x<<=5 -- x is now 32 * x+=normalize_esq_chars_2022[36] * now x is 36 * iii) Search for this value in escSeqStateTable_Key_2022[] * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 * c) Switch on this state and continue to next char * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index * ii) x is currently 36 (from above) * x<<=5 -- x is now 1152 * x+=normalize_esq_chars_2022[66] * now x is 1161 * iii) Search for this value in escSeqStateTable_Key_2022[] * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
*/
/*Below are the 3 arrays depicting a state transition table*/ staticconst int8_t normalize_esq_chars_2022[256] = { /* 0 1 2 3 4 5 6 7 8 9 */
#ifdef U_ENABLE_GENERIC_ISO_2022 /* * When the generic ISO-2022 converter is completely removed, not just disabled * per #ifdef, then the following state table and the associated tables that are * dimensioned with MAX_STATES_2022 should be trimmed. * * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of * the associated escape sequences starting with ESC ( B should be removed. * This includes the ones with key values 1097 and all of the ones above 1000000. * * For the latter, the tables can simply be truncated. * For the former, since the tables must be kept parallel, it is probably best * to simply duplicate an adjacent table cell, parallel in all tables. * * It may make sense to restructure the tables, especially by using small search * tables for the variants instead of indexing them parallel to the table here.
*/ #endif
staticinlinevoid
setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ /* in ISO-2022-KR the designator sequence appears only once * in a file so we append it only once
*/ if( converter->charErrorBufferLength==0){
uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
myConverterData->currentType = ASCII1;
cnv->fromUnicodeStatus =false; if(pArgs->locale){
uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1);
}
version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
myConverterData->version = version; if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
(myLocale[2]=='_' || myLocale[2]=='\0'))
{ /* open the required converters and cache them */ if(version>MAX_JA_VERSION) { // ICU 55 fails to open a converter for an unsupported version. // Previously, it fell back to version 0, but that would yield // unexpected behavior.
*errorCode = U_MISSING_RESOURCE_ERROR; return;
} if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
myConverterData->myConverterArray[ISO8859_7] =
ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
}
myConverterData->myConverterArray[JISX208] =
ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); if(jpCharsetMasks[version]&CSM(JISX212)) {
myConverterData->myConverterArray[JISX212] =
ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
} if(jpCharsetMasks[version]&CSM(GB2312)) {
myConverterData->myConverterArray[GB2312] =
ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
} if(jpCharsetMasks[version]&CSM(KSC5601)) {
myConverterData->myConverterArray[KSC5601] =
ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
}
/* set the function pointers to appropriate functions */
cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022JPData);
uprv_strcpy(myConverterData->locale,"ja");
(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
size_t len = uprv_strlen(myConverterData->name);
myConverterData->name[len] = static_cast<char>(myConverterData->version + static_cast<int>('0'));
myConverterData->name[len+1]='\0';
} #if !UCONFIG_ONLY_HTML_CONVERSION elseif(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
(myLocale[2]=='_' || myLocale[2]=='\0'))
{ if(version>1) { // ICU 55 fails to open a converter for an unsupported version. // Previously, it fell back to version 0, but that would yield // unexpected behavior.
*errorCode = U_MISSING_RESOURCE_ERROR; return;
} constchar *cnvName; if(version==1) {
cnvName="icu-internal-25546";
} else {
cnvName="ibm-949";
myConverterData->version=version=0;
} if(pArgs->onlyTestIsLoadable) {
ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
uprv_free(cnv->extraInfo);
cnv->extraInfo=nullptr; return;
} else {
myConverterData->currentConverter=ucnv_open(cnvName, errorCode); if (U_FAILURE(*errorCode)) {
_ISO2022Close(cnv); return;
}
/* initialize the state variables */
setInitialStateToUnicodeKR(cnv, myConverterData);
setInitialStateFromUnicodeKR(cnv, myConverterData);
/* set the function pointers to appropriate functions */
cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022KRData);
uprv_strcpy(myConverterData->locale,"ko");
}
} elseif(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
(myLocale[2]=='_' || myLocale[2]=='\0'))
{ if(version>2) { // ICU 55 fails to open a converter for an unsupported version. // Previously, it fell back to version 0, but that would yield // unexpected behavior.
*errorCode = U_MISSING_RESOURCE_ERROR; return;
}
/* open the required converters and cache them */
myConverterData->myConverterArray[GB2312_1] =
ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); if(version==1) {
myConverterData->myConverterArray[ISO_IR_165] =
ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
}
myConverterData->myConverterArray[CNS_11643] =
ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
/* set the function pointers to appropriate functions */
cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022CNData);
uprv_strcpy(myConverterData->locale,"cn");
cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; /* initialize the state variables */
uprv_strcpy(myConverterData->name,"ISO_2022"); #else
*errorCode = U_MISSING_RESOURCE_ERROR; // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard // data loading error code. return; #endif
}
if (converter->extraInfo != nullptr) { /*close the array of converter pointers and free the memory*/ for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { if(array[i]!=nullptr) {
ucnv_unloadSharedDataIfReady(array[i]);
}
}
togo = normalize_esq_chars_2022[static_cast<uint8_t>(c)]; if(togo == 0) { /* not a valid character anywhere in an escape sequence */
*key = 0;
*offset = 0; return INVALID_2022;
}
togo = (*key << 5) + togo;
while (hi != low) /*binary search*/{
int32_t mid = (hi+low) >> 1; /*Finds median*/
if (mid == oldmid) break;
if (escSeqStateTable_Key_2022[mid] > togo){
hi = mid;
} elseif (escSeqStateTable_Key_2022[mid] < togo){
low = mid;
} else/*we found it*/{
*key = togo;
*offset = mid; returnstatic_cast<UCNV_TableStates_2022>(escSeqStateTable_Value_2022[mid]);
}
oldmid = mid;
}
*key = 0;
*offset = 0; return INVALID_2022;
}
/*runs through a state machine to determine the escape sequence - codepage correspondence
*/ staticvoid
changeState_2022(UConverter* _this, constchar** source, constchar* sourceLimit,
Variant2022 var,
UErrorCode* err){
UCNV_TableStates_2022 value;
UConverterDataISO2022* myData2022 = static_cast<UConverterDataISO2022*>(_this->extraInfo);
uint32_t key = myData2022->key;
int32_t offset = 0;
int8_t initialToULength = _this->toULength; char c;
value = VALID_NON_TERMINAL_2022; while (*source < sourceLimit) {
c = *(*source)++;
_this->toUBytes[_this->toULength++] = static_cast<uint8_t>(c);
value = getKey_2022(c, reinterpret_cast<int32_t*>(&key), &offset);
switch (value){
case VALID_NON_TERMINAL_2022 : /* continue with the loop */ break;
case VALID_TERMINAL_2022:
key = 0; goto DONE;
case INVALID_2022: goto DONE;
case VALID_MAYBE_TERMINAL_2022: #ifdef U_ENABLE_GENERIC_ISO_2022 /* ESC ( B is ambiguous only for ISO_2022 itself */ if(var == ISO_2022) { /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
_this->toULength = 0;
/* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
/* continue with the loop */
value = VALID_NON_TERMINAL_2022; break;
} else #endif
{ /* not ISO_2022 itself, finish here */
value = VALID_TERMINAL_2022;
key = 0; goto DONE;
}
}
}
DONE:
myData2022->key = key;
if (value == VALID_NON_TERMINAL_2022) { /* indicate that the escape sequence is incomplete: key!=0 */ return;
} elseif (value == INVALID_2022 ) {
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
} else/* value == VALID_TERMINAL_2022 */ { switch(var){ #ifdef U_ENABLE_GENERIC_ISO_2022 case ISO_2022:
{ constchar *chosenConverterName = escSeqStateTable_Result_2022[offset]; if(chosenConverterName == nullptr) { /* SS2 or SS3 */
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
_this->toUCallbackReason = UCNV_UNASSIGNED; return;
}
_this->mode = UCNV_SI;
ucnv_close(myData2022->currentConverter);
myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); if(U_SUCCESS(*err)) {
myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
_this->mode = UCNV_SO;
} break;
} #endif case ISO_2022_JP:
{
StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeJP[offset]); switch(tempState) { case INVALID_STATE:
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE; break; case SS2_STATE: if(myData2022->toU2022State.cs[2]!=0) { if(myData2022->toU2022State.g<2) {
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
}
myData2022->toU2022State.g=2;
} else { /* illegal to have SS2 before a matching designator */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
} break; /* case SS3_STATE: not used in ISO-2022-JP-x */ case ISO8859_1: case ISO8859_7: if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
} else { /* G2 charset for SS2 */
myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState);
} break; default: if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
} else { /* G0 charset */
myData2022->toU2022State.cs[0] = static_cast<int8_t>(tempState);
} break;
}
} break; #if !UCONFIG_ONLY_HTML_CONVERSION case ISO_2022_CN:
{
StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeCN[offset]); switch(tempState) { case INVALID_STATE:
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE; break; case SS2_STATE: if(myData2022->toU2022State.cs[2]!=0) { if(myData2022->toU2022State.g<2) {
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
}
myData2022->toU2022State.g=2;
} else { /* illegal to have SS2 before a matching designator */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
} break; case SS3_STATE: if(myData2022->toU2022State.cs[3]!=0) { if(myData2022->toU2022State.g<2) {
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
}
myData2022->toU2022State.g=3;
} else { /* illegal to have SS3 before a matching designator */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
} break; case ISO_IR_165: if(myData2022->version==0) {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE; break;
}
U_FALLTHROUGH; case GB2312_1:
U_FALLTHROUGH; case CNS_11643_1:
myData2022->toU2022State.cs[1] = static_cast<int8_t>(tempState); break; case CNS_11643_2:
myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState); break; default: /* other CNS 11643 planes */ if(myData2022->version==0) {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
} else {
myData2022->toU2022State.cs[3] = static_cast<int8_t>(tempState);
} break;
}
} break; case ISO_2022_KR: if(offset==0x30){ /* nothing to be done, just accept this one escape sequence */
} else {
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
} break; #endif// !UCONFIG_ONLY_HTML_CONVERSION
default:
*err = U_ILLEGAL_ESCAPE_SEQUENCE; break;
}
} if(U_SUCCESS(*err)) {
_this->toULength = 0;
} elseif(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { if(_this->toULength>1) { /* * Ticket 5691: consistent illegal sequences: * - We include at least the first byte (ESC) in the illegal sequence. * - If any of the non-initial bytes could be the start of a character, * we stop the illegal sequence before the first one of those. * In escape sequences, all following bytes are "printable", that is, * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), * they are valid single/lead bytes. * For simplicity, we always only report the initial ESC byte as the * illegal sequence and back out all other bytes we looked at.
*/ /* Back out some bytes. */
int8_t backOutDistance=_this->toULength-1;
int8_t bytesFromThisBuffer=_this->toULength-initialToULength; if(backOutDistance<=bytesFromThisBuffer) { /* same as initialToULength<=1 */
*source-=backOutDistance;
} else { /* Back out bytes from the previous buffer: Need to replay them. */
_this->preToULength = static_cast<int8_t>(bytesFromThisBuffer - backOutDistance); /* same as -(initialToULength-1) */ /* preToULength is negative! */
uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
*source-=bytesFromThisBuffer;
}
_this->toULength=1;
}
} elseif(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
_this->toUCallbackReason = UCNV_UNASSIGNED;
}
}
#if !UCONFIG_ONLY_HTML_CONVERSION /*Checks the characters of the buffer against valid 2022 escape sequences *if the match we return a pointer to the initial start of the sequence otherwise *we return sourceLimit
*/ /*for 2022 looks ahead in the stream *to determine the longest possible convertible *data stream
*/ staticinlineconstchar*
getEndOfBuffer_2022(constchar** source, constchar* sourceLimit,
UBool /*flush*/){
constchar* mySource = *source;
#ifdef U_ENABLE_GENERIC_ISO_2022 if (*source >= sourceLimit) return sourceLimit;
do{
if (*mySource == ESC_2022){
int8_t i;
int32_t key = 0;
int32_t offset;
UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
/* Kludge: I could not * figure out the reason for validating an escape sequence * twice - once here and once in changeState_2022(). * is it possible to have an ESC character in a ISO2022 * byte stream which is valid in a code page? Is it legal?
*/ for (i=0;
(mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
i++) {
value = getKey_2022(*(mySource+i), &key, &offset);
} if (value > 0 || *mySource==ESC_2022) return mySource;
/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c * any future change in _MBCSFromUChar32() function should be reflected here. * @return number of bytes in *value; negative number if fallback; 0 if no mapping
*/ staticinline int32_t
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
UChar32 c,
uint32_t* value,
UBool useFallback, int outputType)
{ const int32_t *cx; const uint16_t *table;
uint32_t stage2Entry;
uint32_t myValue;
int32_t length; const uint8_t *p; /* * TODO(markus): Use and require new, faster MBCS conversion table structures. * Use internal version of ucnv_open() that verifies that the new structures are available, * else U_INTERNAL_PROGRAM_ERROR.
*/ /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
table=sharedData->mbcs.fromUnicodeTable;
stage2Entry=MBCS_STAGE_2_FROM_U(table, c); /* get the bytes and the length for the output */ if(outputType==MBCS_OUTPUT_2){
myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); if(myValue<=0xff) {
length=1;
} else {
length=2;
}
} else/* outputType==MBCS_OUTPUT_3 */ {
p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
myValue = (static_cast<uint32_t>(*p) << 16) | (static_cast<uint32_t>(p[1]) << 8) | p[2]; if(myValue<=0xff) {
length=1;
} elseif(myValue<=0xffff) {
length=2;
} else {
length=3;
}
} /* is this code point assigned, or do we use fallbacks? */ if((stage2Entry&(1<<(16+(c&0xf))))!=0) { /* assigned */
*value=myValue; return length;
} elseif(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { /* * We allow a 0 byte output if the "assigned" bit is set for this entry. * There is no way with this data structure for fallback output * to be a zero byte.
*/
*value=myValue; return -length;
}
}
/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c * any future change in _MBCSSingleFromUChar32() function should be reflected here. * @param retval pointer to output byte * @return 1 roundtrip byte 0 no mapping -1 fallback byte
*/ staticinline int32_t
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
UChar32 c,
uint32_t* retval,
UBool useFallback)
{ const uint16_t *table;
int32_t value; /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { return 0;
} /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
table=sharedData->mbcs.fromUnicodeTable; /* get the byte for the output */
value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); /* is this code point assigned, or do we use fallbacks? */
*retval = static_cast<uint32_t>(value & 0xff); if(value>=0xf00) { return 1; /* roundtrip */
} elseif(useFallback ? value>=0x800 : value>=0xc00) { return -1; /* fallback taken */
} else { return 0; /* no mapping */
}
}
/* * Check that the result is a 2-byte value with each byte in the range A1..FE * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte * to move it to the ISO 2022 range 21..7E. * Return 0 if out of range.
*/ staticinline uint32_t
_2022FromGR94DBCS(uint32_t value) { if (static_cast<uint16_t>(value - 0xa1a1) <= (0xfefe - 0xa1a1) && static_cast<uint8_t>(value - 0xa1) <= (0xfe - 0xa1)
) { return value - 0x8080; /* shift down to 21..7e byte range */
} else { return 0; /* not valid for ISO 2022 */
}
}
#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ /* * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point * unchanged.
*/ staticinline uint32_t
_2022ToGR94DBCS(uint32_t value) {
uint32_t returnValue = value + 0x8080; if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
(uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { return returnValue;
} else { return value;
}
} #endif
realSourceLimit = args->sourceLimit; while (args->source < realSourceLimit) { if(myData->key == 0) { /* are we in the middle of an escape sequence? */ /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
/* convert to before the ESC or until the end of the buffer */
myData->isFirstBuffer=false;
sourceStart = args->source;
myTargetStart = args->target;
args->converter = myData->currentConverter;
ucnv_toUnicode(args->converter,
&args->target,
args->targetLimit,
&args->source,
mySourceLimit,
args->offsets,
(UBool)(args->flush && mySourceLimit == realSourceLimit),
err);
args->converter = saveThis;
/************************************** IMPORTANT ************************************************** * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). * The converter iterates over each Unicode codepoint * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is * processed one char at a time it would make sense to reduce the extra processing a canned converter * would do as far as possible. * * If the implementation of these macros or structure of sharedData struct change in the future, make * sure that ISO-2022 is also changed. ***************************************************************************************************
*/
/*************************************************************************************************** * Rules for ISO-2022-jp encoding * (i) Escape sequences must be fully contained within a line they should not * span new lines or CRs * (ii) If the last character on a line is represented by two bytes then an ASCII or * JIS-Roman character escape sequence should follow before the line terminates * (iii) If the first character on the line is represented by two bytes then a two * byte character escape sequence should precede it * (iv) If no escape sequence is encountered then the characters are ASCII * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, * and invoked with SS2 (ESC N). * (vi) If there is any G0 designation in text, there must be a switch to * ASCII or to JIS X 0201-Roman before a space character (but not * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control * characters such as tab or CRLF. * (vi) Supported encodings: * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 * * source : RFC-1554 * * JISX201, JISX208,JISX212 : new .cnv data files created * KSC5601 : alias to ibm-949 mapping table * GB2312 : alias to ibm-1386 mapping table * ISO-8859-1 : Algorithmic implemented as LATIN1 case * ISO-8859-7 : alias to ibm-9409 mapping table
*/
/* preference order of JP charsets */ staticconst StateEnum jpCharsetPref[]={
ASCII,
JISX201,
ISO8859_1,
JISX208,
ISO8859_7,
JISX212,
GB2312,
KSC5601,
HWKANA_7BIT
};
/* * The escape sequences must be in order of the enum constants like JISX201 = 3, * not in order of jpCharsetPref[]!
*/ staticconstchar escSeqChars[][6] ={ "\x1B\x28\x42", /* <ESC>(B ASCII */ "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ "\x1B\x24\x42", /* <ESC>$B JISX-208 */ "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ "\x1B\x24\x41", /* <ESC>$A GB2312 */ "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ "\x1B\x28\x49"/* <ESC>(I HWKANA_7BIT */
}; staticconst int8_t escSeqCharsLen[] ={
3, /* length of <ESC>(B ASCII */
3, /* length of <ESC>.A ISO-8859-1 */
3, /* length of <ESC>.F ISO-8859-7 */
3, /* length of <ESC>(J JISX-201 */
3, /* length of <ESC>$B JISX-208 */
4, /* length of <ESC>$(D JISX-212 */
3, /* length of <ESC>$A GB2312 */
4, /* length of <ESC>$(C KSC5601 */
3 /* length of <ESC>(I HWKANA_7BIT */
};
/* * The iteration over various code pages works this way: * i) Get the currentState from myConverterData->currentState * ii) Check if the character is mapped to a valid character in the currentState * Yes -> a) set the initIterState to currentState * b) remain in this state until an invalid character is found * No -> a) go to the next code page and find the character * iii) Before changing the state increment the current state check if the current state * is equal to the intitIteration state * Yes -> A character that cannot be represented in any of the supported encodings * break and return a U_INVALID_CHARACTER error * No -> Continue and find the character in next code page * * * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
*/
/* Map 00..7F to Unicode according to JIS X 0201. */ staticinline uint32_t
jisx201ToU(uint32_t value) { if(value < 0x5c) { return value;
} elseif(value == 0x5c) { return 0xa5;
} elseif(value == 0x7e) { return 0x203e;
} else/* value <= 0x7f */ { return value;
}
}
/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ staticinline uint32_t
jisx201FromU(uint32_t value) { if(value<=0x7f) { if(value!=0x5c && value!=0x7e) { return value;
}
} elseif(value==0xa5) { return 0x5c;
} elseif(value==0x203e) { return 0x7e;
} return 0xfffe;
}
/* * Take a valid Shift-JIS byte pair, check that it is in the range corresponding * to JIS X 0208, and convert it to a pair of 21..7E bytes. * Return 0 if the byte pair is out of range.
*/ staticinline uint32_t
_2022FromSJIS(uint32_t value) {
uint8_t trail;
if(value > 0xEFFC) { return 0; /* beyond JIS X 0208 */
}
trail = static_cast<uint8_t>(value);
value &= 0xff00; /* lead byte */ if(value <= 0x9f00) {
value -= 0x7000;
} else/* 0xe000 <= value <= 0xef00 */ {
value -= 0xb000;
}
value <<= 1;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.