typedefstruct {
char16_t contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */
char16_t contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */
uint16_t defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */
uint16_t currentDeltaFromUnicode; /* current delta in Indic block */
uint16_t currentDeltaToUnicode; /* current delta in Indic block */
MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */
MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */
MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */
UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */
UBool resetToDefaultToUnicode; /* boolean for resetting to default delta and mask when a newline is encountered*/ char name[sizeof(ISCII_CNV_PREFIX) + 1];
UChar32 prevToUnicodeStatus; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */
} UConverterDataISCII;
/** * The values in validity table are indexed by the lower bits of Unicode * range 0x0900 - 0x09ff. The values have a structure like: * --------------------------------------------------------------- * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML | * | | | | | ASM | KND | | | * --------------------------------------------------------------- * If a code point is valid in a particular script * then that bit is turned on * * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for * to represent these languages * * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case * and combine and use 1 bit to represent these languages. * * TODO: It is probably easier to understand and maintain to change this * to use uint16_t and give each of the 9 Unicode/script blocks its own bit.
*/
staticconst uint8_t validityTable[128] = { /* This state table is tool generated please do not edit unless you know exactly what you are doing */ /* Note: This table was edited to mirror the Windows XP implementation */ /*ISCII:Valid:Unicode */ /*0xa0 : 0x00: 0x900 */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xa1 : 0xb8: 0x901 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , /*0xa2 : 0xfe: 0x902 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa3 : 0xbf: 0x903 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0x00 : 0x00: 0x904 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xa4 : 0xff: 0x905 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa5 : 0xff: 0x906 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa6 : 0xff: 0x907 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa7 : 0xff: 0x908 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa8 : 0xff: 0x909 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa9 : 0xff: 0x90a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xaa : 0xfe: 0x90b */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0x00 : 0x00: 0x90c */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xae : 0x80: 0x90d */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xab : 0x87: 0x90e */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xac : 0xff: 0x90f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xad : 0xff: 0x910 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb2 : 0x80: 0x911 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xaf : 0x87: 0x912 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xb0 : 0xff: 0x913 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb1 : 0xff: 0x914 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb3 : 0xff: 0x915 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb4 : 0xfe: 0x916 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xb5 : 0xfe: 0x917 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xb6 : 0xfe: 0x918 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xb7 : 0xff: 0x919 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb8 : 0xff: 0x91a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb9 : 0xfe: 0x91b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xba : 0xff: 0x91c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xbb : 0xfe: 0x91d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xbc : 0xff: 0x91e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xbd : 0xff: 0x91f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xbe : 0xfe: 0x920 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xbf : 0xfe: 0x921 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc0 : 0xfe: 0x922 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc1 : 0xff: 0x923 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xc2 : 0xff: 0x924 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xc3 : 0xfe: 0x925 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc4 : 0xfe: 0x926 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc5 : 0xfe: 0x927 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc6 : 0xff: 0x928 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xc7 : 0x81: 0x929 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + TML_MASK , /*0xc8 : 0xff: 0x92a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xc9 : 0xfe: 0x92b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xca : 0xfe: 0x92c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xcb : 0xfe: 0x92d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xcc : 0xfe: 0x92e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xcd : 0xff: 0x92f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xcf : 0xff: 0x930 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xd0 : 0x87: 0x931 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK , /*0xd1 : 0xff: 0x932 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xd2 : 0xb7: 0x933 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xd3 : 0x83: 0x934 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK , /*0xd4 : 0xff: 0x935 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xd5 : 0xfe: 0x936 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xd6 : 0xbf: 0x937 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xd7 : 0xff: 0x938 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xd8 : 0xff: 0x939 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0x00 : 0x00: 0x93A */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x93B */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xe9 : 0xda: 0x93c */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x93d */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xda : 0xff: 0x93e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xdb : 0xff: 0x93f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xdc : 0xff: 0x940 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xdd : 0xff: 0x941 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xde : 0xff: 0x942 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xdf : 0xbe: 0x943 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0x00 : 0x00: 0x944 */ DEV_MASK + ZERO + GJR_MASK + ZERO + BNG_MASK + KND_MASK + ZERO + ZERO , /*0xe3 : 0x80: 0x945 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xe0 : 0x87: 0x946 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xe1 : 0xff: 0x947 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xe2 : 0xff: 0x948 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xe7 : 0x80: 0x949 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xe4 : 0x87: 0x94a */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xe5 : 0xff: 0x94b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xe6 : 0xff: 0x94c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xe8 : 0xff: 0x94d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xec : 0x00: 0x94e */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xed : 0x00: 0x94f */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x950 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x951 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x952 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x953 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x954 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x955 */ ZERO + ZERO + ZERO + ZERO + ZERO + KND_MASK + ZERO + ZERO , /*0x00 : 0x00: 0x956 */ ZERO + ZERO + ZERO + ORI_MASK + ZERO + KND_MASK + ZERO + ZERO , /*0x00 : 0x00: 0x957 */ ZERO + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + MLM_MASK + ZERO , /*0x00 : 0x00: 0x958 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x959 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95a */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95b */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95c */ DEV_MASK + PNJ_MASK + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95d */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95e */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xce : 0x98: 0x95f */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x960 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0x00 : 0x00: 0x961 */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0x00 : 0x00: 0x962 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x963 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , /*0xea : 0xf8: 0x964 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xeaea : 0x00: 0x965*/ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xf1 : 0xff: 0x966 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf2 : 0xff: 0x967 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf3 : 0xff: 0x968 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf4 : 0xff: 0x969 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf5 : 0xff: 0x96a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf6 : 0xff: 0x96b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf7 : 0xff: 0x96c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf8 : 0xff: 0x96d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf9 : 0xff: 0x96e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xfa : 0xff: 0x96f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0x00 : 0x80: 0x970 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /* * The length of the array is 128 to provide values for 0x900..0x97f. * The last 15 entries for 0x971..0x97f of the validity table are all zero * because no Indic script uses such Unicode code points.
*/ /*0x00 : 0x00: 0x9yz */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO
};
if ((args->converter == nullptr) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)) {
*err = U_ILLEGAL_ARGUMENT_ERROR; return;
} /* initialize data */
converterData=(UConverterDataISCII*)args->converter->extraInfo;
newDelta=converterData->currentDeltaFromUnicode;
range = (uint16_t)(newDelta/DELTA);
if ((sourceChar = args->converter->fromUChar32)!=0) { goto getTrail;
}
/*writing the char to the output stream */ while (source < sourceLimit) { /* Write the language code following LF only if LF is not the last character. */ if (args->converter->fromUnicodeStatus == LF) {
targetByteUnit = ATR<<8;
targetByteUnit += (uint8_t) lookupInitialData[range].isciiLang;
args->converter->fromUnicodeStatus = 0x0000; /* now append ATR and language code */
WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); if (U_FAILURE(*err)) { break;
}
}
/*check if input is in ASCII and C0 control codes range*/ if (sourceChar <= ASCII_END) {
args->converter->fromUnicodeStatus = sourceChar;
WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,sourceChar,err); if (U_FAILURE(*err)) { break;
} continue;
} switch (sourceChar) { case ZWNJ: /* contextChar has HALANT */ if (converterData->contextCharFromUnicode) {
converterData->contextCharFromUnicode = 0x00;
targetByteUnit = ISCII_HALANT;
} else { /* consume ZWNJ and continue */
converterData->contextCharFromUnicode = 0x00; continue;
} break; case ZWJ: /* contextChar has HALANT */ if (converterData->contextCharFromUnicode) {
targetByteUnit = ISCII_NUKTA;
} else {
targetByteUnit =ISCII_INV;
}
converterData->contextCharFromUnicode = 0x00; break; default: /* is the sourceChar in the INDIC_RANGE? */ if ((uint16_t)(INDIC_BLOCK_END-sourceChar) <= INDIC_RANGE) { /* Danda and Double Danda are valid in Northern scripts.. since Unicode * does not include these codepoints in all Northern scrips we need to * filter them out
*/ if (sourceChar!= DANDA && sourceChar != DOUBLE_DANDA) { /* find out to which block the souceChar belongs*/
range =(uint16_t)((sourceChar-INDIC_BLOCK_BEGIN)/DELTA);
newDelta =(uint16_t)(range*DELTA);
/* Now are we in the same block as the previous? */ if (newDelta!= converterData->currentDeltaFromUnicode || converterData->isFirstBuffer) {
converterData->currentDeltaFromUnicode = newDelta;
converterData->currentMaskFromUnicode = lookupInitialData[range].maskEnum;
deltaChanged =true;
converterData->isFirstBuffer=false;
}
if (converterData->currentDeltaFromUnicode == PNJ_DELTA) { if (sourceChar == PNJ_TIPPI) { /* Make sure Tippi is converted to Bindi. */
sourceChar = PNJ_BINDI;
} elseif (sourceChar == PNJ_ADHAK) { /* This is for consonant cluster handling. */
converterData->contextCharFromUnicode = PNJ_ADHAK;
}
} /* Normalize all Indic codepoints to Devanagari and map them to ISCII */ /* now subtract the new delta from sourceChar*/
sourceChar -= converterData->currentDeltaFromUnicode;
}
/* get the target byte unit */
targetByteUnit=fromUnicodeTable[(uint8_t)sourceChar];
/* is the code point valid in current script? */ if ((validityTable[(uint8_t)sourceChar] & converterData->currentMaskFromUnicode)==0) { /* Vocallic RR is assigned in ISCII Telugu and Unicode */ if (converterData->currentDeltaFromUnicode!=(TELUGU_DELTA) || sourceChar!=VOCALLIC_RR) {
targetByteUnit=missingCharMarker;
}
}
if (deltaChanged) { /* we are in a script block which is different than * previous sourceChar's script block write ATR and language codes
*/
uint32_t temp=0;
temp =(uint16_t)(ATR<<8);
temp += (uint16_t)((uint8_t) lookupInitialData[range].isciiLang); /* reset */
deltaChanged=false; /* now append ATR and language code */
WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,temp,err); if (U_FAILURE(*err)) { break;
}
}
if (converterData->currentDeltaFromUnicode == PNJ_DELTA && (sourceChar + PNJ_DELTA) == PNJ_ADHAK) { continue;
}
} /* reset context char */
converterData->contextCharFromUnicode = 0x00; break;
} if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && isPNJConsonant((sourceChar + PNJ_DELTA))) { /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */ /* reset context char */
converterData->contextCharFromUnicode = 0x0000;
targetByteUnit = targetByteUnit << 16 | ISCII_HALANT << 8 | targetByteUnit; /* write targetByteUnit to target */
WRITE_TO_TARGET_FROM_U(args, offsets, source, target, targetLimit, targetByteUnit,err); if (U_FAILURE(*err)) { break;
}
} elseif (targetByteUnit != missingCharMarker) { if (targetByteUnit==ISCII_HALANT) {
converterData->contextCharFromUnicode = (char16_t)targetByteUnit;
} /* write targetByteUnit to target*/
WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); if (U_FAILURE(*err)) { break;
}
} else { /* oops.. the code point is unassigned */ /*check if the char is a First surrogate*/ if (U16_IS_SURROGATE(sourceChar)) { if (U16_IS_SURROGATE_LEAD(sourceChar)) {
getTrail: /*look ahead to find the trail surrogate*/ if (source < sourceLimit) { /* test the following code unit */
char16_t trail= (*source); if (U16_IS_TRAIL(trail)) {
source++;
sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
*err =U_INVALID_CHAR_FOUND; /* convert this surrogate code point */ /* exit this condition tree */
} else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
}
} else { /* no more input */
*err = U_ZERO_ERROR;
}
} else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
}
} else { /* callback(unassigned) for a BMP code point */
*err = U_INVALID_CHAR_FOUND;
}
args->converter->fromUChar32=sourceChar; break;
}
}/* end while(mySourceIndex<mySourceLength) */
/*save the state and return */
args->source = source;
args->target = (char*)target;
}
#define GET_MAPPING(sourceChar,targetUniChar,data) UPRV_BLOCK_MACRO_BEGIN { \
targetUniChar = toUnicodeTable[(sourceChar)] ; \ /* is the code point valid in current script? */ \ if(sourceChar> ASCII_END && \
(validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){ \ /* Vocallic RR is assigned in ISCII Telugu and Unicode */ \ if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \
targetUniChar!=VOCALLIC_RR){ \
targetUniChar=missingCharMarker; \
} \
} \
} UPRV_BLOCK_MACRO_END
/*********** * Rules for ISCII to Unicode converter * ISCII is stateful encoding. To convert ISCII bytes to Unicode, * which has both precomposed and decomposed forms characters * pre-context and post-context need to be considered. * * Post context * i) ATR : Attribute code is used to declare the font and script switching. * Currently we only switch scripts and font codes consumed without generating an error * ii) EXT : Extension code is used to declare switching to Sanskrit and for obscure, * obsolete characters * Pre context * i) Halant: if preceded by a halant then it is a explicit halant * ii) Nukta : * a) if preceded by a halant then it is a soft halant * b) if preceded by specific consonants and the ligatures have pre-composed * characters in Unicode then convert to pre-composed characters * iii) Danda: If Danda is preceded by a Danda then convert to Double Danda *
*/
data = (UConverterDataISCII*)(args->converter->extraInfo);
contextCharToUnicode = &data->contextCharToUnicode; /* contains previous ISCII codepoint visited */
toUnicodeStatus = (UChar32*)&args->converter->toUnicodeStatus;/* contains the mapping to Unicode of the above codepoint*/
while (U_SUCCESS(*err) && source<sourceLimit) {
targetUniChar = missingCharMarker;
if (target < targetLimit) {
sourceChar = (unsignedchar)*(source)++;
/* look at the post-context perform special processing */ if (*contextCharToUnicode==ATR) {
/* If we have ATR in *contextCharToUnicode then we need to change our * state to the Indic Script specified by sourceChar
*/
/* check if the sourceChar is supported script range*/ if ((uint8_t)(PNJ-sourceChar)<=PNJ-DEV) {
data->currentDeltaToUnicode = (uint16_t)(lookupTable[sourceChar & 0x0F][0] * DELTA);
data->currentMaskToUnicode = (MaskEnum)lookupTable[sourceChar & 0x0F][1];
} elseif (sourceChar==DEF) { /* switch back to default */
data->currentDeltaToUnicode = data->defDeltaToUnicode;
data->currentMaskToUnicode = data->defMaskToUnicode;
} else { if ((sourceChar >= 0x21 && sourceChar <= 0x3F)) { /* these are display codes consume and continue */
} else {
*err =U_ILLEGAL_CHAR_FOUND; /* reset */
*contextCharToUnicode=NO_CHAR_MARKER; goto CALLBACK;
}
}
/* reset */
*contextCharToUnicode=NO_CHAR_MARKER;
continue;
} elseif (*contextCharToUnicode==EXT) { /* check if sourceChar is in 0xA1-0xEE range */ if ((uint8_t) (EXT_RANGE_END - sourceChar) <= (EXT_RANGE_END - EXT_RANGE_BEGIN)) { /* We currently support only Anudatta and Devanagari abbreviation sign */ if (sourceChar==0xBF || sourceChar == 0xB8) {
targetUniChar = (sourceChar==0xBF) ? DEV_ABBR_SIGN : DEV_ANUDATTA;
/* find out if the mapping is valid in this state */ if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
*contextCharToUnicode= NO_CHAR_MARKER;
/* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ if (data->prevToUnicodeStatus) {
WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
data->prevToUnicodeStatus = 0x0000;
} /* write to target */
WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
continue;
}
} /* byte unit is unassigned */
targetUniChar = missingCharMarker;
*err= U_INVALID_CHAR_FOUND;
} else { /* only 0xA1 - 0xEE are legal after EXT char */
*contextCharToUnicode= NO_CHAR_MARKER;
*err = U_ILLEGAL_CHAR_FOUND;
} goto CALLBACK;
} elseif (*contextCharToUnicode==ISCII_INV) { if (sourceChar==ISCII_HALANT) {
targetUniChar = 0x0020; /* replace with space according to Indic FAQ */
} else {
targetUniChar = ZWJ;
}
/* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ if (data->prevToUnicodeStatus) {
WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
data->prevToUnicodeStatus = 0x0000;
} /* write to target */
WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); /* reset */
*contextCharToUnicode=NO_CHAR_MARKER;
}
/* look at the pre-context and perform special processing */ switch (sourceChar) { case ISCII_INV: case EXT: case ATR:
*contextCharToUnicode = (char16_t)sourceChar;
if (*toUnicodeStatus != missingCharMarker) { /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ if (data->prevToUnicodeStatus) {
WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
data->prevToUnicodeStatus = 0x0000;
}
WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err);
*toUnicodeStatus = missingCharMarker;
} continue; case ISCII_DANDA: /* handle double danda*/ if (*contextCharToUnicode== ISCII_DANDA) {
targetUniChar = DOUBLE_DANDA; /* clear the context */
*contextCharToUnicode = NO_CHAR_MARKER;
*toUnicodeStatus = missingCharMarker;
} else {
GET_MAPPING(sourceChar,targetUniChar,data);
*contextCharToUnicode = sourceChar;
} break; case ISCII_HALANT: /* handle explicit halant */ if (*contextCharToUnicode == ISCII_HALANT) {
targetUniChar = ZWNJ; /* clear the context */
*contextCharToUnicode = NO_CHAR_MARKER;
} else {
GET_MAPPING(sourceChar,targetUniChar,data);
*contextCharToUnicode = sourceChar;
} break; case 0x0A: case 0x0D:
data->resetToDefaultToUnicode = true;
GET_MAPPING(sourceChar,targetUniChar,data)
;
*contextCharToUnicode = sourceChar; break;
case ISCII_VOWEL_SIGN_E:
i=1;
found=false; for (; i<vowelSignESpecialCases[0][0]; i++) {
U_ASSERT(i<UPRV_LENGTHOF(vowelSignESpecialCases)); if (vowelSignESpecialCases[i][0]==(uint8_t)*contextCharToUnicode) {
targetUniChar=vowelSignESpecialCases[i][1];
found=true; break;
}
} if (found) { /* find out if the mapping is valid in this state */ if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) { /*targetUniChar += data->currentDeltaToUnicode ;*/
*contextCharToUnicode= NO_CHAR_MARKER;
*toUnicodeStatus = missingCharMarker; break;
}
}
GET_MAPPING(sourceChar,targetUniChar,data);
*contextCharToUnicode = sourceChar; break;
case ISCII_NUKTA: /* handle soft halant */ if (*contextCharToUnicode == ISCII_HALANT) {
targetUniChar = ZWJ; /* clear the context */
*contextCharToUnicode = NO_CHAR_MARKER; break;
} elseif (data->currentDeltaToUnicode == PNJ_DELTA && data->contextCharToUnicode == 0xc0) { /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ if (data->prevToUnicodeStatus) {
WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
data->prevToUnicodeStatus = 0x0000;
} /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi. * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39).
*/
targetUniChar = PNJ_RRA;
WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); if (U_SUCCESS(*err)) {
targetUniChar = PNJ_SIGN_VIRAMA;
WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err); if (U_SUCCESS(*err)) {
targetUniChar = PNJ_HA;
WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
} else {
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA;
}
} else {
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_SIGN_VIRAMA;
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA;
}
*toUnicodeStatus = missingCharMarker;
data->contextCharToUnicode = NO_CHAR_MARKER; continue;
} else { /* try to handle <CHAR> + ISCII_NUKTA special mappings */
i=1;
found =false; for (; i<nuktaSpecialCases[0][0]; i++) { if (nuktaSpecialCases[i][0]==(uint8_t)
*contextCharToUnicode) {
targetUniChar=nuktaSpecialCases[i][1];
found =true; break;
}
} if (found) { /* find out if the mapping is valid in this state */ if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) { /*targetUniChar += data->currentDeltaToUnicode ;*/
*contextCharToUnicode= NO_CHAR_MARKER;
*toUnicodeStatus = missingCharMarker; if (data->currentDeltaToUnicode == PNJ_DELTA) { /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */ if (data->prevToUnicodeStatus) {
WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
data->prevToUnicodeStatus = 0x0000;
}
WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err); continue;
} break;
} /* else fall through to default */
} /* else fall through to default */
U_FALLTHROUGH;
} default:GET_MAPPING(sourceChar,targetUniChar,data)
;
*contextCharToUnicode = sourceChar; break;
}
if (*toUnicodeStatus != missingCharMarker) { /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */ if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && isPNJConsonant(data->prevToUnicodeStatus) &&
(*toUnicodeStatus + PNJ_DELTA) == PNJ_SIGN_VIRAMA && ((UChar32)(targetUniChar + PNJ_DELTA) == data->prevToUnicodeStatus)) {
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.24 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.