/* * This implementation is designed for 16-bit Unicode strings. * The main assumption is that the Arabic characters and their * presentation forms each fit into a single char16_t. * With UTF-8, they occupy 2 or 3 bytes, and more than the ASCII * characters.
*/
/* * ### TODO in general for letter shaping: * - the letter shaping code is UTF-16-unaware; needs update * + especially invertBuffer()?! * - needs to handle the "Arabic Tail" that is used in some legacy codepages * as a glyph fragment of wide-glyph letters * + IBM Unicode conversion tables map it to U+200B (ZWSP) * + IBM Egypt has proposed to encode the tail in Unicode among Arabic Presentation Forms * + Unicode 3.2 added U+FE73 ARABIC TAIL FRAGMENT
*/
/* definitions for Arabic letter shaping ------------------------------------ */
/* * This function shapes European digits to Arabic-Indic digits * in-place, writing over the input characters. * Since we know that we are only looking for BMP code points, * we can safely just work with code units (again, at least UTF-16).
*/ staticvoid
_shapeToArabicDigitsWithContext(char16_t *s, int32_t length,
char16_t digitBase,
UBool isLogical, UBool lastStrongWasAL) {
int32_t i;
char16_t c;
digitBase-=0x30;
/* the iteration direction depends on the type of input */ if(isLogical) { for(i=0; i<length; ++i) {
c=s[i]; switch(ubidi_getClass(c)) { case U_LEFT_TO_RIGHT: /* L */ case U_RIGHT_TO_LEFT: /* R */
lastStrongWasAL=false; break; case U_RIGHT_TO_LEFT_ARABIC: /* AL */
lastStrongWasAL=true; break; case U_EUROPEAN_NUMBER: /* EN */ if (lastStrongWasAL && static_cast<uint32_t>(c - 0x30) < 10) {
s[i] = static_cast<char16_t>(digitBase + c); /* digitBase+(c-0x30) - digitBase was modified above */
} break; default : break;
}
}
} else { for(i=length; i>0; /* pre-decrement in the body */) {
c=s[--i]; switch(ubidi_getClass(c)) { case U_LEFT_TO_RIGHT: /* L */ case U_RIGHT_TO_LEFT: /* R */
lastStrongWasAL=false; break; case U_RIGHT_TO_LEFT_ARABIC: /* AL */
lastStrongWasAL=true; break; case U_EUROPEAN_NUMBER: /* EN */ if (lastStrongWasAL && static_cast<uint32_t>(c - 0x30) < 10) {
s[i] = static_cast<char16_t>(digitBase + c); /* digitBase+(c-0x30) - digitBase was modified above */
} break; default : break;
}
}
}
}
/* *Name : invertBuffer *Function : This function inverts the buffer, it's used * in case the user specifies the buffer to be * U_SHAPE_TEXT_DIRECTION_LOGICAL
*/ staticvoid
invertBuffer(char16_t *buffer, int32_t size, uint32_t /*options*/, int32_t lowlimit, int32_t highlimit) {
char16_t temp;
int32_t i=0,j=0; for(i=lowlimit,j=size-highlimit-1;i<j;i++,j--) {
temp = buffer[i];
buffer[i] = buffer[j];
buffer[j] = temp;
}
}
/* *Name : changeLamAlef *Function : Converts the Alef characters into an equivalent * LamAlef location in the 0x06xx Range, this is an * intermediate stage in the operation of the program * later it'll be converted into the 0xFExx LamAlefs * in the shaping function.
*/ staticinline char16_t
changeLamAlef(char16_t ch) { switch(ch) { case 0x0622 : return 0x065C; case 0x0623 : return 0x065D; case 0x0625 : return 0x065E; case 0x0627 : return 0x065F;
} return 0;
}
/* *Name : getLink *Function : Resolves the link between the characters as * Arabic characters have four forms : * Isolated, Initial, Middle and Final Form
*/ static char16_t
getLink(char16_t ch) { if(ch >= 0x0622 && ch <= 0x06D3) { return(araLink[ch-0x0622]);
} elseif(ch == 0x200D) { return(3);
} elseif(ch >= 0x206D && ch <= 0x206F) { return(4);
}elseif(ch >= 0xFB50 && ch <= 0xFC62) { return(presALink[ch-0xFB50]);
} elseif(ch >= 0xFE70 && ch <= 0xFEFC) { return(presBLink[ch-0xFE70]);
}else { return(0);
}
}
/* *Name : countSpaces *Function : Counts the number of spaces * at each end of the logical buffer
*/ staticvoid
countSpaces(char16_t *dest, int32_t size, uint32_t /*options*/, int32_t *spacesCountl, int32_t *spacesCountr) {
int32_t i = 0;
int32_t countl = 0,countr = 0; while((dest[i] == SPACE_CHAR) && (countl < size)) {
countl++;
i++;
} if (countl < size) { /* the entire buffer is not all space */ while(dest[size-1] == SPACE_CHAR) {
countr++;
size--;
}
}
*spacesCountl = countl;
*spacesCountr = countr;
}
/*BIDI *Name : isSeenTailFamilyChar *Function : returns 1 if the character is a seen family isolated character * in the FE range otherwise returns 0
*/
/*Start of BIDI*/ /* *Name : isAlefMaksouraChar *Function : returns 1 if the character is a Alef Maksoura Final or isolated * otherwise returns 0
*/ staticinline int32_t
isAlefMaksouraChar(char16_t ch) { returnstatic_cast<int32_t>(ch == 0xFEEF || ch == 0xFEF0 || ch == 0x0649);
}
/* * Name : isYehHamzaChar * Function : returns 1 if the character is a yehHamza isolated or yehhamza * final is found otherwise returns 0
*/ staticinline int32_t
isYehHamzaChar(char16_t ch) { if((ch==0xFE89)||(ch==0xFE8A)){ return 1;
}else{ return 0;
}
}
/* * Name: isTashkeelOnTatweelChar * Function: Checks if the Tashkeel Character is on Tatweel or not,if the * Tashkeel on tatweel (FE range), it returns 1 else if the * Tashkeel with shadda on tatweel (FC range)return 2 otherwise * returns 0
*/ staticinline int32_t
isTashkeelOnTatweelChar(char16_t ch){ if(ch >= 0xfe70 && ch <= 0xfe7f && ch != NEW_TAIL_CHAR && ch != 0xFE75 && ch != SHADDA_TATWEEL_CHAR)
{ return tashkeelMedial [ch - 0xFE70];
}elseif( (ch >= 0xfcf2 && ch <= 0xfcf4) || (ch == SHADDA_TATWEEL_CHAR)) { return 2;
}else{ return 0;
}
}
/* * Name: isIsolatedTashkeelChar * Function: Checks if the Tashkeel Character is in the isolated form * (i.e. Unicode FE range) returns 1 else if the Tashkeel * with shadda is in the isolated form (i.e. Unicode FC range) * returns 2 otherwise returns 0
*/ staticinline int32_t
isIsolatedTashkeelChar(char16_t ch){ if(ch >= 0xfe70 && ch <= 0xfe7f && ch != NEW_TAIL_CHAR && ch != 0xFE75){ return (1 - tashkeelMedial [ch - 0xFE70]);
}elseif(ch >= 0xfc5e && ch <= 0xfc63){ return 1;
}else{ return 0;
}
}
/* *Name : calculateSize *Function : This function calculates the destSize to be used in preflighting * when the destSize is equal to 0 * It is used also to calculate the new destsize in case the * destination buffer will be resized.
*/
if ((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_UNSHAPE){ if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_RESIZE){ for(i=0;i<sourceLength;i++) { if(isLamAlefChar(source[i]))
destSize++;
}
}
}
return destSize;
}
/* *Name : handleTashkeelWithTatweel *Function : Replaces Tashkeel as following: * Case 1 :if the Tashkeel on tatweel, replace it with Tatweel. * Case 2 :if the Tashkeel aggregated with Shadda on Tatweel, replace * it with Shadda on Tatweel. * Case 3: if the Tashkeel is isolated replace it with Space. *
*/ static int32_t
handleTashkeelWithTatweel(char16_t *dest, int32_t sourceLength,
int32_t /*destSize*/, uint32_t /*options*/,
UErrorCode * /*pErrorCode*/) { int i; for(i = 0; i < sourceLength; i++){ if((isTashkeelOnTatweelChar(dest[i]) == 1)){
dest[i] = TATWEEL_CHAR;
}elseif((isTashkeelOnTatweelChar(dest[i]) == 2)){
dest[i] = SHADDA_TATWEEL_CHAR;
}elseif(isIsolatedTashkeelChar(dest[i]) && dest[i] != SHADDA_CHAR){
dest[i] = SPACE_CHAR;
}
} return sourceLength;
}
/* *Name : handleGeneratedSpaces *Function : The shapeUnicode function converts Lam + Alef into LamAlef + space, * and Tashkeel to space. * handleGeneratedSpaces function puts these generated spaces * according to the options the user specifies. LamAlef and Tashkeel * spaces can be replaced at begin, at end, at near or decrease the * buffer size. * * There is also Auto option for LamAlef and tashkeel, which will put * the spaces at end of the buffer (or end of text if the user used * the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END). * * If the text type was visual_LTR and the option * U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END was selected the END * option will place the space at the beginning of the buffer and * BEGIN will place the space at the end of the buffer.
*/
/* *Name :expandCompositCharAtBegin *Function :Expands the LamAlef character to Lam and Alef consuming the required * space from beginning of the buffer. If the text type was visual_LTR * and the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END was selected * the spaces will be located at end of buffer. * If there are no spaces to expand the LamAlef, an error * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h
*/
i = 0; while(dest[i] == SPACE_CHAR) {
countl++;
i++;
}
i = j = sourceLength-1;
while(i >= 0 && j >= 0) { if( countl>0 && isLamAlefChar(dest[i])) {
tempbuffer[j] = LAM_CHAR; /* to ensure the array index is within the range */
U_ASSERT(dest[i] >= 0xFEF5u
&& dest[i]-0xFEF5u < UPRV_LENGTHOF(convertLamAlef));
tempbuffer[j-1] = convertLamAlef[ dest[i] - 0xFEF5 ];
j--;
countl--;
}else { if( countl == 0 && isLamAlefChar(dest[i]) ) {
*pErrorCode=U_NO_SPACE_AVAILABLE;
}
tempbuffer[j] = dest[i];
}
i--;
j--;
}
u_memcpy(dest, tempbuffer, sourceLength);
uprv_free(tempbuffer);
destSize = sourceLength; return destSize;
}
/* *Name : expandCompositCharAtEnd *Function : Expands the LamAlef character to Lam and Alef consuming the * required space from end of the buffer. If the text type was * Visual LTR and the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END * was used, the spaces will be consumed from begin of buffer. If * there are no spaces to expand the LamAlef, an error * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h
*/
/* *Name : expandCompositCharAtNear *Function : Expands the LamAlef character into Lam + Alef, YehHamza character * into Yeh + Hamza, SeenFamily character into SeenFamily character * + Tail, while consuming the space next to the character. * If there are no spaces next to the character, an error * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h
*/
static int32_t
expandCompositCharAtNear(char16_t *dest, int32_t sourceLength, int32_t destSize,UErrorCode *pErrorCode, int yehHamzaOption, int seenTailOption, int lamAlefOption, struct uShapeVariables shapeVars) {
int32_t i = 0;
*pErrorCode=U_NO_SPACE_AVAILABLE;
}
}elseif(lamAlefOption && isLamAlefChar(dest[i+1])) { if(dest[i] == SPACE_CHAR){
lamalefChar = dest[i+1];
dest[i+1] = LAM_CHAR;
dest[i] = convertLamAlef[ lamalefChar - 0xFEF5 ];
}else {
*pErrorCode=U_NO_SPACE_AVAILABLE;
}
}
}
destSize = sourceLength; return destSize;
} /* * Name : expandCompositChar * Function : LamAlef, need special handling, since it expands from one * character into two characters while shaping or deshaping. * In order to expand it, near or far spaces according to the * options user specifies. Also buffer size can be increased. * * For SeenFamily characters and YehHamza only the near option is * supported, while for LamAlef we can take spaces from begin, end, * near or even increase the buffer size. * There is also the Auto option for LamAlef only, which will first * search for a space at end, begin then near, respectively. * If there are no spaces to expand these characters, an error will be set to * U_NO_SPACE_AVAILABLE as defined in utypes.h
*/
/* * Converts the input buffer from FExx Range into 06xx Range * to make sure that all characters are in the 06xx range * even the lamalef is converted to the special region in * the 06xx range
*/ if ((options & U_SHAPE_PRESERVE_PRESENTATION_MASK) == U_SHAPE_PRESERVE_PRESENTATION_NOOP) { for (i = 0; i < sourceLength; i++) {
char16_t inputChar = dest[i]; if ( (inputChar >= 0xFB50) && (inputChar <= 0xFBFF)) {
char16_t c = convertFBto06 [ (inputChar - 0xFB50) ]; if (c != 0)
dest[i] = c;
} elseif ( (inputChar >= 0xFE70) && (inputChar <= 0xFEFC)) {
dest[i] = convertFEto06 [ (inputChar - 0xFE70) ] ;
} else {
dest[i] = inputChar ;
}
}
}
/* sets the index to the end of the buffer, together with the step point to -1 */
i = sourceLength - 1;
iend = -1;
step = -1;
/* * This function resolves the link between the characters . * Arabic characters have four forms : * Isolated Form, Initial Form, Middle Form and Final Form
*/
currLink = getLink(dest[i]);
lastPos = i;
Nx = -2, Nw = 0;
while (i != iend) { /* If high byte of currLink > 0 then more than one shape */ if ((currLink & 0xFF00) > 0 || (getLink(dest[i]) & IRRELEVANT) != 0) {
Nw = i + step; while (Nx < 0) { /* we need to know about next char */ if(Nw == iend) {
nextLink = 0;
Nx = 3000;
} else {
nextLink = getLink(dest[Nw]); if((nextLink & IRRELEVANT) == 0) {
Nx = Nw;
} else {
Nw = Nw + step;
}
}
}
if ( ((currLink & ALEFTYPE) > 0) && ((lastLink & LAMTYPE) > 0) ) {
lamalef_found = 1;
wLamalef = changeLamAlef(dest[i]); /*get from 0x065C-0x065f */ if ( wLamalef != 0) {
dest[i] = LAMALEF_SPACE_SUB; /* The default case is to drop the Alef and replace */
dest[lastPos] =wLamalef; /* it by LAMALEF_SPACE_SUB which is the last character in the */
i=lastPos; /* unicode private use area, this is done to make */
} /* sure that removeLamAlefSpaces() handles only the */
lastLink = prevLink; /* spaces generated during lamalef generation. */
currLink = getLink(wLamalef); /* LAMALEF_SPACE_SUB is added here and is replaced by spaces */
} /* in removeLamAlefSpaces() */
/* * get the proper shape according to link ability of neighbors * and of character; depends on the order of the shapes * (isolated, initial, middle, final) in the compatibility area
*/
Shape = shapeTable[nextLink & (LINKR + LINKL)]
[lastLink & (LINKR + LINKL)]
[currLink & (LINKR + LINKL)];
/* check that source and destination do not overlap */ if( dest!=nullptr &&
((source<=dest && dest<source+sourceLength) ||
(dest<=source && source<dest+destCapacity))) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0;
}
/* Does Options contain the new Seen Tail Unicode code point option */ if ( (options&U_SHAPE_TAIL_TYPE_MASK) == U_SHAPE_TAIL_NEW_UNICODE){
shapeVars.tailChar = NEW_TAIL_CHAR;
}else {
shapeVars.tailChar = OLD_TAIL_CHAR;
}
/* calculate destination size */ /* TODO: do we ever need to do this pure preflighting? */ if(((options&U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_RESIZE) ||
((options&U_SHAPE_TASHKEEL_MASK)==U_SHAPE_TASHKEEL_RESIZE)) {
outputSize=calculateSize(source,sourceLength,destCapacity,options);
} else {
outputSize=sourceLength;
}
/* * need a temporary buffer of size max(outputSize, sourceLength) * because at first we copy source->temp
*/ if(sourceLength>outputSize) {
outputSize=sourceLength;
}
/* Start of Arabic letter shaping part */ if(outputSize<=UPRV_LENGTHOF(buffer)) {
outputSize=UPRV_LENGTHOF(buffer);
tempbuffer=buffer;
} else {
tempbuffer = (char16_t *)uprv_malloc(outputSize*U_SIZEOF_UCHAR);
/*Test for nullptr*/ if(tempbuffer == nullptr) {
*pErrorCode = U_MEMORY_ALLOCATION_ERROR; if (tempsource != nullptr) uprv_free(tempsource); return 0;
}
}
u_memcpy(tempbuffer, source, sourceLength); if (tempsource != nullptr){
uprv_free(tempsource);
}
switch(options&U_SHAPE_LETTERS_MASK) { case U_SHAPE_LETTERS_SHAPE : if( (options&U_SHAPE_TASHKEEL_MASK)> 0
&& ((options&U_SHAPE_TASHKEEL_MASK) !=U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL)) { /* Call the shaping function with tashkeel flag == 2 for removal of tashkeel */
destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,2,shapeVars);
}else { /* default Call the shaping function with tashkeel flag == 1 */
destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,1,shapeVars);
/*After shaping text check if user wants to remove tashkeel and replace it with tatweel*/ if( (options&U_SHAPE_TASHKEEL_MASK) == U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL){
destLength = handleTashkeelWithTatweel(tempbuffer,destLength,destCapacity,options,pErrorCode);
}
} break; case U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED : /* Call the shaping function with tashkeel flag == 0 */
destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,0,shapeVars); break;
case U_SHAPE_LETTERS_UNSHAPE : /* Call the deshaping function */
destLength = deShapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,shapeVars); break; default : /* will never occur because of validity checks above */
destLength = 0; break;
}
/* * TODO: (markus 2002aug01) * For as long as we always preflight the outputSize above * we should U_ASSERT(outputSize==destLength) * except for the adjustment above before the tempbuffer allocation
*/
/* End of Arabic letter shaping part */
} else { /* * No letter shaping: * just make sure the destination is large enough and copy the string.
*/ if(destCapacity<sourceLength) { /* this catches preflighting, too */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR; return sourceLength;
}
u_memcpy(dest, source, sourceLength);
destLength=sourceLength;
}
/* * Perform number shaping. * With UTF-16 or UTF-32, the length of the string is constant. * The easiest way to do this is to operate on the destination and * "shape" the digits in-place.
*/ if((options&U_SHAPE_DIGITS_MASK)!=U_SHAPE_DIGITS_NOOP) {
char16_t digitBase;
int32_t i;
/* select the requested digit group */ switch(options&U_SHAPE_DIGIT_TYPE_MASK) { case U_SHAPE_DIGIT_TYPE_AN:
digitBase=0x660; /* Unicode: "Arabic-Indic digits" */ break; case U_SHAPE_DIGIT_TYPE_AN_EXTENDED:
digitBase=0x6f0; /* Unicode: "Eastern Arabic-Indic digits (Persian and Urdu)" */ break; default: /* will never occur because of validity checks above */
digitBase=0; break;
}
/* perform the requested operation */ switch(options&U_SHAPE_DIGITS_MASK) { case U_SHAPE_DIGITS_EN2AN: /* add (digitBase-'0') to each European (ASCII) digit code point */
digitBase-=0x30; for(i=0; i<destLength; ++i) { if(((uint32_t)dest[i]-0x30)<10) {
dest[i]+=digitBase;
}
} break; case U_SHAPE_DIGITS_AN2EN: /* subtract (digitBase-'0') from each Arabic digit code point */ for(i=0; i<destLength; ++i) { if(((uint32_t)dest[i]-(uint32_t)digitBase)<10) {
dest[i]-=digitBase-0x30;
}
} break; case U_SHAPE_DIGITS_ALEN2AN_INIT_LR:
_shapeToArabicDigitsWithContext(dest, destLength,
digitBase,
(options & U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGICAL, false); break; case U_SHAPE_DIGITS_ALEN2AN_INIT_AL:
_shapeToArabicDigitsWithContext(dest, destLength,
digitBase,
(options & U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGICAL, true); break; default: /* will never occur because of validity checks above */ break;
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.