// Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG: // // The domain name length limit is 255 octets in an internal DNS representation // where the last ("root") label is the empty label // represented by length byte 0 alone. // In a conventional string, this translates to 253 characters, or 254 // if there is a trailing dot for the root label.
U_NAMESPACE_BEGIN
// Severe errors which usually result in a U+FFFD replacement character in the result string. const uint32_t severeErrors=
UIDNA_ERROR_LEADING_COMBINING_MARK|
UIDNA_ERROR_DISALLOWED|
UIDNA_ERROR_PUNYCODE|
UIDNA_ERROR_LABEL_HAS_DOT|
UIDNA_ERROR_INVALID_ACE_LABEL;
// UTS #46 data for ASCII characters. // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase // and passes through all other ASCII characters. // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed // using this data. // The ASCII fastpath also uses this data. // Values: -1=disallowed 0==valid 1==mapped (lowercase) staticconst int8_t asciiData[128]={
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 002D..002E; valid # HYPHEN-MINUS..FULL STOP
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, // 0030..0039; valid # DIGIT ZERO..DIGIT NINE
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z
-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1
};
UnicodeString &
UTS46::process(const UnicodeString &src,
UBool isLabel, UBool toASCII,
UnicodeString &dest,
IDNAInfo &info, UErrorCode &errorCode) const { // uts46Norm2.normalize() would do all of this error checking and setup, // but with the ASCII fastpath we do not always call it, and do not // call it first. if(U_FAILURE(errorCode)) {
dest.setToBogus(); return dest;
} const char16_t *srcArray=src.getBuffer(); if(&dest==&src || srcArray==nullptr) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
dest.setToBogus(); return dest;
} // Arguments are fine, reset output values.
dest.remove();
info.reset();
int32_t srcLength=src.length(); if(srcLength==0) {
info.errors|=UIDNA_ERROR_EMPTY_LABEL; return dest;
}
char16_t *destArray=dest.getBuffer(srcLength); if(destArray==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR; return dest;
} // ASCII fastpath
UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
int32_t labelStart=0;
int32_t i; for(i=0;; ++i) { if(i==srcLength) { if(toASCII) { if((i-labelStart)>63) {
info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
} // There is a trailing dot if labelStart==i. if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
}
}
info.errors|=info.labelErrors;
dest.releaseBuffer(i); return dest;
}
char16_t c=srcArray[i]; if(c>0x7f) { break;
} int cData=asciiData[c]; if(cData>0) {
destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter.
} elseif(cData<0 && disallowNonLDHDot) { break; // Replacing with U+FFFD can be complicated for toASCII.
} else {
destArray[i]=c; if(c==0x2d) { // hyphen if(i==(labelStart+3) && srcArray[i-1]==0x2d) { // "??--..." is Punycode or forbidden.
++i; // '-' was copied to dest already break;
} if(i==labelStart) { // label starts with "-"
info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
} if((i+1)==srcLength || srcArray[i+1]==0x2e) { // label ends with "-"
info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
}
} elseif(c==0x2e) { // dot if(isLabel) { // Replacing with U+FFFD can be complicated for toASCII.
++i; // '.' was copied to dest already break;
} if(i==labelStart) {
info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
} if(toASCII && (i-labelStart)>63) {
info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
}
info.errors|=info.labelErrors;
info.labelErrors=0;
labelStart=i+1;
}
}
}
info.errors|=info.labelErrors;
dest.releaseBuffer(i);
processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode); if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
(!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))
) {
info.errors|=UIDNA_ERROR_BIDI;
} return dest;
}
void
UTS46::processUTF8(StringPiece src,
UBool isLabel, UBool toASCII,
ByteSink &dest,
IDNAInfo &info, UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { return;
} constchar *srcArray=src.data();
int32_t srcLength=src.length(); if(srcArray==nullptr && srcLength!=0) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR; return;
} // Arguments are fine, reset output values.
info.reset(); if(srcLength==0) {
info.errors|=UIDNA_ERROR_EMPTY_LABEL;
dest.Flush(); return;
}
UnicodeString destString;
int32_t labelStart=0; if(srcLength<=256) { // length of stackArray[] // ASCII fastpath char stackArray[256];
int32_t destCapacity; char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20,
stackArray, UPRV_LENGTHOF(stackArray), &destCapacity);
UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
int32_t i; for(i=0;; ++i) { if(i==srcLength) { if(toASCII) { if((i-labelStart)>63) {
info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
} // There is a trailing dot if labelStart==i. if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
}
}
info.errors|=info.labelErrors;
dest.Append(destArray, i);
dest.Flush(); return;
} char c=srcArray[i]; if (static_cast<int8_t>(c) < 0) { // (uint8_t)c>0x7f break;
} int cData = asciiData[static_cast<int>(c)]; // Cast: gcc warns about indexing with a char. if(cData>0) {
destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter.
} elseif(cData<0 && disallowNonLDHDot) { break; // Replacing with U+FFFD can be complicated for toASCII.
} else {
destArray[i]=c; if(c==0x2d) { // hyphen if(i==(labelStart+3) && srcArray[i-1]==0x2d) { // "??--..." is Punycode or forbidden. break;
} if(i==labelStart) { // label starts with "-"
info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
} if((i+1)==srcLength || srcArray[i+1]==0x2e) { // label ends with "-"
info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
}
} elseif(c==0x2e) { // dot if(isLabel) { break; // Replacing with U+FFFD can be complicated for toASCII.
} if(i==labelStart) {
info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
} if(toASCII && (i-labelStart)>63) {
info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
}
info.errors|=info.labelErrors;
info.labelErrors=0;
labelStart=i+1;
}
}
}
info.errors|=info.labelErrors; // Convert the processed ASCII prefix of the current label to UTF-16.
int32_t mappingStart=i-labelStart;
destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart)); // Output the previous ASCII labels and process the rest of src in UTF-16.
dest.Append(destArray, labelStart);
processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart,
isLabel, toASCII,
destString, info, errorCode);
} else { // src is too long for the ASCII fastpath implementation.
processUnicode(UnicodeString::fromUTF8(src), 0, 0,
isLabel, toASCII,
destString, info, errorCode);
}
destString.toUTF8(dest); // calls dest.Flush() if(toASCII && !isLabel) { // length==labelStart==254 means that there is a trailing dot (ok) and // destString is empty (do not index at 253-labelStart).
int32_t length=labelStart+destString.length(); if( length>=254 && isASCIIString(destString) &&
(length>254 ||
(labelStart<254 && destString[253-labelStart]!=0x2e))
) {
info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
}
} if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
(!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart)))
) {
info.errors|=UIDNA_ERROR_BIDI;
}
}
UnicodeString &
UTS46::processUnicode(const UnicodeString &src,
int32_t labelStart, int32_t mappingStart,
UBool isLabel, UBool toASCII,
UnicodeString &dest,
IDNAInfo &info, UErrorCode &errorCode) const { if(mappingStart==0) {
uts46Norm2.normalize(src, dest, errorCode);
} else {
uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);
} if(U_FAILURE(errorCode)) { return dest;
}
UBool doMapDevChars=
toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 :
(options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0; const char16_t *destArray=dest.getBuffer();
int32_t destLength=dest.length();
int32_t labelLimit=labelStart; while(labelLimit<destLength) {
char16_t c=destArray[labelLimit]; if(c==0x2e && !isLabel) {
int32_t labelLength=labelLimit-labelStart;
int32_t newLength=processLabel(dest, labelStart, labelLength,
toASCII, info, errorCode);
info.errors|=info.labelErrors;
info.labelErrors=0; if(U_FAILURE(errorCode)) { return dest;
}
destArray=dest.getBuffer();
destLength+=newLength-labelLength;
labelLimit=labelStart+=newLength+1; continue;
} elseif(c<0xdf) { // pass
} elseif(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
info.isTransDiff=true; if(doMapDevChars) {
destLength=mapDevChars(dest, labelStart, labelLimit, errorCode); if(U_FAILURE(errorCode)) { return dest;
}
destArray=dest.getBuffer(); // All deviation characters have been mapped, no need to check for them again.
doMapDevChars=false; // Do not increment labelLimit in case c was removed. continue;
}
} elseif(U16_IS_SURROGATE(c)) { if(U16_IS_SURROGATE_LEAD(c) ?
(labelLimit+1)==destLength || !U16_IS_TRAIL(destArray[labelLimit+1]) :
labelLimit==labelStart || !U16_IS_LEAD(destArray[labelLimit-1])) { // Map an unpaired surrogate to U+FFFD before normalization so that when // that removes characters we do not turn two unpaired ones into a pair.
info.labelErrors|=UIDNA_ERROR_DISALLOWED;
dest.setCharAt(labelLimit, 0xfffd);
destArray=dest.getBuffer();
}
}
++labelLimit;
} // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) // but not an empty label elsewhere nor a completely empty domain name. // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. if(0==labelStart || labelStart<labelLimit) {
processLabel(dest, labelStart, labelLimit-labelStart,
toASCII, info, errorCode);
info.errors|=info.labelErrors;
} return dest;
}
int32_t
UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { return 0;
}
int32_t length=dest.length();
char16_t *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length); if(s==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR; return length;
}
int32_t capacity=dest.getCapacity();
UBool didMapDevChars=false;
int32_t readIndex=mappingStart, writeIndex=mappingStart; do {
char16_t c=s[readIndex++]; switch(c) { case 0xdf: // Map sharp s to ss.
didMapDevChars=true;
s[writeIndex++]=0x73; // Replace sharp s with first s. // Insert second s and account for possible buffer reallocation. if(writeIndex==readIndex) { if(length==capacity) {
dest.releaseBuffer(length);
s=dest.getBuffer(length+1); if(s==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR; return length;
}
capacity=dest.getCapacity();
}
u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex);
++readIndex;
}
s[writeIndex++]=0x73;
++length; break; case 0x3c2: // Map final sigma to nonfinal sigma.
didMapDevChars=true;
s[writeIndex++]=0x3c3; break; case 0x200c: // Ignore/remove ZWNJ. case 0x200d: // Ignore/remove ZWJ.
didMapDevChars=true;
--length; break; default: // Only really necessary if writeIndex was different from readIndex.
s[writeIndex++]=c; break;
}
} while(writeIndex<length);
dest.releaseBuffer(length); if(didMapDevChars) { // Mapping deviation characters might have resulted in an un-NFC string. // We could use either the NFC or the UTS #46 normalizer. // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
UnicodeString normalized;
uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode); if(U_SUCCESS(errorCode)) {
dest.replace(labelStart, 0x7fffffff, normalized); if(dest.isBogus()) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} return dest.length();
}
} return length;
}
// Replace the label in dest with the label string, if the label was modified. // If &label==&dest then the label was modified in-place and labelLength // is the new label length, different from label.length(). // If &label!=&dest then labelLength==label.length(). // Returns labelLength (= the new label length). static int32_t
replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength, const UnicodeString &label, int32_t labelLength, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0;
} if(&label!=&dest) {
dest.replace(destLabelStart, destLabelLength, label); if(dest.isBogus()) {
errorCode=U_MEMORY_ALLOCATION_ERROR; return 0;
}
} return labelLength;
}
int32_t
UTS46::processLabel(UnicodeString &dest,
int32_t labelStart, int32_t labelLength,
UBool toASCII,
IDNAInfo &info, UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { return 0;
}
UnicodeString fromPunycode;
UnicodeString *labelString; const char16_t *label=dest.getBuffer()+labelStart;
int32_t destLabelStart=labelStart;
int32_t destLabelLength=labelLength;
UBool wasPunycode; if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) { // Label starts with "xn--", try to un-Punycode it. // In IDNA2008, labels like "xn--" (decodes to an empty string) and // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from // comparing the ToUnicode input with the back-to-ToASCII output. // They are alternate encodings of the respective ASCII labels. // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before // the round-trip verification. if(labelLength==4 || (labelLength>5 && label[labelLength-1]==u'-')) {
info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
}
wasPunycode=true;
char16_t *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit if(unicodeBuffer==nullptr) { // Should never occur if we used capacity==-1 which uses the internal buffer.
errorCode=U_MEMORY_ALLOCATION_ERROR; return labelLength;
}
UErrorCode punycodeErrorCode=U_ZERO_ERROR;
int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4,
unicodeBuffer, fromPunycode.getCapacity(),
nullptr, &punycodeErrorCode); if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
fromPunycode.releaseBuffer(0);
unicodeBuffer=fromPunycode.getBuffer(unicodeLength); if(unicodeBuffer==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR; return labelLength;
}
punycodeErrorCode=U_ZERO_ERROR;
unicodeLength=u_strFromPunycode(label+4, labelLength-4,
unicodeBuffer, fromPunycode.getCapacity(),
nullptr, &punycodeErrorCode);
}
fromPunycode.releaseBuffer(unicodeLength); if(U_FAILURE(punycodeErrorCode)) {
info.labelErrors|=UIDNA_ERROR_PUNYCODE; return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
} // Check for NFC, and for characters that are not // valid or deviation characters according to the normalizer. // If there is something wrong, then the string will change. // Note that the normalizer passes through non-LDH ASCII and deviation characters. // Deviation characters are ok in Punycode even in transitional processing. // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode); if(U_FAILURE(errorCode)) { return labelLength;
} // Unicode 15.1 UTS #46: // Added an additional condition in 4.1 Validity Criteria to // disallow labels such as xn--xn---epa., which do not round-trip. // --> Validity Criteria new criterion 4: // If not CheckHyphens, the label must not begin with “xn--”. if(!isValid || fromPunycode.startsWith(UnicodeString::readOnlyAlias(u"xn--"))) {
info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);
}
labelString=&fromPunycode;
label=fromPunycode.getBuffer();
labelStart=0;
labelLength=fromPunycode.length();
} else {
wasPunycode=false;
labelString=&dest;
} // Validity check if(labelLength==0) {
info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; return replaceLabel(dest, destLabelStart, destLabelLength,
*labelString, labelLength, errorCode);
} // labelLength>0 if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) { // label starts with "??--"
info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4;
} if(label[0]==0x2d) { // label starts with "-"
info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
} if(label[labelLength-1]==0x2d) { // label ends with "-"
info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
} // If the label was not a Punycode label, then it was the result of // mapping, normalization and label segmentation. // If the label was in Punycode, then we mapped it again above // and checked its validity. // Now we handle the STD3 restriction to LDH characters (if set) // and we look for U+FFFD which indicates disallowed characters // in a non-Punycode label or U+FFFD itself in a Punycode label. // We also check for dots which can come from the input to a single-label function. // Ok to cast away const because we own the UnicodeString.
char16_t* s = const_cast<char16_t*>(label); const char16_t *limit=label+labelLength;
char16_t oredChars=0; // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; do {
char16_t c=*s; if(c<=0x7f) { if(c==0x2e) {
info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
*s=0xfffd;
} elseif(disallowNonLDHDot && asciiData[c]<0) {
info.labelErrors|=UIDNA_ERROR_DISALLOWED;
*s=0xfffd;
}
} else {
oredChars|=c; if(c==0xfffd) {
info.labelErrors|=UIDNA_ERROR_DISALLOWED;
}
}
++s;
} while(s<limit); // Check for a leading combining mark after other validity checks // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.
UChar32 c;
int32_t cpLength=0; // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
U16_NEXT_UNSAFE(label, cpLength, c); if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK;
labelString->replace(labelStart, cpLength, static_cast<char16_t>(0xfffd));
label=labelString->getBuffer()+labelStart;
labelLength+=1-cpLength; if(labelString==&dest) {
destLabelLength=labelLength;
}
} if((info.labelErrors&severeErrors)==0) { // Do contextual checks only if we do not have U+FFFD from a severe error // because U+FFFD can make these checks fail. if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) {
checkLabelBiDi(label, labelLength, info);
} if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
!isLabelOkContextJ(label, labelLength)
) {
info.labelErrors|=UIDNA_ERROR_CONTEXTJ;
} if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
checkLabelContextO(label, labelLength, info);
} if(toASCII) { if(wasPunycode) { // Leave a Punycode label unchanged if it has no severe errors. if(destLabelLength>63) {
info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
} return destLabelLength;
} elseif(oredChars>=0x80) { // Contains non-ASCII characters.
UnicodeString punycode;
char16_t *buffer=punycode.getBuffer(63); // 63==maximum DNS label length if(buffer==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR; return destLabelLength;
}
buffer[0]=0x78; // Write "xn--".
buffer[1]=0x6e;
buffer[2]=0x2d;
buffer[3]=0x2d;
int32_t punycodeLength=u_strToPunycode(label, labelLength,
buffer+4, punycode.getCapacity()-4,
nullptr, &errorCode); if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
errorCode=U_ZERO_ERROR;
punycode.releaseBuffer(4);
buffer=punycode.getBuffer(4+punycodeLength); if(buffer==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR; return destLabelLength;
}
punycodeLength=u_strToPunycode(label, labelLength,
buffer+4, punycode.getCapacity()-4,
nullptr, &errorCode);
}
punycodeLength+=4;
punycode.releaseBuffer(punycodeLength); if(U_FAILURE(errorCode)) { return destLabelLength;
} if(punycodeLength>63) {
info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
} return replaceLabel(dest, destLabelStart, destLabelLength,
punycode, punycodeLength, errorCode);
} else { // all-ASCII label if(labelLength>63) {
info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
}
}
}
} else { // If a Punycode label has severe errors, // then leave it but make sure it does not look valid. if(wasPunycode) {
info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info, errorCode);
}
} return replaceLabel(dest, destLabelStart, destLabelLength,
*labelString, labelLength, errorCode);
}
// Make sure an ACE label does not look valid. // Append U+FFFD if the label has only LDH characters. // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.
int32_t
UTS46::markBadACELabel(UnicodeString &dest,
int32_t labelStart, int32_t labelLength,
UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { return 0;
}
UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
UBool isASCII=true;
UBool onlyLDH=true; const char16_t *label=dest.getBuffer()+labelStart; const char16_t *limit=label+labelLength; // Start after the initial "xn--". // Ok to cast away const because we own the UnicodeString. for(char16_t *s=const_cast<char16_t *>(label+4); s<limit; ++s) {
char16_t c=*s; if(c<=0x7f) { if(c==0x2e) {
info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
*s=0xfffd;
isASCII=onlyLDH=false;
} elseif(asciiData[c]<0) {
onlyLDH=false; if(disallowNonLDHDot) {
*s=0xfffd;
isASCII=false;
}
}
} else {
isASCII=onlyLDH=false;
}
} if(onlyLDH) {
dest.insert(labelStart + labelLength, static_cast<char16_t>(0xfffd)); if(dest.isBogus()) {
errorCode=U_MEMORY_ALLOCATION_ERROR; return 0;
}
++labelLength;
} else { if(toASCII && isASCII && labelLength>63) {
info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
}
} return labelLength;
}
// We scan the whole label and check both for whether it contains RTL characters // and whether it passes the BiDi Rule. // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find // that a domain name is a BiDi domain name (has an RTL label) only after // processing several earlier labels. void
UTS46::checkLabelBiDi(const char16_t *label, int32_t labelLength, IDNAInfo &info) const { // IDNA2008 BiDi rule // Get the directionality of the first character.
UChar32 c;
int32_t i=0;
U16_NEXT_UNSAFE(label, i, c);
uint32_t firstMask=U_MASK(u_charDirection(c)); // 1. The first character must be a character with BIDI property L, R // or AL. If it has the R or AL property, it is an RTL label; if it // has the L property, it is an LTR label. if((firstMask&~L_R_AL_MASK)!=0) {
info.isOkBiDi=false;
} // Get the directionality of the last non-NSM character.
uint32_t lastMask; for(;;) { if(i>=labelLength) {
lastMask=firstMask; break;
}
U16_PREV_UNSAFE(label, labelLength, c);
UCharDirection dir=u_charDirection(c); if(dir!=U_DIR_NON_SPACING_MARK) {
lastMask=U_MASK(dir); break;
}
} // 3. In an RTL label, the end of the label must be a character with // BIDI property R, AL, EN or AN, followed by zero or more // characters with BIDI property NSM. // 6. In an LTR label, the end of the label must be a character with // BIDI property L or EN, followed by zero or more characters with // BIDI property NSM. if( (firstMask&L_MASK)!=0 ?
(lastMask&~L_EN_MASK)!=0 :
(lastMask&~R_AL_EN_AN_MASK)!=0
) {
info.isOkBiDi=false;
} // Add the directionalities of the intervening characters.
uint32_t mask=firstMask|lastMask; while(i<labelLength) {
U16_NEXT_UNSAFE(label, i, c);
mask|=U_MASK(u_charDirection(c));
} if(firstMask&L_MASK) { // 5. In an LTR label, only characters with the BIDI properties L, EN, // ES, CS, ET, ON, BN and NSM are allowed. if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
info.isOkBiDi=false;
}
} else { // 2. In an RTL label, only characters with the BIDI properties R, AL, // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
info.isOkBiDi=false;
} // 4. In an RTL label, if an EN is present, no AN may be present, and // vice versa. if((mask&EN_AN_MASK)==EN_AN_MASK) {
info.isOkBiDi=false;
}
} // An RTL label is a label that contains at least one character of type // R, AL or AN. [...] // A "BIDI domain name" is a domain name that contains at least one RTL // label. [...] // The following rule, consisting of six conditions, applies to labels // in BIDI domain names. if((mask&R_AL_AN_MASK)!=0) {
info.isBiDi=true;
}
}
// Special code for the ASCII prefix of a BiDi domain name. // The ASCII prefix is all-LTR.
// IDNA2008 BiDi rule, parts relevant to ASCII labels: // 1. The first character must be a character with BIDI property L [...] // 5. In an LTR label, only characters with the BIDI properties L, EN, // ES, CS, ET, ON, BN and NSM are allowed. // 6. In an LTR label, the end of the label must be a character with // BIDI property L or EN [...]
// UTF-16 version, called for mapped ASCII prefix. // Cannot contain uppercase A-Z. // s[length-1] must be the trailing dot. static UBool
isASCIIOkBiDi(const char16_t *s, int32_t length) {
int32_t labelStart=0; for(int32_t i=0; i<length; ++i) {
char16_t c=s[i]; if(c==0x2e) { // dot if(i>labelStart) {
c=s[i-1]; if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) { // Last character in the label is not an L or EN. returnfalse;
}
}
labelStart=i+1;
} elseif(i==labelStart) { if(!(0x61<=c && c<=0x7a)) { // First character in the label is not an L. returnfalse;
}
} else { if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { // Intermediate character in the label is a B, S or WS. returnfalse;
}
}
} returntrue;
}
// UTF-8 version, called for source ASCII prefix. // Can contain uppercase A-Z. // s[length-1] must be the trailing dot. static UBool
isASCIIOkBiDi(constchar *s, int32_t length) {
int32_t labelStart=0; for(int32_t i=0; i<length; ++i) { char c=s[i]; if(c==0x2e) { // dot if(i>labelStart) {
c=s[i-1]; if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) { // Last character in the label is not an L or EN. returnfalse;
}
}
labelStart=i+1;
} elseif(i==labelStart) { if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) { // First character in the label is not an L. returnfalse;
}
} else { if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { // Intermediate character in the label is a B, S or WS. returnfalse;
}
}
} returntrue;
}
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.42Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.