UVector32 boundary(inString.countChar32() + 1, status); if (U_FAILURE(status)) { return 0;
}
int32_t numBreaks = 0;
int32_t codePointLength = inString.countChar32(); // The ML algorithm groups six char and evaluates whether the 4th char is a breakpoint. // In each iteration, it evaluates the 4th char and then moves forward one char like a sliding // window. Initially, the first six values in the indexList are [-1, -1, 0, 1, 2, 3]. After // moving forward, finally the last six values in the indexList are // [length-4, length-3, length-2, length-1, -1, -1]. The "+4" here means four extra "-1".
int32_t indexSize = codePointLength + 4;
LocalMemory<int32_t> indexList(static_cast<int32_t*>(uprv_malloc(indexSize * sizeof(int32_t)))); if (indexList.isNull()) {
status = U_MEMORY_ALLOCATION_ERROR; return 0;
}
int32_t numCodeUnits = initIndexList(inString, indexList.getAlias(), status);
// Add a break for the start.
boundary.addElement(0, status);
numBreaks++; if (U_FAILURE(status)) return 0;
// Add a break for the end if there is not one there already. if (boundary.lastElementi() != inString.countChar32()) {
boundary.addElement(inString.countChar32(), status);
numBreaks++;
}
if (utextPos > prevUTextPos) { if (utextPos != rangeStart ||
(utextPos > 0 &&
fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
foundBreaks.push(utextPos, status);
correctedNumBreaks++;
}
} else { // Normalization expanded the input text, the dictionary found a boundary // within the expansion, giving two boundaries with the same index in the // original text. Ignore the second. See ticket #12918.
--numBreaks;
}
prevCPPos = cpPos;
prevUTextPos = utextPos;
}
(void)prevCPPos; // suppress compiler warnings about unused variable
UChar32 nextChar = utext_char32At(inText, rangeEnd); if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) { // In phrase breaking, there has to be a breakpoint between Cj character and // the number/open punctuation. // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
foundBreaks.popi();
correctedNumBreaks--;
}
}
int32_t MlBreakEngine::initIndexList(const UnicodeString &inString, int32_t *indexList,
UErrorCode &status) const { if (U_FAILURE(status)) { return 0;
}
int32_t index = 0;
int32_t length = inString.countChar32(); // Set all (lenght+4) items inside indexLength to -1 presuming -1 is 4 bytes of 0xff.
uprv_memset(indexList, 0xff, (length + 4) * sizeof(int32_t)); if (length > 0) {
indexList[2] = 0;
index = U16_LENGTH(inString.char32At(0)); if (length > 1) {
indexList[3] = index;
index += U16_LENGTH(inString.char32At(index)); if (length > 2) {
indexList[4] = index;
index += U16_LENGTH(inString.char32At(index)); if (length > 3) {
indexList[5] = index;
index += U16_LENGTH(inString.char32At(index));
}
}
}
} return index;
}
void MlBreakEngine::loadMLModel(UErrorCode &error) { // BudouX's model consists of thirteen categories, each of which is make up of pairs of the // feature and its score. As integrating it into jaml.txt, we define thirteen kinds of key and // value to represent the feature and the corresponding score respectively.
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.