/** *Checkifatokenlookslikeameaningfulkeyword. *Returnsfalseforshorttokens,numbers-only,etc.
*/ function isValidKeyword(token: string): boolean { if (!token || token.length === 0) { returnfalse;
} // Skip very short English words (likely stop words or fragments) if (/^[a-zA-Z]+$/.test(token) && token.length < 3) { returnfalse;
} // Skip pure numbers (not useful for semantic search) if (/^\d+$/.test(token)) { returnfalse;
} // Skip tokens that are all punctuation if (/^[\p{P}\p{S}]+$/u.test(token)) { returnfalse;
} returntrue;
}
// Split into segments (English words, Chinese character sequences, etc.) const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);
for (const segment of segments) { // Japanese text often mixes scripts (kanji/kana/ASCII) without spaces. // Extract script-specific chunks so technical terms like "API" / "バグ" are retained. if (/[\u3040-\u30ff]/.test(segment)) { const jpParts =
segment.match(/[a-z0-9_]+|[\u30a0-\u30ffー]+|[\u4e00-\u9fff]+|[\u3040-\u309f]{2,}/g) ?? []; for (const part of jpParts) { if (/^[\u4e00-\u9fff]+$/.test(part)) {
tokens.push(part); if (!useTrigram) { for (let i = 0; i < part.length - 1; i++) {
tokens.push(part[i] + part[i + 1]);
}
}
} else {
tokens.push(part);
}
}
} elseif (/[\u4e00-\u9fff]/.test(segment)) { // Check if segment contains CJK characters (Chinese) const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c)); if (useTrigram) { // In trigram mode, push the whole contiguous CJK block (mirroring the // Japanese kanji path). SQLite's trigram FTS requires at least 3 characters // per query term — individual characters silently return no results. const block = chars.join(""); if (block.length > 0) {
tokens.push(block);
}
} else { // Default mode: unigrams + bigrams for phrase matching
tokens.push(...chars); for (let i = 0; i < chars.length - 1; i++) {
tokens.push(chars[i] + chars[i + 1]);
}
}
} elseif (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) { // For Korean (Hangul syllables and jamo), keep the word as-is unless it is // effectively a stop word once trailing particles are removed. const stem = stripKoreanTrailingParticle(segment); const stemIsStopWord = stem !== null && STOP_WORDS_KO.has(stem); if (!STOP_WORDS_KO.has(segment) && !stemIsStopWord) {
tokens.push(segment);
} // Also emit particle-stripped stems when they are useful keywords. if (stem && !STOP_WORDS_KO.has(stem) && isUsefulKoreanStem(stem)) {
tokens.push(stem);
}
} else { // For non-CJK, keep as single token
tokens.push(segment);
}
}
// Build expanded query: original terms OR extracted keywords // This ensures both exact matches and keyword matches are found const expanded = keywords.length > 0 ? `${original} OR ${keywords.join(" OR ")}` : original;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.