import DOMPurify from "dompurify"; import MarkdownIt from "markdown-it"; import markdownItTaskLists from "markdown-it-task-lists"; import { truncateText } from "./format.ts"; import { normalizeLowercaseStringOrEmpty } from "./string-coerce.ts";
export const md = new MarkdownIt({
html: true, // Enable HTML recognition so html_block/html_inline overrides can escape it
breaks: true,
linkify: true,
});
// Enable GFM strikethrough (~~text~~) to match original marked.js behavior. // markdown-it uses <s> tags; we added "s" to allowedTags for DOMPurify.
md.enable("strikethrough");
// Disable fuzzy link detection to prevent bare filenames like "README.md" // from being auto-linked as "http://README.md". URLs with explicit protocol // (https://...) and emails are still linkified. // // Alternative considered: extensions/matrix/src/matrix/format.ts uses fuzzyLink // with a file-extension blocklist to filter false positives at render time. // We chose the www-only approach instead because: // 1. Matches original marked.js GFM behavior exactly (bare domains were never linked) // 2. No blocklist to maintain — new TLDs like .ai, .io, .dev would need constant updates // 3. Predictable behavior — users can always use explicit https:// for any URL
md.linkify.set({ fuzzyLink: false });
// Re-enable www. prefix detection per GFM spec: bare URLs without protocol // must start with "www." to be auto-linked. This avoids false positives on // filenames while preserving expected behavior for "www.example.com". // GFM spec: valid domain = alphanumeric/underscore/hyphen segments separated // by periods, at least one period, no underscores in last two segments.
md.linkify.add("www", {
validate(text, pos) { const tail = text.slice(pos); // Match: . followed by domain and optional path, matching marked.js behavior. // Stops at whitespace, < (HTML tag boundary), or CJK characters (RFC 3986: // raw CJK is not valid in URLs; percent-encoded CJK like %E4%BD%A0 is fine). const match = tail.match(
/^\.(?:[a-zA-Z0-9-]+\.?)+[^\s<\u2E80-\u2FFF\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF\uFF01-\uFF60]*/,
); if (!match) { return0;
}
let len = match[0].length;
// Strip trailing punctuation per GFM extended autolink spec. // GFM says: ?, !, ., ,, :, *, _, ~ are not part of the autolink if trailing.
// Balance checking config: closeChar -> openChar mapping. // Strip trailing close chars only when unbalanced (more closes than opens). // For self-matching pairs like "", open === close (strip if odd count). const balancePairs: Record<string, string> = { ")": "(", "]": "[", "}": "{", '"': '"', "'": "'",
};
// Pre-count balanced pairs to avoid O(n²) rescans. // balance[closeChar] = count(open) - count(close), negative means unbalanced const balance: Record<string, number> = {}; for (const [close, open] of Object.entries(balancePairs)) {
balance[close] = 0; for (let i = 0; i < len; i++) { const c = tail[i]; if (open === close) { // Self-matching pair (e.g., "") — toggle between 0 and 1 if (c === open) {
balance[close] = balance[close] === 0 ? 1 : 0;
}
} else { // Distinct open/close (e.g., ()) if (c === open) {
balance[close]++;
} elseif (c === close) {
balance[close]--;
}
}
}
}
while (len > 0) { const ch = tail[len - 1]; // GFM trailing punctuation: ?, !, ., ,, :, *, _, ~ stripped unconditionally. // Semicolon is handled specially below (entity reference rule). if (/[?!.,:*_~]/.test(ch)) {
len--; continue;
} // GFM entity reference rule: strip trailing &entity; sequences. // Only strip ; when preceded by &<alphanumeric>+ (e.g., & < &hl;). if (ch === ";") { // Backward scan to find & (O(n) total, avoids string allocation)
let j = len - 2; while (j >= 0 && /[a-zA-Z0-9]/.test(tail[j])) {
j--;
} // j < len - 2 ensures at least one alphanumeric between & and ; if (j >= 0 && tail[j] === "&" && j < len - 2) {
len = j; continue;
} // Not an entity reference, stop stripping break;
} // Handle balanced pairs — only strip close char if unbalanced. const open = balancePairs[ch]; if (open !== undefined) { if (open === ch) { // Self-matching: strip if odd count (unbalanced) if (balance[ch] !== 0) {
balance[ch] = 0;
len--; continue;
}
} else { // Distinct pair: strip if more closes than opens if (balance[ch] < 0) {
balance[ch]++;
len--; continue;
}
}
} break;
} return len;
},
normalize(match) {
match.url = "http://" + match.url;
},
});
// Override default link validator to allow all URLs through to renderers. // marked.js does not validate URLs at all — it generates <a>/<img> tags for // everything and relies on DOMPurify to strip dangerous schemes. // // We match this behavior exactly: // - All URLs pass validation, including javascript:, vbscript:, file:, data: // - Images: renderer.rules.image shows alt text for non-data-image URLs // - Links: DOMPurify strips dangerous href schemes, leaving safe anchor text // - Blocking at validateLink would skip token generation entirely, causing raw // markdown source to appear instead of graceful fallbacks.
md.validateLink = () => true;
// Trim trailing CJK characters from auto-linked URLs (RFC 3986: raw CJK is // not valid in URLs). markdown-it's built-in linkify for https:// URLs may // swallow adjacent CJK text into the URL. This core rule runs after linkify // and splits the CJK suffix back into a plain text token.
md.core.ruler.after("linkify", "linkify-cjk-trim", (state) => { for (const blockToken of state.tokens) { if (blockToken.type !== "inline" || !blockToken.children) { continue;
} const children = blockToken.children; for (let i = children.length - 1; i >= 0; i--) { const token = children[i]; if (token.type !== "link_open") { continue;
} // Only trim linkify-generated autolinks, not explicit markdown links // like [OpenClaw中文](https://docs.openclaw.ai) where CJK in display // text is intentional and href must not be rewritten. if (token.markup !== "linkify") { continue;
} // Use the display text to find CJK boundary (href may be percent-encoded) const textToken = children[i + 1]; if (!textToken || textToken.type !== "text") { continue;
} const displayText = textToken.content; // Scan backward to find trailing CJK suffix only. // Middle CJK must be preserved (e.g. https://example.com/你/test stays intact); // only strip a contiguous CJK tail adjacent to non-URL text.
let cjkIdx = displayText.length; while (cjkIdx > 0 && CJK_RE.test(displayText[cjkIdx - 1])) {
cjkIdx--;
} if (cjkIdx <= 0 || cjkIdx === displayText.length) { continue;
} // Split: URL part and CJK tail from display text const trimmedDisplay = displayText.slice(0, cjkIdx); const cjkTail = displayText.slice(cjkIdx); // Rebuild href by preserving the scheme prefix that linkify added but // display text omits (e.g. "mailto:" for emails, "http://" for www links). const href = token.attrGet("href") ?? ""; const prefixLen = href.indexOf(displayText); const hrefPrefix = prefixLen > 0 ? href.slice(0, prefixLen) : "";
token.attrSet("href", hrefPrefix + trimmedDisplay);
textToken.content = trimmedDisplay; // Find link_close and insert CJK text after it for (let j = i + 1; j < children.length; j++) { if (children[j].type === "link_close") { const tailToken = new state.Token("text", "", 0);
tailToken.content = cjkTail;
children.splice(j + 1, 0, tailToken); break;
}
}
}
}
});
// Enable GFM task list checkboxes (- [x] / - [ ]). // enabled: false keeps checkboxes read-only (disabled="") — task lists in // chat messages are display-only, not interactive forms. // label: false avoids wrapping item text in <label>, which would break // accessibility when the item contains links (MDN warns against anchors inside labels).
md.use(markdownItTaskLists, { enabled: false, label: false });
// Mark the <input> html_inline token inside task-list items as trusted so the // html_inline override lets it through. With label: false, the plugin generates // only a single <input ...> token per item. // We identify task-list items by the class="task-list-item" the plugin sets.
md.core.ruler.after("github-task-lists", "task-list-allowlist", (state) => { const tokens = state.tokens; for (let i = 2; i < tokens.length; i++) { if (tokens[i].type !== "inline" || !tokens[i].children) { continue;
} if (tokens[i - 1].type !== "paragraph_open") { continue;
} if (tokens[i - 2].type !== "list_item_open") { continue;
} const listItem = tokens[i - 2]; const cls = listItem.attrGet("class") ?? ""; if (!cls.includes("task-list-item")) { continue;
} // Only trust the checkbox <input> token from the plugin, not other user-supplied HTML. // The plugin inserts an <input> at the start; user HTML elsewhere must stay escaped. for (const child of tokens[i].children!) { if (child.type === "html_inline" && /^<input\s/i.test(child.content)) {
child.meta = { taskListPlugin: true }; break; // Only one checkbox per item
}
}
}
});
// Override html_block and html_inline to escape raw HTML (#13937). // Exception: html_inline tokens marked by a trusted plugin (meta.taskListPlugin) // are allowed through — they are generated by our own plugin pipeline, not user input, // and DOMPurify provides the final safety net regardless.
md.renderer.rules.html_block = (tokens, idx) => { return escapeHtml(tokens[idx].content) + "\n";
};
md.renderer.rules.html_inline = (tokens, idx) => { const token = tokens[idx]; if (token.meta?.taskListPlugin === true) { return token.content;
} return escapeHtml(token.content);
};
// Override image to only allow base64 data URIs (#15437)
md.renderer.rules.image = (tokens, idx) => { const token = tokens[idx]; const src = token.attrGet("src")?.trim() ?? ""; // Use token.content which preserves raw markdown formatting (e.g. **bold**) // to match original marked.js behavior. const alt = normalizeMarkdownImageLabel(token.content); if (!INLINE_DATA_IMAGE_RE.test(src)) { return escapeHtml(alt);
} return `<img class="markdown-inline-image" src="${escapeHtml(src)}" alt="${escapeHtml(alt)}">`;
};
// Override fenced code blocks with copy button + JSON collapse
md.renderer.rules.fence = (tokens, idx) => { const token = tokens[idx]; // token.info contains the full fence info string (e.g., "json title=foo"); // extract only the first whitespace-separated token as the language. const lang = token.info.trim().split(/\s+/)[0] || ""; const text = token.content; const langClass = lang ? ` class="language-${escapeHtml(lang)}"` : ""; const safeText = escapeHtml(text); const codeBlock = `<pre><code${langClass}>${safeText}</code></pre>`; const langLabel = lang ? `<span class="code-block-lang">${escapeHtml(lang)}</span>` : ""; const attrSafe = escapeHtml(text); const copyBtn = `<button type="button"class="code-block-copy" data-code="${attrSafe}" aria-label="Copy code"><span class="code-block-copy__idle">Copy</span><span class="code-block-copy__done">Copied!</span></button>`; const header = `<div class="code-block-header">${langLabel}${copyBtn}</div>`;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.