import { fetchFirecrawlContent } from
"../extensions/firecrawl/api.ts";
import { extractReadableContent } from
"../src/agents/tools/web-tools.js";
import { formatErrorMessage } from
"../src/infra/errors.ts";
const DEFAULT_URLS = [
"https://en.wikipedia.org/wiki/Web_scraping",
"https://news.ycombinator.com/",
"https://www.apple.com/iphone/",
"https://www.nytimes.com/",
"https://www.reddit.com/r/javascript/",
];
const urls = process.argv.slice(
2);
const targets = urls.length >
0 ? urls : DEFAULT_URLS;
const apiKey = process.env.FIRECRAWL_API_KEY;
const baseUrl = process.env.FIRECRAWL_BASE_URL ??
"https://api.firecrawl.dev";
const userAgent =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
const timeoutMs =
30_
000;
function truncate(value: string, max =
180): string {
if (!value) {
return "";
}
return value.length > max ? `${value.slice(
0, max)}…` : value;
}
async
function fetchHtml(url: string): Promise<{
ok:
boolean;
status: number;
contentType: string;
finalUrl: string;
body: string;
}> {
const controller =
new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
try {
const res = await fetch(url, {
method:
"GET",
headers: { Accept:
"*/*", "User-Agent": userAgent },
signal: controller.signal,
});
const contentType = res.headers.get(
"content-type") ??
"application/octet-stream";
const body = await res.text();
return {
ok: res.ok,
status: res.status,
contentType,
finalUrl: res.url || url,
body,
};
}
finally {
clearTimeout(timer);
}
}
async
function run() {
if (!apiKey) {
console.log(
"FIRECRAWL_API_KEY not set. Firecrawl comparisons will be skipped.");
}
for (
const url of targets) {
console.log(`\n=== ${url}`);
let localStatus =
"skipped";
let localTitle =
"";
let localText =
"";
let localError: string | undefined;
try {
const res = await fetchHtml(url);
if (!res.ok) {
localStatus = `http ${res.status}`;
}
else if (!res.contentType.includes(
"text/html")) {
localStatus = `non-html (${res.contentType})`;
}
else {
const readable = await extractReadableContent({
html: res.body,
url: res.finalUrl,
extractMode:
"markdown",
});
if (readable?.text) {
localStatus =
"readability";
localTitle = readable.title ??
"";
localText = readable.text;
}
else {
localStatus =
"readability-empty";
}
}
}
catch (error) {
localStatus =
"error";
localError = formatErrorMessage(error);
}
console.log(`local: ${localStatus} len=${localText.length} title=${truncate(localTi
tle, 80)}`);
if (localError) {
console.log(`local error: ${localError}`);
}
if (localText) {
console.log(`local sample: ${truncate(localText)}`);
}
if (apiKey) {
try {
const firecrawl = await fetchFirecrawlContent({
url,
extractMode: "markdown",
apiKey,
baseUrl,
onlyMainContent: true,
maxAgeMs: 172_800_000,
proxy: "auto",
storeInCache: true,
timeoutSeconds: 60,
});
console.log(
`firecrawl: ok len=${firecrawl.text.length} title=${truncate(
firecrawl.title ?? "",
80,
)} status=${firecrawl.status ?? "n/a"}`,
);
if (firecrawl.warning) {
console.log(`firecrawl warning: ${firecrawl.warning}`);
}
if (firecrawl.text) {
console.log(`firecrawl sample: ${truncate(firecrawl.text)}`);
}
} catch (error) {
const message = formatErrorMessage(error);
console.log(`firecrawl: error ${message}`);
}
}
}
process.exit(0);
}
run().catch((error) => {
console.error(error);
process.exit(1);
});