// Group by model const resultsByModel: Record<string, EvaluatedResult[]> = {}; for (const result of results) { if (!resultsByModel[result.modelName]) {
resultsByModel[result.modelName] = [];
}
resultsByModel[result.modelName].push(result);
}
let minorCount = 0;
let significantCount = 0;
let criticalCount = 0;
for (const r of runs) { if (r.evaluationResult?.issues) { for (const issue of r.evaluationResult.issues) { if (issue.severity === "minor") minorCount++; elseif (issue.severity === "significant") significantCount++; elseif (issue.severity === "critical") criticalCount++;
}
}
}
if (modelsWithFailures) {
summary += `\n- **Models with at least one failure:** ${modelsWithFailures}`;
} return summary;
}
async function main() { const argv = await yargs(hideBin(process.argv))
.option("log-level", {
type: "string",
description: "Set the logging level", default: "info",
choices: ["debug", "info", "warn", "error"],
})
.option("results", {
type: "string",
description: "Directory to keep output files. If not specified, uses results/output-<model>. If specified, uses the provided directory (appending output-<model>).",
coerce: (arg) => (arg === undefined ? true : arg), default: true,
})
.option("runs-per-prompt", {
type: "number",
description: "Number of times to run each prompt", default: 1,
})
.option("model", {
type: "string",
array: true,
description: "Filter models by exact name", default: [],
choices: modelsToTest.map((m) => m.name),
})
.option("prompt", {
type: "string",
array: true,
description: "Filter prompts by name prefix",
})
.option("eval-model", {
type: "string",
description: "Model to use for evaluation", default: "gemini-2.5-flash",
choices: modelsToTest.map((m) => m.name),
})
.option("clean-results", {
type: "boolean",
description: "Clear the output directory before starting", default: false,
})
.help()
.alias("h", "help")
.strict().argv;
// Filter Models
let filteredModels = modelsToTest; if (argv.model && argv.model.length > 0) { const modelNames = argv.model as string[];
filteredModels = modelsToTest.filter((m) => modelNames.includes(m.name)); if (filteredModels.length === 0) {
logger.error(`No models found matching: ${modelNames.join(", ")}.`);
process.exit(1);
}
}
// Filter Prompts
let filteredPrompts = prompts; if (argv.prompt && argv.prompt.length > 0) { const promptPrefixes = argv.prompt as string[];
filteredPrompts = prompts.filter((p) =>
promptPrefixes.some((prefix) => p.name.startsWith(prefix))
); if (filteredPrompts.length === 0) {
logger.error(
`No prompt found with prefix "${promptPrefixes.join(", ")}".`
);
process.exit(1);
}
}
// Determine Output Directory (Base) // Note: Generator/Validator/Evaluator handle per-model subdirectories if outputDir is provided. // But we need a base output dir to pass to them.
let resultsBaseDir: string | undefined; const resultsArg = argv.results; if (typeof resultsArg === "string") {
resultsBaseDir = resultsArg;
} elseif (resultsArg === true) {
resultsBaseDir = "results";
}
// Clean Results if (
argv["clean-results"] &&
resultsBaseDir &&
fs.existsSync(resultsBaseDir)
) { // Only clean if we are using the default structure or explicit path // We should be careful not to delete root if user passed "/" (unlikely but possible) // For safety, let's iterate over models and clean their specific dirs if they exist // Or just clean the base dir if it looks like our results dir. // The previous logic cleaned `outputDir` which was per-model. // Here we might want to clean the whole results dir if it's the default "results". if (resultsBaseDir === "results") {
fs.rmSync(resultsBaseDir, { recursive: true, force: true });
} else { // If custom dir, maybe just clean it? // User asked to clean results.
fs.rmSync(resultsBaseDir, { recursive: true, force: true });
}
}
// Setup Logger (Global) // We need to setup logger to write to file? // Previous logic setup logger per model output dir. // Now we have multiple models potentially. // We can setup logger to write to stdout/stderr primarily, and maybe a global log file? // Or we can setup logger to NOT write to file, and let phases write their own logs? // The `setupLogger` function takes an outputDir. // If we have multiple models, where do we log? // Maybe just log to the first model's dir or a "latest" dir? // Or just console for now if multiple models? // If single model, use that model's dir.
if (resultsBaseDir) { if (filteredModels.length === 1) { const modelDirName = `output-${filteredModels[0].name.replace(/[\/:]/g, "_")}`;
setupLogger(path.join(resultsBaseDir, modelDirName), argv["log-level"]);
} else { // If multiple models, maybe just log to console or a shared log? // For now, let's just use console logging (default if setupLogger not called with dir?) // Actually setupLogger needs a dir to create 'eval.log'. // Let's create a 'combined' log if multiple models? // Or just skip file logging for multiple models for now.
setupLogger(undefined, argv["log-level"]);
}
} else {
setupLogger(undefined, argv["log-level"]);
}
const schemas = loadSchemas(); const catalogRulesPath = path.join(
__dirname, "../../json/standard_catalog_rules.txt"
);
let catalogRules: string | undefined; if (fs.existsSync(catalogRulesPath)) {
catalogRules = fs.readFileSync(catalogRulesPath, "utf-8");
} else {
logger.warn(
`Catalog rules file not found at ${catalogRulesPath}. Proceeding without specific catalog rules.`
);
}
if (resultsBaseDir) { // Save summary to each model dir? // Or just one summary? // Previous logic saved summary.md in model dir. for (const model of filteredModels) { const modelDirName = `output-${model.name.replace(/[\/:]/g, "_")}`; const modelDir = path.join(resultsBaseDir, modelDirName); if (fs.existsSync(modelDir)) {
fs.writeFileSync(path.join(modelDir, "summary.md"), summary);
}
}
}
}
if (require.main === module) {
main().catch(console.error);
}
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.12 Sekunden
(vorverarbeitet am 2026-06-10)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.