Update test framework: fix run_tests.py to support all test files, add auto-import-check for test files

This commit is contained in:
qiaoxinjiu
2026-05-09 15:11:30 +08:00
parent eb053a347f
commit eaba8328da
21739 changed files with 2236758 additions and 719 deletions

112
node_modules/mcp-evals/dist/index.js generated vendored Normal file
View File

@@ -0,0 +1,112 @@
import { Experimental_StdioMCPTransport } from "ai/mcp-stdio";
import { experimental_createMCPClient, streamText, } from "ai";
import { openai } from "@ai-sdk/openai";
const defaultModel = openai("gpt-4o");
export async function runEvals(model = defaultModel, prompt, serverPath) {
const transport = new Experimental_StdioMCPTransport({
command: "tsx",
args: [serverPath],
env: Object.fromEntries(Object.entries(process.env).filter(([_, v]) => v !== undefined))
});
const client = await experimental_createMCPClient({
transport,
});
const tools = await client.tools();
try {
const result = streamText({
model,
tools,
system: "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
prompt,
maxRetries: 1,
maxSteps: 10,
onError: ({ error }) => {
console.error(error);
},
});
let fullText = '';
for await (const chunk of result.fullStream) {
if (chunk.type === 'text-delta') {
fullText += chunk.textDelta;
}
}
return fullText;
}
catch (error) {
console.error('Error in runEvals:', error);
throw error;
}
}
export async function grade(model = defaultModel, prompt, serverPath) {
const finalServerPath = serverPath || process.argv[3]; // Use provided serverPath or CLI args
if (!finalServerPath) {
throw new Error('Server path not provided');
}
const result = await runEvals(model, prompt, finalServerPath);
const evalSystemPromt = `You are an expert evaluator assessing how well an LLM answers a given question. Review the provided answer and score it from 1 to 5 in each of the following categories:
Accuracy Does the answer contain factual errors or hallucinations?
Completeness Does the answer fully address all parts of the question?
Relevance Is the information directly related to the question?
Clarity Is the explanation easy to understand and well-structured?
Reasoning Does the answer show logical thinking or provide evidence or rationale?
Return your evaluation as a JSON object in the format:
{
"accuracy": 1-5,
"completeness": 1-5,
"relevance": 1-5,
"clarity": 1-5,
"reasoning": 1-5,
"overall_comments": "A short paragraph summarizing the strengths and weaknesses of the answer."
}`;
const evalPromt = `Here is the user input: ${prompt}
Here is the LLM's answer: ${result}`;
const evalResult = streamText({
model,
maxRetries: 1,
maxSteps: 10,
system: evalSystemPromt,
prompt: evalPromt,
onError: ({ error }) => {
console.error(error);
},
});
for await (const _ of evalResult.fullStream) {
}
return await evalResult.text;
}
export async function runAllEvals(config, serverPath) {
const results = new Map();
let transport;
try {
transport = new Experimental_StdioMCPTransport({
command: "tsx",
args: [serverPath],
env: Object.fromEntries(Object.entries(process.env).filter(([_, v]) => v !== undefined))
});
const client = await experimental_createMCPClient({
transport,
});
for (const evaluation of config.evals) {
console.log(`Running ${evaluation.name}...`);
try {
const result = await evaluation.run(config.model);
results.set(evaluation.name, result);
}
catch (error) {
console.error(`Error running ${evaluation.name}:`, error);
results.set(evaluation.name, { error: error instanceof Error ? error.message : String(error) });
}
}
return results;
}
finally {
// Clean up the transport
if (transport) {
await transport.close?.();
}
}
}
// Export everything needed by consumers
export * from './types.js';
export { metrics } from './metrics.js';
//# sourceMappingURL=index.js.map