Files
smart-management-auto-test/node_modules/mcp-evals/dist/index.js

112 lines
4.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { Experimental_StdioMCPTransport } from "ai/mcp-stdio";
import { experimental_createMCPClient, streamText, } from "ai";
import { openai } from "@ai-sdk/openai";
const defaultModel = openai("gpt-4o");
export async function runEvals(model = defaultModel, prompt, serverPath) {
const transport = new Experimental_StdioMCPTransport({
command: "tsx",
args: [serverPath],
env: Object.fromEntries(Object.entries(process.env).filter(([_, v]) => v !== undefined))
});
const client = await experimental_createMCPClient({
transport,
});
const tools = await client.tools();
try {
const result = streamText({
model,
tools,
system: "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
prompt,
maxRetries: 1,
maxSteps: 10,
onError: ({ error }) => {
console.error(error);
},
});
let fullText = '';
for await (const chunk of result.fullStream) {
if (chunk.type === 'text-delta') {
fullText += chunk.textDelta;
}
}
return fullText;
}
catch (error) {
console.error('Error in runEvals:', error);
throw error;
}
}
export async function grade(model = defaultModel, prompt, serverPath) {
const finalServerPath = serverPath || process.argv[3]; // Use provided serverPath or CLI args
if (!finalServerPath) {
throw new Error('Server path not provided');
}
const result = await runEvals(model, prompt, finalServerPath);
const evalSystemPromt = `You are an expert evaluator assessing how well an LLM answers a given question. Review the provided answer and score it from 1 to 5 in each of the following categories:
Accuracy Does the answer contain factual errors or hallucinations?
Completeness Does the answer fully address all parts of the question?
Relevance Is the information directly related to the question?
Clarity Is the explanation easy to understand and well-structured?
Reasoning Does the answer show logical thinking or provide evidence or rationale?
Return your evaluation as a JSON object in the format:
{
"accuracy": 1-5,
"completeness": 1-5,
"relevance": 1-5,
"clarity": 1-5,
"reasoning": 1-5,
"overall_comments": "A short paragraph summarizing the strengths and weaknesses of the answer."
}`;
const evalPromt = `Here is the user input: ${prompt}
Here is the LLM's answer: ${result}`;
const evalResult = streamText({
model,
maxRetries: 1,
maxSteps: 10,
system: evalSystemPromt,
prompt: evalPromt,
onError: ({ error }) => {
console.error(error);
},
});
for await (const _ of evalResult.fullStream) {
}
return await evalResult.text;
}
export async function runAllEvals(config, serverPath) {
const results = new Map();
let transport;
try {
transport = new Experimental_StdioMCPTransport({
command: "tsx",
args: [serverPath],
env: Object.fromEntries(Object.entries(process.env).filter(([_, v]) => v !== undefined))
});
const client = await experimental_createMCPClient({
transport,
});
for (const evaluation of config.evals) {
console.log(`Running ${evaluation.name}...`);
try {
const result = await evaluation.run(config.model);
results.set(evaluation.name, result);
}
catch (error) {
console.error(`Error running ${evaluation.name}:`, error);
results.set(evaluation.name, { error: error instanceof Error ? error.message : String(error) });
}
}
return results;
}
finally {
// Clean up the transport
if (transport) {
await transport.close?.();
}
}
}
// Export everything needed by consumers
export * from './types.js';
export { metrics } from './metrics.js';
//# sourceMappingURL=index.js.map