112 lines
4.2 KiB
JavaScript
112 lines
4.2 KiB
JavaScript
import { Experimental_StdioMCPTransport } from "ai/mcp-stdio";
|
||
import { experimental_createMCPClient, streamText, } from "ai";
|
||
import { openai } from "@ai-sdk/openai";
|
||
const defaultModel = openai("gpt-4o");
|
||
export async function runEvals(model = defaultModel, prompt, serverPath) {
|
||
const transport = new Experimental_StdioMCPTransport({
|
||
command: "tsx",
|
||
args: [serverPath],
|
||
env: Object.fromEntries(Object.entries(process.env).filter(([_, v]) => v !== undefined))
|
||
});
|
||
const client = await experimental_createMCPClient({
|
||
transport,
|
||
});
|
||
const tools = await client.tools();
|
||
try {
|
||
const result = streamText({
|
||
model,
|
||
tools,
|
||
system: "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
|
||
prompt,
|
||
maxRetries: 1,
|
||
maxSteps: 10,
|
||
onError: ({ error }) => {
|
||
console.error(error);
|
||
},
|
||
});
|
||
let fullText = '';
|
||
for await (const chunk of result.fullStream) {
|
||
if (chunk.type === 'text-delta') {
|
||
fullText += chunk.textDelta;
|
||
}
|
||
}
|
||
return fullText;
|
||
}
|
||
catch (error) {
|
||
console.error('Error in runEvals:', error);
|
||
throw error;
|
||
}
|
||
}
|
||
export async function grade(model = defaultModel, prompt, serverPath) {
|
||
const finalServerPath = serverPath || process.argv[3]; // Use provided serverPath or CLI args
|
||
if (!finalServerPath) {
|
||
throw new Error('Server path not provided');
|
||
}
|
||
const result = await runEvals(model, prompt, finalServerPath);
|
||
const evalSystemPromt = `You are an expert evaluator assessing how well an LLM answers a given question. Review the provided answer and score it from 1 to 5 in each of the following categories:
|
||
Accuracy – Does the answer contain factual errors or hallucinations?
|
||
Completeness – Does the answer fully address all parts of the question?
|
||
Relevance – Is the information directly related to the question?
|
||
Clarity – Is the explanation easy to understand and well-structured?
|
||
Reasoning – Does the answer show logical thinking or provide evidence or rationale?
|
||
Return your evaluation as a JSON object in the format:
|
||
{
|
||
"accuracy": 1-5,
|
||
"completeness": 1-5,
|
||
"relevance": 1-5,
|
||
"clarity": 1-5,
|
||
"reasoning": 1-5,
|
||
"overall_comments": "A short paragraph summarizing the strengths and weaknesses of the answer."
|
||
}`;
|
||
const evalPromt = `Here is the user input: ${prompt}
|
||
Here is the LLM's answer: ${result}`;
|
||
const evalResult = streamText({
|
||
model,
|
||
maxRetries: 1,
|
||
maxSteps: 10,
|
||
system: evalSystemPromt,
|
||
prompt: evalPromt,
|
||
onError: ({ error }) => {
|
||
console.error(error);
|
||
},
|
||
});
|
||
for await (const _ of evalResult.fullStream) {
|
||
}
|
||
return await evalResult.text;
|
||
}
|
||
export async function runAllEvals(config, serverPath) {
|
||
const results = new Map();
|
||
let transport;
|
||
try {
|
||
transport = new Experimental_StdioMCPTransport({
|
||
command: "tsx",
|
||
args: [serverPath],
|
||
env: Object.fromEntries(Object.entries(process.env).filter(([_, v]) => v !== undefined))
|
||
});
|
||
const client = await experimental_createMCPClient({
|
||
transport,
|
||
});
|
||
for (const evaluation of config.evals) {
|
||
console.log(`Running ${evaluation.name}...`);
|
||
try {
|
||
const result = await evaluation.run(config.model);
|
||
results.set(evaluation.name, result);
|
||
}
|
||
catch (error) {
|
||
console.error(`Error running ${evaluation.name}:`, error);
|
||
results.set(evaluation.name, { error: error instanceof Error ? error.message : String(error) });
|
||
}
|
||
}
|
||
return results;
|
||
}
|
||
finally {
|
||
// Clean up the transport
|
||
if (transport) {
|
||
await transport.close?.();
|
||
}
|
||
}
|
||
}
|
||
// Export everything needed by consumers
|
||
export * from './types.js';
|
||
export { metrics } from './metrics.js';
|
||
//# sourceMappingURL=index.js.map
|