feat: 新增文档源和技能管理相关功能

1. 新增文档源管理模块(documentSource)
   - 控制器:documentSourceController.py
   - DAO层:documentSourceDao.py
   - 模型:documentSourceModel.py
   - 服务层:documentSourceService.py

2. 新增技能管理模块(skill)
   - 控制器:skillController.py
   - DAO层:skillDao.py
   - 模型:skillModel.py
   - 服务层:skillService.py

3. 新增AI服务(aiService.py)

4. 新增配置文件
   - AI配置:config/ai_config.py
   - 技能配置:config/skills/test-case-generator/

5. 新增SQL脚本
   - 文档权限:add_document_permissions.sql
   - 模块状态字段:add_module_status_field.sql
   - 文档源表:create_document_source_table.sql
   - 技能规则:skills_rules_pgsql.sql
This commit is contained in:
qiaoxinjiu
2026-05-18 10:23:07 +08:00
parent 65524de6fc
commit 420b9e37fa
38 changed files with 9613 additions and 0 deletions

View File

@@ -0,0 +1,266 @@
# encoding: UTF-8
import os
import re
import uuid
from datetime import datetime
from flask import current_app, g
from .baseCrudController import BaseCrudController
from ..model.documentSourceModel import DocumentSource
from ..model.productModel import Product
from ..model.projectModel import Project
from ..service.documentSourceService import DocumentSourceService
class DocumentSourceController(BaseCrudController):
UPLOAD_FOLDER = 'uploads'
ALLOWED_EXTENSIONS = {'pdf'}
def allowed_file(self, filename):
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in self.ALLOWED_EXTENSIONS
def document_list(self):
items, total = DocumentSourceService.list(self.session, self.req_data)
return {'list': self.serialize_list(items, ['is_delete']), 'total': total}
def document_detail(self):
document_id = self._get(self.req_data, 'documentId', 'id')
if not document_id:
return {}, 'documentId 为必传参数'
item = DocumentSourceService.get_by_id(self.session, document_id)
if not item:
return {}, '未查询到对应文档!'
return self.serialize(item, ['is_delete']), ''
def document_create(self):
product_id = self._get(self.req_data, 'productId', 'product_id')
project_id = self._get(self.req_data, 'projectId', 'project_id')
source = self._get(self.req_data, 'source')
if not product_id or not project_id or not source:
return 0, 'productId、projectId、source 为必传参数'
data = {
'product_id': product_id,
'project_id': project_id,
'source': source,
'type': self._get(self.req_data, 'type', default=1),
'content': self._get(self.req_data, 'content', default=''),
'created_by': self._get(self.req_data, 'createdBy', 'created_by')
}
return DocumentSourceService.create(self.session, data)
def document_update(self):
document_id = self._get(self.req_data, 'documentId', 'id')
if not document_id:
return 0, 'documentId 为必传参数'
data = {}
fields = ['type', 'source', 'content', 'ai_model']
for field in fields:
value = self._get(self.req_data, field)
if value is not None:
data[field] = value
return DocumentSourceService.update(self.session, document_id, data)
def document_delete(self):
document_id = self._get(self.req_data, 'documentId', 'id')
if not document_id:
return 0, 'documentId 为必传参数'
result, msg = DocumentSourceService.delete(self.session, document_id)
if msg:
return 0, msg
err = self.session.done(close=False)
if err:
return 0, f'删除失败!{err}'
return result, ''
def document_refresh(self):
document_id = self._get(self.req_data, 'documentId', 'id')
if not document_id:
return False, 'documentId 为必传参数'
return DocumentSourceService.refresh_content(self.session, document_id)
def document_generate_cases(self):
# 支持单个文档ID或多个文档ID
document_id = self._get(self.req_data, 'documentId', 'id')
document_ids = self._get(self.req_data, 'documentIds', 'document_ids', default=[])
# 如果传了单个ID转换为列表
if document_id:
document_ids = [document_id]
if not document_ids or not isinstance(document_ids, list) or len(document_ids) == 0:
return [], 'documentId 或 documentIds 为必传参数'
project_id = self._get(self.req_data, 'projectId', 'project_id')
user_id = getattr(g, 'current_user_id', None) or self._get(self.req_data, 'userId', 'user_id')
if not project_id:
return [], 'projectId 为必传参数'
if not user_id:
return [], '未获取到当前登录用户'
template = {
'project_id': int(project_id),
'priority': int(self._get(self.req_data, 'priority', default=2)),
'case_type': int(self._get(self.req_data, 'caseType', 'case_type', default=1)),
'tags': self._get(self.req_data, 'tags', default=['AI生成']),
'skill_ids': self._get(self.req_data, 'skillIds', 'skill_ids', default=[]),
'rule_ids': self._get(self.req_data, 'ruleIds', 'rule_ids', default=[])
}
if isinstance(template['tags'], str):
template['tags'] = template['tags'].split(',')
# 批量生成测试用例(合并多个文档内容)
all_cases, failed_docs = DocumentSourceService.generate_cases_batch(
self.session, document_ids, template
)
if failed_docs:
return {'cases': [], 'total': 0, 'failed': failed_docs}, ''
# 直接导入到用例表,自动创建不存在的模块
success_count, msg = DocumentSourceService.import_cases(
self.session,
document_ids[0], # 使用第一个文档ID作为关联
all_cases,
user_id,
auto_create_module=True # 自动创建模块
)
if msg:
return {'cases': all_cases, 'total': len(all_cases), 'failed': [{'error': msg}]}, ''
# 提交事务
self.session.commit()
return {
'cases': all_cases,
'total': len(all_cases),
'importedCount': success_count,
'failed': []
}, ''
def document_match_modules(self):
document_id = self._get(self.req_data, 'documentId', 'id')
cases = self._get(self.req_data, 'cases', default=[])
if not document_id:
return [], 'documentId 为必传参数'
document = DocumentSourceService.get_by_id(self.session, document_id)
if not document:
return [], '文档不存在'
return DocumentSourceService.match_modules(self.session, document.project_id, cases), ''
def document_import_cases(self):
document_id = self._get(self.req_data, 'documentId', 'id')
cases = self._get(self.req_data, 'cases', default=[])
user_id = self._get(self.req_data, 'userId', 'user_id')
if not document_id:
return 0, 'documentId 为必传参数'
if not isinstance(cases, list):
return 0, 'cases 必须为数组'
return DocumentSourceService.import_cases(self.session, document_id, cases, user_id)
def document_batch_create_modules(self):
project_id = self._get(self.req_data, 'projectId', 'project_id')
module_names = self._get(self.req_data, 'moduleNames', 'module_names', default=[])
if not project_id:
return [], 'projectId 为必传参数'
if not isinstance(module_names, list):
return [], 'moduleNames 必须为数组'
modules = DocumentSourceService.batch_create_modules(self.session, project_id, module_names)
return self.serialize_list(modules, ['is_delete']), ''
def document_upload(self):
if 'file' not in self.req_data.files:
return None, '未找到上传文件'
file = self.req_data.files['file']
if file.filename == '':
return None, '文件名不能为空'
if not self.allowed_file(file.filename):
return None, '不支持的文件格式仅支持pdf'
# 文件上传使用 form 表单获取参数
product_id = self.req_data.form.get('productId')
project_id = self.req_data.form.get('projectId')
created_by = self.req_data.form.get('createdBy')
if not product_id or not project_id:
return None, 'productId、projectId 为必传参数'
# 获取产品和项目名称
product = self.session.query(Product).filter(Product.id == int(product_id), Product.is_delete == 0).first()
if not product:
return None, '产品不存在'
project = self.session.query(Project).filter(Project.id == int(project_id), Project.is_delete == 0).first()
if not project:
return None, '项目不存在'
try:
# 创建文件夹结构uploads/{产品名称}/{项目名称}
base_upload_path = os.path.join(os.getcwd(), self.UPLOAD_FOLDER)
product_folder = os.path.join(base_upload_path, product.name)
project_folder = os.path.join(product_folder, project.name)
os.makedirs(project_folder, exist_ok=True)
# 获取原始文件扩展名
ext = file.filename.rsplit('.', 1)[1].lower()
# 生成安全的文件名(保留原始文件名的主要部分,替换特殊字符)
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
# 从原始文件名中提取主名称(不包含扩展名)
original_name = file.filename.rsplit('.', 1)[0]
# 替换特殊字符为下划线,但保留中文字符
safe_name = re.sub(r'[^\w\u4e00-\u9fa5-]', '_', original_name)
# 限制文件名长度,避免过长
safe_name = safe_name[:50] if len(safe_name) > 50 else safe_name
# 组合文件名
new_filename = f'{timestamp}-{safe_name}-{uuid.uuid4().hex[:8]}.{ext}'
# 保存文件
file_path = os.path.join(project_folder, new_filename)
file.save(file_path)
# 计算相对路径用于数据库存储
relative_path = os.path.join(self.UPLOAD_FOLDER, product.name, project.name, new_filename)
# 转换为统一的路径格式
relative_path = relative_path.replace('\\', '/')
# 创建文档源记录
data = {
'product_id': product_id,
'project_id': project_id,
'source': relative_path,
'type': 1,
'content': '',
'created_by': created_by
}
document_id, msg = DocumentSourceService.create(self.session, data)
if msg:
return None, msg
# 提交事务
self.session.commit()
return {'documentId': document_id, 'filePath': relative_path}, ''
except Exception as e:
self.session.rollback()
return None, f'文件上传失败:{str(e)}'

View File

@@ -0,0 +1,46 @@
# encoding: UTF-8
from flask import g
from .baseCrudController import BaseCrudController
from ..service.skillService import SkillService
class SkillController(BaseCrudController):
def skill_create(self):
return SkillService.create_skill(self.session, self.req_data, getattr(g, 'current_user_id', None))
def skill_update(self):
return SkillService.update_skill(self.session, self.req_data)
def skill_delete(self):
return SkillService.delete_skill(self.session, self.req_data)
def skill_detail(self):
skill_id = self._get(self.req_data, 'skillId', 'id')
if not skill_id:
return {}, 'skillId 为必传参数'
return SkillService.skill_detail(self.session, skill_id)
def skill_list(self):
return SkillService.skill_list(self.session, self.req_data)
def skill_rule_list(self):
return SkillService.skill_rule_list(self.session, self.req_data)
def business_rule_create(self):
return SkillService.create_business_rule(self.session, self.req_data, getattr(g, 'current_user_id', None))
def business_rule_update(self):
return SkillService.update_business_rule(self.session, self.req_data)
def business_rule_delete(self):
return SkillService.delete_business_rule(self.session, self.req_data)
def business_rule_detail(self):
rule_id = self._get(self.req_data, 'ruleId', 'id')
if not rule_id:
return {}, 'ruleId 为必传参数'
return SkillService.business_rule_detail(self.session, rule_id)
def business_rule_list(self):
return SkillService.business_rule_list(self.session, self.req_data)

View File

@@ -0,0 +1,75 @@
# encoding: UTF-8
from sqlalchemy import func
from ..model.documentSourceModel import DocumentSource
class DocumentSourceDao:
@staticmethod
def create(session, document_source):
session.add(document_source)
session.flush()
return document_source.id
@staticmethod
def get_by_id(session, document_id):
return session.query(DocumentSource).filter(
DocumentSource.id == document_id,
DocumentSource.is_delete == 0
).first()
@staticmethod
def get_by_source(session, source):
return session.query(DocumentSource).filter(
DocumentSource.source == source,
DocumentSource.is_delete == 0
).first()
@staticmethod
def list_by_filters(session, filters, page_no=1, page_size=20, order_by=None):
query = session.query(DocumentSource).filter(*filters)
if order_by is not None:
query = query.order_by(order_by)
total = query.count()
items = query.offset((page_no - 1) * page_size).limit(page_size).all()
return items, total
@staticmethod
def update_by_id(session, document_id, update_info):
result = session.query(DocumentSource).filter(
DocumentSource.id == document_id,
DocumentSource.is_delete == 0
).update(update_info)
session.flush()
return result
@staticmethod
def delete_by_id(session, document_id):
return session.query(DocumentSource).filter(
DocumentSource.id == document_id,
DocumentSource.is_delete == 0
).update({'is_delete': 1})
@staticmethod
def get_latest_version(session, product_id, project_id, source):
return session.query(DocumentSource).filter(
DocumentSource.product_id == product_id,
DocumentSource.project_id == project_id,
DocumentSource.source == source,
DocumentSource.is_delete == 0
).order_by(DocumentSource.version.desc()).first()
@staticmethod
def get_max_version(session, product_id, project_id, source):
result = session.query(func.max(DocumentSource.version)).filter(
DocumentSource.product_id == product_id,
DocumentSource.project_id == project_id,
DocumentSource.source == source,
DocumentSource.is_delete == 0
).scalar()
return result if result else 0

163
app/api/dao/skillDao.py Normal file
View File

@@ -0,0 +1,163 @@
# encoding: UTF-8
from sqlalchemy import or_
from logger import logger
from ..model.caseModel import Module
from ..model.productModel import Product
from ..model.projectModel import Project
from ..model.skillModel import TestSkill, TestBusinessRule, TestAiGenerationContext
class SkillDao(object):
@staticmethod
def create(session, model_cls, add_info):
obj = model_cls(**add_info)
session.add(obj)
err = session.done(close=False)
if err:
logger.warning(f'{model_cls.__name__}新增失败!{err}')
return 0, f'新增失败!{err}'
return obj.id, ''
@staticmethod
def update_by_id(session, model_cls, obj_id, update_info):
update_res = session.query(model_cls).filter(model_cls.id == int(obj_id), model_cls.is_delete == 0).update(update_info)
err = session.done(close=False)
if err:
logger.error(f'{model_cls.__name__}更新失败id: {obj_id}, err: {err}')
return 0, f'更新失败!{err}'
if not update_res:
return 0, '未查询到对应记录!'
return int(obj_id), ''
@staticmethod
def get_by_id(session, model_cls, obj_id):
return session.query(model_cls).filter(model_cls.id == int(obj_id), model_cls.is_delete == 0).first()
@staticmethod
def get_skill_by_project_code(session, project_id, code):
return session.query(TestSkill).filter(
TestSkill.project_id == int(project_id),
TestSkill.code == code,
TestSkill.is_delete == 0
).first()
@staticmethod
def get_business_rule_by_project_code(session, project_id, rule_code):
return session.query(TestBusinessRule).filter(
TestBusinessRule.project_id == int(project_id),
TestBusinessRule.rule_code == rule_code,
TestBusinessRule.is_delete == 0
).first()
@staticmethod
def list_skill(session, filters, page=1, limit=20, keyword=None, tag=None):
query = session.query(TestSkill).filter(TestSkill.is_delete == 0, *filters)
if keyword:
like_keyword = f'%{keyword}%'
query = query.filter(or_(
TestSkill.name.like(like_keyword),
TestSkill.code.like(like_keyword),
TestSkill.description.like(like_keyword),
TestSkill.trigger_condition.like(like_keyword)
))
if tag:
query = query.filter(TestSkill.tags.contains([tag]))
total = query.count()
items = query.order_by(TestSkill.created_time.desc()).offset((int(page) - 1) * int(limit)).limit(int(limit)).all()
return items, total
@staticmethod
def list_business_rule(session, filters, page=1, limit=20, keyword=None, tag=None):
query = session.query(TestBusinessRule).filter(TestBusinessRule.is_delete == 0, *filters)
if keyword:
like_keyword = f'%{keyword}%'
query = query.filter(or_(
TestBusinessRule.name.like(like_keyword),
TestBusinessRule.rule_code.like(like_keyword),
TestBusinessRule.rule_content.like(like_keyword),
TestBusinessRule.applicable_scene.like(like_keyword)
))
if tag:
query = query.filter(TestBusinessRule.tags.contains([tag]))
total = query.count()
items = query.order_by(TestBusinessRule.created_time.desc()).offset((int(page) - 1) * int(limit)).limit(int(limit)).all()
return items, total
@staticmethod
def delete_by_id(session, model_cls, obj_id):
return SkillDao.update_by_id(session, model_cls, obj_id, {'is_delete': 1})
@staticmethod
def get_project_by_product(session, product_id, project_id):
return session.query(Project).filter(
Project.id == int(project_id),
Project.product_id == int(product_id),
Project.is_delete == 0
).first()
@staticmethod
def list_skills_by_project(session, project_id, status=None):
query = session.query(TestSkill).filter(
TestSkill.project_id == int(project_id),
TestSkill.is_delete == 0
)
if status not in (None, ''):
query = query.filter(TestSkill.status == int(status))
return query.order_by(TestSkill.created_time.desc()).all()
@staticmethod
def list_business_rules_by_project(session, project_id, status=None):
query = session.query(TestBusinessRule).filter(
TestBusinessRule.project_id == int(project_id),
TestBusinessRule.is_delete == 0
)
if status not in (None, ''):
query = query.filter(TestBusinessRule.status == int(status))
return query.order_by(TestBusinessRule.created_time.desc()).all()
@staticmethod
def list_skills_by_ids(session, project_id, skill_ids):
if not skill_ids:
return []
return session.query(TestSkill).filter(
TestSkill.project_id == int(project_id),
TestSkill.id.in_([int(skill_id) for skill_id in skill_ids]),
TestSkill.is_delete == 0
).all()
@staticmethod
def list_business_rules_by_ids(session, project_id, rule_ids):
if not rule_ids:
return []
return session.query(TestBusinessRule).filter(
TestBusinessRule.project_id == int(project_id),
TestBusinessRule.id.in_([int(rule_id) for rule_id in rule_ids]),
TestBusinessRule.is_delete == 0
).all()
@staticmethod
def get_skill_path_context(session, project_id, module_id=None):
project = session.query(Project).filter(Project.id == int(project_id), Project.is_delete == 0).first()
product = None
module = None
if project and project.product_id:
product = session.query(Product).filter(Product.id == int(project.product_id), Product.is_delete == 0).first()
if module_id:
module = session.query(Module).filter(Module.id == int(module_id), Module.is_delete == 0).first()
return {
'product_name': product.name if product else '未关联产品',
'project_name': project.name if project else f'项目{project_id}',
'module_name': module.name if module else '项目通用'
}
@staticmethod
def batch_create_generation_context(session, rows):
if not rows:
return 0, ''
session.add_all([TestAiGenerationContext(**row) for row in rows])
err = session.done(close=False)
if err:
logger.warning(f'TestAiGenerationContext批量新增失败{err}')
return 0, f'批量新增失败!{err}'
return len(rows), ''

View File

@@ -0,0 +1,26 @@
# encoding: UTF-8
from sqlalchemy import BigInteger, Column, Integer, SmallInteger, String, TIMESTAMP, Text, text
from sqlalchemy.ext.declarative import declarative_base
from common.sqlSession import to_dict
Base = declarative_base()
Base.to_dict = to_dict
class DocumentSource(Base):
__tablename__ = 'document_source'
id = Column(BigInteger, primary_key=True, autoincrement=True, comment='主键ID')
product_id = Column(BigInteger, nullable=False, comment='产品ID')
project_id = Column(BigInteger, nullable=False, comment='项目ID')
type = Column(SmallInteger, default=1, comment='类型1-PDF文件2-飞书链接')
source = Column(String(512), nullable=False, comment='文件路径或飞书链接')
content = Column(Text, comment='解析后的文本内容(缓存)')
version = Column(Integer, default=1, comment='版本号')
status = Column(SmallInteger, default=0, comment='状态0-待解析1-已解析2-已生成用例')
ai_model = Column(String(64), comment='使用的AI模型')
created_by = Column(BigInteger, comment='创建人ID')
is_delete = Column(Integer, default=0, comment='0未删除1已删除')
created_time = Column(TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'), comment='创建时间')
updated_time = Column(TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'), server_onupdate=text('CURRENT_TIMESTAMP'), comment='更新时间')

View File

@@ -0,0 +1,67 @@
from sqlalchemy import BigInteger, Column, Integer, SmallInteger, String, TIMESTAMP, Text, text
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.ext.declarative import declarative_base
from common.sqlSession import to_dict
Base = declarative_base()
Base.to_dict = to_dict
class TestSkill(Base):
__tablename__ = 'test_skill'
id = Column(BigInteger, primary_key=True, autoincrement=True, comment='id')
project_id = Column(BigInteger, nullable=False, comment='项目id')
module_id = Column(BigInteger, comment='模块id空表示项目级通用')
name = Column(String(128), nullable=False, comment='Skill名称')
code = Column(String(64), nullable=False, comment='Skill编码项目内唯一')
description = Column(Text, comment='Skill描述')
trigger_condition = Column(Text, nullable=False, comment='触发条件')
reasoning_path = Column(Text, comment='推理路径')
output_spec = Column(Text, comment='输出规范')
skill_file_path = Column(String(512), comment='Skill文件路径指向config/skills下生成的SKILL.md')
skill_type = Column(SmallInteger, nullable=False, default=1, comment='类型1通用测试策略 2历史缺陷模式 3边界场景 4接口测试 5UI测试 6性能测试 7安全测试 8数据一致性 9并发幂等 99其他')
risk_level = Column(SmallInteger, nullable=False, default=2, comment='风险等级0高 1中高 2中 3低')
tags = Column(JSONB, nullable=False, server_default=text("'[]'::jsonb"), comment='标签数组')
status = Column(SmallInteger, nullable=False, default=1, comment='状态1启用 2停用 3草稿')
owner_id = Column(BigInteger, comment='负责人用户id')
created_by = Column(BigInteger, comment='创建人用户id')
usage_count = Column(Integer, nullable=False, default=0, comment='使用次数')
is_delete = Column(Integer, nullable=False, default=0, comment='0未删除 1已删除')
created_time = Column(TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'), nullable=True, comment='创建时间')
updated_time = Column(TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'), server_onupdate=text('CURRENT_TIMESTAMP'), nullable=True, comment='修改时间')
class TestBusinessRule(Base):
__tablename__ = 'test_business_rule'
id = Column(BigInteger, primary_key=True, autoincrement=True, comment='id')
project_id = Column(BigInteger, nullable=False, comment='项目id')
module_id = Column(BigInteger, comment='模块id空表示项目级通用')
name = Column(String(128), nullable=False, comment='业务规则名称')
rule_code = Column(String(64), comment='业务规则编码,项目内唯一')
rule_content = Column(Text, nullable=False, comment='业务规则内容')
applicable_scene = Column(Text, comment='适用场景')
example = Column(Text, comment='示例')
rule_file_path = Column(String(512), comment='业务规则文件路径指向config/rules下生成的RULE.md')
priority = Column(SmallInteger, nullable=False, default=2, comment='优先级0高 1中高 2中 3低')
tags = Column(JSONB, nullable=False, server_default=text("'[]'::jsonb"), comment='标签数组')
status = Column(SmallInteger, nullable=False, default=1, comment='状态1启用 2停用 3草稿')
owner_id = Column(BigInteger, comment='负责人用户id')
created_by = Column(BigInteger, comment='创建人用户id')
usage_count = Column(Integer, nullable=False, default=0, comment='使用次数')
is_delete = Column(Integer, nullable=False, default=0, comment='0未删除 1已删除')
created_time = Column(TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'), nullable=True, comment='创建时间')
updated_time = Column(TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'), server_onupdate=text('CURRENT_TIMESTAMP'), nullable=True, comment='修改时间')
class TestAiGenerationContext(Base):
__tablename__ = 'test_ai_generation_context'
id = Column(BigInteger, primary_key=True, autoincrement=True, comment='id')
generation_id = Column(BigInteger, comment='AI生成任务id兼容现有生成任务')
project_id = Column(BigInteger, nullable=False, comment='项目id')
module_id = Column(BigInteger, comment='模块id')
source_type = Column(SmallInteger, nullable=False, comment='来源类型1 Skill 2业务规则')
source_id = Column(BigInteger, nullable=False, comment='来源id')
source_name = Column(String(128), comment='来源名称快照')
match_score = Column(Integer, nullable=False, default=0, comment='匹配分数')
created_time = Column(TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'), nullable=True, comment='创建时间')

View File

@@ -0,0 +1,536 @@
# encoding: UTF-8
"""
AI服务类 - 用于调用大模型生成测试用例、测试 Skill 和业务规则
"""
import json
import re
import time
import traceback
from pathlib import Path
from flask import current_app
class AIService:
"""AI服务类"""
@staticmethod
def generate_test_cases(document_content, template=None):
try:
from openai import OpenAI
from config.ai_config import AIConfig
import httpx
api_key = AIConfig.get_api_key()
api_base = AIConfig.get_api_base()
model = AIConfig.get_model()
provider = AIConfig.MODEL_PROVIDER
key_source = AIConfig.get_api_key_source()
if not api_key or api_key == '请替换为你的Meteor API Key':
return [], '未配置API密钥请在.env中配置METEOR_API_KEY'
is_plan_key = provider == 'custom' and api_key.startswith('plan-')
request_base = AIService._normalize_plan_api_base(api_base) if is_plan_key else AIService._normalize_api_base(api_base)
current_app.logger.info(f'AI配置: provider={provider}, base={request_base}, model={model}, key_source={key_source}, key_prefix={api_key[:8]}, plan_key={is_plan_key}')
timeout = httpx.Timeout(connect=AIConfig.CONNECT_TIMEOUT, read=AIConfig.READ_TIMEOUT, write=AIConfig.READ_TIMEOUT, pool=AIConfig.CONNECT_TIMEOUT)
skill_content = AIService._load_skill_content()
chunks = AIService._split_document_content(document_content)
all_cases = []
for chunk_index, chunk in enumerate(chunks, 1):
prompt = AIService._build_prompt(chunk['content'], template, skill_content, chunk_index, len(chunks), chunk['title'])
result = AIService._request_model(OpenAI, AIConfig, api_key, request_base, model, is_plan_key, prompt, timeout, httpx)
try:
parsed_result = json.loads(AIService._extract_json_text(result))
all_cases.extend(AIService._normalize_cases(parsed_result, template, chunk['title']))
except json.JSONDecodeError:
return [], f'{chunk_index}段解析结果失败: {result[:200]}'
return AIService._deduplicate_cases(all_cases), ''
except Exception as e:
current_app.logger.error(f'AI生成测试用例失败: {str(e)}')
current_app.logger.error(traceback.format_exc())
return [], f'AI生成失败: {str(e)}'
@staticmethod
def _request_model(OpenAI, AIConfig, api_key, request_base, model, is_plan_key, prompt, timeout, httpx):
max_retries = AIConfig.MAX_RETRIES
retry_delay = AIConfig.RETRY_DELAY
for attempt in range(max_retries):
try:
if is_plan_key:
return AIService._create_plan_message(api_key, request_base, model, prompt, timeout)
client = OpenAI(api_key=api_key, base_url=request_base, http_client=httpx.Client(timeout=timeout, trust_env=False))
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "你是一个专业的测试知识资产生成助手。必须最终只输出可解析JSON。"},
{"role": "user", "content": prompt}
],
max_tokens=AIConfig.OPENAI_MAX_TOKENS,
temperature=AIConfig.OPENAI_TEMPERATURE
)
return response.choices[0].message.content
except Exception as e:
if attempt < max_retries - 1:
current_app.logger.warning(f'AI请求第{attempt + 1}次失败,{retry_delay}秒后重试: {str(e)}')
time.sleep(retry_delay * (2 ** attempt))
else:
raise
@staticmethod
def _normalize_api_base(api_base):
if not api_base:
return 'https://api.routin.ai/v1'
return api_base.rstrip('/').replace('/chat/completions', '')
@staticmethod
def _normalize_plan_api_base(api_base):
if not api_base:
return 'https://api.routin.ai/plan/v1'
normalized = api_base.rstrip('/').replace('/chat/completions', '')
if '/plan/v1' in normalized:
return normalized
return normalized.replace('/v1', '/plan/v1')
@staticmethod
def _create_plan_message(api_key, api_base, model, prompt, timeout):
import httpx
response = httpx.post(
f'{api_base}/messages',
headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'},
json={'model': model, 'messages': [{'role': 'user', 'content': prompt}], 'max_tokens': 4096, 'temperature': 0.7},
timeout=timeout,
trust_env=False
)
response.raise_for_status()
return AIService._extract_message_text(response.json())
@staticmethod
def _extract_message_text(data):
if isinstance(data, dict):
content = data.get('content')
if isinstance(content, list):
texts = [part['text'] for part in content if isinstance(part, dict) and part.get('text')]
if texts:
return ''.join(texts)
if isinstance(content, str):
return content
return json.dumps(data, ensure_ascii=False)
@staticmethod
def _extract_json_text(result):
text = result.strip()
fence_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', text)
if fence_match:
text = fence_match.group(1).strip()
if text.startswith('{') or text.startswith('['):
return text
json_match = re.search(r'(\{[\s\S]*\}|\[[\s\S]*\])', text)
if json_match:
return json_match.group(1).strip()
return text
@staticmethod
def generate_skill_content(req_data):
return AIService._generate_asset_content(
req_data=req_data,
prompt=AIService._build_skill_create_prompt(req_data),
markdown_key='skill_md',
normalizer=AIService._normalize_skill_markdown,
error_prefix='AI生成 Skill 内容'
)
@staticmethod
def generate_business_rule_content(req_data):
return AIService._generate_asset_content(
req_data=req_data,
prompt=AIService._build_business_rule_create_prompt(req_data),
markdown_key='rule_md',
normalizer=AIService._normalize_rule_markdown,
error_prefix='AI生成业务规则内容'
)
@staticmethod
def _generate_asset_content(req_data, prompt, markdown_key, normalizer, error_prefix):
try:
from openai import OpenAI
from config.ai_config import AIConfig
import httpx
api_key = AIConfig.get_api_key()
api_base = AIConfig.get_api_base()
model = AIConfig.get_model()
provider = AIConfig.MODEL_PROVIDER
if not api_key or api_key == '请替换为你的Meteor API Key':
return {}, '未配置API密钥请在.env中配置METEOR_API_KEY'
is_plan_key = provider == 'custom' and api_key.startswith('plan-')
request_base = AIService._normalize_plan_api_base(api_base) if is_plan_key else AIService._normalize_api_base(api_base)
timeout = httpx.Timeout(connect=AIConfig.CONNECT_TIMEOUT, read=AIConfig.READ_TIMEOUT, write=AIConfig.READ_TIMEOUT, pool=AIConfig.CONNECT_TIMEOUT)
result = AIService._request_model(OpenAI, AIConfig, api_key, request_base, model, is_plan_key, prompt, timeout, httpx)
parsed_result = json.loads(AIService._extract_json_text(result))
if not isinstance(parsed_result, dict):
return {}, f'{error_prefix}格式错误'
md = parsed_result.get(markdown_key) or parsed_result.get(markdown_key.replace('_', ''))
if not md or not isinstance(md, str):
return {}, f'{error_prefix}缺少 {markdown_key}'
parsed_result[markdown_key] = normalizer(md, req_data)
return parsed_result, ''
except json.JSONDecodeError:
return {}, f'{error_prefix}不是合法 JSON'
except Exception as e:
current_app.logger.error(f'{error_prefix}失败: {str(e)}')
current_app.logger.error(traceback.format_exc())
return {}, f'{error_prefix}失败: {str(e)}'
@staticmethod
def _normalize_skill_markdown(skill_md, req_data):
return AIService._normalize_markdown(skill_md, req_data, 'generated-skill')
@staticmethod
def _normalize_rule_markdown(rule_md, req_data):
return AIService._normalize_markdown(rule_md, req_data, 'generated-rule')
@staticmethod
def _normalize_markdown(markdown, req_data, fallback_name):
content = markdown.strip()
content = re.sub(r'^```(?:markdown|md)?\s*', '', content)
content = re.sub(r'\s*```$', '', content).strip()
if content.startswith('---'):
return content
raw_name = str(req_data.get('name') or fallback_name).strip()
frontmatter_name = re.sub(r'[^a-zA-Z0-9_-]+', '-', raw_name.lower()).strip('-') or fallback_name
description = str(req_data.get('description') or raw_name).strip()
return f'---\nname: {frontmatter_name}\ndescription: {description}\n---\n\n{content}'
@staticmethod
def get_default_case_generation_trigger_condition():
return '当用户基于 PRD、需求文档、用户故事、功能说明、接口说明、UI 交互说明或业务规则生成、补充、优化、评审测试用例时触发。'
@staticmethod
def get_default_case_generation_output_spec():
return '''输出必须兼容当前 AI 生成用例入库结构:最终只输出 JSON 对象,不输出 Markdown、解释文本或代码块。JSON 对象结构为 {"cases": [{"title": "用例名称/测试点名称", "module_name": "父模块/子模块/叶子模块", "precondition": "前置条件", "steps": "步骤1\\n步骤2", "expected_result": "预期结果1\\n预期结果2", "priority": 2, "case_type": 1, "tags": ["AI生成"]}]}。每条用例 title 需要细化到具体场景steps 和 expected_result 每一行带数字编号,信息不足时标记“待确认”,不能编造需求。'''
@staticmethod
def _load_skill_creator_content():
skill_path = Path(__file__).resolve().parents[3] / 'config' / 'skills' / 'skill-creator' / 'SKILL.md'
if not skill_path.exists():
raise FileNotFoundError(f'Skill创建规则不存在: {skill_path}')
return skill_path.read_text(encoding='utf-8')
@staticmethod
def _load_skill_content():
skill_path = Path(__file__).resolve().parents[3] / 'config' / 'skills' / 'test-case-generator' / 'SKILL.md'
if not skill_path.exists():
raise FileNotFoundError(f'测试用例生成技能不存在: {skill_path}')
return skill_path.read_text(encoding='utf-8')
@staticmethod
def _build_skill_create_prompt(req_data):
skill_creator_content = AIService._load_skill_creator_content()
default_trigger_condition = AIService.get_default_case_generation_trigger_condition()
default_output_spec = AIService.get_default_case_generation_output_spec()
return f'''
你现在要严格按照下面 skill-creator 的 SKILL.md 规范,为测试平台创建一个新的 Skill 文件。
<skill-creator-skill-md>
{skill_creator_content}
</skill-creator-skill-md>
<new-skill-input>
Skill 名称:{req_data.get('name') or ''}
用户补充描述:{req_data.get('description') or ''}
标签:{req_data.get('tags') or []}
Skill 类型枚举值:{req_data.get('skillType') or req_data.get('skill_type') or 1}
风险等级枚举值:{req_data.get('riskLevel') or req_data.get('risk_level') or 2}
</new-skill-input>
<platform-contract>
这个 Skill 的目标是增强当前平台“AI 根据 PRD/需求生成测试用例”的能力。
触发条件固定理解为:{default_trigger_condition}
输出规范固定理解为:{default_output_spec}
</platform-contract>
请只输出 JSON 对象:
{{
"description": "适合列表展示的 Skill 简介80字以内",
"reasoning_path": "面向测试用例生成的推理路径摘要,简洁步骤描述",
"tags": ["标签1", "标签2"],
"skill_type": 1,
"risk_level": 2,
"skill_md": "完整的 SKILL.md 文件内容,包含 YAML frontmatter 和 Markdown body"
}}
约束skill_md 必须包含 YAML frontmatter至少包含 name 和 descriptionbody 必须是面向测试用例生成的 Markdown 指令;不要复制 skill-creator 原文;不要输出代码块或额外说明。
'''.strip()
@staticmethod
def _build_business_rule_create_prompt(req_data):
input_rule_content = req_data.get('ruleContent') or req_data.get('rule_content') or req_data.get('description') or ''
return f'''
请为测试平台创建一条“业务规则”知识资产,用于增强 AI 根据 PRD/需求生成测试用例时对确定性业务约束、校验条件、状态流转、边界条件和异常处理的理解。
<business-rule-input>
规则名称:{req_data.get('name') or ''}
用户输入的规则原文:{input_rule_content}
用户补充描述:{req_data.get('description') or ''}
标签:{req_data.get('tags') or []}
优先级枚举值:{req_data.get('priority') or 2}
</business-rule-input>
硬性约束:
1. 不要随机生成、替换或改变“用户输入的规则原文”的业务含义。
2. 返回 JSON 中的 rule_content 必须逐字等于“用户输入的规则原文”。
3. 你只能基于用户输入补充 applicable_scene、example、tags、priority并生成用于测试用例生成的 RULE.md。
4. RULE.md 的“## Rule”章节必须逐字包含“用户输入的规则原文”不能改写成另一条规则。
请只输出 JSON 对象:
{{
"rule_content": "逐字返回用户输入的规则原文",
"applicable_scene": "该规则适用的业务场景",
"example": "输入/场景/预期的示例",
"tags": ["标签1", "标签2"],
"priority": 2,
"rule_md": "完整的 RULE.md 文件内容,包含 YAML frontmatter 和 Markdown body"
}}
RULE.md 要求:必须包含 YAML frontmatter至少包含 name 和 descriptionbody 建议包含规则说明、适用场景、测试关注点、正反例、生成用例时的约束内容必须面向测试用例生成priority 只能是 0、1、2、3tags 最多 8 个;不要输出代码块或额外说明。
'''.strip()
@staticmethod
def _split_document_content(document_content, max_chars=8000):
content = (document_content or '').strip()
if not content:
return []
sections = AIService._split_by_headings(content)
chunks = []
current_parts = []
current_len = 0
current_title = '文档内容'
for section in sections:
section_text = section['content'].strip()
if not section_text:
continue
if len(section_text) > max_chars:
if current_parts:
chunks.append({'title': current_title, 'content': '\n\n'.join(current_parts)})
current_parts = []
current_len = 0
chunks.extend(AIService._split_large_section(section['title'], section_text, max_chars))
continue
if current_parts and current_len + len(section_text) > max_chars:
chunks.append({'title': current_title, 'content': '\n\n'.join(current_parts)})
current_parts = []
current_len = 0
if not current_parts:
current_title = section['title']
current_parts.append(section_text)
current_len += len(section_text)
if current_parts:
chunks.append({'title': current_title, 'content': '\n\n'.join(current_parts)})
return chunks or [{'title': '文档内容', 'content': content}]
@staticmethod
def _split_by_headings(content):
heading_pattern = re.compile(r'(?m)^(#{1,6}\s+.+|第[一二三四五六七八九十百千万\d]+[章节部分篇].*|\d+(?:\.\d+)*[、.]\s*.+)$')
matches = list(heading_pattern.finditer(content))
if not matches:
return [{'title': '文档内容', 'content': content}]
sections = []
if matches[0].start() > 0:
sections.append({'title': '文档开头', 'content': content[:matches[0].start()].strip()})
for index, match in enumerate(matches):
start = match.start()
end = matches[index + 1].start() if index + 1 < len(matches) else len(content)
title = match.group(0).strip().lstrip('#').strip()
sections.append({'title': title[:80] or '文档内容', 'content': content[start:end].strip()})
return sections
@staticmethod
def _split_large_section(title, section_text, max_chars):
paragraphs = re.split(r'\n\s*\n', section_text)
chunks = []
current_parts = []
current_len = 0
part_index = 1
for paragraph in paragraphs:
paragraph = paragraph.strip()
if not paragraph:
continue
while len(paragraph) > max_chars:
if current_parts:
chunks.append({'title': f'{title}(第{part_index}部分)', 'content': '\n\n'.join(current_parts)})
part_index += 1
current_parts = []
current_len = 0
chunks.append({'title': f'{title}(第{part_index}部分)', 'content': paragraph[:max_chars]})
part_index += 1
paragraph = paragraph[max_chars:]
if current_parts and current_len + len(paragraph) > max_chars:
chunks.append({'title': f'{title}(第{part_index}部分)', 'content': '\n\n'.join(current_parts)})
part_index += 1
current_parts = []
current_len = 0
current_parts.append(paragraph)
current_len += len(paragraph)
if current_parts:
chunks.append({'title': f'{title}(第{part_index}部分)', 'content': '\n\n'.join(current_parts)})
return chunks
@staticmethod
def _deduplicate_cases(cases):
seen = {}
deduplicated = []
for case in cases:
key = f"{case.get('module_name', '')}::{case.get('title', '')}".strip().lower()
if not key or key in seen:
continue
seen[key] = True
deduplicated.append(case)
return deduplicated
@staticmethod
def _normalize_cases(parsed_result, template=None, chunk_title=''):
template = template or {}
raw_cases = AIService._collect_case_items(parsed_result)
normalized = []
for index, item in enumerate(raw_cases, 1):
if not isinstance(item, dict):
continue
tags = item.get('tags') or item.get('标签') or template.get('tags', ['AI生成'])
if isinstance(tags, str):
tags = [tag.strip() for tag in re.split(r'[,]', tags) if tag.strip()]
normalized.append({
'selected': item.get('selected', True),
'module_name': AIService._normalize_module_name(item.get('module_name') or item.get('所属模块') or item.get('module') or '未分类'),
'title': item.get('title') or item.get('用例名称') or item.get('case_name') or item.get('name') or f'AI生成用例{index}',
'precondition': item.get('precondition') or item.get('前置条件') or '',
'steps': AIService._number_lines(item.get('steps') or item.get('步骤描述') or item.get('操作步骤') or ''),
'expected_result': AIService._number_lines(item.get('expected_result') or item.get('expected_results') or item.get('预期结果') or item.get('期望结果') or ''),
'priority': AIService._normalize_priority(item.get('priority') or item.get('用例等级'), template.get('priority', 2)),
'case_type': AIService._normalize_case_type(item.get('case_type') or item.get('类型') or item.get('标签'), template.get('case_type', 1)),
'tags': tags or ['AI生成']
})
return normalized
@staticmethod
def _collect_case_items(value):
if isinstance(value, list):
items = []
for item in value:
items.extend(AIService._collect_case_items(item))
return items
if not isinstance(value, dict):
return []
case_keys = {'title', '用例名称', 'case_name', 'name', 'steps', '步骤描述', '操作步骤', 'expected_result', '预期结果', '期望结果'}
if any(key in value for key in case_keys):
return [value]
items = []
for nested_value in value.values():
items.extend(AIService._collect_case_items(nested_value))
return items
@staticmethod
def _normalize_module_name(module_name):
parts = [part.strip() for part in re.split(r'[/\\>|]', str(module_name or '')) if part.strip()]
return '/'.join(parts[:3]) if parts else '未分类'
@staticmethod
def _number_lines(value):
if isinstance(value, list):
lines = [str(item).strip() for item in value if str(item).strip()]
else:
lines = [line.strip() for line in re.split(r'\n+', str(value or '')) if line.strip()]
normalized = []
for index, line in enumerate(lines, 1):
cleaned_line = re.sub(r'^(?:步骤|预期结果)?\s*\d+\s*[.、.]\s*', '', line).strip()
normalized.append(f'{index}. {cleaned_line}')
return '\n'.join(normalized)
@staticmethod
def _normalize_priority(value, default=2):
if isinstance(value, int):
return value
return {'P0': 0, 'P1': 1, 'P2': 2, 'P3': 3, 'P4': 3, 'P5': 3}.get(str(value).upper(), default)
@staticmethod
def _normalize_case_type(value, default=1):
if isinstance(value, int):
return value
text = str(value or '')
if '性能' in text:
return 2
if '安全' in text:
return 3
if '接口' in text or 'API' in text.upper():
return 4
return default
@staticmethod
def _build_generation_context(template):
template = template or {}
skill_contexts = template.get('skill_contexts') or []
rule_contexts = template.get('rule_contexts') or []
if not skill_contexts and not rule_contexts:
return ''
parts = ['<selected-generation-context>']
if skill_contexts:
parts.append('请在生成测试用例时结合以下用户指定 Skill')
for item in skill_contexts:
parts.append(f'''<selected-skill id="{item.get('id')}" name="{item.get('name')}">
{item.get('content') or ''}
</selected-skill>''')
if rule_contexts:
parts.append('请在生成测试用例时严格覆盖以下用户指定业务规则:')
for item in rule_contexts:
parts.append(f'''<selected-rule id="{item.get('id')}" name="{item.get('name')}">
{item.get('content') or ''}
</selected-rule>''')
parts.append('</selected-generation-context>')
return '\n\n'.join(parts)
@staticmethod
def _build_prompt(document_content, template=None, skill_content='', chunk_index=1, total_chunks=1, chunk_title='文档内容'):
template = template or {'priority': 2, 'case_type': 1, 'tags': ['AI生成']}
generation_context = AIService._build_generation_context(template)
return f'''
请使用下面的 test-case-generator skill 对需求文档分段进行深度测试用例设计。最终只输出 JSON。
<test-case-generator-skill>
{skill_content}
</test-case-generator-skill>
{generation_context}
<document-chunk-info>
当前分段:{chunk_index}/{total_chunks}
分段标题:{chunk_title}
</document-chunk-info>
<requirement-document-chunk>
{document_content}
</requirement-document-chunk>
平台入库配置:
- 默认优先级(priority): {template['priority']}
- 默认用例类型(case_type): {template['case_type']}
- 默认标签(tags): {template['tags']}
输出 JSON 结构:
{{"cases":[{{"title":"用例名称/测试点名称","module_name":"父模块/子模块/叶子模块","precondition":"前置条件","steps":"步骤1\\n步骤2","expected_result":"预期结果1\\n预期结果2","priority":2,"case_type":1,"tags":["AI生成"]}}]}}
'''.strip()
@staticmethod
def parse_pdf_and_generate_cases(pdf_path, template=None):
try:
from PyPDF2 import PdfReader
reader = PdfReader(pdf_path)
content = ''
for page in reader.pages:
page_content = page.extract_text()
if page_content:
content += page_content + '\n'
if not content.strip():
return [], 'PDF文件内容为空'
return AIService.generate_test_cases(content, template)
except Exception as e:
current_app.logger.error(f'解析PDF并生成用例失败: {str(e)}')
return [], f'解析PDF失败: {str(e)}'

View File

@@ -0,0 +1,507 @@
# encoding: UTF-8
import os
import re
from ..model.documentSourceModel import DocumentSource
from ..model.caseModel import TestCase, Module
from ..dao.documentSourceDao import DocumentSourceDao
from ..dao.caseDao import CaseDao
from ..dao.skillDao import SkillDao
from .aiService import AIService
class DocumentSourceService:
DOCUMENT_TYPE_PDF = 1
DOCUMENT_TYPE_FEISHU = 2
DOCUMENT_STATUS_PENDING = 0
DOCUMENT_STATUS_PARSED = 1
DOCUMENT_STATUS_GENERATED = 2
@staticmethod
def create(session, data):
product_id = data.get('productId') or data.get('product_id')
project_id = data.get('projectId') or data.get('project_id')
document_type = data.get('type', 1)
source = data.get('source')
content = data.get('content', '')
created_by = data.get('createdBy') or data.get('created_by')
if not product_id or not project_id or not source:
return 0, 'productId、projectId、source 为必传参数'
max_version = DocumentSourceDao.get_max_version(session, product_id, project_id, source)
document_source = DocumentSource(
product_id=product_id,
project_id=project_id,
type=document_type,
source=source,
content=content,
version=max_version + 1,
status=DocumentSourceService.DOCUMENT_STATUS_PENDING,
created_by=created_by,
is_delete=0
)
if document_type == DocumentSourceService.DOCUMENT_TYPE_FEISHU:
content = DocumentSourceService._fetch_feishu_content(source)
if content:
document_source.content = content
document_source.status = DocumentSourceService.DOCUMENT_STATUS_PARSED
doc_id = DocumentSourceDao.create(session, document_source)
return doc_id, ''
@staticmethod
def _fetch_feishu_content(url):
try:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
return soup.get_text(strip=True)[:10000]
return None
except Exception:
return None
@staticmethod
def parse_pdf_content(pdf_path):
try:
from PyPDF2 import PdfReader
reader = PdfReader(pdf_path)
content = ''
for page in reader.pages:
text = page.extract_text()
if text:
content += text
return content
except Exception:
return None
@staticmethod
def get_by_id(session, document_id):
return DocumentSourceDao.get_by_id(session, document_id)
@staticmethod
def list(session, req_data):
filters = [DocumentSource.is_delete == 0]
product_id = req_data.get('productId') or req_data.get('product_id')
if product_id:
filters.append(DocumentSource.product_id == product_id)
project_id = req_data.get('projectId') or req_data.get('project_id')
if project_id:
filters.append(DocumentSource.project_id == project_id)
doc_type = req_data.get('type')
if doc_type is not None:
filters.append(DocumentSource.type == doc_type)
status = req_data.get('status')
if status is not None:
filters.append(DocumentSource.status == status)
keyword = req_data.get('keyword')
if keyword:
filters.append(DocumentSource.source.like(f'%{keyword}%'))
page_no = int(req_data.get('pageNo', req_data.get('page', 1)))
page_size = int(req_data.get('pageSize', req_data.get('size', 20)))
items, total = DocumentSourceDao.list_by_filters(
session, filters, page_no, page_size, DocumentSource.created_time.desc()
)
return items, total
@staticmethod
def update(session, document_id, data):
update_info = {}
fields = ['type', 'source', 'content', 'ai_model']
for field in fields:
if field in data:
update_info[field] = data[field]
if update_info:
return DocumentSourceDao.update_by_id(session, document_id, update_info)
return 1
@staticmethod
def delete(session, document_id):
import os
from flask import current_app
# 先查询文档信息
document = DocumentSourceDao.get_by_id(session, document_id)
if not document:
return 0, '文档不存在'
# 如果是PDF文件类型删除对应的文件
if document.type == DocumentSourceService.DOCUMENT_TYPE_PDF and document.source:
# source字段存储的是相对路径uploads/zhyy/v2.0/xxx.pdf
file_path = os.path.join(os.getcwd(), document.source)
try:
if os.path.exists(file_path):
os.remove(file_path)
current_app.logger.info(f'已删除文件: {file_path}')
except Exception as e:
current_app.logger.error(f'删除文件失败: {file_path}, 错误: {str(e)}')
# 软删除数据库记录
result = DocumentSourceDao.delete_by_id(session, document_id)
return result, ''
@staticmethod
def refresh_content(session, document_id):
document = DocumentSourceDao.get_by_id(session, document_id)
if not document:
return False, '文档不存在'
if document.type == DocumentSourceService.DOCUMENT_TYPE_FEISHU:
content = DocumentSourceService._fetch_feishu_content(document.source)
if content:
DocumentSourceDao.update_by_id(session, document_id, {
'content': content,
'status': DocumentSourceService.DOCUMENT_STATUS_PARSED
})
return True, ''
return False, '获取飞书内容失败'
return False, '仅支持刷新飞书链接内容'
@staticmethod
def generate_cases(session, document_id, template=None):
document = DocumentSourceDao.get_by_id(session, document_id)
if not document:
return [], '文档不存在'
# 如果是PDF类型且内容为空先解析PDF
if document.type == DocumentSourceService.DOCUMENT_TYPE_PDF and not document.content:
# 解析PDF内容
pdf_path = os.path.join(os.getcwd(), document.source)
if not os.path.exists(pdf_path):
return [], 'PDF文件不存在'
# 使用AI服务解析PDF并生成用例
cases, msg = AIService.parse_pdf_and_generate_cases(pdf_path, template)
if msg:
return [], msg
# 更新文档内容和状态
DocumentSourceDao.update_by_id(session, document_id, {
'content': DocumentSourceService._extract_content_from_pdf(pdf_path),
'status': DocumentSourceService.DOCUMENT_STATUS_GENERATED
})
return cases, ''
if not document.content:
return [], '文档内容为空'
# 使用AI服务生成测试用例
cases, msg = AIService.generate_test_cases(document.content, template)
if msg:
return [], msg
# 更新文档状态为已生成用例
DocumentSourceDao.update_by_id(session, document_id, {
'status': DocumentSourceService.DOCUMENT_STATUS_GENERATED
})
return cases, ''
@staticmethod
def _extract_content_from_pdf(pdf_path):
"""提取PDF内容"""
try:
from PyPDF2 import PdfReader
reader = PdfReader(pdf_path)
content = ''
for page in reader.pages:
page_content = page.extract_text()
if page_content:
content += page_content + '\n'
return content
except Exception:
return ''
@staticmethod
def generate_cases_batch(session, document_ids, template=None):
"""
批量生成测试用例,支持多个文档
:param session: 数据库会话
:param document_ids: 文档ID列表
:param template: 用例模板配置
:return: 所有测试用例列表,失败文档列表
"""
all_cases = []
failed_docs = []
combined_content = []
template = template or {}
for doc_id in document_ids:
document = DocumentSourceDao.get_by_id(session, doc_id)
if not document:
failed_docs.append({'documentId': doc_id, 'error': '文档不存在'})
continue
content = document.content
# 如果是PDF类型且内容为空先解析PDF
if document.type == DocumentSourceService.DOCUMENT_TYPE_PDF and not content:
pdf_path = os.path.join(os.getcwd(), document.source)
if not os.path.exists(pdf_path):
failed_docs.append({'documentId': doc_id, 'error': 'PDF文件不存在'})
continue
# 提取PDF内容
content = DocumentSourceService._extract_content_from_pdf(pdf_path)
if not content:
failed_docs.append({'documentId': doc_id, 'error': 'PDF内容为空'})
continue
# 更新文档内容
DocumentSourceDao.update_by_id(session, doc_id, {
'content': content,
'status': DocumentSourceService.DOCUMENT_STATUS_PARSED
})
if not content:
failed_docs.append({'documentId': doc_id, 'error': '文档内容为空'})
continue
# 添加文档标识
combined_content.append(f"【文档ID: {doc_id}\n{content}\n")
if not combined_content:
return [], failed_docs
# 合并所有文档内容
merged_content = "\n---\n".join(combined_content)
context_template, context_err = DocumentSourceService._attach_generation_context(session, template)
if context_err:
return [], [{'documentId': 'all', 'error': context_err}]
# 使用AI服务生成测试用例基于合并后的内容
cases, msg = AIService.generate_test_cases(merged_content, context_template)
if msg:
return [], [{'documentId': 'all', 'error': msg}]
# 更新所有文档状态为已生成用例
for doc_id in document_ids:
if doc_id not in [f['documentId'] for f in failed_docs]:
DocumentSourceDao.update_by_id(session, doc_id, {
'status': DocumentSourceService.DOCUMENT_STATUS_GENERATED
})
return cases, failed_docs
@staticmethod
def _attach_generation_context(session, template):
template = dict(template or {})
skill_ids = template.get('skill_ids') or []
rule_ids = template.get('rule_ids') or []
if not skill_ids and not rule_ids:
return template, ''
project_id = template.get('project_id')
if not project_id:
return template, 'projectId 为必传参数'
try:
skill_ids = [int(item) for item in skill_ids]
rule_ids = [int(item) for item in rule_ids]
except (TypeError, ValueError):
return template, 'skillIds、ruleIds 必须是数字数组'
skills = SkillDao.list_skills_by_ids(session, project_id, skill_ids)
rules = SkillDao.list_business_rules_by_ids(session, project_id, rule_ids)
if len(skills) != len(set(skill_ids)):
return template, '存在未查询到的 Skill 或 Skill 不属于当前项目'
if len(rules) != len(set(rule_ids)):
return template, '存在未查询到的业务规则或业务规则不属于当前项目'
skill_contexts, err_msg = DocumentSourceService._load_asset_contexts(skills, 'skill_file_path', 'Skill')
if err_msg:
return template, err_msg
rule_contexts, err_msg = DocumentSourceService._load_asset_contexts(rules, 'rule_file_path', '业务规则')
if err_msg:
return template, err_msg
template['skill_contexts'] = skill_contexts
template['rule_contexts'] = rule_contexts
return template, ''
@staticmethod
def _load_asset_contexts(items, path_field, source_label):
contexts = []
workspace_root = os.getcwd()
for item in items:
file_path = getattr(item, path_field, None)
if not file_path:
return [], f'{source_label}{getattr(item, "name", "")}」未配置文件路径'
if not os.path.isabs(file_path):
file_path = os.path.join(workspace_root, file_path)
normalized_path = os.path.abspath(file_path)
if not os.path.exists(normalized_path):
return [], f'{source_label}{getattr(item, "name", "")}」文件不存在'
try:
with open(normalized_path, 'r', encoding='utf-8') as file_obj:
content = file_obj.read()
except Exception as e:
return [], f'{source_label}{getattr(item, "name", "")}」文件读取失败:{str(e)}'
contexts.append({
'id': item.id,
'name': item.name,
'path': normalized_path,
'content': content
})
return contexts, ''
@staticmethod
def match_modules(session, project_id, cases):
for case in cases:
module_name = case.get('module_name')
case['module_id'] = DocumentSourceService._find_module_by_path(session, project_id, module_name) if module_name else None
return cases
@staticmethod
def import_cases(session, document_id, cases, user_id, auto_create_module=False):
document = DocumentSourceDao.get_by_id(session, document_id)
if not document:
return 0, '文档不存在'
success_count = 0
for case_data in cases:
if not case_data.get('selected', True):
continue
module_id = case_data.get('module_id')
module_name = case_data.get('module_name', '未分类')
if not module_id:
if auto_create_module:
module_id = DocumentSourceService._get_or_create_module_path(session, document.project_id, module_name)
else:
module_id = DocumentSourceService._find_module_by_path(session, document.project_id, module_name)
if not module_id:
continue
case_info = {
'project_id': document.project_id,
'module_id': module_id,
'case_key': CaseDao.next_case_key(session, document.project_id, module_id, document.product_id),
'title': case_data.get('title', ''),
'preconditions': case_data.get('precondition', ''),
'steps': case_data.get('steps', ''),
'expected_results': case_data.get('expected_result', ''),
'priority': case_data.get('priority', 2),
'case_type': case_data.get('case_type', 1),
'tags': case_data.get('tags', []),
'is_ai_generated': 1,
'status': 0,
'is_delete': 0,
'created_by': user_id
}
case_id, err_msg = CaseDao.create(session, TestCase, case_info)
if err_msg:
return success_count, err_msg
success_count += 1
DocumentSourceDao.update_by_id(session, document_id, {
'status': DocumentSourceService.DOCUMENT_STATUS_GENERATED
})
return success_count, ''
@staticmethod
def batch_create_modules(session, project_id, module_names):
created_modules = []
for name in module_names:
module = DocumentSourceService._get_or_create_module_path(session, project_id, name, return_model=True)
if module:
created_modules.append(module)
session.flush()
return created_modules
@staticmethod
def _find_module_by_path(session, project_id, module_name):
parts = DocumentSourceService._parse_module_path(module_name)
parent_id = 0
module_id = None
for name in parts:
module = session.query(Module).filter(
Module.project_id == project_id,
Module.parent_id == parent_id,
Module.name == name,
Module.is_delete == 0
).first()
if not module:
return None
module_id = module.id
parent_id = module.id
return module_id
@staticmethod
def _get_or_create_module_path(session, project_id, module_name, return_model=False):
parts = DocumentSourceService._parse_module_path(module_name)
parent_id = 0
current_module = None
for name in parts:
current_module = session.query(Module).filter(
Module.project_id == project_id,
Module.parent_id == parent_id,
Module.name == name,
Module.is_delete == 0
).first()
if not current_module:
current_module = Module(
project_id=project_id,
parent_id=parent_id,
name=name,
sort_order=DocumentSourceService._next_module_sort_order(session, project_id, parent_id),
path=DocumentSourceService._build_module_path(session, parent_id, name),
is_delete=0,
status=0
)
session.add(current_module)
session.flush()
parent_id = current_module.id
return current_module if return_model else current_module.id
@staticmethod
def _parse_module_path(module_name):
module_name = str(module_name or '').strip() or '未分类'
parts = [part.strip() for part in re.split(r'[/\\>|]', module_name) if part.strip()]
return (parts or ['未分类'])[:3]
@staticmethod
def _next_module_sort_order(session, project_id, parent_id):
last_module = session.query(Module).filter(
Module.project_id == project_id,
Module.parent_id == parent_id,
Module.is_delete == 0
).order_by(Module.sort_order.desc()).first()
return (last_module.sort_order if last_module and last_module.sort_order is not None else 0) + 1
@staticmethod
def _build_module_path(session, parent_id, name):
if not parent_id:
return name
parent = session.query(Module).filter(Module.id == parent_id, Module.is_delete == 0).first()
if parent and parent.path:
return f'{parent.path}/{name}'
if parent:
return f'{parent.name}/{name}'
return name

View File

@@ -0,0 +1,571 @@
# encoding: UTF-8
import re
import shutil
from datetime import datetime
from pathlib import Path
from ..dao.skillDao import SkillDao
from ..model.skillModel import TestSkill, TestBusinessRule
from .aiService import AIService
class SkillService(object):
VALID_SKILL_TYPES = {1, 2, 3, 4, 5, 6, 7, 8, 9, 99}
VALID_STATUS = {1, 2, 3}
VALID_LEVELS = {0, 1, 2, 3}
@staticmethod
def _get(req_data, *keys, default=None):
for key in keys:
value = req_data.get(key)
if value not in (None, ''):
return value
return default
@staticmethod
def _ensure_list(value, field_name):
if value in (None, ''):
return [], ''
if not isinstance(value, list):
return [], f'{field_name} 必须是数组'
return value, ''
@staticmethod
def _normalize_generated_tags(value, fallback):
if isinstance(value, list):
tags = [str(item).strip() for item in value if str(item).strip()]
elif isinstance(value, str):
tags = [item.strip() for item in re.split(r'[,,、\s]+', value) if item.strip()]
else:
tags = []
return tags[:8] or fallback
@staticmethod
def _generate_unique_code(session, project_id, name, prefix, exists_checker):
name_text = str(name or '').strip().upper()
letters = re.sub(r'[^A-Z0-9]+', '_', name_text).strip('_')
code_prefix = (letters[:24] if letters else prefix) or prefix
time_part = datetime.now().strftime('%Y%m%d%H%M%S%f')[:20]
code = f'{code_prefix}_{time_part}'[:64]
if not exists_checker(session, project_id, code):
return code
for index in range(1, 100):
candidate = f'{code_prefix}_{time_part}_{index}'[:64]
if not exists_checker(session, project_id, candidate):
return candidate
return f'{prefix}_{time_part}'[:64]
@staticmethod
def _generate_skill_code(session, project_id, name):
return SkillService._generate_unique_code(session, project_id, name, 'SKILL', SkillDao.get_skill_by_project_code)
@staticmethod
def _generate_rule_code(session, project_id, name):
return SkillService._generate_unique_code(session, project_id, name, 'RULE', SkillDao.get_business_rule_by_project_code)
@staticmethod
def _safe_path_name(value, fallback):
value = str(value or '').strip() or fallback
value = re.sub(r'[\\/:*?"<>|\r\n\t]+', '_', value)
value = re.sub(r'\s+', ' ', value).strip(' .')
return (value or fallback)[:80]
@staticmethod
def _build_rule_file_content(rule_info):
tags = rule_info.get('tags') or []
tags_text = ', '.join([str(tag) for tag in tags])
frontmatter_name = re.sub(r'[^a-zA-Z0-9_-]+', '-', str(rule_info.get('name') or 'generated-rule').lower()).strip('-') or 'generated-rule'
description = rule_info.get('rule_content') or rule_info.get('description') or rule_info.get('name') or ''
return f'''---
name: {frontmatter_name}
description: {description}
---
# {rule_info.get('name')}
## Rule
{rule_info.get('rule_content') or ''}
## Applicable scene
{rule_info.get('applicable_scene') or ''}
## Example
{rule_info.get('example') or ''}
## Test design constraints
- Generate cases that verify this rule is satisfied in normal flows.
- Generate negative and boundary cases when the rule describes validation, limits, state changes, permissions, or data constraints.
- Mark missing prerequisites as “待确认” instead of inventing behavior.
## Metadata
- Code: {rule_info.get('rule_code') or ''}
- Product: {rule_info.get('product_name') or ''}
- Project: {rule_info.get('project_name') or ''}
- Module: {rule_info.get('module_name') or ''}
- Priority: {rule_info.get('priority')}
- Tags: {tags_text}
'''
@staticmethod
def _build_skill_file_content(skill_info):
skill_md = skill_info.get('skill_md') or skill_info.get('skillMd')
if isinstance(skill_md, str) and skill_md.strip():
return skill_md.strip() + '\n'
tags = skill_info.get('tags') or []
tags_text = ', '.join([str(tag) for tag in tags])
frontmatter_name = re.sub(r'[^a-zA-Z0-9_-]+', '-', str(skill_info.get('name') or 'generated-skill').lower()).strip('-') or 'generated-skill'
description = skill_info.get('description') or skill_info.get('name') or ''
return f'''---
name: {frontmatter_name}
description: {description}
---
# {skill_info.get('name')}
Use this skill when PRD, requirement, user story, interface specification, UI interaction, or business rule content needs to be transformed into high-quality test cases. This skill helps the model apply project-specific testing experience when designing functional, interface, boundary, exception, and regression cases.
## When to use this skill
{skill_info.get('trigger_condition') or ''}
## Analysis workflow
{skill_info.get('reasoning_path') or ''}
## Test design guidance
- Identify the core business flow, state changes, inputs, outputs, permissions, and data dependencies.
- Cover normal paths, boundary values, invalid inputs, exception handling, idempotency, concurrency, and regression risks when applicable.
- Mark missing or ambiguous requirements as “待确认” rather than inventing behavior.
## Output format
{skill_info.get('output_spec') or ''}
## Metadata
- Code: {skill_info.get('code') or ''}
- Product: {skill_info.get('product_name') or ''}
- Project: {skill_info.get('project_name') or ''}
- Module: {skill_info.get('module_name') or ''}
- Skill Type: {skill_info.get('skill_type')}
- Risk Level: {skill_info.get('risk_level')}
- Tags: {tags_text}
'''
@staticmethod
def _create_asset_file(session, project_id, module_id, asset_info, root_folder, folder_name, file_name, content_builder):
context = SkillDao.get_skill_path_context(session, project_id, module_id)
product_name = SkillService._safe_path_name(context.get('product_name'), '未关联产品')
project_name = SkillService._safe_path_name(context.get('project_name'), f'项目{project_id}')
module_name = SkillService._safe_path_name(context.get('module_name'), '项目通用')
asset_name = SkillService._safe_path_name(folder_name, '未命名')
base_dir = Path(__file__).resolve().parents[3] / 'config' / root_folder
asset_dir = base_dir / product_name / project_name / module_name / asset_name
if asset_dir.exists():
suffix = datetime.now().strftime('%Y%m%d%H%M%S%f')[:20]
asset_dir = asset_dir.with_name(f'{asset_dir.name}_{suffix}')
asset_dir.mkdir(parents=True, exist_ok=False)
asset_path = asset_dir / file_name
file_info = dict(asset_info)
file_info.update({
'product_name': context.get('product_name'),
'project_name': context.get('project_name'),
'module_name': context.get('module_name')
})
asset_path.write_text(content_builder(file_info), encoding='utf-8')
return str(asset_path), str(asset_dir)
@staticmethod
def _create_skill_file(session, project_id, module_id, skill_info):
return SkillService._create_asset_file(
session, project_id, module_id, skill_info, 'skills', skill_info.get('name'), 'SKILL.md', SkillService._build_skill_file_content
)
@staticmethod
def _create_rule_file(session, project_id, module_id, rule_info):
return SkillService._create_asset_file(
session, project_id, module_id, rule_info, 'rules', rule_info.get('name'), 'RULE.md', SkillService._build_rule_file_content
)
@staticmethod
def _remove_asset_file_path(asset_file_path, root_folder):
if not asset_file_path:
return
asset_path = Path(asset_file_path)
base_dir = Path(__file__).resolve().parents[3] / 'config' / root_folder
try:
resolved_asset_path = asset_path.resolve()
resolved_base_dir = base_dir.resolve()
if resolved_base_dir not in resolved_asset_path.parents:
return
asset_dir = resolved_asset_path.parent
if asset_dir.exists() and asset_dir.name not in {root_folder, 'config'}:
shutil.rmtree(asset_dir)
except FileNotFoundError:
return
@staticmethod
def _remove_skill_file_path(skill_file_path):
SkillService._remove_asset_file_path(skill_file_path, 'skills')
@staticmethod
def _remove_rule_file_path(rule_file_path):
SkillService._remove_asset_file_path(rule_file_path, 'rules')
@staticmethod
def create_skill(session, req_data, user_id=None):
project_id = SkillService._get(req_data, 'projectId', 'project_id')
name = SkillService._get(req_data, 'name')
if not project_id or not name:
return 0, 'projectId、name 为必传参数'
input_tags, err_msg = SkillService._ensure_list(SkillService._get(req_data, 'tags', default=[]), 'tags')
if err_msg:
return 0, err_msg
generated_info, err_msg = AIService.generate_skill_content(req_data)
if err_msg:
return 0, err_msg
generated_skill_type = generated_info.get('skill_type') or generated_info.get('skillType')
generated_risk_level = generated_info.get('risk_level') or generated_info.get('riskLevel')
skill_type = int(generated_skill_type if generated_skill_type is not None else SkillService._get(req_data, 'skillType', 'skill_type', default=1))
risk_level = int(generated_risk_level if generated_risk_level is not None else SkillService._get(req_data, 'riskLevel', 'risk_level', default=2))
status = int(SkillService._get(req_data, 'status', default=1))
if skill_type not in SkillService.VALID_SKILL_TYPES:
skill_type = 1
if risk_level not in SkillService.VALID_LEVELS:
risk_level = 2
if status not in SkillService.VALID_STATUS:
return 0, 'status 不合法'
generated_tags = SkillService._normalize_generated_tags(generated_info.get('tags'), input_tags)
module_id_value = SkillService._get(req_data, 'moduleId', 'module_id')
module_id = int(module_id_value) if module_id_value else None
add_info = {
'project_id': int(project_id),
'module_id': module_id,
'name': name,
'code': SkillService._generate_skill_code(session, project_id, name),
'description': generated_info.get('description') or SkillService._get(req_data, 'description') or name,
'trigger_condition': AIService.get_default_case_generation_trigger_condition(),
'reasoning_path': generated_info.get('reasoning_path') or generated_info.get('reasoningPath'),
'output_spec': AIService.get_default_case_generation_output_spec(),
'skill_type': skill_type,
'risk_level': risk_level,
'tags': generated_tags,
'status': status,
'owner_id': int(user_id) if user_id else None,
'created_by': user_id,
'usage_count': 0,
'is_delete': 0
}
skill_file_info = dict(add_info)
skill_file_info['skill_md'] = generated_info.get('skill_md')
skill_file_path, skill_dir = SkillService._create_skill_file(session, int(project_id), module_id, skill_file_info)
add_info['skill_file_path'] = skill_file_path
obj_id, create_err = SkillDao.create(session, TestSkill, add_info)
if create_err:
shutil.rmtree(skill_dir, ignore_errors=True)
return 0, create_err
return obj_id, ''
@staticmethod
def update_skill(session, req_data):
skill_id = SkillService._get(req_data, 'skillId', 'id')
if not skill_id:
return 0, 'skillId 为必传参数'
item = SkillDao.get_by_id(session, TestSkill, skill_id)
if not item:
return 0, '未查询到对应 Skill'
update_info = {}
mapping = [
('name', 'name'), ('description', 'description'),
('triggerCondition', 'trigger_condition'), ('trigger_condition', 'trigger_condition'),
('reasoningPath', 'reasoning_path'), ('reasoning_path', 'reasoning_path'),
('outputSpec', 'output_spec'), ('output_spec', 'output_spec')
]
for req_key, column_key in mapping:
value = SkillService._get(req_data, req_key)
if value is not None:
update_info[column_key] = value
module_id = SkillService._get(req_data, 'moduleId', 'module_id')
if module_id is not None:
update_info['module_id'] = int(module_id) if module_id != '' else None
owner_id = SkillService._get(req_data, 'ownerId', 'owner_id')
if owner_id is not None:
update_info['owner_id'] = int(owner_id) if owner_id != '' else None
tags = SkillService._get(req_data, 'tags')
if tags is not None:
tags, err_msg = SkillService._ensure_list(tags, 'tags')
if err_msg:
return 0, err_msg
update_info['tags'] = tags
for req_key, column_key, valid_set in [
('skillType', 'skill_type', SkillService.VALID_SKILL_TYPES),
('skill_type', 'skill_type', SkillService.VALID_SKILL_TYPES),
('riskLevel', 'risk_level', SkillService.VALID_LEVELS),
('risk_level', 'risk_level', SkillService.VALID_LEVELS),
('status', 'status', SkillService.VALID_STATUS)
]:
value = SkillService._get(req_data, req_key)
if value is not None:
value = int(value)
if value not in valid_set:
return 0, f'{req_key} 不合法'
update_info[column_key] = value
if not update_info:
return int(skill_id), ''
merged_info = item.to_dict()
merged_info.update(update_info)
new_skill_file_path = None
new_skill_dir = None
try:
new_skill_file_path, new_skill_dir = SkillService._create_skill_file(
session,
int(merged_info.get('project_id')),
merged_info.get('module_id'),
merged_info
)
update_info['skill_file_path'] = new_skill_file_path
except Exception as e:
return 0, f'Skill 文件创建失败:{str(e)}'
obj_id, err_msg = SkillDao.update_by_id(session, TestSkill, skill_id, update_info)
if err_msg:
if new_skill_dir:
shutil.rmtree(new_skill_dir, ignore_errors=True)
return obj_id, err_msg
SkillService._remove_skill_file_path(item.skill_file_path)
return obj_id, ''
@staticmethod
def delete_skill(session, req_data):
skill_id = SkillService._get(req_data, 'skillId', 'id')
if not skill_id:
return 0, 'skillId 为必传参数'
item = SkillDao.get_by_id(session, TestSkill, skill_id)
if not item:
return 0, '未查询到对应 Skill'
obj_id, err_msg = SkillDao.delete_by_id(session, TestSkill, skill_id)
if err_msg:
return obj_id, err_msg
SkillService._remove_skill_file_path(item.skill_file_path)
return obj_id, ''
@staticmethod
def skill_detail(session, skill_id):
item = SkillDao.get_by_id(session, TestSkill, skill_id)
if not item:
return {}, '未查询到对应 Skill'
return item.to_dict(), ''
@staticmethod
def skill_list(session, req_data):
filters = []
project_id = SkillService._get(req_data, 'projectId', 'project_id')
module_id = SkillService._get(req_data, 'moduleId', 'module_id')
status = SkillService._get(req_data, 'status')
skill_type = SkillService._get(req_data, 'skillType', 'skill_type')
risk_level = SkillService._get(req_data, 'riskLevel', 'risk_level')
if project_id:
filters.append(TestSkill.project_id == int(project_id))
if module_id not in (None, ''):
filters.append(TestSkill.module_id == int(module_id))
if status not in (None, ''):
filters.append(TestSkill.status == int(status))
if skill_type not in (None, ''):
filters.append(TestSkill.skill_type == int(skill_type))
if risk_level not in (None, ''):
filters.append(TestSkill.risk_level == int(risk_level))
items, total = SkillDao.list_skill(
session, filters,
SkillService._get(req_data, 'pageNo', 'page', default=1),
SkillService._get(req_data, 'pageSize', 'size', default=20),
SkillService._get(req_data, 'keyword'),
SkillService._get(req_data, 'tag')
)
return {'list': [item.to_dict() for item in items], 'total': total}
@staticmethod
def create_business_rule(session, req_data, user_id=None):
project_id = SkillService._get(req_data, 'projectId', 'project_id')
name = SkillService._get(req_data, 'name')
if not project_id or not name:
return 0, 'projectId、name 为必传参数'
input_tags, err_msg = SkillService._ensure_list(SkillService._get(req_data, 'tags', default=[]), 'tags')
if err_msg:
return 0, err_msg
generated_info, err_msg = AIService.generate_business_rule_content(req_data)
if err_msg:
return 0, err_msg
input_priority = SkillService._get(req_data, 'priority')
priority_value = input_priority if input_priority is not None else generated_info.get('priority')
priority = int(priority_value if priority_value is not None else 2)
status = int(SkillService._get(req_data, 'status', default=1))
if priority not in SkillService.VALID_LEVELS:
priority = 2
if status not in SkillService.VALID_STATUS:
return 0, 'status 不合法'
generated_tags = SkillService._normalize_generated_tags(generated_info.get('tags'), input_tags)
module_id_value = SkillService._get(req_data, 'moduleId', 'module_id')
module_id = int(module_id_value) if module_id_value else None
input_rule_content = SkillService._get(req_data, 'ruleContent', 'rule_content') or SkillService._get(req_data, 'description') or name
add_info = {
'project_id': int(project_id),
'module_id': module_id,
'name': name,
'rule_code': SkillService._generate_rule_code(session, project_id, name),
'rule_content': input_rule_content,
'applicable_scene': SkillService._get(req_data, 'applicableScene', 'applicable_scene') or generated_info.get('applicable_scene') or generated_info.get('applicableScene'),
'example': SkillService._get(req_data, 'example') or generated_info.get('example'),
'priority': priority,
'tags': input_tags or generated_tags,
'status': status,
'owner_id': int(user_id) if user_id else None,
'created_by': user_id,
'usage_count': 0,
'is_delete': 0
}
rule_file_info = dict(add_info)
rule_file_path, rule_dir = SkillService._create_rule_file(session, int(project_id), module_id, rule_file_info)
add_info['rule_file_path'] = rule_file_path
obj_id, create_err = SkillDao.create(session, TestBusinessRule, add_info)
if create_err:
shutil.rmtree(rule_dir, ignore_errors=True)
return 0, create_err
return obj_id, ''
@staticmethod
def update_business_rule(session, req_data):
rule_id = SkillService._get(req_data, 'ruleId', 'id')
if not rule_id:
return 0, 'ruleId 为必传参数'
item = SkillDao.get_by_id(session, TestBusinessRule, rule_id)
if not item:
return 0, '未查询到对应业务规则'
update_info = {}
mapping = [
('name', 'name'), ('ruleContent', 'rule_content'), ('rule_content', 'rule_content'),
('applicableScene', 'applicable_scene'), ('applicable_scene', 'applicable_scene'),
('example', 'example')
]
for req_key, column_key in mapping:
value = SkillService._get(req_data, req_key)
if value is not None:
update_info[column_key] = value
module_id = SkillService._get(req_data, 'moduleId', 'module_id')
if module_id is not None:
update_info['module_id'] = int(module_id) if module_id != '' else None
owner_id = SkillService._get(req_data, 'ownerId', 'owner_id')
if owner_id is not None:
update_info['owner_id'] = int(owner_id) if owner_id != '' else None
tags = SkillService._get(req_data, 'tags')
if tags is not None:
tags, err_msg = SkillService._ensure_list(tags, 'tags')
if err_msg:
return 0, err_msg
update_info['tags'] = tags
priority = SkillService._get(req_data, 'priority')
if priority is not None:
priority = int(priority)
if priority not in SkillService.VALID_LEVELS:
return 0, 'priority 不合法'
update_info['priority'] = priority
status = SkillService._get(req_data, 'status')
if status is not None:
status = int(status)
if status not in SkillService.VALID_STATUS:
return 0, 'status 不合法'
update_info['status'] = status
if not update_info:
return int(rule_id), ''
merged_info = item.to_dict()
merged_info.update(update_info)
new_rule_file_path = None
new_rule_dir = None
try:
new_rule_file_path, new_rule_dir = SkillService._create_rule_file(
session,
int(merged_info.get('project_id')),
merged_info.get('module_id'),
merged_info
)
update_info['rule_file_path'] = new_rule_file_path
except Exception as e:
return 0, f'业务规则文件创建失败:{str(e)}'
obj_id, err_msg = SkillDao.update_by_id(session, TestBusinessRule, rule_id, update_info)
if err_msg:
if new_rule_dir:
shutil.rmtree(new_rule_dir, ignore_errors=True)
return obj_id, err_msg
SkillService._remove_rule_file_path(item.rule_file_path)
return obj_id, ''
@staticmethod
def delete_business_rule(session, req_data):
rule_id = SkillService._get(req_data, 'ruleId', 'id')
if not rule_id:
return 0, 'ruleId 为必传参数'
item = SkillDao.get_by_id(session, TestBusinessRule, rule_id)
if not item:
return 0, '未查询到对应业务规则'
obj_id, err_msg = SkillDao.delete_by_id(session, TestBusinessRule, rule_id)
if err_msg:
return obj_id, err_msg
SkillService._remove_rule_file_path(item.rule_file_path)
return obj_id, ''
@staticmethod
def business_rule_detail(session, rule_id):
item = SkillDao.get_by_id(session, TestBusinessRule, rule_id)
if not item:
return {}, '未查询到对应业务规则'
return item.to_dict(), ''
@staticmethod
def skill_rule_list(session, req_data):
product_id = SkillService._get(req_data, 'productId', 'product_id')
project_id = SkillService._get(req_data, 'projectId', 'project_id')
status = SkillService._get(req_data, 'status')
if not product_id or not project_id:
return {}, 'productId、projectId 为必传参数'
project = SkillDao.get_project_by_product(session, product_id, project_id)
if not project:
return {}, '未查询到对应产品下的项目'
skills = SkillDao.list_skills_by_project(session, project_id, status)
rules = SkillDao.list_business_rules_by_project(session, project_id, status)
return {
'productId': int(product_id),
'projectId': int(project_id),
'skills': [item.to_dict() for item in skills],
'rules': [item.to_dict() for item in rules],
'skillTotal': len(skills),
'ruleTotal': len(rules)
}, ''
@staticmethod
def business_rule_list(session, req_data):
filters = []
project_id = SkillService._get(req_data, 'projectId', 'project_id')
module_id = SkillService._get(req_data, 'moduleId', 'module_id')
status = SkillService._get(req_data, 'status')
priority = SkillService._get(req_data, 'priority')
if project_id:
filters.append(TestBusinessRule.project_id == int(project_id))
if module_id not in (None, ''):
filters.append(TestBusinessRule.module_id == int(module_id))
if status not in (None, ''):
filters.append(TestBusinessRule.status == int(status))
if priority not in (None, ''):
filters.append(TestBusinessRule.priority == int(priority))
items, total = SkillDao.list_business_rule(
session, filters,
SkillService._get(req_data, 'pageNo', 'page', default=1),
SkillService._get(req_data, 'pageSize', 'size', default=20),
SkillService._get(req_data, 'keyword'),
SkillService._get(req_data, 'tag')
)
return {'list': [item.to_dict() for item in items], 'total': total}

Binary file not shown.

74
config/ai_config.py Normal file
View File

@@ -0,0 +1,74 @@
# encoding: UTF-8
"""
AI大模型配置文件
支持多种模型提供商OpenAI、自定义API等
"""
import os
from pathlib import Path
from dotenv import load_dotenv
# 加载项目根目录.env避免进程工作目录不同导致读取到其他配置
ENV_PATH = Path(__file__).resolve().parents[1] / '.env'
load_dotenv(ENV_PATH, override=True)
class AIConfig:
"""AI配置类"""
# OpenAI配置
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
OPENAI_API_BASE = os.getenv('OPENAI_API_BASE', 'https://api.openai.com/v1')
OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-4o')
OPENAI_MAX_TOKENS = int(os.getenv('OPENAI_MAX_TOKENS', '4096'))
OPENAI_TEMPERATURE = float(os.getenv('OPENAI_TEMPERATURE', '0.7'))
# Routin/Meteor API配置兼容OpenAI格式的API
METEOR_API_KEY = os.getenv('METEOR_API_KEY', '')
ROUTIN_API_KEY = os.getenv('ROUTIN_API_KEY', '')
CUSTOM_API_KEY = os.getenv('CUSTOM_API_KEY', '')
CUSTOM_API_BASE = os.getenv('CUSTOM_API_BASE', 'https://api.routin.ai/v1')
CUSTOM_MODEL = os.getenv('CUSTOM_MODEL', 'gpt-4o')
# 模型提供商openai, custom
MODEL_PROVIDER = os.getenv('MODEL_PROVIDER', 'openai')
# 超时配置(秒)
CONNECT_TIMEOUT = int(os.getenv('AI_CONNECT_TIMEOUT', '60'))
READ_TIMEOUT = int(os.getenv('AI_READ_TIMEOUT', '120'))
# 重试配置
MAX_RETRIES = int(os.getenv('AI_MAX_RETRIES', '3'))
RETRY_DELAY = float(os.getenv('AI_RETRY_DELAY', '2.0'))
@staticmethod
def get_api_key():
"""获取API密钥"""
if AIConfig.MODEL_PROVIDER == 'openai':
return AIConfig.OPENAI_API_KEY
return AIConfig.METEOR_API_KEY or AIConfig.ROUTIN_API_KEY or AIConfig.CUSTOM_API_KEY
@staticmethod
def get_api_key_source():
"""获取当前API密钥来源用于排查配置加载问题"""
if AIConfig.MODEL_PROVIDER == 'openai':
return 'OPENAI_API_KEY'
if AIConfig.METEOR_API_KEY:
return 'METEOR_API_KEY'
if AIConfig.ROUTIN_API_KEY:
return 'ROUTIN_API_KEY'
if AIConfig.CUSTOM_API_KEY:
return 'CUSTOM_API_KEY'
return 'EMPTY'
@staticmethod
def get_api_base():
"""获取API基础URL"""
if AIConfig.MODEL_PROVIDER == 'openai':
return AIConfig.OPENAI_API_BASE
return AIConfig.CUSTOM_API_BASE
@staticmethod
def get_model():
"""获取模型名称"""
if AIConfig.MODEL_PROVIDER == 'openai':
return AIConfig.OPENAI_MODEL
return AIConfig.CUSTOM_MODEL

View File

@@ -0,0 +1,31 @@
---
name: generated-rule
description: 1.需要进行倒序排序
2.需要在卡片上展示状态
---
# 待办规则
## Rule
1.需要进行倒序排序
2.需要在卡片上展示状态
## Applicable scene
待办列表、任务卡片、工单列表等需要按倒序展示并在卡片上呈现状态的场景;适用于 AI 根据 PRD/需求生成测试用例时识别排序规则与卡片展示约束。
## Example
输入场景:待办列表中存在多条记录,按创建时间/时间序列需要倒序展示,且每条卡片需要显示当前状态。
预期:列表默认按倒序排列;每张卡片可见状态信息,状态展示与数据源一致,刷新后仍保持正确展示。
## Test design constraints
- Generate cases that verify this rule is satisfied in normal flows.
- Generate negative and boundary cases when the rule describes validation, limits, state changes, permissions, or data constraints.
- Mark missing prerequisites as “待确认” instead of inventing behavior.
## Metadata
- Code: RULE_20260515175047622133
- Product: 智慧运营
- Project: 智慧运营V2.0
- Module: 采购工作台
- Priority: 2
- Tags: 排序, 状态展示, 待办, 列表, 卡片

View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2026 Anthropic, PBC.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -0,0 +1,485 @@
---
name: skill-creator
description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, edit, or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy.
---
# Skill Creator
A skill for creating new skills and iteratively improving them.
At a high level, the process of creating a skill goes like this:
- Decide what you want the skill to do and roughly how it should do it
- Write a draft of the skill
- Create a few test prompts and run claude-with-access-to-the-skill on them
- Help the user evaluate the results both qualitatively and quantitatively
- While the runs happen in the background, draft some quantitative evals if there aren't any (if there are some, you can either use as is or modify if you feel something needs to change about them). Then explain them to the user (or if they already existed, explain the ones that already exist)
- Use the `eval-viewer/generate_review.py` script to show the user the results for them to look at, and also let them look at the quantitative metrics
- Rewrite the skill based on feedback from the user's evaluation of the results (and also if there are any glaring flaws that become apparent from the quantitative benchmarks)
- Repeat until you're satisfied
- Expand the test set and try again at larger scale
Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages. So for instance, maybe they're like "I want to make a skill for X". You can help narrow down what they mean, write a draft, write the test cases, figure out how they want to evaluate, run all the prompts, and repeat.
On the other hand, maybe they already have a draft of the skill. In this case you can go straight to the eval/iterate part of the loop.
Of course, you should always be flexible and if the user is like "I don't need to run a bunch of evaluations, just vibe with me", you can do that instead.
Then after the skill is done (but again, the order is flexible), you can also run the skill description improver, which we have a whole separate script for, to optimize the triggering of the skill.
Cool? Cool.
## Communicating with the user
The skill creator is liable to be used by people across a wide range of familiarity with coding jargon. If you haven't heard (and how could you, it's only very recently that it started), there's a trend now where the power of Claude is inspiring plumbers to open up their terminals, parents and grandparents to google "how to install npm". On the other hand, the bulk of users are probably fairly computer-literate.
So please pay attention to context cues to understand how to phrase your communication! In the default case, just to give you some idea:
- "evaluation" and "benchmark" are borderline, but OK
- for "JSON" and "assertion" you want to see serious cues from the user that they know what those things are before using them without explaining them
It's OK to briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it.
---
## Creating a skill
### Capture Intent
Start by understanding the user's intent. The current conversation might already contain a workflow the user wants to capture (e.g., they say "turn this into a skill"). If so, extract answers from the conversation history first — the tools used, the sequence of steps, corrections the user made, input/output formats observed. The user may need to fill the gaps, and should confirm before proceeding to the next step.
1. What should this skill enable Claude to do?
2. When should this skill trigger? (what user phrases/contexts)
3. What's the expected output format?
4. Should we set up test cases to verify the skill works? Skills with objectively verifiable outputs (file transforms, data extraction, code generation, fixed workflow steps) benefit from test cases. Skills with subjective outputs (writing style, art) often don't need them. Suggest the appropriate default based on the skill type, but let the user decide.
### Interview and Research
Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out.
Check available MCPs - if useful for research (searching docs, finding similar skills, looking up best practices), research in parallel via subagents if available, otherwise inline. Come prepared with context to reduce burden on the user.
### Write the SKILL.md
Based on the user interview, fill in these components:
- **name**: Skill identifier
- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. All "when to use" info goes here, not in the body. Note: currently Claude has a tendency to "undertrigger" skills -- to not use them when they'd be useful. To combat this, please make the skill descriptions a little bit "pushy". So for instance, instead of "How to build a simple fast dashboard to display internal Anthropic data.", you might write "How to build a simple fast dashboard to display internal Anthropic data. Make sure to use this skill whenever the user mentions dashboards, data visualization, internal metrics, or wants to display any kind of company data, even if they don't explicitly ask for a 'dashboard.'"
- **compatibility**: Required tools, dependencies (optional, rarely needed)
- **the rest of the skill :)**
### Skill Writing Guide
#### Anatomy of a Skill
```
skill-name/
├── SKILL.md (required)
│ ├── YAML frontmatter (name, description required)
│ └── Markdown instructions
└── Bundled Resources (optional)
├── scripts/ - Executable code for deterministic/repetitive tasks
├── references/ - Docs loaded into context as needed
└── assets/ - Files used in output (templates, icons, fonts)
```
#### Progressive Disclosure
Skills use a three-level loading system:
1. **Metadata** (name + description) - Always in context (~100 words)
2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal)
3. **Bundled resources** - As needed (unlimited, scripts can execute without loading)
These word counts are approximate and you can feel free to go longer if needed.
**Key patterns:**
- Keep SKILL.md under 500 lines; if you're approaching this limit, add an additional layer of hierarchy along with clear pointers about where the model using the skill should go next to follow up.
- Reference files clearly from SKILL.md with guidance on when to read them
- For large reference files (>300 lines), include a table of contents
**Domain organization**: When a skill supports multiple domains/frameworks, organize by variant:
```
cloud-deploy/
├── SKILL.md (workflow + selection)
└── references/
├── aws.md
├── gcp.md
└── azure.md
```
Claude reads only the relevant reference file.
#### Principle of Lack of Surprise
This goes without saying, but skills must not contain malware, exploit code, or any content that could compromise system security. A skill's contents should not surprise the user in their intent if described. Don't go along with requests to create misleading skills or skills designed to facilitate unauthorized access, data exfiltration, or other malicious activities. Things like a "roleplay as an XYZ" are OK though.
#### Writing Patterns
Prefer using the imperative form in instructions.
**Defining output formats** - You can do it like this:
```markdown
## Report structure
ALWAYS use this exact template:
# [Title]
## Executive summary
## Key findings
## Recommendations
```
**Examples pattern** - It's useful to include examples. You can format them like this (but if "Input" and "Output" are in the examples you might want to deviate a little):
```markdown
## Commit message format
**Example 1:**
Input: Added user authentication with JWT tokens
Output: feat(auth): implement JWT-based authentication
```
### Writing Style
Try to explain to the model why things are important in lieu of heavy-handed musty MUSTs. Use theory of mind and try to make the skill general and not super-narrow to specific examples. Start by writing a draft and then look at it with fresh eyes and improve it.
### Test Cases
After writing the skill draft, come up with 2-3 realistic test prompts — the kind of thing a real user would actually say. Share them with the user: [you don't have to use this exact language] "Here are a few test cases I'd like to try. Do these look right, or do you want to add more?" Then run them.
Save test cases to `evals/evals.json`. Don't write assertions yet — just the prompts. You'll draft assertions in the next step while the runs are in progress.
```json
{
"skill_name": "example-skill",
"evals": [
{
"id": 1,
"prompt": "User's task prompt",
"expected_output": "Description of expected result",
"files": []
}
]
}
```
See `references/schemas.md` for the full schema (including the `assertions` field, which you'll add later).
## Running and evaluating test cases
This section is one continuous sequence — don't stop partway through. Do NOT use `/skill-test` or any other testing skill.
Put results in `<skill-name>-workspace/` as a sibling to the skill directory. Within the workspace, organize results by iteration (`iteration-1/`, `iteration-2/`, etc.) and within that, each test case gets a directory (`eval-0/`, `eval-1/`, etc.). Don't create all of this upfront — just create directories as you go.
### Step 1: Spawn all runs (with-skill AND baseline) in the same turn
For each test case, spawn two subagents in the same turn — one with the skill, one without. This is important: don't spawn the with-skill runs first and then come back for baselines later. Launch everything at once so it all finishes around the same time.
**With-skill run:**
```
Execute this task:
- Skill path: <path-to-skill>
- Task: <eval prompt>
- Input files: <eval files if any, or "none">
- Save outputs to: <workspace>/iteration-<N>/eval-<ID>/with_skill/outputs/
- Outputs to save: <what the user cares about — e.g., "the .docx file", "the final CSV">
```
**Baseline run** (same prompt, but the baseline depends on context):
- **Creating a new skill**: no skill at all. Same prompt, no skill path, save to `without_skill/outputs/`.
- **Improving an existing skill**: the old version. Before editing, snapshot the skill (`cp -r <skill-path> <workspace>/skill-snapshot/`), then point the baseline subagent at the snapshot. Save to `old_skill/outputs/`.
Write an `eval_metadata.json` for each test case (assertions can be empty for now). Give each eval a descriptive name based on what it's testing — not just "eval-0". Use this name for the directory too. If this iteration uses new or modified eval prompts, create these files for each new eval directory — don't assume they carry over from previous iterations.
```json
{
"eval_id": 0,
"eval_name": "descriptive-name-here",
"prompt": "The user's task prompt",
"assertions": []
}
```
### Step 2: While runs are in progress, draft assertions
Don't just wait for the runs to finish — you can use this time productively. Draft quantitative assertions for each test case and explain them to the user. If assertions already exist in `evals/evals.json`, review them and explain what they check.
Good assertions are objectively verifiable and have descriptive names — they should read clearly in the benchmark viewer so someone glancing at the results immediately understands what each one checks. Subjective skills (writing style, design quality) are better evaluated qualitatively — don't force assertions onto things that need human judgment.
Update the `eval_metadata.json` files and `evals/evals.json` with the assertions once drafted. Also explain to the user what they'll see in the viewer — both the qualitative outputs and the quantitative benchmark.
### Step 3: As runs complete, capture timing data
When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. Save this data immediately to `timing.json` in the run directory:
```json
{
"total_tokens": 84852,
"duration_ms": 23332,
"total_duration_seconds": 23.3
}
```
This is the only opportunity to capture this data — it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives rather than trying to batch them.
### Step 4: Grade, aggregate, and launch the viewer
Once all runs are done:
1. **Grade each run** — spawn a grader subagent (or grade inline) that reads `agents/grader.md` and evaluates each assertion against the outputs. Save results to `grading.json` in each run directory. The grading.json expectations array must use the fields `text`, `passed`, and `evidence` (not `name`/`met`/`details` or other variants) — the viewer depends on these exact field names. For assertions that can be checked programmatically, write and run a script rather than eyeballing it — scripts are faster, more reliable, and can be reused across iterations.
2. **Aggregate into benchmark** — run the aggregation script from the skill-creator directory:
```bash
python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>
```
This produces `benchmark.json` and `benchmark.md` with pass_rate, time, and tokens for each configuration, with mean ± stddev and the delta. If generating benchmark.json manually, see `references/schemas.md` for the exact schema the viewer expects.
Put each with_skill version before its baseline counterpart.
3. **Do an analyst pass** — read the benchmark data and surface patterns the aggregate stats might hide. See `agents/analyzer.md` (the "Analyzing Benchmark Results" section) for what to look for — things like assertions that always pass regardless of skill (non-discriminating), high-variance evals (possibly flaky), and time/token tradeoffs.
4. **Launch the viewer** with both qualitative outputs and quantitative data:
```bash
nohup python <skill-creator-path>/eval-viewer/generate_review.py \
<workspace>/iteration-N \
--skill-name "my-skill" \
--benchmark <workspace>/iteration-N/benchmark.json \
> /dev/null 2>&1 &
VIEWER_PID=$!
```
For iteration 2+, also pass `--previous-workspace <workspace>/iteration-<N-1>`.
**Cowork / headless environments:** If `webbrowser.open()` is not available or the environment has no display, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Feedback will be downloaded as a `feedback.json` file when the user clicks "Submit All Reviews". After download, copy `feedback.json` into the workspace directory for the next iteration to pick up.
Note: please use generate_review.py to create the viewer; there's no need to write custom HTML.
5. **Tell the user** something like: "I've opened the results in your browser. There are two tabs — 'Outputs' lets you click through each test case and leave feedback, 'Benchmark' shows the quantitative comparison. When you're done, come back here and let me know."
### What the user sees in the viewer
The "Outputs" tab shows one test case at a time:
- **Prompt**: the task that was given
- **Output**: the files the skill produced, rendered inline where possible
- **Previous Output** (iteration 2+): collapsed section showing last iteration's output
- **Formal Grades** (if grading was run): collapsed section showing assertion pass/fail
- **Feedback**: a textbox that auto-saves as they type
- **Previous Feedback** (iteration 2+): their comments from last time, shown below the textbox
The "Benchmark" tab shows the stats summary: pass rates, timing, and token usage for each configuration, with per-eval breakdowns and analyst observations.
Navigation is via prev/next buttons or arrow keys. When done, they click "Submit All Reviews" which saves all feedback to `feedback.json`.
### Step 5: Read the feedback
When the user tells you they're done, read `feedback.json`:
```json
{
"reviews": [
{"run_id": "eval-0-with_skill", "feedback": "the chart is missing axis labels", "timestamp": "..."},
{"run_id": "eval-1-with_skill", "feedback": "", "timestamp": "..."},
{"run_id": "eval-2-with_skill", "feedback": "perfect, love this", "timestamp": "..."}
],
"status": "complete"
}
```
Empty feedback means the user thought it was fine. Focus your improvements on the test cases where the user had specific complaints.
Kill the viewer server when you're done with it:
```bash
kill $VIEWER_PID 2>/dev/null
```
---
## Improving the skill
This is the heart of the loop. You've run the test cases, the user has reviewed the results, and now you need to make the skill better based on their feedback.
### How to think about improvements
1. **Generalize from the feedback.** The big picture thing that's happening here is that we're trying to create skills that can be used a million times (maybe literally, maybe even more who knows) across many different prompts. Here you and the user are iterating on only a few examples over and over again because it helps move faster. The user knows these examples in and out and it's quick for them to assess new outputs. But if the skill you and the user are codeveloping works only for those examples, it's useless. Rather than put in fiddly overfitty changes, or oppressively constrictive MUSTs, if there's some stubborn issue, you might try branching out and using different metaphors, or recommending different patterns of working. It's relatively cheap to try and maybe you'll land on something great.
2. **Keep the prompt lean.** Remove things that aren't pulling their weight. Make sure to read the transcripts, not just the final outputs — if it looks like the skill is making the model waste a bunch of time doing things that are unproductive, you can try getting rid of the parts of the skill that are making it do that and seeing what happens.
3. **Explain the why.** Try hard to explain the **why** behind everything you're asking the model to do. Today's LLMs are *smart*. They have good theory of mind and when given a good harness can go beyond rote instructions and really make things happen. Even if the feedback from the user is terse or frustrated, try to actually understand the task and why the user is writing what they wrote, and what they actually wrote, and then transmit this understanding into the instructions. If you find yourself writing ALWAYS or NEVER in all caps, or using super rigid structures, that's a yellow flag — if possible, reframe and explain the reasoning so that the model understands why the thing you're asking for is important. That's a more humane, powerful, and effective approach.
4. **Look for repeated work across test cases.** Read the transcripts from the test runs and notice if the subagents all independently wrote similar helper scripts or took the same multi-step approach to something. If all 3 test cases resulted in the subagent writing a `create_docx.py` or a `build_chart.py`, that's a strong signal the skill should bundle that script. Write it once, put it in `scripts/`, and tell the skill to use it. This saves every future invocation from reinventing the wheel.
This task is pretty important (we are trying to create billions a year in economic value here!) and your thinking time is not the blocker; take your time and really mull things over. I'd suggest writing a draft revision and then looking at it anew and making improvements. Really do your best to get into the head of the user and understand what they want and need.
### The iteration loop
After improving the skill:
1. Apply your improvements to the skill
2. Rerun all test cases into a new `iteration-<N+1>/` directory, including baseline runs. If you're creating a new skill, the baseline is always `without_skill` (no skill) — that stays the same across iterations. If you're improving an existing skill, use your judgment on what makes sense as the baseline: the original version the user came in with, or the previous iteration.
3. Launch the reviewer with `--previous-workspace` pointing at the previous iteration
4. Wait for the user to review and tell you they're done
5. Read the new feedback, improve again, repeat
Keep going until:
- The user says they're happy
- The feedback is all empty (everything looks good)
- You're not making meaningful progress
---
## Advanced: Blind comparison
For situations where you want a more rigorous comparison between two versions of a skill (e.g., the user asks "is the new version actually better?"), there's a blind comparison system. Read `agents/comparator.md` and `agents/analyzer.md` for the details. The basic idea is: give two outputs to an independent agent without telling it which is which, and let it judge quality. Then analyze why the winner won.
This is optional, requires subagents, and most users won't need it. The human review loop is usually sufficient.
---
## Description Optimization
The description field in SKILL.md frontmatter is the primary mechanism that determines whether Claude invokes a skill. After creating or improving a skill, offer to optimize the description for better triggering accuracy.
### Step 1: Generate trigger eval queries
Create 20 eval queries — a mix of should-trigger and should-not-trigger. Save as JSON:
```json
[
{"query": "the user prompt", "should_trigger": true},
{"query": "another prompt", "should_trigger": false}
]
```
The queries must be realistic and something a Claude Code or Claude.ai user would actually type. Not abstract requests, but requests that are concrete and specific and have a good amount of detail. For instance, file paths, personal context about the user's job or situation, column names and values, company names, URLs. A little bit of backstory. Some might be in lowercase or contain abbreviations or typos or casual speech. Use a mix of different lengths, and focus on edge cases rather than making them clear-cut (the user will get a chance to sign off on them).
Bad: `"Format this data"`, `"Extract text from PDF"`, `"Create a chart"`
Good: `"ok so my boss just sent me this xlsx file (its in my downloads, called something like 'Q4 sales final FINAL v2.xlsx') and she wants me to add a column that shows the profit margin as a percentage. The revenue is in column C and costs are in column D i think"`
For the **should-trigger** queries (8-10), think about coverage. You want different phrasings of the same intent — some formal, some casual. Include cases where the user doesn't explicitly name the skill or file type but clearly needs it. Throw in some uncommon use cases and cases where this skill competes with another but should win.
For the **should-not-trigger** queries (8-10), the most valuable ones are the near-misses — queries that share keywords or concepts with the skill but actually need something different. Think adjacent domains, ambiguous phrasing where a naive keyword match would trigger but shouldn't, and cases where the query touches on something the skill does but in a context where another tool is more appropriate.
The key thing to avoid: don't make should-not-trigger queries obviously irrelevant. "Write a fibonacci function" as a negative test for a PDF skill is too easy — it doesn't test anything. The negative cases should be genuinely tricky.
### Step 2: Review with user
Present the eval set to the user for review using the HTML template:
1. Read the template from `assets/eval_review.html`
2. Replace the placeholders:
- `__EVAL_DATA_PLACEHOLDER__` → the JSON array of eval items (no quotes around it — it's a JS variable assignment)
- `__SKILL_NAME_PLACEHOLDER__` → the skill's name
- `__SKILL_DESCRIPTION_PLACEHOLDER__` → the skill's current description
3. Write to a temp file (e.g., `/tmp/eval_review_<skill-name>.html`) and open it: `open /tmp/eval_review_<skill-name>.html`
4. The user can edit queries, toggle should-trigger, add/remove entries, then click "Export Eval Set"
5. The file downloads to `~/Downloads/eval_set.json` — check the Downloads folder for the most recent version in case there are multiple (e.g., `eval_set (1).json`)
This step matters — bad eval queries lead to bad descriptions.
### Step 3: Run the optimization loop
Tell the user: "This will take some time — I'll run the optimization loop in the background and check on it periodically."
Save the eval set to the workspace, then run in the background:
```bash
python -m scripts.run_loop \
--eval-set <path-to-trigger-eval.json> \
--skill-path <path-to-skill> \
--model <model-id-powering-this-session> \
--max-iterations 5 \
--verbose
```
Use the model ID from your system prompt (the one powering the current session) so the triggering test matches what the user actually experiences.
While it runs, periodically tail the output to give the user updates on which iteration it's on and what the scores look like.
This handles the full optimization loop automatically. It splits the eval set into 60% train and 40% held-out test, evaluates the current description (running each query 3 times to get a reliable trigger rate), then calls Claude to propose improvements based on what failed. It re-evaluates each new description on both train and test, iterating up to 5 times. When it's done, it opens an HTML report in the browser showing the results per iteration and returns JSON with `best_description` — selected by test score rather than train score to avoid overfitting.
### How skill triggering works
Understanding the triggering mechanism helps design better eval queries. Skills appear in Claude's `available_skills` list with their name + description, and Claude decides whether to consult a skill based on that description. The important thing to know is that Claude only consults skills for tasks it can't easily handle on its own — simple, one-step queries like "read this PDF" may not trigger a skill even if the description matches perfectly, because Claude can handle them directly with basic tools. Complex, multi-step, or specialized queries reliably trigger skills when the description matches.
This means your eval queries should be substantive enough that Claude would actually benefit from consulting a skill. Simple queries like "read file X" are poor test cases — they won't trigger skills regardless of description quality.
### Step 4: Apply the result
Take `best_description` from the JSON output and update the skill's SKILL.md frontmatter. Show the user before/after and report the scores.
---
### Package and Present (only if `present_files` tool is available)
Check whether you have access to the `present_files` tool. If you don't, skip this step. If you do, package the skill and present the .skill file to the user:
```bash
python -m scripts.package_skill <path/to/skill-folder>
```
After packaging, direct the user to the resulting `.skill` file path so they can install it.
---
## Claude.ai-specific instructions
In Claude.ai, the core workflow is the same (draft → test → review → improve → repeat), but because Claude.ai doesn't have subagents, some mechanics change. Here's what to adapt:
**Running test cases**: No subagents means no parallel execution. For each test case, read the skill's SKILL.md, then follow its instructions to accomplish the test prompt yourself. Do them one at a time. This is less rigorous than independent subagents (you wrote the skill and you're also running it, so you have full context), but it's a useful sanity check — and the human review step compensates. Skip the baseline runs — just use the skill to complete the task as requested.
**Reviewing results**: If you can't open a browser (e.g., Claude.ai's VM has no display, or you're on a remote server), skip the browser reviewer entirely. Instead, present results directly in the conversation. For each test case, show the prompt and the output. If the output is a file the user needs to see (like a .docx or .xlsx), save it to the filesystem and tell them where it is so they can download and inspect it. Ask for feedback inline: "How does this look? Anything you'd change?"
**Benchmarking**: Skip the quantitative benchmarking — it relies on baseline comparisons which aren't meaningful without subagents. Focus on qualitative feedback from the user.
**The iteration loop**: Same as before — improve the skill, rerun the test cases, ask for feedback — just without the browser reviewer in the middle. You can still organize results into iteration directories on the filesystem if you have one.
**Description optimization**: This section requires the `claude` CLI tool (specifically `claude -p`) which is only available in Claude Code. Skip it if you're on Claude.ai.
**Blind comparison**: Requires subagents. Skip it.
**Packaging**: The `package_skill.py` script works anywhere with Python and a filesystem. On Claude.ai, you can run it and the user can download the resulting `.skill` file.
**Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. In this case:
- **Preserve the original name.** Note the skill's directory name and `name` frontmatter field -- use them unchanged. E.g., if the installed skill is `research-helper`, output `research-helper.skill` (not `research-helper-v2`).
- **Copy to a writeable location before editing.** The installed skill path may be read-only. Copy to `/tmp/skill-name/`, edit there, and package from the copy.
- **If packaging manually, stage in `/tmp/` first**, then copy to the output directory -- direct writes may fail due to permissions.
---
## Cowork-Specific Instructions
If you're in Cowork, the main things to know are:
- You have subagents, so the main workflow (spawn test cases in parallel, run baselines, grade, etc.) all works. (However, if you run into severe problems with timeouts, it's OK to run the test prompts in series rather than parallel.)
- You don't have a browser or display, so when generating the eval viewer, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Then proffer a link that the user can click to open the HTML in their browser.
- For whatever reason, the Cowork setup seems to disincline Claude from generating the eval viewer after running the tests, so just to reiterate: whether you're in Cowork or in Claude Code, after running tests, you should always generate the eval viewer for the human to look at examples before revising the skill yourself and trying to make corrections, using `generate_review.py` (not writing your own boutique html code). Sorry in advance but I'm gonna go all caps here: GENERATE THE EVAL VIEWER *BEFORE* evaluating inputs yourself. You want to get them in front of the human ASAP!
- Feedback works differently: since there's no running server, the viewer's "Submit All Reviews" button will download `feedback.json` as a file. You can then read it from there (you may have to request access first).
- Packaging works — `package_skill.py` just needs Python and a filesystem.
- Description optimization (`run_loop.py` / `run_eval.py`) should work in Cowork just fine since it uses `claude -p` via subprocess, not a browser, but please save it until you've fully finished making the skill and the user agrees it's in good shape.
- **Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. Follow the update guidance in the claude.ai section above.
---
## Reference files
The agents/ directory contains instructions for specialized subagents. Read them when you need to spawn the relevant subagent.
- `agents/grader.md` — How to evaluate assertions against outputs
- `agents/comparator.md` — How to do blind A/B comparison between two outputs
- `agents/analyzer.md` — How to analyze why one version beat another
The references/ directory has additional documentation:
- `references/schemas.md` — JSON structures for evals.json, grading.json, etc.
---
Repeating one more time the core loop here for emphasis:
- Figure out what the skill is about
- Draft or edit the skill
- Run claude-with-access-to-the-skill on test prompts
- With the user, evaluate the outputs:
- Create benchmark.json and run `eval-viewer/generate_review.py` to help the user review them
- Run quantitative evals
- Repeat until you and the user are satisfied
- Package the final skill and return it to the user.
Please add steps to your TodoList, if you have such a thing, to make sure you don't forget. If you're in Cowork, please specifically put "Create evals JSON and run `eval-viewer/generate_review.py` so human can review test cases" in your TodoList to make sure it happens.
Good luck!

View File

@@ -0,0 +1,274 @@
# Post-hoc Analyzer Agent
Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions.
## Role
After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved?
## Inputs
You receive these parameters in your prompt:
- **winner**: "A" or "B" (from blind comparison)
- **winner_skill_path**: Path to the skill that produced the winning output
- **winner_transcript_path**: Path to the execution transcript for the winner
- **loser_skill_path**: Path to the skill that produced the losing output
- **loser_transcript_path**: Path to the execution transcript for the loser
- **comparison_result_path**: Path to the blind comparator's output JSON
- **output_path**: Where to save the analysis results
## Process
### Step 1: Read Comparison Result
1. Read the blind comparator's output at comparison_result_path
2. Note the winning side (A or B), the reasoning, and any scores
3. Understand what the comparator valued in the winning output
### Step 2: Read Both Skills
1. Read the winner skill's SKILL.md and key referenced files
2. Read the loser skill's SKILL.md and key referenced files
3. Identify structural differences:
- Instructions clarity and specificity
- Script/tool usage patterns
- Example coverage
- Edge case handling
### Step 3: Read Both Transcripts
1. Read the winner's transcript
2. Read the loser's transcript
3. Compare execution patterns:
- How closely did each follow their skill's instructions?
- What tools were used differently?
- Where did the loser diverge from optimal behavior?
- Did either encounter errors or make recovery attempts?
### Step 4: Analyze Instruction Following
For each transcript, evaluate:
- Did the agent follow the skill's explicit instructions?
- Did the agent use the skill's provided tools/scripts?
- Were there missed opportunities to leverage skill content?
- Did the agent add unnecessary steps not in the skill?
Score instruction following 1-10 and note specific issues.
### Step 5: Identify Winner Strengths
Determine what made the winner better:
- Clearer instructions that led to better behavior?
- Better scripts/tools that produced better output?
- More comprehensive examples that guided edge cases?
- Better error handling guidance?
Be specific. Quote from skills/transcripts where relevant.
### Step 6: Identify Loser Weaknesses
Determine what held the loser back:
- Ambiguous instructions that led to suboptimal choices?
- Missing tools/scripts that forced workarounds?
- Gaps in edge case coverage?
- Poor error handling that caused failures?
### Step 7: Generate Improvement Suggestions
Based on the analysis, produce actionable suggestions for improving the loser skill:
- Specific instruction changes to make
- Tools/scripts to add or modify
- Examples to include
- Edge cases to address
Prioritize by impact. Focus on changes that would have changed the outcome.
### Step 8: Write Analysis Results
Save structured analysis to `{output_path}`.
## Output Format
Write a JSON file with this structure:
```json
{
"comparison_summary": {
"winner": "A",
"winner_skill": "path/to/winner/skill",
"loser_skill": "path/to/loser/skill",
"comparator_reasoning": "Brief summary of why comparator chose winner"
},
"winner_strengths": [
"Clear step-by-step instructions for handling multi-page documents",
"Included validation script that caught formatting errors",
"Explicit guidance on fallback behavior when OCR fails"
],
"loser_weaknesses": [
"Vague instruction 'process the document appropriately' led to inconsistent behavior",
"No script for validation, agent had to improvise and made errors",
"No guidance on OCR failure, agent gave up instead of trying alternatives"
],
"instruction_following": {
"winner": {
"score": 9,
"issues": [
"Minor: skipped optional logging step"
]
},
"loser": {
"score": 6,
"issues": [
"Did not use the skill's formatting template",
"Invented own approach instead of following step 3",
"Missed the 'always validate output' instruction"
]
}
},
"improvement_suggestions": [
{
"priority": "high",
"category": "instructions",
"suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template",
"expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
},
{
"priority": "high",
"category": "tools",
"suggestion": "Add validate_output.py script similar to winner skill's validation approach",
"expected_impact": "Would catch formatting errors before final output"
},
{
"priority": "medium",
"category": "error_handling",
"suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'",
"expected_impact": "Would prevent early failure on difficult documents"
}
],
"transcript_insights": {
"winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output",
"loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors"
}
}
```
## Guidelines
- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear"
- **Be actionable**: Suggestions should be concrete changes, not vague advice
- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent
- **Prioritize by impact**: Which changes would most likely have changed the outcome?
- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental?
- **Stay objective**: Analyze what happened, don't editorialize
- **Think about generalization**: Would this improvement help on other evals too?
## Categories for Suggestions
Use these categories to organize improvement suggestions:
| Category | Description |
|----------|-------------|
| `instructions` | Changes to the skill's prose instructions |
| `tools` | Scripts, templates, or utilities to add/modify |
| `examples` | Example inputs/outputs to include |
| `error_handling` | Guidance for handling failures |
| `structure` | Reorganization of skill content |
| `references` | External docs or resources to add |
## Priority Levels
- **high**: Would likely change the outcome of this comparison
- **medium**: Would improve quality but may not change win/loss
- **low**: Nice to have, marginal improvement
---
# Analyzing Benchmark Results
When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements.
## Role
Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone.
## Inputs
You receive these parameters in your prompt:
- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results
- **skill_path**: Path to the skill being benchmarked
- **output_path**: Where to save the notes (as JSON array of strings)
## Process
### Step 1: Read Benchmark Data
1. Read the benchmark.json containing all run results
2. Note the configurations tested (with_skill, without_skill)
3. Understand the run_summary aggregates already calculated
### Step 2: Analyze Per-Assertion Patterns
For each expectation across all runs:
- Does it **always pass** in both configurations? (may not differentiate skill value)
- Does it **always fail** in both configurations? (may be broken or beyond capability)
- Does it **always pass with skill but fail without**? (skill clearly adds value here)
- Does it **always fail with skill but pass without**? (skill may be hurting)
- Is it **highly variable**? (flaky expectation or non-deterministic behavior)
### Step 3: Analyze Cross-Eval Patterns
Look for patterns across evals:
- Are certain eval types consistently harder/easier?
- Do some evals show high variance while others are stable?
- Are there surprising results that contradict expectations?
### Step 4: Analyze Metrics Patterns
Look at time_seconds, tokens, tool_calls:
- Does the skill significantly increase execution time?
- Is there high variance in resource usage?
- Are there outlier runs that skew the aggregates?
### Step 5: Generate Notes
Write freeform observations as a list of strings. Each note should:
- State a specific observation
- Be grounded in the data (not speculation)
- Help the user understand something the aggregate metrics don't show
Examples:
- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
- "Skill adds 13s average execution time but improves pass rate by 50%"
- "Token usage is 80% higher with skill, primarily due to script output parsing"
- "All 3 without-skill runs for eval 1 produced empty output"
### Step 6: Write Notes
Save notes to `{output_path}` as a JSON array of strings:
```json
[
"Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
"Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure",
"Without-skill runs consistently fail on table extraction expectations",
"Skill adds 13s average execution time but improves pass rate by 50%"
]
```
## Guidelines
**DO:**
- Report what you observe in the data
- Be specific about which evals, expectations, or runs you're referring to
- Note patterns that aggregate metrics would hide
- Provide context that helps interpret the numbers
**DO NOT:**
- Suggest improvements to the skill (that's for the improvement step, not benchmarking)
- Make subjective quality judgments ("the output was good/bad")
- Speculate about causes without evidence
- Repeat information already in the run_summary aggregates

View File

@@ -0,0 +1,202 @@
# Blind Comparator Agent
Compare two outputs WITHOUT knowing which skill produced them.
## Role
The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach.
Your judgment is based purely on output quality and task completion.
## Inputs
You receive these parameters in your prompt:
- **output_a_path**: Path to the first output file or directory
- **output_b_path**: Path to the second output file or directory
- **eval_prompt**: The original task/prompt that was executed
- **expectations**: List of expectations to check (optional - may be empty)
## Process
### Step 1: Read Both Outputs
1. Examine output A (file or directory)
2. Examine output B (file or directory)
3. Note the type, structure, and content of each
4. If outputs are directories, examine all relevant files inside
### Step 2: Understand the Task
1. Read the eval_prompt carefully
2. Identify what the task requires:
- What should be produced?
- What qualities matter (accuracy, completeness, format)?
- What would distinguish a good output from a poor one?
### Step 3: Generate Evaluation Rubric
Based on the task, generate a rubric with two dimensions:
**Content Rubric** (what the output contains):
| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
|-----------|----------|----------------|---------------|
| Correctness | Major errors | Minor errors | Fully correct |
| Completeness | Missing key elements | Mostly complete | All elements present |
| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
**Structure Rubric** (how the output is organized):
| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
|-----------|----------|----------------|---------------|
| Organization | Disorganized | Reasonably organized | Clear, logical structure |
| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished |
| Usability | Difficult to use | Usable with effort | Easy to use |
Adapt criteria to the specific task. For example:
- PDF form → "Field alignment", "Text readability", "Data placement"
- Document → "Section structure", "Heading hierarchy", "Paragraph flow"
- Data output → "Schema correctness", "Data types", "Completeness"
### Step 4: Evaluate Each Output Against the Rubric
For each output (A and B):
1. **Score each criterion** on the rubric (1-5 scale)
2. **Calculate dimension totals**: Content score, Structure score
3. **Calculate overall score**: Average of dimension scores, scaled to 1-10
### Step 5: Check Assertions (if provided)
If expectations are provided:
1. Check each expectation against output A
2. Check each expectation against output B
3. Count pass rates for each output
4. Use expectation scores as secondary evidence (not the primary decision factor)
### Step 6: Determine the Winner
Compare A and B based on (in priority order):
1. **Primary**: Overall rubric score (content + structure)
2. **Secondary**: Assertion pass rates (if applicable)
3. **Tiebreaker**: If truly equal, declare a TIE
Be decisive - ties should be rare. One output is usually better, even if marginally.
### Step 7: Write Comparison Results
Save results to a JSON file at the path specified (or `comparison.json` if not specified).
## Output Format
Write a JSON file with this structure:
```json
{
"winner": "A",
"reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
"rubric": {
"A": {
"content": {
"correctness": 5,
"completeness": 5,
"accuracy": 4
},
"structure": {
"organization": 4,
"formatting": 5,
"usability": 4
},
"content_score": 4.7,
"structure_score": 4.3,
"overall_score": 9.0
},
"B": {
"content": {
"correctness": 3,
"completeness": 2,
"accuracy": 3
},
"structure": {
"organization": 3,
"formatting": 2,
"usability": 3
},
"content_score": 2.7,
"structure_score": 2.7,
"overall_score": 5.4
}
},
"output_quality": {
"A": {
"score": 9,
"strengths": ["Complete solution", "Well-formatted", "All fields present"],
"weaknesses": ["Minor style inconsistency in header"]
},
"B": {
"score": 5,
"strengths": ["Readable output", "Correct basic structure"],
"weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
}
},
"expectation_results": {
"A": {
"passed": 4,
"total": 5,
"pass_rate": 0.80,
"details": [
{"text": "Output includes name", "passed": true},
{"text": "Output includes date", "passed": true},
{"text": "Format is PDF", "passed": true},
{"text": "Contains signature", "passed": false},
{"text": "Readable text", "passed": true}
]
},
"B": {
"passed": 3,
"total": 5,
"pass_rate": 0.60,
"details": [
{"text": "Output includes name", "passed": true},
{"text": "Output includes date", "passed": false},
{"text": "Format is PDF", "passed": true},
{"text": "Contains signature", "passed": false},
{"text": "Readable text", "passed": true}
]
}
}
}
```
If no expectations were provided, omit the `expectation_results` field entirely.
## Field Descriptions
- **winner**: "A", "B", or "TIE"
- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie)
- **rubric**: Structured rubric evaluation for each output
- **content**: Scores for content criteria (correctness, completeness, accuracy)
- **structure**: Scores for structure criteria (organization, formatting, usability)
- **content_score**: Average of content criteria (1-5)
- **structure_score**: Average of structure criteria (1-5)
- **overall_score**: Combined score scaled to 1-10
- **output_quality**: Summary quality assessment
- **score**: 1-10 rating (should match rubric overall_score)
- **strengths**: List of positive aspects
- **weaknesses**: List of issues or shortcomings
- **expectation_results**: (Only if expectations provided)
- **passed**: Number of expectations that passed
- **total**: Total number of expectations
- **pass_rate**: Fraction passed (0.0 to 1.0)
- **details**: Individual expectation results
## Guidelines
- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality.
- **Be specific**: Cite specific examples when explaining strengths and weaknesses.
- **Be decisive**: Choose a winner unless outputs are genuinely equivalent.
- **Output quality first**: Assertion scores are secondary to overall task completion.
- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness.
- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner.
- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better.

View File

@@ -0,0 +1,223 @@
# Grader Agent
Evaluate expectations against an execution transcript and outputs.
## Role
The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment.
You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so.
## Inputs
You receive these parameters in your prompt:
- **expectations**: List of expectations to evaluate (strings)
- **transcript_path**: Path to the execution transcript (markdown file)
- **outputs_dir**: Directory containing output files from execution
## Process
### Step 1: Read the Transcript
1. Read the transcript file completely
2. Note the eval prompt, execution steps, and final result
3. Identify any issues or errors documented
### Step 2: Examine Output Files
1. List files in outputs_dir
2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced.
3. Note contents, structure, and quality
### Step 3: Evaluate Each Assertion
For each expectation:
1. **Search for evidence** in the transcript and outputs
2. **Determine verdict**:
- **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance
- **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content)
3. **Cite the evidence**: Quote the specific text or describe what you found
### Step 4: Extract and Verify Claims
Beyond the predefined expectations, extract implicit claims from the outputs and verify them:
1. **Extract claims** from the transcript and outputs:
- Factual statements ("The form has 12 fields")
- Process claims ("Used pypdf to fill the form")
- Quality claims ("All fields were filled correctly")
2. **Verify each claim**:
- **Factual claims**: Can be checked against the outputs or external sources
- **Process claims**: Can be verified from the transcript
- **Quality claims**: Evaluate whether the claim is justified
3. **Flag unverifiable claims**: Note claims that cannot be verified with available information
This catches issues that predefined expectations might miss.
### Step 5: Read User Notes
If `{outputs_dir}/user_notes.md` exists:
1. Read it and note any uncertainties or issues flagged by the executor
2. Include relevant concerns in the grading output
3. These may reveal problems even when expectations pass
### Step 6: Critique the Evals
After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap.
Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
Suggestions worth raising:
- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
- An important outcome you observed — good or bad — that no assertion covers at all
- An assertion that can't actually be verified from the available outputs
Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion.
### Step 7: Write Grading Results
Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
## Grading Criteria
**PASS when**:
- The transcript or outputs clearly demonstrate the expectation is true
- Specific evidence can be cited
- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
**FAIL when**:
- No evidence found for the expectation
- Evidence contradicts the expectation
- The expectation cannot be verified from available information
- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete
- The output appears to meet the assertion by coincidence rather than by actually doing the work
**When uncertain**: The burden of proof to pass is on the expectation.
### Step 8: Read Executor Metrics and Timing
1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output
2. If `{outputs_dir}/../timing.json` exists, read it and include timing data
## Output Format
Write a JSON file with this structure:
```json
{
"expectations": [
{
"text": "The output includes the name 'John Smith'",
"passed": true,
"evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
},
{
"text": "The spreadsheet has a SUM formula in cell B10",
"passed": false,
"evidence": "No spreadsheet was created. The output was a text file."
},
{
"text": "The assistant used the skill's OCR script",
"passed": true,
"evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
}
],
"summary": {
"passed": 2,
"failed": 1,
"total": 3,
"pass_rate": 0.67
},
"execution_metrics": {
"tool_calls": {
"Read": 5,
"Write": 2,
"Bash": 8
},
"total_tool_calls": 15,
"total_steps": 6,
"errors_encountered": 0,
"output_chars": 12450,
"transcript_chars": 3200
},
"timing": {
"executor_duration_seconds": 165.0,
"grader_duration_seconds": 26.0,
"total_duration_seconds": 191.0
},
"claims": [
{
"claim": "The form has 12 fillable fields",
"type": "factual",
"verified": true,
"evidence": "Counted 12 fields in field_info.json"
},
{
"claim": "All required fields were populated",
"type": "quality",
"verified": false,
"evidence": "Reference section was left blank despite data being available"
}
],
"user_notes_summary": {
"uncertainties": ["Used 2023 data, may be stale"],
"needs_review": [],
"workarounds": ["Fell back to text overlay for non-fillable fields"]
},
"eval_feedback": {
"suggestions": [
{
"assertion": "The output includes the name 'John Smith'",
"reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
},
{
"reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
}
],
"overall": "Assertions check presence but not correctness. Consider adding content verification."
}
}
```
## Field Descriptions
- **expectations**: Array of graded expectations
- **text**: The original expectation text
- **passed**: Boolean - true if expectation passes
- **evidence**: Specific quote or description supporting the verdict
- **summary**: Aggregate statistics
- **passed**: Count of passed expectations
- **failed**: Count of failed expectations
- **total**: Total expectations evaluated
- **pass_rate**: Fraction passed (0.0 to 1.0)
- **execution_metrics**: Copied from executor's metrics.json (if available)
- **output_chars**: Total character count of output files (proxy for tokens)
- **transcript_chars**: Character count of transcript
- **timing**: Wall clock timing from timing.json (if available)
- **executor_duration_seconds**: Time spent in executor subagent
- **total_duration_seconds**: Total elapsed time for the run
- **claims**: Extracted and verified claims from the output
- **claim**: The statement being verified
- **type**: "factual", "process", or "quality"
- **verified**: Boolean - whether the claim holds
- **evidence**: Supporting or contradicting evidence
- **user_notes_summary**: Issues flagged by the executor
- **uncertainties**: Things the executor wasn't sure about
- **needs_review**: Items requiring human attention
- **workarounds**: Places where the skill didn't work as expected
- **eval_feedback**: Improvement suggestions for the evals (only when warranted)
- **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to
- **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag
## Guidelines
- **Be objective**: Base verdicts on evidence, not assumptions
- **Be specific**: Quote the exact text that supports your verdict
- **Be thorough**: Check both transcript and output files
- **Be consistent**: Apply the same standard to each expectation
- **Explain failures**: Make it clear why evidence was insufficient
- **No partial credit**: Each expectation is pass or fail, not partial

View File

@@ -0,0 +1,146 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Eval Set Review - __SKILL_NAME_PLACEHOLDER__</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body { font-family: 'Lora', Georgia, serif; background: #faf9f5; padding: 2rem; color: #141413; }
h1 { font-family: 'Poppins', sans-serif; margin-bottom: 0.5rem; font-size: 1.5rem; }
.description { color: #b0aea5; margin-bottom: 1.5rem; font-style: italic; max-width: 900px; }
.controls { margin-bottom: 1rem; display: flex; gap: 0.5rem; }
.btn { font-family: 'Poppins', sans-serif; padding: 0.5rem 1rem; border: none; border-radius: 6px; cursor: pointer; font-size: 0.875rem; font-weight: 500; }
.btn-add { background: #6a9bcc; color: white; }
.btn-add:hover { background: #5889b8; }
.btn-export { background: #d97757; color: white; }
.btn-export:hover { background: #c4613f; }
table { width: 100%; max-width: 1100px; border-collapse: collapse; background: white; border-radius: 6px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }
th { font-family: 'Poppins', sans-serif; background: #141413; color: #faf9f5; padding: 0.75rem 1rem; text-align: left; font-size: 0.875rem; }
td { padding: 0.75rem 1rem; border-bottom: 1px solid #e8e6dc; vertical-align: top; }
tr:nth-child(even) td { background: #faf9f5; }
tr:hover td { background: #f3f1ea; }
.section-header td { background: #e8e6dc; font-family: 'Poppins', sans-serif; font-weight: 500; font-size: 0.8rem; color: #141413; text-transform: uppercase; letter-spacing: 0.05em; }
.query-input { width: 100%; padding: 0.4rem; border: 1px solid #e8e6dc; border-radius: 4px; font-size: 0.875rem; font-family: 'Lora', Georgia, serif; resize: vertical; min-height: 60px; }
.query-input:focus { outline: none; border-color: #d97757; box-shadow: 0 0 0 2px rgba(217,119,87,0.15); }
.toggle { position: relative; display: inline-block; width: 44px; height: 24px; }
.toggle input { opacity: 0; width: 0; height: 0; }
.toggle .slider { position: absolute; inset: 0; background: #b0aea5; border-radius: 24px; cursor: pointer; transition: 0.2s; }
.toggle .slider::before { content: ""; position: absolute; width: 18px; height: 18px; left: 3px; bottom: 3px; background: white; border-radius: 50%; transition: 0.2s; }
.toggle input:checked + .slider { background: #d97757; }
.toggle input:checked + .slider::before { transform: translateX(20px); }
.btn-delete { background: #c44; color: white; padding: 0.3rem 0.6rem; border: none; border-radius: 4px; cursor: pointer; font-size: 0.75rem; font-family: 'Poppins', sans-serif; }
.btn-delete:hover { background: #a33; }
.summary { margin-top: 1rem; color: #b0aea5; font-size: 0.875rem; }
</style>
</head>
<body>
<h1>Eval Set Review: <span id="skill-name">__SKILL_NAME_PLACEHOLDER__</span></h1>
<p class="description">Current description: <span id="skill-desc">__SKILL_DESCRIPTION_PLACEHOLDER__</span></p>
<div class="controls">
<button class="btn btn-add" onclick="addRow()">+ Add Query</button>
<button class="btn btn-export" onclick="exportEvalSet()">Export Eval Set</button>
</div>
<table>
<thead>
<tr>
<th style="width:65%">Query</th>
<th style="width:18%">Should Trigger</th>
<th style="width:10%">Actions</th>
</tr>
</thead>
<tbody id="eval-body"></tbody>
</table>
<p class="summary" id="summary"></p>
<script>
const EVAL_DATA = __EVAL_DATA_PLACEHOLDER__;
let evalItems = [...EVAL_DATA];
function render() {
const tbody = document.getElementById('eval-body');
tbody.innerHTML = '';
// Sort: should-trigger first, then should-not-trigger
const sorted = evalItems
.map((item, origIdx) => ({ ...item, origIdx }))
.sort((a, b) => (b.should_trigger ? 1 : 0) - (a.should_trigger ? 1 : 0));
let lastGroup = null;
sorted.forEach(item => {
const group = item.should_trigger ? 'trigger' : 'no-trigger';
if (group !== lastGroup) {
const headerRow = document.createElement('tr');
headerRow.className = 'section-header';
headerRow.innerHTML = `<td colspan="3">${item.should_trigger ? 'Should Trigger' : 'Should NOT Trigger'}</td>`;
tbody.appendChild(headerRow);
lastGroup = group;
}
const idx = item.origIdx;
const tr = document.createElement('tr');
tr.innerHTML = `
<td><textarea class="query-input" onchange="updateQuery(${idx}, this.value)">${escapeHtml(item.query)}</textarea></td>
<td>
<label class="toggle">
<input type="checkbox" ${item.should_trigger ? 'checked' : ''} onchange="updateTrigger(${idx}, this.checked)">
<span class="slider"></span>
</label>
<span style="margin-left:8px;font-size:0.8rem;color:#b0aea5">${item.should_trigger ? 'Yes' : 'No'}</span>
</td>
<td><button class="btn-delete" onclick="deleteRow(${idx})">Delete</button></td>
`;
tbody.appendChild(tr);
});
updateSummary();
}
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
function updateQuery(idx, value) { evalItems[idx].query = value; updateSummary(); }
function updateTrigger(idx, value) { evalItems[idx].should_trigger = value; render(); }
function deleteRow(idx) { evalItems.splice(idx, 1); render(); }
function addRow() {
evalItems.push({ query: '', should_trigger: true });
render();
const inputs = document.querySelectorAll('.query-input');
inputs[inputs.length - 1].focus();
}
function updateSummary() {
const trigger = evalItems.filter(i => i.should_trigger).length;
const noTrigger = evalItems.filter(i => !i.should_trigger).length;
document.getElementById('summary').textContent =
`${evalItems.length} queries total: ${trigger} should trigger, ${noTrigger} should not trigger`;
}
function exportEvalSet() {
const valid = evalItems.filter(i => i.query.trim() !== '');
const data = valid.map(i => ({ query: i.query.trim(), should_trigger: i.should_trigger }));
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'eval_set.json';
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
render();
</script>
</body>
</html>

View File

@@ -0,0 +1,471 @@
#!/usr/bin/env python3
"""Generate and serve a review page for eval results.
Reads the workspace directory, discovers runs (directories with outputs/),
embeds all output data into a self-contained HTML page, and serves it via
a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
Usage:
python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
No dependencies beyond the Python stdlib are required.
"""
import argparse
import base64
import json
import mimetypes
import os
import re
import signal
import subprocess
import sys
import time
import webbrowser
from functools import partial
from http.server import HTTPServer, BaseHTTPRequestHandler
from pathlib import Path
# Files to exclude from output listings
METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
# Extensions we render as inline text
TEXT_EXTENSIONS = {
".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
}
# Extensions we render as inline images
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
# MIME type overrides for common types
MIME_OVERRIDES = {
".svg": "image/svg+xml",
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
}
def get_mime_type(path: Path) -> str:
ext = path.suffix.lower()
if ext in MIME_OVERRIDES:
return MIME_OVERRIDES[ext]
mime, _ = mimetypes.guess_type(str(path))
return mime or "application/octet-stream"
def find_runs(workspace: Path) -> list[dict]:
"""Recursively find directories that contain an outputs/ subdirectory."""
runs: list[dict] = []
_find_runs_recursive(workspace, workspace, runs)
runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
return runs
def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
if not current.is_dir():
return
outputs_dir = current / "outputs"
if outputs_dir.is_dir():
run = build_run(root, current)
if run:
runs.append(run)
return
skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
for child in sorted(current.iterdir()):
if child.is_dir() and child.name not in skip:
_find_runs_recursive(root, child, runs)
def build_run(root: Path, run_dir: Path) -> dict | None:
"""Build a run dict with prompt, outputs, and grading data."""
prompt = ""
eval_id = None
# Try eval_metadata.json
for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
if candidate.exists():
try:
metadata = json.loads(candidate.read_text())
prompt = metadata.get("prompt", "")
eval_id = metadata.get("eval_id")
except (json.JSONDecodeError, OSError):
pass
if prompt:
break
# Fall back to transcript.md
if not prompt:
for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
if candidate.exists():
try:
text = candidate.read_text()
match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
if match:
prompt = match.group(1).strip()
except OSError:
pass
if prompt:
break
if not prompt:
prompt = "(No prompt found)"
run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
# Collect output files
outputs_dir = run_dir / "outputs"
output_files: list[dict] = []
if outputs_dir.is_dir():
for f in sorted(outputs_dir.iterdir()):
if f.is_file() and f.name not in METADATA_FILES:
output_files.append(embed_file(f))
# Load grading if present
grading = None
for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
if candidate.exists():
try:
grading = json.loads(candidate.read_text())
except (json.JSONDecodeError, OSError):
pass
if grading:
break
return {
"id": run_id,
"prompt": prompt,
"eval_id": eval_id,
"outputs": output_files,
"grading": grading,
}
def embed_file(path: Path) -> dict:
"""Read a file and return an embedded representation."""
ext = path.suffix.lower()
mime = get_mime_type(path)
if ext in TEXT_EXTENSIONS:
try:
content = path.read_text(errors="replace")
except OSError:
content = "(Error reading file)"
return {
"name": path.name,
"type": "text",
"content": content,
}
elif ext in IMAGE_EXTENSIONS:
try:
raw = path.read_bytes()
b64 = base64.b64encode(raw).decode("ascii")
except OSError:
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
return {
"name": path.name,
"type": "image",
"mime": mime,
"data_uri": f"data:{mime};base64,{b64}",
}
elif ext == ".pdf":
try:
raw = path.read_bytes()
b64 = base64.b64encode(raw).decode("ascii")
except OSError:
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
return {
"name": path.name,
"type": "pdf",
"data_uri": f"data:{mime};base64,{b64}",
}
elif ext == ".xlsx":
try:
raw = path.read_bytes()
b64 = base64.b64encode(raw).decode("ascii")
except OSError:
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
return {
"name": path.name,
"type": "xlsx",
"data_b64": b64,
}
else:
# Binary / unknown — base64 download link
try:
raw = path.read_bytes()
b64 = base64.b64encode(raw).decode("ascii")
except OSError:
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
return {
"name": path.name,
"type": "binary",
"mime": mime,
"data_uri": f"data:{mime};base64,{b64}",
}
def load_previous_iteration(workspace: Path) -> dict[str, dict]:
"""Load previous iteration's feedback and outputs.
Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
"""
result: dict[str, dict] = {}
# Load feedback
feedback_map: dict[str, str] = {}
feedback_path = workspace / "feedback.json"
if feedback_path.exists():
try:
data = json.loads(feedback_path.read_text())
feedback_map = {
r["run_id"]: r["feedback"]
for r in data.get("reviews", [])
if r.get("feedback", "").strip()
}
except (json.JSONDecodeError, OSError, KeyError):
pass
# Load runs (to get outputs)
prev_runs = find_runs(workspace)
for run in prev_runs:
result[run["id"]] = {
"feedback": feedback_map.get(run["id"], ""),
"outputs": run.get("outputs", []),
}
# Also add feedback for run_ids that had feedback but no matching run
for run_id, fb in feedback_map.items():
if run_id not in result:
result[run_id] = {"feedback": fb, "outputs": []}
return result
def generate_html(
runs: list[dict],
skill_name: str,
previous: dict[str, dict] | None = None,
benchmark: dict | None = None,
) -> str:
"""Generate the complete standalone HTML page with embedded data."""
template_path = Path(__file__).parent / "viewer.html"
template = template_path.read_text()
# Build previous_feedback and previous_outputs maps for the template
previous_feedback: dict[str, str] = {}
previous_outputs: dict[str, list[dict]] = {}
if previous:
for run_id, data in previous.items():
if data.get("feedback"):
previous_feedback[run_id] = data["feedback"]
if data.get("outputs"):
previous_outputs[run_id] = data["outputs"]
embedded = {
"skill_name": skill_name,
"runs": runs,
"previous_feedback": previous_feedback,
"previous_outputs": previous_outputs,
}
if benchmark:
embedded["benchmark"] = benchmark
data_json = json.dumps(embedded)
return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
# ---------------------------------------------------------------------------
# HTTP server (stdlib only, zero dependencies)
# ---------------------------------------------------------------------------
def _kill_port(port: int) -> None:
"""Kill any process listening on the given port."""
try:
result = subprocess.run(
["lsof", "-ti", f":{port}"],
capture_output=True, text=True, timeout=5,
)
for pid_str in result.stdout.strip().split("\n"):
if pid_str.strip():
try:
os.kill(int(pid_str.strip()), signal.SIGTERM)
except (ProcessLookupError, ValueError):
pass
if result.stdout.strip():
time.sleep(0.5)
except subprocess.TimeoutExpired:
pass
except FileNotFoundError:
print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
class ReviewHandler(BaseHTTPRequestHandler):
"""Serves the review HTML and handles feedback saves.
Regenerates the HTML on each page load so that refreshing the browser
picks up new eval outputs without restarting the server.
"""
def __init__(
self,
workspace: Path,
skill_name: str,
feedback_path: Path,
previous: dict[str, dict],
benchmark_path: Path | None,
*args,
**kwargs,
):
self.workspace = workspace
self.skill_name = skill_name
self.feedback_path = feedback_path
self.previous = previous
self.benchmark_path = benchmark_path
super().__init__(*args, **kwargs)
def do_GET(self) -> None:
if self.path == "/" or self.path == "/index.html":
# Regenerate HTML on each request (re-scans workspace for new outputs)
runs = find_runs(self.workspace)
benchmark = None
if self.benchmark_path and self.benchmark_path.exists():
try:
benchmark = json.loads(self.benchmark_path.read_text())
except (json.JSONDecodeError, OSError):
pass
html = generate_html(runs, self.skill_name, self.previous, benchmark)
content = html.encode("utf-8")
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(content)))
self.end_headers()
self.wfile.write(content)
elif self.path == "/api/feedback":
data = b"{}"
if self.feedback_path.exists():
data = self.feedback_path.read_bytes()
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(data)))
self.end_headers()
self.wfile.write(data)
else:
self.send_error(404)
def do_POST(self) -> None:
if self.path == "/api/feedback":
length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length)
try:
data = json.loads(body)
if not isinstance(data, dict) or "reviews" not in data:
raise ValueError("Expected JSON object with 'reviews' key")
self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
resp = b'{"ok":true}'
self.send_response(200)
except (json.JSONDecodeError, OSError, ValueError) as e:
resp = json.dumps({"error": str(e)}).encode()
self.send_response(500)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(resp)))
self.end_headers()
self.wfile.write(resp)
else:
self.send_error(404)
def log_message(self, format: str, *args: object) -> None:
# Suppress request logging to keep terminal clean
pass
def main() -> None:
parser = argparse.ArgumentParser(description="Generate and serve eval review")
parser.add_argument("workspace", type=Path, help="Path to workspace directory")
parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
parser.add_argument(
"--previous-workspace", type=Path, default=None,
help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
)
parser.add_argument(
"--benchmark", type=Path, default=None,
help="Path to benchmark.json to show in the Benchmark tab",
)
parser.add_argument(
"--static", "-s", type=Path, default=None,
help="Write standalone HTML to this path instead of starting a server",
)
args = parser.parse_args()
workspace = args.workspace.resolve()
if not workspace.is_dir():
print(f"Error: {workspace} is not a directory", file=sys.stderr)
sys.exit(1)
runs = find_runs(workspace)
if not runs:
print(f"No runs found in {workspace}", file=sys.stderr)
sys.exit(1)
skill_name = args.skill_name or workspace.name.replace("-workspace", "")
feedback_path = workspace / "feedback.json"
previous: dict[str, dict] = {}
if args.previous_workspace:
previous = load_previous_iteration(args.previous_workspace.resolve())
benchmark_path = args.benchmark.resolve() if args.benchmark else None
benchmark = None
if benchmark_path and benchmark_path.exists():
try:
benchmark = json.loads(benchmark_path.read_text())
except (json.JSONDecodeError, OSError):
pass
if args.static:
html = generate_html(runs, skill_name, previous, benchmark)
args.static.parent.mkdir(parents=True, exist_ok=True)
args.static.write_text(html)
print(f"\n Static viewer written to: {args.static}\n")
sys.exit(0)
# Kill any existing process on the target port
port = args.port
_kill_port(port)
handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
try:
server = HTTPServer(("127.0.0.1", port), handler)
except OSError:
# Port still in use after kill attempt — find a free one
server = HTTPServer(("127.0.0.1", 0), handler)
port = server.server_address[1]
url = f"http://localhost:{port}"
print(f"\n Eval Viewer")
print(f" ─────────────────────────────────")
print(f" URL: {url}")
print(f" Workspace: {workspace}")
print(f" Feedback: {feedback_path}")
if previous:
print(f" Previous: {args.previous_workspace} ({len(previous)} runs)")
if benchmark_path:
print(f" Benchmark: {benchmark_path}")
print(f"\n Press Ctrl+C to stop.\n")
webbrowser.open(url)
try:
server.serve_forever()
except KeyboardInterrupt:
print("\nStopped.")
server.server_close()
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,430 @@
# JSON Schemas
This document defines the JSON schemas used by skill-creator.
---
## evals.json
Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
```json
{
"skill_name": "example-skill",
"evals": [
{
"id": 1,
"prompt": "User's example prompt",
"expected_output": "Description of expected result",
"files": ["evals/files/sample1.pdf"],
"expectations": [
"The output includes X",
"The skill used script Y"
]
}
]
}
```
**Fields:**
- `skill_name`: Name matching the skill's frontmatter
- `evals[].id`: Unique integer identifier
- `evals[].prompt`: The task to execute
- `evals[].expected_output`: Human-readable description of success
- `evals[].files`: Optional list of input file paths (relative to skill root)
- `evals[].expectations`: List of verifiable statements
---
## history.json
Tracks version progression in Improve mode. Located at workspace root.
```json
{
"started_at": "2026-01-15T10:30:00Z",
"skill_name": "pdf",
"current_best": "v2",
"iterations": [
{
"version": "v0",
"parent": null,
"expectation_pass_rate": 0.65,
"grading_result": "baseline",
"is_current_best": false
},
{
"version": "v1",
"parent": "v0",
"expectation_pass_rate": 0.75,
"grading_result": "won",
"is_current_best": false
},
{
"version": "v2",
"parent": "v1",
"expectation_pass_rate": 0.85,
"grading_result": "won",
"is_current_best": true
}
]
}
```
**Fields:**
- `started_at`: ISO timestamp of when improvement started
- `skill_name`: Name of the skill being improved
- `current_best`: Version identifier of the best performer
- `iterations[].version`: Version identifier (v0, v1, ...)
- `iterations[].parent`: Parent version this was derived from
- `iterations[].expectation_pass_rate`: Pass rate from grading
- `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
- `iterations[].is_current_best`: Whether this is the current best version
---
## grading.json
Output from the grader agent. Located at `<run-dir>/grading.json`.
```json
{
"expectations": [
{
"text": "The output includes the name 'John Smith'",
"passed": true,
"evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
},
{
"text": "The spreadsheet has a SUM formula in cell B10",
"passed": false,
"evidence": "No spreadsheet was created. The output was a text file."
}
],
"summary": {
"passed": 2,
"failed": 1,
"total": 3,
"pass_rate": 0.67
},
"execution_metrics": {
"tool_calls": {
"Read": 5,
"Write": 2,
"Bash": 8
},
"total_tool_calls": 15,
"total_steps": 6,
"errors_encountered": 0,
"output_chars": 12450,
"transcript_chars": 3200
},
"timing": {
"executor_duration_seconds": 165.0,
"grader_duration_seconds": 26.0,
"total_duration_seconds": 191.0
},
"claims": [
{
"claim": "The form has 12 fillable fields",
"type": "factual",
"verified": true,
"evidence": "Counted 12 fields in field_info.json"
}
],
"user_notes_summary": {
"uncertainties": ["Used 2023 data, may be stale"],
"needs_review": [],
"workarounds": ["Fell back to text overlay for non-fillable fields"]
},
"eval_feedback": {
"suggestions": [
{
"assertion": "The output includes the name 'John Smith'",
"reason": "A hallucinated document that mentions the name would also pass"
}
],
"overall": "Assertions check presence but not correctness."
}
}
```
**Fields:**
- `expectations[]`: Graded expectations with evidence
- `summary`: Aggregate pass/fail counts
- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
- `timing`: Wall clock timing (from timing.json)
- `claims`: Extracted and verified claims from the output
- `user_notes_summary`: Issues flagged by the executor
- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
---
## metrics.json
Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
```json
{
"tool_calls": {
"Read": 5,
"Write": 2,
"Bash": 8,
"Edit": 1,
"Glob": 2,
"Grep": 0
},
"total_tool_calls": 18,
"total_steps": 6,
"files_created": ["filled_form.pdf", "field_values.json"],
"errors_encountered": 0,
"output_chars": 12450,
"transcript_chars": 3200
}
```
**Fields:**
- `tool_calls`: Count per tool type
- `total_tool_calls`: Sum of all tool calls
- `total_steps`: Number of major execution steps
- `files_created`: List of output files created
- `errors_encountered`: Number of errors during execution
- `output_chars`: Total character count of output files
- `transcript_chars`: Character count of transcript
---
## timing.json
Wall clock timing for a run. Located at `<run-dir>/timing.json`.
**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact.
```json
{
"total_tokens": 84852,
"duration_ms": 23332,
"total_duration_seconds": 23.3,
"executor_start": "2026-01-15T10:30:00Z",
"executor_end": "2026-01-15T10:32:45Z",
"executor_duration_seconds": 165.0,
"grader_start": "2026-01-15T10:32:46Z",
"grader_end": "2026-01-15T10:33:12Z",
"grader_duration_seconds": 26.0
}
```
---
## benchmark.json
Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
```json
{
"metadata": {
"skill_name": "pdf",
"skill_path": "/path/to/pdf",
"executor_model": "claude-sonnet-4-20250514",
"analyzer_model": "most-capable-model",
"timestamp": "2026-01-15T10:30:00Z",
"evals_run": [1, 2, 3],
"runs_per_configuration": 3
},
"runs": [
{
"eval_id": 1,
"eval_name": "Ocean",
"configuration": "with_skill",
"run_number": 1,
"result": {
"pass_rate": 0.85,
"passed": 6,
"failed": 1,
"total": 7,
"time_seconds": 42.5,
"tokens": 3800,
"tool_calls": 18,
"errors": 0
},
"expectations": [
{"text": "...", "passed": true, "evidence": "..."}
],
"notes": [
"Used 2023 data, may be stale",
"Fell back to text overlay for non-fillable fields"
]
}
],
"run_summary": {
"with_skill": {
"pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
"time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
"tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
},
"without_skill": {
"pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
"time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
"tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
},
"delta": {
"pass_rate": "+0.50",
"time_seconds": "+13.0",
"tokens": "+1700"
}
},
"notes": [
"Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
"Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
"Without-skill runs consistently fail on table extraction expectations",
"Skill adds 13s average execution time but improves pass rate by 50%"
]
}
```
**Fields:**
- `metadata`: Information about the benchmark run
- `skill_name`: Name of the skill
- `timestamp`: When the benchmark was run
- `evals_run`: List of eval names or IDs
- `runs_per_configuration`: Number of runs per config (e.g. 3)
- `runs[]`: Individual run results
- `eval_id`: Numeric eval identifier
- `eval_name`: Human-readable eval name (used as section header in the viewer)
- `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding)
- `run_number`: Integer run number (1, 2, 3...)
- `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors`
- `run_summary`: Statistical aggregates per configuration
- `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields
- `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"`
- `notes`: Freeform observations from the analyzer
**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually.
---
## comparison.json
Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
```json
{
"winner": "A",
"reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
"rubric": {
"A": {
"content": {
"correctness": 5,
"completeness": 5,
"accuracy": 4
},
"structure": {
"organization": 4,
"formatting": 5,
"usability": 4
},
"content_score": 4.7,
"structure_score": 4.3,
"overall_score": 9.0
},
"B": {
"content": {
"correctness": 3,
"completeness": 2,
"accuracy": 3
},
"structure": {
"organization": 3,
"formatting": 2,
"usability": 3
},
"content_score": 2.7,
"structure_score": 2.7,
"overall_score": 5.4
}
},
"output_quality": {
"A": {
"score": 9,
"strengths": ["Complete solution", "Well-formatted", "All fields present"],
"weaknesses": ["Minor style inconsistency in header"]
},
"B": {
"score": 5,
"strengths": ["Readable output", "Correct basic structure"],
"weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
}
},
"expectation_results": {
"A": {
"passed": 4,
"total": 5,
"pass_rate": 0.80,
"details": [
{"text": "Output includes name", "passed": true}
]
},
"B": {
"passed": 3,
"total": 5,
"pass_rate": 0.60,
"details": [
{"text": "Output includes name", "passed": true}
]
}
}
}
```
---
## analysis.json
Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
```json
{
"comparison_summary": {
"winner": "A",
"winner_skill": "path/to/winner/skill",
"loser_skill": "path/to/loser/skill",
"comparator_reasoning": "Brief summary of why comparator chose winner"
},
"winner_strengths": [
"Clear step-by-step instructions for handling multi-page documents",
"Included validation script that caught formatting errors"
],
"loser_weaknesses": [
"Vague instruction 'process the document appropriately' led to inconsistent behavior",
"No script for validation, agent had to improvise"
],
"instruction_following": {
"winner": {
"score": 9,
"issues": ["Minor: skipped optional logging step"]
},
"loser": {
"score": 6,
"issues": [
"Did not use the skill's formatting template",
"Invented own approach instead of following step 3"
]
}
},
"improvement_suggestions": [
{
"priority": "high",
"category": "instructions",
"suggestion": "Replace 'process the document appropriately' with explicit steps",
"expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
}
],
"transcript_insights": {
"winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
"loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
}
}
```

View File

@@ -0,0 +1,401 @@
#!/usr/bin/env python3
"""
Aggregate individual run results into benchmark summary statistics.
Reads grading.json files from run directories and produces:
- run_summary with mean, stddev, min, max for each metric
- delta between with_skill and without_skill configurations
Usage:
python aggregate_benchmark.py <benchmark_dir>
Example:
python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
The script supports two directory layouts:
Workspace layout (from skill-creator iterations):
<benchmark_dir>/
└── eval-N/
├── with_skill/
│ ├── run-1/grading.json
│ └── run-2/grading.json
└── without_skill/
├── run-1/grading.json
└── run-2/grading.json
Legacy layout (with runs/ subdirectory):
<benchmark_dir>/
└── runs/
└── eval-N/
├── with_skill/
│ └── run-1/grading.json
└── without_skill/
└── run-1/grading.json
"""
import argparse
import json
import math
import sys
from datetime import datetime, timezone
from pathlib import Path
def calculate_stats(values: list[float]) -> dict:
"""Calculate mean, stddev, min, max for a list of values."""
if not values:
return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
n = len(values)
mean = sum(values) / n
if n > 1:
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
stddev = math.sqrt(variance)
else:
stddev = 0.0
return {
"mean": round(mean, 4),
"stddev": round(stddev, 4),
"min": round(min(values), 4),
"max": round(max(values), 4)
}
def load_run_results(benchmark_dir: Path) -> dict:
"""
Load all run results from a benchmark directory.
Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
or "new_skill"/"old_skill"), each containing a list of run results.
"""
# Support both layouts: eval dirs directly under benchmark_dir, or under runs/
runs_dir = benchmark_dir / "runs"
if runs_dir.exists():
search_dir = runs_dir
elif list(benchmark_dir.glob("eval-*")):
search_dir = benchmark_dir
else:
print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
return {}
results: dict[str, list] = {}
for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
metadata_path = eval_dir / "eval_metadata.json"
if metadata_path.exists():
try:
with open(metadata_path) as mf:
eval_id = json.load(mf).get("eval_id", eval_idx)
except (json.JSONDecodeError, OSError):
eval_id = eval_idx
else:
try:
eval_id = int(eval_dir.name.split("-")[1])
except ValueError:
eval_id = eval_idx
# Discover config directories dynamically rather than hardcoding names
for config_dir in sorted(eval_dir.iterdir()):
if not config_dir.is_dir():
continue
# Skip non-config directories (inputs, outputs, etc.)
if not list(config_dir.glob("run-*")):
continue
config = config_dir.name
if config not in results:
results[config] = []
for run_dir in sorted(config_dir.glob("run-*")):
run_number = int(run_dir.name.split("-")[1])
grading_file = run_dir / "grading.json"
if not grading_file.exists():
print(f"Warning: grading.json not found in {run_dir}")
continue
try:
with open(grading_file) as f:
grading = json.load(f)
except json.JSONDecodeError as e:
print(f"Warning: Invalid JSON in {grading_file}: {e}")
continue
# Extract metrics
result = {
"eval_id": eval_id,
"run_number": run_number,
"pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
"passed": grading.get("summary", {}).get("passed", 0),
"failed": grading.get("summary", {}).get("failed", 0),
"total": grading.get("summary", {}).get("total", 0),
}
# Extract timing — check grading.json first, then sibling timing.json
timing = grading.get("timing", {})
result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
timing_file = run_dir / "timing.json"
if result["time_seconds"] == 0.0 and timing_file.exists():
try:
with open(timing_file) as tf:
timing_data = json.load(tf)
result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
result["tokens"] = timing_data.get("total_tokens", 0)
except json.JSONDecodeError:
pass
# Extract metrics if available
metrics = grading.get("execution_metrics", {})
result["tool_calls"] = metrics.get("total_tool_calls", 0)
if not result.get("tokens"):
result["tokens"] = metrics.get("output_chars", 0)
result["errors"] = metrics.get("errors_encountered", 0)
# Extract expectations — viewer requires fields: text, passed, evidence
raw_expectations = grading.get("expectations", [])
for exp in raw_expectations:
if "text" not in exp or "passed" not in exp:
print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
result["expectations"] = raw_expectations
# Extract notes from user_notes_summary
notes_summary = grading.get("user_notes_summary", {})
notes = []
notes.extend(notes_summary.get("uncertainties", []))
notes.extend(notes_summary.get("needs_review", []))
notes.extend(notes_summary.get("workarounds", []))
result["notes"] = notes
results[config].append(result)
return results
def aggregate_results(results: dict) -> dict:
"""
Aggregate run results into summary statistics.
Returns run_summary with stats for each configuration and delta.
"""
run_summary = {}
configs = list(results.keys())
for config in configs:
runs = results.get(config, [])
if not runs:
run_summary[config] = {
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
}
continue
pass_rates = [r["pass_rate"] for r in runs]
times = [r["time_seconds"] for r in runs]
tokens = [r.get("tokens", 0) for r in runs]
run_summary[config] = {
"pass_rate": calculate_stats(pass_rates),
"time_seconds": calculate_stats(times),
"tokens": calculate_stats(tokens)
}
# Calculate delta between the first two configs (if two exist)
if len(configs) >= 2:
primary = run_summary.get(configs[0], {})
baseline = run_summary.get(configs[1], {})
else:
primary = run_summary.get(configs[0], {}) if configs else {}
baseline = {}
delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
run_summary["delta"] = {
"pass_rate": f"{delta_pass_rate:+.2f}",
"time_seconds": f"{delta_time:+.1f}",
"tokens": f"{delta_tokens:+.0f}"
}
return run_summary
def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
"""
Generate complete benchmark.json from run results.
"""
results = load_run_results(benchmark_dir)
run_summary = aggregate_results(results)
# Build runs array for benchmark.json
runs = []
for config in results:
for result in results[config]:
runs.append({
"eval_id": result["eval_id"],
"configuration": config,
"run_number": result["run_number"],
"result": {
"pass_rate": result["pass_rate"],
"passed": result["passed"],
"failed": result["failed"],
"total": result["total"],
"time_seconds": result["time_seconds"],
"tokens": result.get("tokens", 0),
"tool_calls": result.get("tool_calls", 0),
"errors": result.get("errors", 0)
},
"expectations": result["expectations"],
"notes": result["notes"]
})
# Determine eval IDs from results
eval_ids = sorted(set(
r["eval_id"]
for config in results.values()
for r in config
))
benchmark = {
"metadata": {
"skill_name": skill_name or "<skill-name>",
"skill_path": skill_path or "<path/to/skill>",
"executor_model": "<model-name>",
"analyzer_model": "<model-name>",
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"evals_run": eval_ids,
"runs_per_configuration": 3
},
"runs": runs,
"run_summary": run_summary,
"notes": [] # To be filled by analyzer
}
return benchmark
def generate_markdown(benchmark: dict) -> str:
"""Generate human-readable benchmark.md from benchmark data."""
metadata = benchmark["metadata"]
run_summary = benchmark["run_summary"]
# Determine config names (excluding "delta")
configs = [k for k in run_summary if k != "delta"]
config_a = configs[0] if len(configs) >= 1 else "config_a"
config_b = configs[1] if len(configs) >= 2 else "config_b"
label_a = config_a.replace("_", " ").title()
label_b = config_b.replace("_", " ").title()
lines = [
f"# Skill Benchmark: {metadata['skill_name']}",
"",
f"**Model**: {metadata['executor_model']}",
f"**Date**: {metadata['timestamp']}",
f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
"",
"## Summary",
"",
f"| Metric | {label_a} | {label_b} | Delta |",
"|--------|------------|---------------|-------|",
]
a_summary = run_summary.get(config_a, {})
b_summary = run_summary.get(config_b, {})
delta = run_summary.get("delta", {})
# Format pass rate
a_pr = a_summary.get("pass_rate", {})
b_pr = b_summary.get("pass_rate", {})
lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '')} |")
# Format time
a_time = a_summary.get("time_seconds", {})
b_time = b_summary.get("time_seconds", {})
lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '')}s |")
# Format tokens
a_tokens = a_summary.get("tokens", {})
b_tokens = b_summary.get("tokens", {})
lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '')} |")
# Notes section
if benchmark.get("notes"):
lines.extend([
"",
"## Notes",
""
])
for note in benchmark["notes"]:
lines.append(f"- {note}")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Aggregate benchmark run results into summary statistics"
)
parser.add_argument(
"benchmark_dir",
type=Path,
help="Path to the benchmark directory"
)
parser.add_argument(
"--skill-name",
default="",
help="Name of the skill being benchmarked"
)
parser.add_argument(
"--skill-path",
default="",
help="Path to the skill being benchmarked"
)
parser.add_argument(
"--output", "-o",
type=Path,
help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
)
args = parser.parse_args()
if not args.benchmark_dir.exists():
print(f"Directory not found: {args.benchmark_dir}")
sys.exit(1)
# Generate benchmark
benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
# Determine output paths
output_json = args.output or (args.benchmark_dir / "benchmark.json")
output_md = output_json.with_suffix(".md")
# Write benchmark.json
with open(output_json, "w") as f:
json.dump(benchmark, f, indent=2)
print(f"Generated: {output_json}")
# Write benchmark.md
markdown = generate_markdown(benchmark)
with open(output_md, "w") as f:
f.write(markdown)
print(f"Generated: {output_md}")
# Print summary
run_summary = benchmark["run_summary"]
configs = [k for k in run_summary if k != "delta"]
delta = run_summary.get("delta", {})
print(f"\nSummary:")
for config in configs:
pr = run_summary[config]["pass_rate"]["mean"]
label = config.replace("_", " ").title()
print(f" {label}: {pr*100:.1f}% pass rate")
print(f" Delta: {delta.get('pass_rate', '')}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,326 @@
#!/usr/bin/env python3
"""Generate an HTML report from run_loop.py output.
Takes the JSON output from run_loop.py and generates a visual HTML report
showing each description attempt with check/x for each test case.
Distinguishes between train and test queries.
"""
import argparse
import html
import json
import sys
from pathlib import Path
def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
"""Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
history = data.get("history", [])
holdout = data.get("holdout", 0)
title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
# Get all unique queries from train and test sets, with should_trigger info
train_queries: list[dict] = []
test_queries: list[dict] = []
if history:
for r in history[0].get("train_results", history[0].get("results", [])):
train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
if history[0].get("test_results"):
for r in history[0].get("test_results", []):
test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
refresh_tag = ' <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
html_parts = ["""<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
""" + refresh_tag + """ <title>""" + title_prefix + """Skill Description Optimization</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
<style>
body {
font-family: 'Lora', Georgia, serif;
max-width: 100%;
margin: 0 auto;
padding: 20px;
background: #faf9f5;
color: #141413;
}
h1 { font-family: 'Poppins', sans-serif; color: #141413; }
.explainer {
background: white;
padding: 15px;
border-radius: 6px;
margin-bottom: 20px;
border: 1px solid #e8e6dc;
color: #b0aea5;
font-size: 0.875rem;
line-height: 1.6;
}
.summary {
background: white;
padding: 15px;
border-radius: 6px;
margin-bottom: 20px;
border: 1px solid #e8e6dc;
}
.summary p { margin: 5px 0; }
.best { color: #788c5d; font-weight: bold; }
.table-container {
overflow-x: auto;
width: 100%;
}
table {
border-collapse: collapse;
background: white;
border: 1px solid #e8e6dc;
border-radius: 6px;
font-size: 12px;
min-width: 100%;
}
th, td {
padding: 8px;
text-align: left;
border: 1px solid #e8e6dc;
white-space: normal;
word-wrap: break-word;
}
th {
font-family: 'Poppins', sans-serif;
background: #141413;
color: #faf9f5;
font-weight: 500;
}
th.test-col {
background: #6a9bcc;
}
th.query-col { min-width: 200px; }
td.description {
font-family: monospace;
font-size: 11px;
word-wrap: break-word;
max-width: 400px;
}
td.result {
text-align: center;
font-size: 16px;
min-width: 40px;
}
td.test-result {
background: #f0f6fc;
}
.pass { color: #788c5d; }
.fail { color: #c44; }
.rate {
font-size: 9px;
color: #b0aea5;
display: block;
}
tr:hover { background: #faf9f5; }
.score {
display: inline-block;
padding: 2px 6px;
border-radius: 4px;
font-weight: bold;
font-size: 11px;
}
.score-good { background: #eef2e8; color: #788c5d; }
.score-ok { background: #fef3c7; color: #d97706; }
.score-bad { background: #fceaea; color: #c44; }
.train-label { color: #b0aea5; font-size: 10px; }
.test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
.best-row { background: #f5f8f2; }
th.positive-col { border-bottom: 3px solid #788c5d; }
th.negative-col { border-bottom: 3px solid #c44; }
th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
th.test-col.negative-col { border-bottom: 3px solid #c44; }
.legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
.legend-item { display: flex; align-items: center; gap: 6px; }
.legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
.swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
.swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
.swatch-test { background: #6a9bcc; }
.swatch-train { background: #141413; }
</style>
</head>
<body>
<h1>""" + title_prefix + """Skill Description Optimization</h1>
<div class="explainer">
<strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
</div>
"""]
# Summary section
best_test_score = data.get('best_test_score')
best_train_score = data.get('best_train_score')
html_parts.append(f"""
<div class="summary">
<p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
<p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
<p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
<p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
</div>
""")
# Legend
html_parts.append("""
<div class="legend">
<span style="font-weight:600">Query columns:</span>
<span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
<span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
<span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
<span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
</div>
""")
# Table header
html_parts.append("""
<div class="table-container">
<table>
<thead>
<tr>
<th>Iter</th>
<th>Train</th>
<th>Test</th>
<th class="query-col">Description</th>
""")
# Add column headers for train queries
for qinfo in train_queries:
polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
html_parts.append(f' <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
# Add column headers for test queries (different color)
for qinfo in test_queries:
polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
html_parts.append(f' <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
html_parts.append(""" </tr>
</thead>
<tbody>
""")
# Find best iteration for highlighting
if test_queries:
best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
else:
best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
# Add rows for each iteration
for h in history:
iteration = h.get("iteration", "?")
train_passed = h.get("train_passed", h.get("passed", 0))
train_total = h.get("train_total", h.get("total", 0))
test_passed = h.get("test_passed")
test_total = h.get("test_total")
description = h.get("description", "")
train_results = h.get("train_results", h.get("results", []))
test_results = h.get("test_results", [])
# Create lookups for results by query
train_by_query = {r["query"]: r for r in train_results}
test_by_query = {r["query"]: r for r in test_results} if test_results else {}
# Compute aggregate correct/total runs across all retries
def aggregate_runs(results: list[dict]) -> tuple[int, int]:
correct = 0
total = 0
for r in results:
runs = r.get("runs", 0)
triggers = r.get("triggers", 0)
total += runs
if r.get("should_trigger", True):
correct += triggers
else:
correct += runs - triggers
return correct, total
train_correct, train_runs = aggregate_runs(train_results)
test_correct, test_runs = aggregate_runs(test_results)
# Determine score classes
def score_class(correct: int, total: int) -> str:
if total > 0:
ratio = correct / total
if ratio >= 0.8:
return "score-good"
elif ratio >= 0.5:
return "score-ok"
return "score-bad"
train_class = score_class(train_correct, train_runs)
test_class = score_class(test_correct, test_runs)
row_class = "best-row" if iteration == best_iter else ""
html_parts.append(f""" <tr class="{row_class}">
<td>{iteration}</td>
<td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
<td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
<td class="description">{html.escape(description)}</td>
""")
# Add result for each train query
for qinfo in train_queries:
r = train_by_query.get(qinfo["query"], {})
did_pass = r.get("pass", False)
triggers = r.get("triggers", 0)
runs = r.get("runs", 0)
icon = "" if did_pass else ""
css_class = "pass" if did_pass else "fail"
html_parts.append(f' <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
# Add result for each test query (with different background)
for qinfo in test_queries:
r = test_by_query.get(qinfo["query"], {})
did_pass = r.get("pass", False)
triggers = r.get("triggers", 0)
runs = r.get("runs", 0)
icon = "" if did_pass else ""
css_class = "pass" if did_pass else "fail"
html_parts.append(f' <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
html_parts.append(" </tr>\n")
html_parts.append(""" </tbody>
</table>
</div>
""")
html_parts.append("""
</body>
</html>
""")
return "".join(html_parts)
def main():
parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
args = parser.parse_args()
if args.input == "-":
data = json.load(sys.stdin)
else:
data = json.loads(Path(args.input).read_text())
html_output = generate_html(data, skill_name=args.skill_name)
if args.output:
Path(args.output).write_text(html_output)
print(f"Report written to {args.output}", file=sys.stderr)
else:
print(html_output)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,247 @@
#!/usr/bin/env python3
"""Improve a skill description based on eval results.
Takes eval results (from run_eval.py) and generates an improved description
by calling `claude -p` as a subprocess (same auth pattern as run_eval.py —
uses the session's Claude Code auth, no separate ANTHROPIC_API_KEY needed).
"""
import argparse
import json
import os
import re
import subprocess
import sys
from pathlib import Path
from scripts.utils import parse_skill_md
def _call_claude(prompt: str, model: str | None, timeout: int = 300) -> str:
"""Run `claude -p` with the prompt on stdin and return the text response.
Prompt goes over stdin (not argv) because it embeds the full SKILL.md
body and can easily exceed comfortable argv length.
"""
cmd = ["claude", "-p", "--output-format", "text"]
if model:
cmd.extend(["--model", model])
# Remove CLAUDECODE env var to allow nesting claude -p inside a
# Claude Code session. The guard is for interactive terminal conflicts;
# programmatic subprocess usage is safe. Same pattern as run_eval.py.
env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
result = subprocess.run(
cmd,
input=prompt,
capture_output=True,
text=True,
env=env,
timeout=timeout,
)
if result.returncode != 0:
raise RuntimeError(
f"claude -p exited {result.returncode}\nstderr: {result.stderr}"
)
return result.stdout
def improve_description(
skill_name: str,
skill_content: str,
current_description: str,
eval_results: dict,
history: list[dict],
model: str,
test_results: dict | None = None,
log_dir: Path | None = None,
iteration: int | None = None,
) -> str:
"""Call Claude to improve the description based on eval results."""
failed_triggers = [
r for r in eval_results["results"]
if r["should_trigger"] and not r["pass"]
]
false_triggers = [
r for r in eval_results["results"]
if not r["should_trigger"] and not r["pass"]
]
# Build scores summary
train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
if test_results:
test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
scores_summary = f"Train: {train_score}, Test: {test_score}"
else:
scores_summary = f"Train: {train_score}"
prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
Here's the current description:
<current_description>
"{current_description}"
</current_description>
Current scores ({scores_summary}):
<scores_summary>
"""
if failed_triggers:
prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"
for r in failed_triggers:
prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
prompt += "\n"
if false_triggers:
prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"
for r in false_triggers:
prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
prompt += "\n"
if history:
prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
for h in history:
train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
prompt += f'<attempt {score_str}>\n'
prompt += f'Description: "{h["description"]}"\n'
if "results" in h:
prompt += "Train results:\n"
for r in h["results"]:
status = "PASS" if r["pass"] else "FAIL"
prompt += f' [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
if h.get("note"):
prompt += f'Note: {h["note"]}\n'
prompt += "</attempt>\n\n"
prompt += f"""</scores_summary>
Skill content (for context on what the skill does):
<skill_content>
{skill_content}
</skill_content>
Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
1. Avoid overfitting
2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. There is a hard limit of 1024 characters — descriptions over that will be truncated, so stay comfortably under it.
Here are some tips that we've found to work well in writing these descriptions:
- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end.
Please respond with only the new description text in <new_description> tags, nothing else."""
text = _call_claude(prompt, model)
match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
transcript: dict = {
"iteration": iteration,
"prompt": prompt,
"response": text,
"parsed_description": description,
"char_count": len(description),
"over_limit": len(description) > 1024,
}
# Safety net: the prompt already states the 1024-char hard limit, but if
# the model blew past it anyway, make one fresh single-turn call that
# quotes the too-long version and asks for a shorter rewrite. (The old
# SDK path did this as a true multi-turn; `claude -p` is one-shot, so we
# inline the prior output into the new prompt instead.)
if len(description) > 1024:
shorten_prompt = (
f"{prompt}\n\n"
f"---\n\n"
f"A previous attempt produced this description, which at "
f"{len(description)} characters is over the 1024-character hard limit:\n\n"
f'"{description}"\n\n'
f"Rewrite it to be under 1024 characters while keeping the most "
f"important trigger words and intent coverage. Respond with only "
f"the new description in <new_description> tags."
)
shorten_text = _call_claude(shorten_prompt, model)
match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
transcript["rewrite_prompt"] = shorten_prompt
transcript["rewrite_response"] = shorten_text
transcript["rewrite_description"] = shortened
transcript["rewrite_char_count"] = len(shortened)
description = shortened
transcript["final_description"] = description
if log_dir:
log_dir.mkdir(parents=True, exist_ok=True)
log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
log_file.write_text(json.dumps(transcript, indent=2))
return description
def main():
parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")
parser.add_argument("--skill-path", required=True, help="Path to skill directory")
parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")
parser.add_argument("--model", required=True, help="Model for improvement")
parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")
args = parser.parse_args()
skill_path = Path(args.skill_path)
if not (skill_path / "SKILL.md").exists():
print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
sys.exit(1)
eval_results = json.loads(Path(args.eval_results).read_text())
history = []
if args.history:
history = json.loads(Path(args.history).read_text())
name, _, content = parse_skill_md(skill_path)
current_description = eval_results["description"]
if args.verbose:
print(f"Current: {current_description}", file=sys.stderr)
print(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}", file=sys.stderr)
new_description = improve_description(
skill_name=name,
skill_content=content,
current_description=current_description,
eval_results=eval_results,
history=history,
model=args.model,
)
if args.verbose:
print(f"Improved: {new_description}", file=sys.stderr)
# Output as JSON with both the new description and updated history
output = {
"description": new_description,
"history": history + [{
"description": current_description,
"passed": eval_results["summary"]["passed"],
"failed": eval_results["summary"]["failed"],
"total": eval_results["summary"]["total"],
"results": eval_results["results"],
}],
}
print(json.dumps(output, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
Skill Packager - Creates a distributable .skill file of a skill folder
Usage:
python utils/package_skill.py <path/to/skill-folder> [output-directory]
Example:
python utils/package_skill.py skills/public/my-skill
python utils/package_skill.py skills/public/my-skill ./dist
"""
import fnmatch
import sys
import zipfile
from pathlib import Path
from scripts.quick_validate import validate_skill
# Patterns to exclude when packaging skills.
EXCLUDE_DIRS = {"__pycache__", "node_modules"}
EXCLUDE_GLOBS = {"*.pyc"}
EXCLUDE_FILES = {".DS_Store"}
# Directories excluded only at the skill root (not when nested deeper).
ROOT_EXCLUDE_DIRS = {"evals"}
def should_exclude(rel_path: Path) -> bool:
"""Check if a path should be excluded from packaging."""
parts = rel_path.parts
if any(part in EXCLUDE_DIRS for part in parts):
return True
# rel_path is relative to skill_path.parent, so parts[0] is the skill
# folder name and parts[1] (if present) is the first subdir.
if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
return True
name = rel_path.name
if name in EXCLUDE_FILES:
return True
return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
def package_skill(skill_path, output_dir=None):
"""
Package a skill folder into a .skill file.
Args:
skill_path: Path to the skill folder
output_dir: Optional output directory for the .skill file (defaults to current directory)
Returns:
Path to the created .skill file, or None if error
"""
skill_path = Path(skill_path).resolve()
# Validate skill folder exists
if not skill_path.exists():
print(f"❌ Error: Skill folder not found: {skill_path}")
return None
if not skill_path.is_dir():
print(f"❌ Error: Path is not a directory: {skill_path}")
return None
# Validate SKILL.md exists
skill_md = skill_path / "SKILL.md"
if not skill_md.exists():
print(f"❌ Error: SKILL.md not found in {skill_path}")
return None
# Run validation before packaging
print("🔍 Validating skill...")
valid, message = validate_skill(skill_path)
if not valid:
print(f"❌ Validation failed: {message}")
print(" Please fix the validation errors before packaging.")
return None
print(f"{message}\n")
# Determine output location
skill_name = skill_path.name
if output_dir:
output_path = Path(output_dir).resolve()
output_path.mkdir(parents=True, exist_ok=True)
else:
output_path = Path.cwd()
skill_filename = output_path / f"{skill_name}.skill"
# Create the .skill file (zip format)
try:
with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
# Walk through the skill directory, excluding build artifacts
for file_path in skill_path.rglob('*'):
if not file_path.is_file():
continue
arcname = file_path.relative_to(skill_path.parent)
if should_exclude(arcname):
print(f" Skipped: {arcname}")
continue
zipf.write(file_path, arcname)
print(f" Added: {arcname}")
print(f"\n✅ Successfully packaged skill to: {skill_filename}")
return skill_filename
except Exception as e:
print(f"❌ Error creating .skill file: {e}")
return None
def main():
if len(sys.argv) < 2:
print("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
print("\nExample:")
print(" python utils/package_skill.py skills/public/my-skill")
print(" python utils/package_skill.py skills/public/my-skill ./dist")
sys.exit(1)
skill_path = sys.argv[1]
output_dir = sys.argv[2] if len(sys.argv) > 2 else None
print(f"📦 Packaging skill: {skill_path}")
if output_dir:
print(f" Output directory: {output_dir}")
print()
result = package_skill(skill_path, output_dir)
if result:
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""
Quick validation script for skills - minimal version
"""
import sys
import os
import re
import yaml
from pathlib import Path
def validate_skill(skill_path):
"""Basic validation of a skill"""
skill_path = Path(skill_path)
# Check SKILL.md exists
skill_md = skill_path / 'SKILL.md'
if not skill_md.exists():
return False, "SKILL.md not found"
# Read and validate frontmatter
content = skill_md.read_text()
if not content.startswith('---'):
return False, "No YAML frontmatter found"
# Extract frontmatter
match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
if not match:
return False, "Invalid frontmatter format"
frontmatter_text = match.group(1)
# Parse YAML frontmatter
try:
frontmatter = yaml.safe_load(frontmatter_text)
if not isinstance(frontmatter, dict):
return False, "Frontmatter must be a YAML dictionary"
except yaml.YAMLError as e:
return False, f"Invalid YAML in frontmatter: {e}"
# Define allowed properties
ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
# Check for unexpected properties (excluding nested keys under metadata)
unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
if unexpected_keys:
return False, (
f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
)
# Check required fields
if 'name' not in frontmatter:
return False, "Missing 'name' in frontmatter"
if 'description' not in frontmatter:
return False, "Missing 'description' in frontmatter"
# Extract name for validation
name = frontmatter.get('name', '')
if not isinstance(name, str):
return False, f"Name must be a string, got {type(name).__name__}"
name = name.strip()
if name:
# Check naming convention (kebab-case: lowercase with hyphens)
if not re.match(r'^[a-z0-9-]+$', name):
return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
if name.startswith('-') or name.endswith('-') or '--' in name:
return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
# Check name length (max 64 characters per spec)
if len(name) > 64:
return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
# Extract and validate description
description = frontmatter.get('description', '')
if not isinstance(description, str):
return False, f"Description must be a string, got {type(description).__name__}"
description = description.strip()
if description:
# Check for angle brackets
if '<' in description or '>' in description:
return False, "Description cannot contain angle brackets (< or >)"
# Check description length (max 1024 characters per spec)
if len(description) > 1024:
return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
# Validate compatibility field if present (optional)
compatibility = frontmatter.get('compatibility', '')
if compatibility:
if not isinstance(compatibility, str):
return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
if len(compatibility) > 500:
return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
return True, "Skill is valid!"
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python quick_validate.py <skill_directory>")
sys.exit(1)
valid, message = validate_skill(sys.argv[1])
print(message)
sys.exit(0 if valid else 1)

View File

@@ -0,0 +1,310 @@
#!/usr/bin/env python3
"""Run trigger evaluation for a skill description.
Tests whether a skill's description causes Claude to trigger (read the skill)
for a set of queries. Outputs results as JSON.
"""
import argparse
import json
import os
import select
import subprocess
import sys
import time
import uuid
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
from scripts.utils import parse_skill_md
def find_project_root() -> Path:
"""Find the project root by walking up from cwd looking for .claude/.
Mimics how Claude Code discovers its project root, so the command file
we create ends up where claude -p will look for it.
"""
current = Path.cwd()
for parent in [current, *current.parents]:
if (parent / ".claude").is_dir():
return parent
return current
def run_single_query(
query: str,
skill_name: str,
skill_description: str,
timeout: int,
project_root: str,
model: str | None = None,
) -> bool:
"""Run a single query and return whether the skill was triggered.
Creates a command file in .claude/commands/ so it appears in Claude's
available_skills list, then runs `claude -p` with the raw query.
Uses --include-partial-messages to detect triggering early from
stream events (content_block_start) rather than waiting for the
full assistant message, which only arrives after tool execution.
"""
unique_id = uuid.uuid4().hex[:8]
clean_name = f"{skill_name}-skill-{unique_id}"
project_commands_dir = Path(project_root) / ".claude" / "commands"
command_file = project_commands_dir / f"{clean_name}.md"
try:
project_commands_dir.mkdir(parents=True, exist_ok=True)
# Use YAML block scalar to avoid breaking on quotes in description
indented_desc = "\n ".join(skill_description.split("\n"))
command_content = (
f"---\n"
f"description: |\n"
f" {indented_desc}\n"
f"---\n\n"
f"# {skill_name}\n\n"
f"This skill handles: {skill_description}\n"
)
command_file.write_text(command_content)
cmd = [
"claude",
"-p", query,
"--output-format", "stream-json",
"--verbose",
"--include-partial-messages",
]
if model:
cmd.extend(["--model", model])
# Remove CLAUDECODE env var to allow nesting claude -p inside a
# Claude Code session. The guard is for interactive terminal conflicts;
# programmatic subprocess usage is safe.
env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
cwd=project_root,
env=env,
)
triggered = False
start_time = time.time()
buffer = ""
# Track state for stream event detection
pending_tool_name = None
accumulated_json = ""
try:
while time.time() - start_time < timeout:
if process.poll() is not None:
remaining = process.stdout.read()
if remaining:
buffer += remaining.decode("utf-8", errors="replace")
break
ready, _, _ = select.select([process.stdout], [], [], 1.0)
if not ready:
continue
chunk = os.read(process.stdout.fileno(), 8192)
if not chunk:
break
buffer += chunk.decode("utf-8", errors="replace")
while "\n" in buffer:
line, buffer = buffer.split("\n", 1)
line = line.strip()
if not line:
continue
try:
event = json.loads(line)
except json.JSONDecodeError:
continue
# Early detection via stream events
if event.get("type") == "stream_event":
se = event.get("event", {})
se_type = se.get("type", "")
if se_type == "content_block_start":
cb = se.get("content_block", {})
if cb.get("type") == "tool_use":
tool_name = cb.get("name", "")
if tool_name in ("Skill", "Read"):
pending_tool_name = tool_name
accumulated_json = ""
else:
return False
elif se_type == "content_block_delta" and pending_tool_name:
delta = se.get("delta", {})
if delta.get("type") == "input_json_delta":
accumulated_json += delta.get("partial_json", "")
if clean_name in accumulated_json:
return True
elif se_type in ("content_block_stop", "message_stop"):
if pending_tool_name:
return clean_name in accumulated_json
if se_type == "message_stop":
return False
# Fallback: full assistant message
elif event.get("type") == "assistant":
message = event.get("message", {})
for content_item in message.get("content", []):
if content_item.get("type") != "tool_use":
continue
tool_name = content_item.get("name", "")
tool_input = content_item.get("input", {})
if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
triggered = True
elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
triggered = True
return triggered
elif event.get("type") == "result":
return triggered
finally:
# Clean up process on any exit path (return, exception, timeout)
if process.poll() is None:
process.kill()
process.wait()
return triggered
finally:
if command_file.exists():
command_file.unlink()
def run_eval(
eval_set: list[dict],
skill_name: str,
description: str,
num_workers: int,
timeout: int,
project_root: Path,
runs_per_query: int = 1,
trigger_threshold: float = 0.5,
model: str | None = None,
) -> dict:
"""Run the full eval set and return results."""
results = []
with ProcessPoolExecutor(max_workers=num_workers) as executor:
future_to_info = {}
for item in eval_set:
for run_idx in range(runs_per_query):
future = executor.submit(
run_single_query,
item["query"],
skill_name,
description,
timeout,
str(project_root),
model,
)
future_to_info[future] = (item, run_idx)
query_triggers: dict[str, list[bool]] = {}
query_items: dict[str, dict] = {}
for future in as_completed(future_to_info):
item, _ = future_to_info[future]
query = item["query"]
query_items[query] = item
if query not in query_triggers:
query_triggers[query] = []
try:
query_triggers[query].append(future.result())
except Exception as e:
print(f"Warning: query failed: {e}", file=sys.stderr)
query_triggers[query].append(False)
for query, triggers in query_triggers.items():
item = query_items[query]
trigger_rate = sum(triggers) / len(triggers)
should_trigger = item["should_trigger"]
if should_trigger:
did_pass = trigger_rate >= trigger_threshold
else:
did_pass = trigger_rate < trigger_threshold
results.append({
"query": query,
"should_trigger": should_trigger,
"trigger_rate": trigger_rate,
"triggers": sum(triggers),
"runs": len(triggers),
"pass": did_pass,
})
passed = sum(1 for r in results if r["pass"])
total = len(results)
return {
"skill_name": skill_name,
"description": description,
"results": results,
"summary": {
"total": total,
"passed": passed,
"failed": total - passed,
},
}
def main():
parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
parser.add_argument("--skill-path", required=True, help="Path to skill directory")
parser.add_argument("--description", default=None, help="Override description to test")
parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
args = parser.parse_args()
eval_set = json.loads(Path(args.eval_set).read_text())
skill_path = Path(args.skill_path)
if not (skill_path / "SKILL.md").exists():
print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
sys.exit(1)
name, original_description, content = parse_skill_md(skill_path)
description = args.description or original_description
project_root = find_project_root()
if args.verbose:
print(f"Evaluating: {description}", file=sys.stderr)
output = run_eval(
eval_set=eval_set,
skill_name=name,
description=description,
num_workers=args.num_workers,
timeout=args.timeout,
project_root=project_root,
runs_per_query=args.runs_per_query,
trigger_threshold=args.trigger_threshold,
model=args.model,
)
if args.verbose:
summary = output["summary"]
print(f"Results: {summary['passed']}/{summary['total']} passed", file=sys.stderr)
for r in output["results"]:
status = "PASS" if r["pass"] else "FAIL"
rate_str = f"{r['triggers']}/{r['runs']}"
print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr)
print(json.dumps(output, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,328 @@
#!/usr/bin/env python3
"""Run the eval + improve loop until all pass or max iterations reached.
Combines run_eval.py and improve_description.py in a loop, tracking history
and returning the best description found. Supports train/test split to prevent
overfitting.
"""
import argparse
import json
import random
import sys
import tempfile
import time
import webbrowser
from pathlib import Path
from scripts.generate_report import generate_html
from scripts.improve_description import improve_description
from scripts.run_eval import find_project_root, run_eval
from scripts.utils import parse_skill_md
def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
"""Split eval set into train and test sets, stratified by should_trigger."""
random.seed(seed)
# Separate by should_trigger
trigger = [e for e in eval_set if e["should_trigger"]]
no_trigger = [e for e in eval_set if not e["should_trigger"]]
# Shuffle each group
random.shuffle(trigger)
random.shuffle(no_trigger)
# Calculate split points
n_trigger_test = max(1, int(len(trigger) * holdout))
n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
# Split
test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
return train_set, test_set
def run_loop(
eval_set: list[dict],
skill_path: Path,
description_override: str | None,
num_workers: int,
timeout: int,
max_iterations: int,
runs_per_query: int,
trigger_threshold: float,
holdout: float,
model: str,
verbose: bool,
live_report_path: Path | None = None,
log_dir: Path | None = None,
) -> dict:
"""Run the eval + improvement loop."""
project_root = find_project_root()
name, original_description, content = parse_skill_md(skill_path)
current_description = description_override or original_description
# Split into train/test if holdout > 0
if holdout > 0:
train_set, test_set = split_eval_set(eval_set, holdout)
if verbose:
print(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})", file=sys.stderr)
else:
train_set = eval_set
test_set = []
history = []
exit_reason = "unknown"
for iteration in range(1, max_iterations + 1):
if verbose:
print(f"\n{'='*60}", file=sys.stderr)
print(f"Iteration {iteration}/{max_iterations}", file=sys.stderr)
print(f"Description: {current_description}", file=sys.stderr)
print(f"{'='*60}", file=sys.stderr)
# Evaluate train + test together in one batch for parallelism
all_queries = train_set + test_set
t0 = time.time()
all_results = run_eval(
eval_set=all_queries,
skill_name=name,
description=current_description,
num_workers=num_workers,
timeout=timeout,
project_root=project_root,
runs_per_query=runs_per_query,
trigger_threshold=trigger_threshold,
model=model,
)
eval_elapsed = time.time() - t0
# Split results back into train/test by matching queries
train_queries_set = {q["query"] for q in train_set}
train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
train_passed = sum(1 for r in train_result_list if r["pass"])
train_total = len(train_result_list)
train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
train_results = {"results": train_result_list, "summary": train_summary}
if test_set:
test_passed = sum(1 for r in test_result_list if r["pass"])
test_total = len(test_result_list)
test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
test_results = {"results": test_result_list, "summary": test_summary}
else:
test_results = None
test_summary = None
history.append({
"iteration": iteration,
"description": current_description,
"train_passed": train_summary["passed"],
"train_failed": train_summary["failed"],
"train_total": train_summary["total"],
"train_results": train_results["results"],
"test_passed": test_summary["passed"] if test_summary else None,
"test_failed": test_summary["failed"] if test_summary else None,
"test_total": test_summary["total"] if test_summary else None,
"test_results": test_results["results"] if test_results else None,
# For backward compat with report generator
"passed": train_summary["passed"],
"failed": train_summary["failed"],
"total": train_summary["total"],
"results": train_results["results"],
})
# Write live report if path provided
if live_report_path:
partial_output = {
"original_description": original_description,
"best_description": current_description,
"best_score": "in progress",
"iterations_run": len(history),
"holdout": holdout,
"train_size": len(train_set),
"test_size": len(test_set),
"history": history,
}
live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
if verbose:
def print_eval_stats(label, results, elapsed):
pos = [r for r in results if r["should_trigger"]]
neg = [r for r in results if not r["should_trigger"]]
tp = sum(r["triggers"] for r in pos)
pos_runs = sum(r["runs"] for r in pos)
fn = pos_runs - tp
fp = sum(r["triggers"] for r in neg)
neg_runs = sum(r["runs"] for r in neg)
tn = neg_runs - fp
total = tp + tn + fp + fn
precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
accuracy = (tp + tn) / total if total > 0 else 0.0
print(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)", file=sys.stderr)
for r in results:
status = "PASS" if r["pass"] else "FAIL"
rate_str = f"{r['triggers']}/{r['runs']}"
print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}", file=sys.stderr)
print_eval_stats("Train", train_results["results"], eval_elapsed)
if test_summary:
print_eval_stats("Test ", test_results["results"], 0)
if train_summary["failed"] == 0:
exit_reason = f"all_passed (iteration {iteration})"
if verbose:
print(f"\nAll train queries passed on iteration {iteration}!", file=sys.stderr)
break
if iteration == max_iterations:
exit_reason = f"max_iterations ({max_iterations})"
if verbose:
print(f"\nMax iterations reached ({max_iterations}).", file=sys.stderr)
break
# Improve the description based on train results
if verbose:
print(f"\nImproving description...", file=sys.stderr)
t0 = time.time()
# Strip test scores from history so improvement model can't see them
blinded_history = [
{k: v for k, v in h.items() if not k.startswith("test_")}
for h in history
]
new_description = improve_description(
skill_name=name,
skill_content=content,
current_description=current_description,
eval_results=train_results,
history=blinded_history,
model=model,
log_dir=log_dir,
iteration=iteration,
)
improve_elapsed = time.time() - t0
if verbose:
print(f"Proposed ({improve_elapsed:.1f}s): {new_description}", file=sys.stderr)
current_description = new_description
# Find the best iteration by TEST score (or train if no test set)
if test_set:
best = max(history, key=lambda h: h["test_passed"] or 0)
best_score = f"{best['test_passed']}/{best['test_total']}"
else:
best = max(history, key=lambda h: h["train_passed"])
best_score = f"{best['train_passed']}/{best['train_total']}"
if verbose:
print(f"\nExit reason: {exit_reason}", file=sys.stderr)
print(f"Best score: {best_score} (iteration {best['iteration']})", file=sys.stderr)
return {
"exit_reason": exit_reason,
"original_description": original_description,
"best_description": best["description"],
"best_score": best_score,
"best_train_score": f"{best['train_passed']}/{best['train_total']}",
"best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
"final_description": current_description,
"iterations_run": len(history),
"holdout": holdout,
"train_size": len(train_set),
"test_size": len(test_set),
"history": history,
}
def main():
parser = argparse.ArgumentParser(description="Run eval + improve loop")
parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
parser.add_argument("--skill-path", required=True, help="Path to skill directory")
parser.add_argument("--description", default=None, help="Override starting description")
parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
parser.add_argument("--model", required=True, help="Model for improvement")
parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
args = parser.parse_args()
eval_set = json.loads(Path(args.eval_set).read_text())
skill_path = Path(args.skill_path)
if not (skill_path / "SKILL.md").exists():
print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
sys.exit(1)
name, _, _ = parse_skill_md(skill_path)
# Set up live report path
if args.report != "none":
if args.report == "auto":
timestamp = time.strftime("%Y%m%d_%H%M%S")
live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
else:
live_report_path = Path(args.report)
# Open the report immediately so the user can watch
live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
webbrowser.open(str(live_report_path))
else:
live_report_path = None
# Determine output directory (create before run_loop so logs can be written)
if args.results_dir:
timestamp = time.strftime("%Y-%m-%d_%H%M%S")
results_dir = Path(args.results_dir) / timestamp
results_dir.mkdir(parents=True, exist_ok=True)
else:
results_dir = None
log_dir = results_dir / "logs" if results_dir else None
output = run_loop(
eval_set=eval_set,
skill_path=skill_path,
description_override=args.description,
num_workers=args.num_workers,
timeout=args.timeout,
max_iterations=args.max_iterations,
runs_per_query=args.runs_per_query,
trigger_threshold=args.trigger_threshold,
holdout=args.holdout,
model=args.model,
verbose=args.verbose,
live_report_path=live_report_path,
log_dir=log_dir,
)
# Save JSON output
json_output = json.dumps(output, indent=2)
print(json_output)
if results_dir:
(results_dir / "results.json").write_text(json_output)
# Write final HTML report (without auto-refresh)
if live_report_path:
live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
print(f"\nReport: {live_report_path}", file=sys.stderr)
if results_dir and live_report_path:
(results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
if results_dir:
print(f"Results saved to: {results_dir}", file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,47 @@
"""Shared utilities for skill-creator scripts."""
from pathlib import Path
def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
"""Parse a SKILL.md file, returning (name, description, full_content)."""
content = (skill_path / "SKILL.md").read_text()
lines = content.split("\n")
if lines[0].strip() != "---":
raise ValueError("SKILL.md missing frontmatter (no opening ---)")
end_idx = None
for i, line in enumerate(lines[1:], start=1):
if line.strip() == "---":
end_idx = i
break
if end_idx is None:
raise ValueError("SKILL.md missing frontmatter (no closing ---)")
name = ""
description = ""
frontmatter_lines = lines[1:end_idx]
i = 0
while i < len(frontmatter_lines):
line = frontmatter_lines[i]
if line.startswith("name:"):
name = line[len("name:"):].strip().strip('"').strip("'")
elif line.startswith("description:"):
value = line[len("description:"):].strip()
# Handle YAML multiline indicators (>, |, >-, |-)
if value in (">", "|", ">-", "|-"):
continuation_lines: list[str] = []
i += 1
while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")):
continuation_lines.append(frontmatter_lines[i].strip())
i += 1
description = " ".join(continuation_lines)
continue
else:
description = value.strip('"').strip("'")
i += 1
return name, description, content

View File

@@ -0,0 +1,425 @@
---
name: test-case-generator
description: Generate structured software test cases from requirements using a strict four-stage workflow: requirement analysis and test point identification, test case design, quality review and optimization, and final deduplication/output. Use this skill whenever the user asks to analyze requirements, identify test points, write test cases, review test cases, optimize QA cases, export JSON test cases, or ensure test case coverage is aligned with requirement documents.
---
# Test Case Generator
Use this skill to generate high-quality software test cases from requirement documents, PRDs, user stories, feature descriptions, API specifications, UI interaction descriptions, bug reports, or change notes.
The workflow has four required stages:
1. 需求分析与测试点识别
2. 测试用例设计
3. 质量审核与优化
4. 最终整理与输出
Core rule: requirement documents are the source of truth. Test points guide titles and coverage scope, but concrete test steps and expected results must come from the actual requirement content.
## When to use this skill
Use this skill when the user asks for any of the following:
- 生成测试用例、编写测试用例、设计测试用例
- 根据需求文档提取测试点
- 根据测试点生成 JSON 测试用例
- 审核、优化、补充测试用例
- 去重整合测试用例
- 输出可导入测试管理平台的测试用例集合
- 生成正常流程、异常流程、边界值、专项验证、回归测试用例
If the user only provides rough requirements, continue with explicit assumptions and mark missing information as open questions. Do not invent unsupported business behavior as if it were stated in the requirement.
## Global principles
- Requirement document = source for test steps, expected results, preconditions, data, and business behavior.
- Test point list = title and coverage guidance.
- One atomic scenario maps to one test case; do not compress multiple independent rules, validations, states, or exception branches into one large case.
- Test case count must equal atomic scenario count in the final output.
- Test case order must follow the module order and business flow order.
- Test case names must directly describe the atomic scenario, including normal, exception, boundary, permission, state, data, or integration context.
- Include exception, boundary, UI, permission, performance, compatibility, or security tests when the requirement explicitly supports them or when business risk clearly justifies them; uncertain details must be marked as open questions or “待确认”.
- Every step must be executable and every expected result must be verifiable.
- Prefer detailed coverage over overly terse output. For each explicit requirement point, identify all applicable normal, exception, boundary, data validation, state transition, permission, UI feedback, and persistence scenarios.
## Stage 1: 需求分析与测试点识别
Act as a requirement analyst. Deeply analyze the requirement document and identify all test points.
Important notes:
- The test points you provide will become the titles and coverage guidance for the test cases.
- The test case designer will use the test points to determine the testing scope, but the concrete test steps must be extracted directly from the requirement document.
- Therefore, the test point list must be complete, accurate, and structured to avoid missed coverage.
### Responsibilities
1. Deeply read the requirement document and understand business functions, operation flows, UI interactions, data processing, and constraints.
2. Identify functional modules according to business logic.
3. Extract all key requirement points for each functional module, including pages, fields, buttons, APIs, status transitions, messages, persistence, permissions, background jobs, and integrations.
4. Split each requirement point into atomic scenarios: normal flow, exception flow, boundary value, data validation, permission, state transition, UI feedback, persistence, and regression impact when applicable.
5. Output a structured atomic scenario checklist for test case design. Do not output only high-level module summaries.
### Test point identification strategy
#### Normal flow test points: must cover
Include:
- Main-flow test points for core business functions.
- Standard user operation paths.
- Basic UI interaction functions.
- Successful create, query, update, delete, submit, approve, export, import, sync, or callback flows when mentioned.
- Data persistence and state transition verification for each operation that changes data.
#### Exception and boundary value test points: include only when applicable
Only include exception and boundary test points when at least one of these conditions is true:
- The requirement explicitly mentions input restrictions, such as length, numeric range, format, type, uniqueness, required fields, allowed values, file size, file type, time range, count limit, or status limit.
- The requirement describes exception handling, such as error messages, alternative flows, failure states, retries, rollback, duplicate operations, expired data, invalid tokens, or missing permissions.
- The feature involves critical data processing, audit records, financial amounts, workflow state changes, or security-sensitive operations.
- The requirement is complex and contains multiple business rules or constraints.
Do not fabricate boundary or exception rules that are not supported by the requirement. If they are likely important but missing, list them as open questions instead of test points.
#### Special verification test points: include only when explicitly required
Only include these sections when the requirement explicitly mentions the relevant requirement:
- UI verification: display, interaction experience, responsive layout, visual state, or UI copy.
- Permission verification: user permissions, operation permissions, data permissions, roles, tenants, or scopes.
- Performance verification: response time, concurrency, throughput, resource usage, or capacity.
- Compatibility verification: browser, device, operating system, app version, or platform support.
- Security verification: data security, access security, transmission security, sensitive information, or attack prevention.
### Stage 1 output format
Use this exact structure:
```markdown
## 功能模块1[模块名称]
### 正常流程测试点:
- [编号]、[测试点名称][简要说明]
- [编号]、[测试点名称][简要说明]
### 异常&边界值测试点:(仅在需求文档明确涉及时才包含此部分)
- [编号]、[测试点名称][简要说明]
- [编号]、[测试点名称][简要说明]
### 专项验证测试点:(仅在需求文档明确提及相关要求时才包含此部分)
- [编号]、[测试点名称][简要说明]
- [编号]、[测试点名称][简要说明]
```
If a section is not applicable, omit the section unless the user asks to show empty sections.
## Stage 2: 测试用例设计
Act as a test case designer. Write detailed test cases based on both the test point checklist and the original requirement document.
### Mandatory workflow
Strictly follow these steps:
1. Carefully read the original requirement document and understand business functions, operation flows, UI interactions, data processing, and details.
2. List all test points from the requirement analyst one by one.
3. Write test cases one by one in the exact order of the test points.
4. Write concrete test steps based on the requirement document's actual functional description.
5. Final-check count and order: test case count must equal test point count, and order must be exactly the same.
### Important principles
- Requirement analyst test points = case titles and testing scope guidance.
- Requirement document functional descriptions = source for concrete test steps.
- Never write test steps only from the test point name. Always return to the requirement document to find the corresponding functional details.
### Design principles
- Strictly follow test point order.
- One test point maps to one test case.
- Test point as title, requirement document as content.
- Steps must be concrete and executable, including what to click, what to input, which page to operate on, which API to call, or which data to prepare.
- Expected results must be verifiable, including visible results, state changes, data changes, messages, status codes, or persisted records.
### Before writing cases: count test points first
Before generating JSON, output a test point count section:
```markdown
测试点统计:
功能模块X
- 正常流程测试点:[测试点1]、[测试点2]...共X个
- 异常&边界值测试点:[测试点1]、[测试点2]...共X个【如果需求分析师未提供此类测试点则显示"无"】
- 专项验证测试点:[测试点1]、[测试点2]...共X个【如果需求分析师未提供此类测试点则显示"无"】
总计X个测试点需要编写X个测试用例
```
### Stage 2 JSON output format
After the count section, generate test cases using this JSON shape:
```json
{
"业务模块名称": [
{
"ID": "用例编号",
"用例名称": "[测试点名称](直接使用需求分析师的测试点名称,不要修改)",
"所属模块": "业务模块名称",
"前置条件": "前置条件描述(基于需求文档的具体要求)",
"备注": "测试用例相关备注说明",
"步骤描述": "具体操作步骤1基于需求文档的功能描述\n具体操作步骤2基于需求文档的功能描述\n具体操作步骤3基于需求文档的功能描述",
"预期结果": "具体预期结果1基于需求文档的功能要求\n具体预期结果2基于需求文档的功能要求\n具体预期结果3基于需求文档的功能要求",
"编辑模式": "创建",
"标签": "功能测试",
"用例等级": "P1/P2/P3/P4/P5",
"用例状态": "待执行"
}
]
}
```
### Field rules
- ID should be stable and ordered, such as `TC-001`, `TC-002`, or the user's requested format.
- 用例名称 must exactly match the test point name.
- 所属模块 must match the functional module.
- 前置条件 must come from requirements or clearly stated assumptions.
- 步骤描述 must use newline-separated concrete operations.
- 预期结果 must use newline-separated concrete assertions.
- 编辑模式 defaults to `创建`.
- 标签 defaults to `功能测试` unless another type is clearly more appropriate.
- 用例等级 should reflect business priority and risk:
- P1: core/blocking flow or high-risk business function.
- P2: important normal/exception flow.
- P3: general validation or lower-risk branch.
- P4/P5: low-risk, compatibility, or optional verification when applicable.
- 用例状态 defaults to `待执行`.
## Stage 3: 质量审核与优化
Act as a test case reviewer. Perform a comprehensive review of the generated test cases.
### Core review principles
- Requirement document is the fundamental basis of the test cases.
- Each test step should be traceable to a functional description in the requirement document.
- Test points guide coverage scope and must all have corresponding test cases.
- Test steps must be concrete and executable, not abstract concepts.
### Review checklist
#### 1. Requirement basis check
Check:
- Can every test step be traced to a corresponding functional description in the requirement?
- Does the operation path match the business flow in the requirement?
- Are data input and output consistent with requirement specifications?
- Does UI interaction reflect the UI design or interaction description in the requirement?
- Are there any imagined steps detached from the requirement and based only on the test point name?
#### 2. Test point coverage check
Check:
- Quantity: does test case count equal test point count?
- Order: does test case order exactly match test point order?
- Normal flow: does every normal flow test point have a corresponding case?
- Exception and boundary: does every exception/boundary test point have a dedicated case?
- Special verification: are data, UI, permission, performance, compatibility, or security points covered when provided?
- Name consistency: does each test case name directly use the test point name?
#### 3. Test quality standard check
Check:
- Step specificity: do steps include concrete operations such as what to click, what to input, and where to operate?
- Result verifiability: are expected results clear enough to determine pass/fail?
- Preconditions completeness: are all required conditions before execution stated?
- Test data sufficiency: are required data and parameters clear?
### Stage 3 output format
Output a concise review report:
```markdown
## 质量审核结果
### 1. 需求文档依据性检查
- 结论:通过/需修改
- 问题:...
- 优化建议:...
### 2. 测试点覆盖度检查
- 测试点数量X
- 测试用例数量X
- 顺序一致性:一致/不一致
- 名称一致性:一致/不一致
- 缺失项:无/...
### 3. 测试质量标准检查
- 步骤具体性:通过/需修改
- 结果可验证性:通过/需修改
- 前置条件完整性:通过/需修改
- 数据准备充分性:通过/需修改
### 4. 需要修正的用例
- [ID] [用例名称]:问题与修正建议
```
If issues are found, revise the affected test cases before final output.
## Stage 4: 最终整理与输出
Act as a test case organizer. Perform final deduplication, integration, sorting, and complete output.
### Core responsibilities
1. Collect and extract all JSON-format test cases from previous design and review outputs.
2. Deduplicate and merge repeated or similar test cases so each test point has exactly one best case.
3. Sort the integrated test cases according to the functional module order and test point order from Stage 1.
4. Output a complete deduplicated test case collection.
### Deduplication and integration strategy
#### Same functional point
If multiple cases test the same functional point:
- Choose the version with the most detailed steps and clearest expected results.
- Merge useful steps and assertions from other versions if coverage is improved.
- Do not reduce coverage during merging.
#### Similar cases
If cases have similar objectives but different wording:
- Merge them into a more comprehensive case only if they correspond to the same test point.
- Preserve the core testing value of each case.
- Avoid creating functional testing blind spots.
### Final quality checklist
Before final output, verify:
- [ ] Test case count equals test point count.
- [ ] Test case order exactly matches test point order.
- [ ] Test case names directly use test point names.
- [ ] Every test case step is concrete and executable.
- [ ] Every test case can be traced to the requirement document.
- [ ] All requirement analyst test points are covered.
- [ ] Duplicate and redundant test cases are removed.
- [ ] Expected results are clear and verifiable.
### Stage 4 final output format
Output:
1. Final test point count summary.
2. Final deduplicated JSON test case collection.
3. Final quality checklist result.
4. Assumptions and open questions, if any.
Use this structure:
```markdown
## 最终测试点统计
- 功能模块AX个
- 功能模块BX个
- 总计X个测试点X个测试用例
## 最终测试用例JSON
```json
{
"业务模块名称": [
{
"ID": "TC-001",
"用例名称": "测试点名称",
"所属模块": "业务模块名称",
"前置条件": "...",
"备注": "...",
"步骤描述": "...",
"预期结果": "...",
"编辑模式": "创建",
"标签": "功能测试",
"用例等级": "P1",
"用例状态": "待执行"
}
]
}
```
## 最终质量检查
- 测试用例数量是否等于测试点数量:是/否
- 测试用例顺序是否与测试点顺序完全一致:是/否
- 测试用例名称是否直接使用测试点名称:是/否
- 步骤是否具体可操作:是/否
- 是否能追溯到需求文档:是/否
- 是否覆盖所有测试点:是/否
- 是否去除重复冗余:是/否
- 预期结果是否明确可验证:是/否
## 假设与待确认问题
- ...
```
## Default response behavior
When the user asks to generate test cases from a requirement, perform all four stages unless they explicitly request only one stage.
Recommended output order:
1. Stage 1 test point checklist.
2. Stage 2 test point count and JSON test cases.
3. Stage 3 quality review report and any corrections.
4. Stage 4 final deduplicated JSON output.
5. Assumptions and open questions.
If the requirement is very long, you may keep Stage 3 concise but must still perform count, order, name, traceability, and deduplication checks.
## Handling incomplete requirements
When requirements are incomplete:
- Continue generating useful cases based on available information.
- Clearly separate requirement-supported cases from assumptions.
- Do not add unsupported exception/boundary/special verification test points as confirmed facts.
- Put missing business rules, validation limits, permission rules, UI details, or error messages into open questions.
## API-specific adaptation
For API requirements:
- Steps may include method, path, headers, auth state, request body, and execution method.
- Expected results should include status code, response body, database side effects, idempotency, and error format when required.
- Boundary and exception tests should only use limits/rules specified by the API requirement or clearly marked assumptions.
## Frontend-specific adaptation
For frontend/UI requirements:
- Steps should identify page, control, action, input, and navigation path.
- Expected results should include visible UI text, component state, validation message, navigation result, and persisted data when required.
- UI-specific verification should only be included when UI display or interaction requirements are explicit.
## Backend/business-logic adaptation
For backend or business rules:
- Steps should identify data setup, operation trigger, business condition, and verification method.
- Expected results should include state transition, persisted records, downstream effects, event/message production, or rollback behavior when required.
## Example
Input requirement:
```markdown
登录页支持手机号+短信验证码登录。验证码 60 秒内不能重复发送,验证码 5 分钟有效,输错提示“验证码错误”。登录成功后跳转首页。
```
Expected behavior:
- Stage 1 identifies test points such as successful verification-code login, resend restriction within 60 seconds, expired code after 5 minutes, wrong code error, and successful redirect after login.
- Stage 2 outputs one JSON test case for each test point in the same order.
- Stage 3 checks that every step and expected result comes from the requirement.
- Stage 4 outputs deduplicated final JSON and confirms count/order/name consistency.

View File

@@ -0,0 +1,29 @@
{
"skill_name": "test-case-generator",
"evals": [
{
"id": 1,
"prompt": "根据以下需求生成测试用例:登录页支持手机号+短信验证码登录。验证码60秒内不能重复发送验证码5分钟有效输错提示“验证码错误”登录成功后跳转首页。",
"expected_output": "应按四阶段输出先识别测试点再统计测试点数量并生成JSON测试用例再进行质量审核最后输出去重后的最终JSON。测试用例数量应等于测试点数量名称应直接使用测试点名称步骤和预期结果应来自需求。",
"files": []
},
{
"id": 2,
"prompt": "我有一份需求文档和需求分析师给出的测试点清单请严格按测试点顺序生成JSON测试用例并确保一个测试点对应一个用例。",
"expected_output": "应先列出测试点统计,明确各模块正常流程、异常&边界值、专项验证测试点数量然后按测试点顺序输出JSON每个用例名称直接使用测试点名称最后检查数量、顺序、名称一致性。",
"files": []
},
{
"id": 3,
"prompt": "帮我审核这批测试用例:确认是否覆盖所有测试点,步骤是否来自需求文档,是否有重复用例,并输出最终整理后的版本。",
"expected_output": "应执行质量审核与最终整理检查需求文档依据性、测试点覆盖度、步骤具体性、预期结果可验证性识别重复或相似用例并合并最终输出去重后的JSON和质量检查清单。",
"files": []
},
{
"id": 4,
"prompt": "根据这个订单接口需求生成测试用例POST /api/orders 创建订单请求包含商品ID、数量、收货地址ID库存不足返回失败重复提交不能创建两单创建成功返回订单ID。",
"expected_output": "应适配API场景测试步骤包含接口方法、路径、请求体、执行方式预期结果包含状态码或响应内容、订单ID、库存不足失败、重复提交幂等或不重复创建。异常和边界测试点只基于需求明确内容或标记为待确认。",
"files": []
}
]
}

View File

@@ -0,0 +1,101 @@
---
name: boundary-case-testcase-generator
description: 从 PRD、需求文档、用户故事、功能说明、接口说明、UI 交互说明或业务规则中,生成、补充、优化和评审测试用例,尤其擅长边界值、极值、空值、越界、长度限制、状态切换、权限、时间、金额、枚举、组合条件等场景。只要用户要根据需求生成测试用例、完善测试点、补充边界场景、检查是否遗漏边界条件,就应优先使用此 Skill。
---
# 边界值测试用例生成
请将输入的 PRD、需求说明、业务规则、接口文档或 UI 说明,转换为适合入库的测试用例 JSON。
## 适用场景
当用户希望你:
- 根据 PRD 或需求文档生成测试用例
- 补充边界值、异常值、极限值测试点
- 评审某个功能是否遗漏关键测试场景
- 将模糊需求拆成更细的测试点
- 为接口、页面、流程、规则、权限生成详细用例
如果信息不足,不要自行编造需求;应使用“待确认”标记不明确部分,并继续围绕已知信息生成可验证的测试点。
## 任务目标
你的目标不是写总结,而是输出可直接入库的测试用例数据。重点是:
- 覆盖边界值、异常输入、临界状态和业务限制
- 用更细的粒度拆解测试点,避免一个用例覆盖太多内容
- 保持用例名称具体、步骤清晰、预期可检查
- 严格遵守固定 JSON 输出结构
## 分析步骤
### 1. 先识别需求对象
从输入中提取:
- 业务目标
- 功能模块与子模块
- 输入字段、参数、条件、状态
- 约束规则:长度、范围、格式、时间、金额、频率、并发、权限、依赖关系
- 成功路径与失败路径
### 2. 再提取边界点
优先寻找以下边界:
- 数值边界最小值、最大值、临界值、0、负数、极大值、小数精度
- 字符边界:空字符串、仅空格、最短、最长、特殊字符、中文/英文/表情
- 集合边界无数据、1条、最大条数、重复项、顺序变化
- 时间边界:起止时间、跨天、跨月、当前时间、过期、时区
- 状态边界:未开始/进行中/已完成/已失效/已取消
- 权限边界:未登录、无权限、角色切换、越权访问
- 组合边界:多个限制同时触发
### 3. 设计用例
每个用例都要尽量单一、具体、可执行:
- title 写成“具体场景 + 测试点”
- module_name 写明父模块/子模块/叶子模块
- precondition 写清楚前置条件;没有就写“待确认”
- steps 用数字编号逐行写
- expected_result 也必须逐行编号
- priority 默认按风险和影响判断;若无明确规则,优先级可保守取 2
- case_type 使用平台要求的枚举值;若无更多信息,按功能/边界验证类用例理解为 1
- tags 至少保留“AI生成”
### 4. 输出前自检
检查是否满足:
- 是否覆盖边界值与异常值
- 是否存在编造需求的内容
- 是否每条用例都足够具体
- 是否 steps 和 expected_result 都是编号格式
- 是否完全符合固定 JSON 结构
## 输出约束
必须只输出一个 JSON 对象,不要输出 Markdown、解释、代码块或多余文本。
固定输出结构如下:
{"cases":[{"title":"用例名称/测试点名称","module_name":"父模块/子模块/叶子模块","precondition":"前置条件","steps":"1. 步骤1\n2. 步骤2","expected_result":"1. 预期结果1\n2. 预期结果2","priority":2,"case_type":1,"tags":["AI生成"]}]}
## 编写原则
- 以需求为准,不臆造不存在的规则
- 细化到可执行层级,不要把多个独立场景揉成一个大用例
- 边界值优先于笼统的“正常/异常”描述
- 预期结果要可观察、可判断、可入库
- 当需求模糊时,保留“不确定项”,并围绕已知规则生成用例
## 示例
输入示例:
“用户在注册页输入手机号,手机号为 11 位,验证码 6 位,密码长度 8-20 位。”
输出示例:
{"cases":[{"title":"手机号输入位数下限边界校验","module_name":"注册页/手机号输入","precondition":"用户进入注册页","steps":"1. 在手机号输入框输入 10 位手机号\n2. 点击获取验证码或提交","expected_result":"1. 系统提示手机号格式不正确\n2. 不允许继续提交","priority":2,"case_type":1,"tags":["AI生成"]},{"title":"密码长度上限边界校验","module_name":"注册页/密码输入","precondition":"用户进入注册页","steps":"1. 在密码输入框输入 20 位密码\n2. 点击提交","expected_result":"1. 密码可正常通过长度校验\n2. 若其他字段满足要求,提交成功","priority":2,"case_type":1,"tags":["AI生成"]}]}
## 处理信息不足的方式
如果需求中没有明确说明:
- 是否允许空值
- 最大长度是多少
- 时间范围如何计算
- 是否支持特殊字符
- 权限边界如何定义
则在用例里明确写“待确认”,不要推测具体规则;同时继续生成基于已知条件的边界值测试点。

View File

@@ -0,0 +1,608 @@
# 测试 Skills 与业务规则接口文档
## 1. 基础说明
接口前缀:`/it/api`
统一响应:
```json
{
"code": 20000,
"msg": "success",
"data": {}
}
```
鉴权:需要登录态 token。
建议请求头:
```http
Authorization: Bearer ${token}
```
也兼容:
```http
accessToken: ${token}
```
***
## 2. 枚举
### 2.1 Skill 类型 skill\_type
| 值 | 含义 |
| -- | ------ |
| 1 | 通用测试策略 |
| 2 | 历史缺陷模式 |
| 3 | 边界场景 |
| 4 | 接口测试 |
| 5 | UI 测试 |
| 6 | 性能测试 |
| 7 | 安全测试 |
| 8 | 数据一致性 |
| 9 | 并发/幂等 |
| 99 | 其他 |
### 2.2 风险等级 risk\_level
| 值 | 含义 |
| - | ---- |
| 0 | 高风险 |
| 1 | 中高风险 |
| 2 | 中风险 |
| 3 | 低风险 |
### 2.3 业务规则优先级 priority
| 值 | 含义 |
| - | ----- |
| 0 | 高优先级 |
| 1 | 中高优先级 |
| 2 | 中优先级 |
| 3 | 低优先级 |
### 2.4 状态 status
| 值 | 含义 |
| - | -- |
| 1 | 启用 |
| 2 | 停用 |
| 3 | 草稿 |
***
## 3. Skill 接口
### 3.1 创建 Skill
```http
POST /it/api/skill/create
```
权限:`skill:create`
请求体:
```json
{
"projectId": 1,
"moduleId": 10,
"name": "支付金额边界校验",
"description": "用于支付金额相关需求的边界测试生成",
"skillType": 3,
"riskLevel": 0,
"tags": ["支付", "金额", "边界"],
"status": 1
}
```
参数:
| 字段 | 类型 | 必填 | 说明 |
| ------------- | ----------- | ---- | ---- |
| projectId | number | 是 | 项目 ID |
| moduleId | number | 否 | 模块 ID |
| name | string | 是 | Skill 名称 |
| description | string | 否 | 用户补充描述,会作为大模型生成 Skill 内容的输入 |
| skillType | number | 否 | Skill 类型,未传时默认 1大模型也可能根据内容修正 |
| riskLevel | number | 否 | 风险等级,未传时默认 2大模型也可能根据内容修正 |
| tags | string\[] | 否 | 初始标签数组,大模型可能补全 |
| status | number | 否 | 状态,默认 1 |
说明:
- `code` 不需要前端传,后端自动生成项目内唯一编码。
- `triggerCondition` 不需要前端传,后端默认使用当前 AI 生成用例的触发条件。
- `outputSpec` 不需要前端传,后端默认使用当前 AI 生成用例的输出规范。
- `reasoningPath` 不需要前端传,后端会调用大模型并根据 `config/skills/skill-creator/SKILL.md` 规则生成。
- `ownerId` 不需要前端传,后端默认取当前登录人 ID。
- 创建成功后,后端会在 `config/skills/{产品名称}/{项目名称}/{模块名称}/{Skill名称}/SKILL.md` 生成 Skill 文件。
- 数据库会保存生成文件路径到 `skill_file_path`
成功响应:
```json
{
"code": 20000,
"msg": "success",
"data": {
"id": 1
}
}
```
常见失败:
```json
{
"code": 40009,
"msg": "projectId、name 为必传参数"
}
```
```json
{
"code": 40009,
"msg": "AI生成 Skill 内容失败: xxx"
}
```
***
### 3.2 更新 Skill
```http
POST /it/api/skill/update
```
权限:`skill:update`
请求体:
```json
{
"skillId": 1,
"name": "支付金额边界校验",
"description": "更新后的描述",
"triggerCondition": "更新后的触发条件",
"reasoningPath": "更新后的推理路径",
"outputSpec": "更新后的输出规范",
"skillType": 3,
"riskLevel": 0,
"tags": ["支付", "金额", "边界", "参数校验"],
"status": 1,
"ownerId": 8
}
```
说明:
- 当前接口不支持更新 `code`
- 更新成功后,后端会根据更新后的 Skill 内容重新创建 `SKILL.md` 文件。
- 新文件路径会同步更新到数据库 `skill_file_path`
- 数据库更新成功后会删除原 Skill 文件夹。
- 如果数据库更新失败,后端会删除新创建的文件夹并保留旧数据库记录和旧文件,避免数据库与文件不一致。
成功响应:
```json
{
"code": 20000,
"msg": "success",
"data": {
"id": 1
}
}
```
***
### 3.3 删除 Skill
```http
POST /it/api/skill/delete
```
权限:`skill:delete`
请求体:
```json
{
"skillId": 1
}
```
说明:软删除,设置 `is_delete = 1`,并删除该 Skill 对应的 `config/skills/{产品名称}/{项目名称}/{模块名称}/{Skill名称}` 文件夹。
成功响应:
```json
{
"code": 20000,
"msg": "success",
"data": {
"id": 1
}
}
```
***
### 3.4 Skill 详情
```http
GET /it/api/skill/detail?skillId=1
```
权限:`skill:detail`
成功响应:
```json
{
"code": 20000,
"msg": "success",
"data": {
"id": 1,
"project_id": 1,
"module_id": 10,
"name": "支付金额边界校验",
"code": "PAY_AMOUNT_BOUNDARY",
"description": "用于支付金额相关需求的边界测试生成",
"trigger_condition": "需求中出现支付、金额、扣款、退款、余额等关键词时触发",
"reasoning_path": "识别金额字段构造最小值、最大值、0、负数、小数精度、超限金额等场景",
"output_spec": "必须覆盖正常金额、0元、负数、超大金额、小数精度、余额不足",
"skill_file_path": "D:\\zhyy\\effekt-interface\\config\\skills\\产品A\\项目A\\支付模块\\支付金额边界校验\\SKILL.md",
"skill_type": 3,
"risk_level": 0,
"tags": ["支付", "金额", "边界"],
"status": 1,
"owner_id": 8,
"created_by": 6,
"usage_count": 0,
"is_delete": 0,
"created_time": "2025-09-20 12:00:00",
"updated_time": "2025-09-20 12:00:00"
}
}
```
***
### 3.5 Skill 列表
```http
GET /it/api/skill/list
```
权限:`skill:list`
Query 参数:
| 参数 | 类型 | 必填 | 说明 |
| --------- | ------ | -- | ------------------------------------------- |
| pageNo | number | 否 | 页码,默认 1 |
| pageSize | number | 否 | 每页数量,默认 20 |
| projectId | number | 否 | 项目 ID |
| moduleId | number | 否 | 模块 ID |
| status | number | 否 | 状态 |
| skillType | number | 否 | Skill 类型 |
| riskLevel | number | 否 | 风险等级 |
| keyword | string | 否 | 搜索 name/code/description/trigger\_condition |
| tag | string | 否 | 单个标签过滤 |
请求示例:
```http
GET /it/api/skill/list?pageNo=1&pageSize=20&projectId=1&moduleId=10&keyword=&status=1
```
成功响应:
```json
{
"code": 20000,
"msg": "success",
"data": {
"list": [
{
"id": 1,
"project_id": 1,
"module_id": 10,
"name": "支付金额边界校验",
"code": "PAY_AMOUNT_BOUNDARY",
"description": "用于支付金额相关需求的边界测试生成",
"trigger_condition": "需求中出现支付、金额、扣款、退款、余额等关键词时触发",
"reasoning_path": "识别金额字段...",
"output_spec": "必须覆盖正常金额...",
"skill_file_path": "D:\\zhyy\\effekt-interface\\config\\skills\\产品A\\项目A\\支付模块\\支付金额边界校验\\SKILL.md",
"skill_type": 3,
"risk_level": 0,
"tags": ["支付", "金额", "边界"],
"status": 1,
"owner_id": 8,
"usage_count": 0,
"created_time": "2025-09-20 12:00:00",
"updated_time": "2025-09-20 12:00:00"
}
],
"total": 1
}
}
```
***
## 4. Business Rule 接口
### 4.1 创建业务规则
```http
POST /it/api/business-rule/create
```
权限:`business-rule:create`
请求体:
```json
{
"projectId": 1,
"moduleId": 10,
"name": "支付金额必须大于 0",
"description": "用于支付、充值、扣款、退款等金额输入场景的参数校验规则",
"priority": 0,
"tags": ["支付", "金额", "参数校验"],
"status": 1
}
```
参数:
| 字段 | 类型 | 必填 | 说明 |
| ----------- | ----------- | ---- | ---- |
| projectId | number | 是 | 项目 ID |
| moduleId | number | 否 | 模块 ID |
| name | string | 是 | 规则名称 |
| description | string | 否 | 用户补充描述,会作为大模型生成规则内容的输入 |
| priority | number | 否 | 优先级,默认 2大模型也可能根据内容修正 |
| tags | string\[] | 否 | 初始标签数组,大模型可能补全 |
| status | number | 否 | 状态,默认 1 |
说明:
- `ruleCode` 不需要前端传,后端自动生成项目内唯一编码。
- `ruleContent``applicableScene``example` 不需要前端传,后端会调用大模型生成。
- `ownerId` 不需要前端传,后端默认取当前登录人 ID。
- 创建成功后,后端会在 `config/rules/{产品名称}/{项目名称}/{模块名称}/{规则名称}/RULE.md` 生成业务规则文件。
- 数据库会保存生成文件路径到 `rule_file_path`
成功响应:
```json
{
"code": 20000,
"msg": "success",
"data": {
"id": 1
}
}
```
***
### 4.2 更新业务规则
```http
POST /it/api/business-rule/update
```
权限:`business-rule:update`
请求体:
```json
{
"ruleId": 1,
"name": "支付金额必须大于 0",
"ruleContent": "支付金额必须大于 0等于 0 或小于 0 时接口应返回参数错误",
"applicableScene": "支付、充值、扣款、退款金额输入",
"example": "amount=0预期返回金额必须大于0",
"priority": 0,
"tags": ["支付", "金额", "参数校验"],
"status": 1,
"ownerId": 8
}
```
说明:
- 当前接口不支持更新 `ruleCode`
- 更新成功后,后端会根据更新后的业务规则内容重新创建 `RULE.md` 文件。
- 新文件路径会同步更新到数据库 `rule_file_path`
- 数据库更新成功后会删除原业务规则文件夹。
- 如果数据库更新失败,后端会删除新创建的文件夹并保留旧数据库记录和旧文件,避免数据库与文件不一致。
成功响应:
```json
{
"code": 20000,
"msg": "success",
"data": {
"id": 1
}
}
```
***
### 4.3 删除业务规则
```http
POST /it/api/business-rule/delete
```
权限:`business-rule:delete`
请求体:
```json
{
"ruleId": 1
}
```
说明:软删除,设置 `is_delete = 1`,并删除该业务规则对应的 `config/rules/{产品名称}/{项目名称}/{模块名称}/{规则名称}` 文件夹。
***
### 4.4 业务规则详情
```http
GET /it/api/business-rule/detail?ruleId=1
```
权限:`business-rule:detail`
成功响应:
```json
{
"code": 20000,
"msg": "success",
"data": {
"id": 1,
"project_id": 1,
"module_id": 10,
"name": "支付金额必须大于 0",
"rule_code": "PAY_AMOUNT_GT_ZERO",
"rule_content": "支付金额必须大于 0等于 0 或小于 0 时接口应返回参数错误",
"applicable_scene": "支付、充值、扣款、退款金额输入",
"example": "amount=0预期返回金额必须大于0",
"rule_file_path": "D:\\zhyy\\effekt-interface\\config\\rules\\产品A\\项目A\\支付模块\\支付金额必须大于 0\\RULE.md",
"priority": 0,
"tags": ["支付", "金额", "参数校验"],
"status": 1,
"owner_id": 8,
"created_by": 6,
"usage_count": 0,
"is_delete": 0,
"created_time": "2025-09-20 12:00:00",
"updated_time": "2025-09-20 12:00:00"
}
}
```
***
### 4.5 业务规则列表
```http
GET /it/api/business-rule/list
```
权限:`business-rule:list`
Query 参数:
| 参数 | 类型 | 必填 | 说明 |
| --------- | ------ | -- | -------------------------------------------------- |
| pageNo | number | 否 | 页码,默认 1 |
| pageSize | number | 否 | 每页数量,默认 20 |
| projectId | number | 否 | 项目 ID |
| moduleId | number | 否 | 模块 ID |
| status | number | 否 | 状态 |
| priority | number | 否 | 优先级 |
| keyword | string | 否 | 搜索 name/rule\_code/rule\_content/applicable\_scene |
| tag | string | 否 | 单个标签过滤 |
请求示例:
```http
GET /it/api/business-rule/list?pageNo=1&pageSize=20&projectId=1&moduleId=10&keyword=&status=1
```
成功响应:
```json
{
"code": 20000,
"msg": "success",
"data": {
"list": [
{
"id": 1,
"project_id": 1,
"module_id": 10,
"name": "支付金额必须大于 0",
"rule_code": "PAY_AMOUNT_GT_ZERO",
"rule_content": "支付金额必须大于 0等于 0 或小于 0 时接口应返回参数错误",
"applicable_scene": "支付、充值、扣款、退款金额输入",
"example": "amount=0预期返回金额必须大于0",
"priority": 0,
"tags": ["支付", "金额", "参数校验"],
"status": 1,
"owner_id": 8,
"usage_count": 0,
"created_time": "2025-09-20 12:00:00",
"updated_time": "2025-09-20 12:00:00"
}
],
"total": 1
}
}
```
***
## 5. 调用示例
### 5.1 curl 创建 Skill
```bash
curl -X POST 'http://localhost:5010/it/api/skill/create' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer your-token' \
-d '{
"projectId": 1,
"name": "支付金额边界校验",
"code": "PAY_AMOUNT_BOUNDARY",
"triggerCondition": "需求中出现支付、金额、扣款、退款、余额等关键词时触发",
"skillType": 3,
"riskLevel": 0,
"tags": ["支付", "金额", "边界"]
}'
```
### 5.2 curl 创建业务规则
```bash
curl -X POST 'http://localhost:5010/it/api/business-rule/create' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer your-token' \
-d '{
"projectId": 1,
"name": "支付金额必须大于 0",
"ruleCode": "PAY_AMOUNT_GT_ZERO",
"ruleContent": "支付金额必须大于 0等于 0 或小于 0 时接口应返回参数错误",
"priority": 0,
"tags": ["支付", "金额", "参数校验"]
}'
```
***
## 6. 注意事项
1. 返回字段是后端数据库下划线风格,例如 `project_id``created_time`
2. `tags` 是数组,创建和更新时必须传数组。
3. 删除是软删除。
4. Skill 的 `code` 和业务规则的 `ruleCode` 当前不支持更新。
5. 本次接口只做 Skills / Rules 管理,暂未接入 PRD AI 生成用例链路。

View File

@@ -0,0 +1,26 @@
-- 添加文档源模块的权限和菜单
BEGIN;
-- 1. 添加文档源相关权限
INSERT INTO public.permission (code, name, module, action, description, status, is_delete, created_time, updated_time) VALUES
('document:list', '文档源列表', 'document', 'list', '查看文档源列表', 1, 0, NOW(), NOW()),
('document:detail', '文档源详情', 'document', 'detail', '查看文档源详情', 1, 0, NOW(), NOW()),
('document:create', '文档源创建', 'document', 'create', '创建文档源', 1, 0, NOW(), NOW()),
('document:update', '文档源更新', 'document', 'update', '更新文档源', 1, 0, NOW(), NOW()),
('document:delete', '文档源删除', 'document', 'delete', '删除文档源', 1, 0, NOW(), NOW()),
('document:generate', '文档源生成用例', 'document', 'generate', '根据文档生成测试用例', 1, 0, NOW(), NOW()),
('document:import', '文档源导入用例', 'document', 'import', '导入生成的测试用例', 1, 0, NOW(), NOW())
ON CONFLICT (code) DO UPDATE SET name=EXCLUDED.name, description=EXCLUDED.description;
-- 2. 添加文档源菜单(作为测试用例的子菜单)
-- 先查找测试用例菜单的ID
WITH case_menu AS (
SELECT id FROM public.menu WHERE code = 'case' AND is_delete = 0
)
INSERT INTO public.menu (parent_id, name, code, type, path, component, icon, permission_code, sort, visible, status, is_delete, created_time, updated_time)
SELECT id, '文档源管理', 'document', 2, '/document', 'document/index', 'file-text', 'document:list', 10, 1, 1, 0, NOW(), NOW()
FROM case_menu
ON CONFLICT (code) DO UPDATE SET name=EXCLUDED.name, path=EXCLUDED.path, component=EXCLUDED.component;
COMMIT;

View File

@@ -0,0 +1,5 @@
-- 为module表添加status字段默认0待确认1正常2弃用
ALTER TABLE module ADD COLUMN IF NOT EXISTS status INTEGER DEFAULT 0 COMMENT '0待确认1正常2弃用';
-- 将历史数据的status设置为1
UPDATE module SET status = 1 WHERE status IS NULL OR status = 0;

View File

@@ -0,0 +1,56 @@
-- 创建文档源表 document_source
-- 用于存储PRD文档PDF和飞书链接
BEGIN;
-- 创建文档源表
CREATE TABLE IF NOT EXISTS public.document_source (
id BIGSERIAL PRIMARY KEY,
product_id BIGINT NOT NULL,
project_id BIGINT NOT NULL,
type SMALLINT NOT NULL DEFAULT 1,
source VARCHAR(512) NOT NULL,
content TEXT,
version INTEGER NOT NULL DEFAULT 1,
status SMALLINT NOT NULL DEFAULT 0,
ai_model VARCHAR(64),
created_by BIGINT,
is_delete INTEGER NOT NULL DEFAULT 0,
created_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT uk_document_source_source UNIQUE (source, is_delete)
);
COMMENT ON TABLE public.document_source IS '文档源表 - 存储PRD文档和飞书链接';
COMMENT ON COLUMN public.document_source.id IS '主键ID';
COMMENT ON COLUMN public.document_source.product_id IS '产品ID';
COMMENT ON COLUMN public.document_source.project_id IS '项目ID';
COMMENT ON COLUMN public.document_source.type IS '类型1-PDF文件2-飞书链接';
COMMENT ON COLUMN public.document_source.source IS '文件路径或飞书链接';
COMMENT ON COLUMN public.document_source.content IS '解析后的文本内容(缓存)';
COMMENT ON COLUMN public.document_source.version IS '版本号';
COMMENT ON COLUMN public.document_source.status IS '状态0-待解析1-已解析2-已生成用例';
COMMENT ON COLUMN public.document_source.ai_model IS '使用的AI模型';
COMMENT ON COLUMN public.document_source.created_by IS '创建人ID';
COMMENT ON COLUMN public.document_source.is_delete IS '0未删除1已删除';
COMMENT ON COLUMN public.document_source.created_time IS '创建时间';
COMMENT ON COLUMN public.document_source.updated_time IS '更新时间';
-- 为 test_case 表添加字段
ALTER TABLE public.test_case ADD COLUMN IF NOT EXISTS document_id BIGINT;
COMMENT ON COLUMN public.test_case.document_id IS '关联的文档源ID';
ALTER TABLE public.test_case ADD COLUMN IF NOT EXISTS document_version INTEGER;
COMMENT ON COLUMN public.test_case.document_version IS '关联的文档版本';
-- 添加外键约束
ALTER TABLE public.test_case ADD CONSTRAINT fk_test_case_document_id FOREIGN KEY (document_id) REFERENCES public.document_source(id);
-- 添加索引
CREATE INDEX IF NOT EXISTS idx_document_source_product_id ON public.document_source(product_id);
CREATE INDEX IF NOT EXISTS idx_document_source_project_id ON public.document_source(project_id);
CREATE INDEX IF NOT EXISTS idx_document_source_type ON public.document_source(type);
CREATE INDEX IF NOT EXISTS idx_document_source_status ON public.document_source(status);
CREATE INDEX IF NOT EXISTS idx_test_case_document_id ON public.test_case(document_id);
COMMIT;

View File

@@ -0,0 +1,345 @@
-- Skills / Business Rules / AI Generation Context 初始化脚本
-- 数据库PostgreSQL
CREATE TABLE IF NOT EXISTS test_skill (
id BIGSERIAL PRIMARY KEY,
project_id BIGINT NOT NULL,
module_id BIGINT,
name VARCHAR(128) NOT NULL,
code VARCHAR(64) NOT NULL,
description TEXT,
trigger_condition TEXT NOT NULL,
reasoning_path TEXT,
output_spec TEXT,
skill_file_path VARCHAR(512),
skill_type SMALLINT NOT NULL DEFAULT 1,
risk_level SMALLINT NOT NULL DEFAULT 2,
tags JSONB NOT NULL DEFAULT '[]'::jsonb,
status SMALLINT NOT NULL DEFAULT 1,
owner_id BIGINT,
created_by BIGINT,
usage_count INTEGER NOT NULL DEFAULT 0,
is_delete INTEGER NOT NULL DEFAULT 0,
created_time TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_time TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
);
COMMENT ON TABLE test_skill IS '测试 Skills 表:沉淀面向 AI 用例生成的测试策略、历史缺陷模式、边界场景和测试经验';
COMMENT ON COLUMN test_skill.id IS '主键 ID';
COMMENT ON COLUMN test_skill.project_id IS '所属项目 ID';
COMMENT ON COLUMN test_skill.module_id IS '所属模块 ID空表示项目级通用 Skill';
COMMENT ON COLUMN test_skill.name IS 'Skill 名称';
COMMENT ON COLUMN test_skill.code IS 'Skill 编码,建议项目内唯一';
COMMENT ON COLUMN test_skill.description IS 'Skill 描述';
COMMENT ON COLUMN test_skill.trigger_condition IS '触发条件,描述什么需求或场景下应该使用该 Skill';
COMMENT ON COLUMN test_skill.reasoning_path IS '推理路径,指导 AI 如何分析需求并设计测试点';
COMMENT ON COLUMN test_skill.output_spec IS '输出规范,指导 AI 必须生成哪些类型或结构的用例';
COMMENT ON COLUMN test_skill.skill_file_path IS 'Skill 文件路径,指向 config/skills 下生成的 SKILL.md';
COMMENT ON COLUMN test_skill.skill_type IS 'Skill 类型1通用测试策略 2历史缺陷模式 3边界场景 4接口测试 5UI测试 6性能测试 7安全测试 8数据一致性 9并发幂等 99其他';
COMMENT ON COLUMN test_skill.risk_level IS '风险等级0高风险 1中高风险 2中风险 3低风险';
COMMENT ON COLUMN test_skill.tags IS '标签数组,例如 ["支付","金额","边界"]';
COMMENT ON COLUMN test_skill.status IS '状态1启用 2停用 3草稿';
COMMENT ON COLUMN test_skill.owner_id IS '负责人用户 ID';
COMMENT ON COLUMN test_skill.created_by IS '创建人用户 ID';
COMMENT ON COLUMN test_skill.usage_count IS '使用次数,后续 PRD 生成用例引用该 Skill 时累加';
COMMENT ON COLUMN test_skill.is_delete IS '软删除标记0未删除 1已删除';
COMMENT ON COLUMN test_skill.created_time IS '创建时间';
COMMENT ON COLUMN test_skill.updated_time IS '更新时间';
ALTER TABLE test_skill ADD COLUMN IF NOT EXISTS skill_file_path VARCHAR(512);
COMMENT ON COLUMN test_skill.skill_file_path IS 'Skill 文件路径,指向 config/skills 下生成的 SKILL.md';
CREATE UNIQUE INDEX IF NOT EXISTS uk_test_skill_project_code
ON test_skill(project_id, code)
WHERE is_delete = 0;
COMMENT ON INDEX uk_test_skill_project_code IS '同一项目下未删除 Skill 编码唯一';
CREATE INDEX IF NOT EXISTS idx_test_skill_project_module
ON test_skill(project_id, module_id)
WHERE is_delete = 0;
COMMENT ON INDEX idx_test_skill_project_module IS '按项目和模块查询 Skill';
CREATE INDEX IF NOT EXISTS idx_test_skill_status
ON test_skill(status)
WHERE is_delete = 0;
COMMENT ON INDEX idx_test_skill_status IS '按 Skill 状态过滤';
CREATE INDEX IF NOT EXISTS idx_test_skill_type
ON test_skill(skill_type)
WHERE is_delete = 0;
COMMENT ON INDEX idx_test_skill_type IS '按 Skill 类型过滤';
CREATE INDEX IF NOT EXISTS idx_test_skill_risk_level
ON test_skill(risk_level)
WHERE is_delete = 0;
COMMENT ON INDEX idx_test_skill_risk_level IS '按风险等级过滤';
CREATE INDEX IF NOT EXISTS idx_test_skill_created_time
ON test_skill(created_time DESC);
COMMENT ON INDEX idx_test_skill_created_time IS 'Skill 列表按创建时间排序';
CREATE INDEX IF NOT EXISTS idx_test_skill_file_path
ON test_skill(skill_file_path)
WHERE is_delete = 0;
COMMENT ON INDEX idx_test_skill_file_path IS '按 Skill 文件路径查询';
CREATE INDEX IF NOT EXISTS idx_test_skill_tags_gin
ON test_skill USING GIN(tags);
COMMENT ON INDEX idx_test_skill_tags_gin IS 'Skill 标签 JSONB GIN 索引';
CREATE TABLE IF NOT EXISTS test_business_rule (
id BIGSERIAL PRIMARY KEY,
project_id BIGINT NOT NULL,
module_id BIGINT,
name VARCHAR(128) NOT NULL,
rule_code VARCHAR(64),
rule_content TEXT NOT NULL,
applicable_scene TEXT,
example TEXT,
rule_file_path VARCHAR(512),
priority SMALLINT NOT NULL DEFAULT 2,
tags JSONB NOT NULL DEFAULT '[]'::jsonb,
status SMALLINT NOT NULL DEFAULT 1,
owner_id BIGINT,
created_by BIGINT,
usage_count INTEGER NOT NULL DEFAULT 0,
is_delete INTEGER NOT NULL DEFAULT 0,
created_time TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_time TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
);
COMMENT ON TABLE test_business_rule IS '业务规则表:沉淀确定性的业务约束、参数校验规则和场景规则';
COMMENT ON COLUMN test_business_rule.id IS '主键 ID';
COMMENT ON COLUMN test_business_rule.project_id IS '所属项目 ID';
COMMENT ON COLUMN test_business_rule.module_id IS '所属模块 ID空表示项目级通用规则';
COMMENT ON COLUMN test_business_rule.name IS '业务规则名称';
COMMENT ON COLUMN test_business_rule.rule_code IS '业务规则编码,建议项目内唯一,可为空';
COMMENT ON COLUMN test_business_rule.rule_content IS '业务规则内容';
COMMENT ON COLUMN test_business_rule.applicable_scene IS '适用场景';
COMMENT ON COLUMN test_business_rule.example IS '规则示例';
COMMENT ON COLUMN test_business_rule.rule_file_path IS '业务规则文件路径,指向 config/rules 下生成的 RULE.md';
COMMENT ON COLUMN test_business_rule.priority IS '优先级0高 1中高 2中 3低';
ALTER TABLE test_business_rule ADD COLUMN IF NOT EXISTS rule_file_path VARCHAR(512);
COMMENT ON COLUMN test_business_rule.rule_file_path IS '业务规则文件路径,指向 config/rules 下生成的 RULE.md';
COMMENT ON COLUMN test_business_rule.tags IS '标签数组,例如 ["支付","金额","参数校验"]';
COMMENT ON COLUMN test_business_rule.status IS '状态1启用 2停用 3草稿';
COMMENT ON COLUMN test_business_rule.owner_id IS '负责人用户 ID';
COMMENT ON COLUMN test_business_rule.created_by IS '创建人用户 ID';
COMMENT ON COLUMN test_business_rule.usage_count IS '使用次数,后续 PRD 生成用例引用该规则时累加';
COMMENT ON COLUMN test_business_rule.is_delete IS '软删除标记0未删除 1已删除';
COMMENT ON COLUMN test_business_rule.created_time IS '创建时间';
COMMENT ON COLUMN test_business_rule.updated_time IS '更新时间';
CREATE UNIQUE INDEX IF NOT EXISTS uk_test_business_rule_project_code
ON test_business_rule(project_id, rule_code)
WHERE is_delete = 0 AND rule_code IS NOT NULL;
COMMENT ON INDEX uk_test_business_rule_project_code IS '同一项目下未删除业务规则编码唯一rule_code 为空时不参与唯一约束';
CREATE INDEX IF NOT EXISTS idx_test_business_rule_project_module
ON test_business_rule(project_id, module_id)
WHERE is_delete = 0;
COMMENT ON INDEX idx_test_business_rule_project_module IS '按项目和模块查询业务规则';
CREATE INDEX IF NOT EXISTS idx_test_business_rule_status
ON test_business_rule(status)
WHERE is_delete = 0;
COMMENT ON INDEX idx_test_business_rule_status IS '按业务规则状态过滤';
CREATE INDEX IF NOT EXISTS idx_test_business_rule_priority
ON test_business_rule(priority)
WHERE is_delete = 0;
COMMENT ON INDEX idx_test_business_rule_priority IS '按业务规则优先级过滤';
CREATE INDEX IF NOT EXISTS idx_test_business_rule_created_time
ON test_business_rule(created_time DESC);
COMMENT ON INDEX idx_test_business_rule_created_time IS '业务规则列表按创建时间排序';
CREATE INDEX IF NOT EXISTS idx_test_business_rule_file_path
ON test_business_rule(rule_file_path)
WHERE is_delete = 0;
COMMENT ON INDEX idx_test_business_rule_file_path IS '按业务规则文件路径查询';
CREATE INDEX IF NOT EXISTS idx_test_business_rule_tags_gin
ON test_business_rule USING GIN(tags);
COMMENT ON INDEX idx_test_business_rule_tags_gin IS '业务规则标签 JSONB GIN 索引';
CREATE TABLE IF NOT EXISTS test_ai_generation_context (
id BIGSERIAL PRIMARY KEY,
generation_id BIGINT,
project_id BIGINT NOT NULL,
module_id BIGINT,
source_type SMALLINT NOT NULL,
source_id BIGINT NOT NULL,
source_name VARCHAR(128),
match_score INTEGER NOT NULL DEFAULT 0,
created_time TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
);
COMMENT ON TABLE test_ai_generation_context IS 'AI 生成上下文引用记录表:记录某次 PRD/AI 生成用例使用了哪些 Skill 或业务规则';
COMMENT ON COLUMN test_ai_generation_context.id IS '主键 ID';
COMMENT ON COLUMN test_ai_generation_context.generation_id IS 'AI 生成任务 ID兼容现有 PRD 生成功能的任务 ID';
COMMENT ON COLUMN test_ai_generation_context.project_id IS '项目 ID';
COMMENT ON COLUMN test_ai_generation_context.module_id IS '模块 ID';
COMMENT ON COLUMN test_ai_generation_context.source_type IS '来源类型1 Skill 2业务规则';
COMMENT ON COLUMN test_ai_generation_context.source_id IS '来源 IDSkill ID 或 Business Rule ID';
COMMENT ON COLUMN test_ai_generation_context.source_name IS '来源名称快照';
COMMENT ON COLUMN test_ai_generation_context.match_score IS '匹配分数';
COMMENT ON COLUMN test_ai_generation_context.created_time IS '创建时间';
CREATE INDEX IF NOT EXISTS idx_test_ai_generation_context_generation
ON test_ai_generation_context(generation_id);
COMMENT ON INDEX idx_test_ai_generation_context_generation IS '按 AI 生成任务 ID 查询上下文引用';
CREATE INDEX IF NOT EXISTS idx_test_ai_generation_context_project_module
ON test_ai_generation_context(project_id, module_id);
COMMENT ON INDEX idx_test_ai_generation_context_project_module IS '按项目和模块查询上下文引用';
CREATE INDEX IF NOT EXISTS idx_test_ai_generation_context_source
ON test_ai_generation_context(source_type, source_id);
COMMENT ON INDEX idx_test_ai_generation_context_source IS '按来源类型和来源 ID 查询上下文引用';
CREATE INDEX IF NOT EXISTS idx_test_ai_generation_context_created_time
ON test_ai_generation_context(created_time DESC);
COMMENT ON INDEX idx_test_ai_generation_context_created_time IS '上下文引用记录按创建时间排序';
CREATE OR REPLACE FUNCTION update_updated_time_column()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_time = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS trg_test_skill_updated_time ON test_skill;
CREATE TRIGGER trg_test_skill_updated_time
BEFORE UPDATE ON test_skill
FOR EACH ROW
EXECUTE FUNCTION update_updated_time_column();
COMMENT ON TRIGGER trg_test_skill_updated_time ON test_skill IS '自动维护 test_skill.updated_time';
DROP TRIGGER IF EXISTS trg_test_business_rule_updated_time ON test_business_rule;
CREATE TRIGGER trg_test_business_rule_updated_time
BEFORE UPDATE ON test_business_rule
FOR EACH ROW
EXECUTE FUNCTION update_updated_time_column();
COMMENT ON TRIGGER trg_test_business_rule_updated_time ON test_business_rule IS '自动维护 test_business_rule.updated_time';
INSERT INTO permission (code, name, module, action, description, status, is_delete)
VALUES
('skill:create', '创建测试Skill', 'skill', 'create', '创建测试 Skills', 1, 0),
('skill:update', '更新测试Skill', 'skill', 'update', '更新测试 Skills', 1, 0),
('skill:delete', '删除测试Skill', 'skill', 'delete', '软删除测试 Skills', 1, 0),
('skill:list', '查询测试Skill列表', 'skill', 'list', '查询测试 Skills 列表', 1, 0),
('skill:detail', '查询测试Skill详情', 'skill', 'detail', '查询测试 Skills 详情', 1, 0),
('business-rule:create', '创建业务规则', 'business-rule', 'create', '创建业务规则', 1, 0),
('business-rule:update', '更新业务规则', 'business-rule', 'update', '更新业务规则', 1, 0),
('business-rule:delete', '删除业务规则', 'business-rule', 'delete', '软删除业务规则', 1, 0),
('business-rule:list', '查询业务规则列表', 'business-rule', 'list', '查询业务规则列表', 1, 0),
('business-rule:detail', '查询业务规则详情', 'business-rule', 'detail', '查询业务规则详情', 1, 0)
ON CONFLICT (code) DO UPDATE SET
name = EXCLUDED.name,
module = EXCLUDED.module,
action = EXCLUDED.action,
description = EXCLUDED.description,
status = 1,
is_delete = 0,
updated_time = CURRENT_TIMESTAMP;
-- Skills / Business Rules 菜单初始化
-- 默认挂载到“测试平台(test_platform)”目录下;如果不存在则创建测试平台目录。
INSERT INTO menu (parent_id, name, code, type, path, component, icon, permission_code, sort, visible, status, is_delete)
VALUES (0, '测试平台', 'test_platform', 1, '/test-platform', 'Layout', 'test', NULL, 2, 1, 1, 0)
ON CONFLICT (code) DO UPDATE SET
name = EXCLUDED.name,
type = EXCLUDED.type,
path = EXCLUDED.path,
component = EXCLUDED.component,
icon = EXCLUDED.icon,
sort = EXCLUDED.sort,
visible = 1,
status = 1,
is_delete = 0,
updated_time = CURRENT_TIMESTAMP;
WITH parent_menu AS (
SELECT id FROM menu WHERE code = 'test_platform' LIMIT 1
)
INSERT INTO menu (parent_id, name, code, type, path, component, icon, permission_code, sort, visible, status, is_delete)
SELECT id, '测试 Skills', 'skill_manage', 2, '/test-platform/skills', 'test-platform/skills/index', 'skill', 'skill:list', 20, 1, 1, 0 FROM parent_menu
ON CONFLICT (code) DO UPDATE SET
parent_id = EXCLUDED.parent_id,
name = EXCLUDED.name,
type = EXCLUDED.type,
path = EXCLUDED.path,
component = EXCLUDED.component,
icon = EXCLUDED.icon,
permission_code = EXCLUDED.permission_code,
sort = EXCLUDED.sort,
visible = 1,
status = 1,
is_delete = 0,
updated_time = CURRENT_TIMESTAMP;
WITH parent_menu AS (
SELECT id FROM menu WHERE code = 'test_platform' LIMIT 1
)
INSERT INTO menu (parent_id, name, code, type, path, component, icon, permission_code, sort, visible, status, is_delete)
SELECT id, '业务规则', 'business_rule_manage', 2, '/test-platform/business-rules', 'test-platform/business-rules/index', 'rule', 'business-rule:list', 21, 1, 1, 0 FROM parent_menu
ON CONFLICT (code) DO UPDATE SET
parent_id = EXCLUDED.parent_id,
name = EXCLUDED.name,
type = EXCLUDED.type,
path = EXCLUDED.path,
component = EXCLUDED.component,
icon = EXCLUDED.icon,
permission_code = EXCLUDED.permission_code,
sort = EXCLUDED.sort,
visible = 1,
status = 1,
is_delete = 0,
updated_time = CURRENT_TIMESTAMP;
WITH parent_menu AS (
SELECT id FROM menu WHERE code = 'skill_manage' LIMIT 1
)
INSERT INTO menu (parent_id, name, code, type, path, component, icon, permission_code, sort, visible, status, is_delete)
SELECT id, '新增', 'skill:create', 3, '', '', NULL, 'skill:create', 1, 1, 1, 0 FROM parent_menu
UNION ALL SELECT id, '编辑', 'skill:update', 3, '', '', NULL, 'skill:update', 2, 1, 1, 0 FROM parent_menu
UNION ALL SELECT id, '删除', 'skill:delete', 3, '', '', NULL, 'skill:delete', 3, 1, 1, 0 FROM parent_menu
UNION ALL SELECT id, '列表查询', 'skill:list', 3, '', '', NULL, 'skill:list', 4, 1, 1, 0 FROM parent_menu
UNION ALL SELECT id, '详情', 'skill:detail', 3, '', '', NULL, 'skill:detail', 5, 1, 1, 0 FROM parent_menu
ON CONFLICT (code) DO UPDATE SET
parent_id = EXCLUDED.parent_id,
name = EXCLUDED.name,
type = EXCLUDED.type,
path = EXCLUDED.path,
component = EXCLUDED.component,
permission_code = EXCLUDED.permission_code,
sort = EXCLUDED.sort,
visible = 1,
status = 1,
is_delete = 0,
updated_time = CURRENT_TIMESTAMP;
WITH parent_menu AS (
SELECT id FROM menu WHERE code = 'business_rule_manage' LIMIT 1
)
INSERT INTO menu (parent_id, name, code, type, path, component, icon, permission_code, sort, visible, status, is_delete)
SELECT id, '新增', 'business-rule:create', 3, '', '', NULL, 'business-rule:create', 1, 1, 1, 0 FROM parent_menu
UNION ALL SELECT id, '编辑', 'business-rule:update', 3, '', '', NULL, 'business-rule:update', 2, 1, 1, 0 FROM parent_menu
UNION ALL SELECT id, '删除', 'business-rule:delete', 3, '', '', NULL, 'business-rule:delete', 3, 1, 1, 0 FROM parent_menu
UNION ALL SELECT id, '列表查询', 'business-rule:list', 3, '', '', NULL, 'business-rule:list', 4, 1, 1, 0 FROM parent_menu
UNION ALL SELECT id, '详情', 'business-rule:detail', 3, '', '', NULL, 'business-rule:detail', 5, 1, 1, 0 FROM parent_menu
ON CONFLICT (code) DO UPDATE SET
parent_id = EXCLUDED.parent_id,
name = EXCLUDED.name,
type = EXCLUDED.type,
path = EXCLUDED.path,
component = EXCLUDED.component,
permission_code = EXCLUDED.permission_code,
sort = EXCLUDED.sort,
visible = 1,
status = 1,
is_delete = 0,
updated_time = CURRENT_TIMESTAMP;