[G1-Sync] Manual knowledge update
This commit is contained in:
@@ -0,0 +1,324 @@
|
||||
---
|
||||
id: ai-eval-framework-deep
|
||||
title: LLM Eval Framework — Inspect / Promptfoo / Braintrust
|
||||
category: Coding
|
||||
status: draft
|
||||
source_trust_level: B
|
||||
verification_status: conceptual
|
||||
created_at: 2026-05-09
|
||||
updated_at: 2026-05-09
|
||||
tags: [ai, llm, eval, framework, vibe-coding]
|
||||
tech_stack: { language: "TS / Python", applicable_to: ["Backend"] }
|
||||
applied_in: []
|
||||
aliases: [Inspect AI, Promptfoo, Braintrust, LangSmith, Helicone, eval-driven development]
|
||||
---
|
||||
|
||||
# LLM Eval Framework
|
||||
|
||||
> Eval-driven development. **Inspect AI (UK AISI), Promptfoo (OSS), Braintrust (managed), LangSmith (LangChain)**. Dataset + scorer + 비교.
|
||||
|
||||
## 📖 핵심 개념
|
||||
- Dataset: input + expected.
|
||||
- Scorer: 채점 (exact / similarity / LLM judge).
|
||||
- Run: model × prompt × dataset.
|
||||
- Trace: 각 case 의 실행 추적.
|
||||
|
||||
## 💻 코드 패턴
|
||||
|
||||
### Inspect AI (Python, UK AISI)
|
||||
```python
|
||||
from inspect_ai import Task, task, eval
|
||||
from inspect_ai.dataset import Sample
|
||||
from inspect_ai.scorer import match
|
||||
from inspect_ai.solver import generate
|
||||
|
||||
@task
|
||||
def my_eval():
|
||||
return Task(
|
||||
dataset=[
|
||||
Sample(input='Capital of France?', target='Paris'),
|
||||
Sample(input='Capital of Korea?', target='Seoul'),
|
||||
],
|
||||
plan=[generate()],
|
||||
scorer=match(),
|
||||
)
|
||||
|
||||
# 실행
|
||||
eval(my_eval(), model='anthropic/claude-opus-4-7')
|
||||
```
|
||||
|
||||
→ AI safety 평가 강력.
|
||||
|
||||
### Promptfoo (TS / OSS)
|
||||
```yaml
|
||||
# promptfooconfig.yaml
|
||||
description: "Customer support eval"
|
||||
|
||||
prompts:
|
||||
- "Answer the customer's question concisely:\n{{question}}"
|
||||
|
||||
providers:
|
||||
- openai:gpt-4o
|
||||
- openai:gpt-4o-mini
|
||||
- anthropic:claude-opus-4-7
|
||||
- anthropic:claude-haiku-4-5
|
||||
|
||||
tests:
|
||||
- vars: { question: "How do I reset my password?" }
|
||||
assert:
|
||||
- type: contains
|
||||
value: "/forgot-password"
|
||||
- type: llm-rubric
|
||||
value: "Provides clear step-by-step instructions"
|
||||
- type: latency
|
||||
threshold: 3000
|
||||
- type: cost
|
||||
threshold: 0.005
|
||||
- vars: { question: "Refund policy?" }
|
||||
assert:
|
||||
- type: contains-any
|
||||
value: ["30 days", "money back", "refund"]
|
||||
|
||||
defaultTest:
|
||||
options:
|
||||
cache: true
|
||||
```
|
||||
|
||||
```bash
|
||||
promptfoo eval
|
||||
promptfoo view # web UI 비교
|
||||
```
|
||||
|
||||
### Promptfoo programmatic
|
||||
```ts
|
||||
import { evaluate } from 'promptfoo';
|
||||
|
||||
const result = await evaluate({
|
||||
prompts: ['Answer: {{q}}'],
|
||||
providers: ['openai:gpt-4o'],
|
||||
tests: [
|
||||
{ vars: { q: 'capital of France' }, assert: [{ type: 'contains', value: 'Paris' }] },
|
||||
],
|
||||
});
|
||||
|
||||
console.log(result.results.passCount, '/', result.results.length);
|
||||
```
|
||||
|
||||
### Braintrust (managed, modern)
|
||||
```ts
|
||||
import { Eval } from 'braintrust';
|
||||
|
||||
await Eval('My Project', {
|
||||
data: () => [
|
||||
{ input: 'Capital of France?', expected: 'Paris' },
|
||||
{ input: 'Capital of Korea?', expected: 'Seoul' },
|
||||
],
|
||||
task: async (input) => {
|
||||
const r = await openai.chat.completions.create({...});
|
||||
return r.choices[0].message.content!;
|
||||
},
|
||||
scores: [
|
||||
Levenshtein,
|
||||
LLMClassifier({
|
||||
model: 'gpt-4o',
|
||||
criteria: 'Does the answer contain the correct city?',
|
||||
}),
|
||||
],
|
||||
});
|
||||
```
|
||||
|
||||
→ Web UI 자동 + 비교 + regression detection.
|
||||
|
||||
### LangSmith (LangChain)
|
||||
```ts
|
||||
import { Client } from 'langsmith';
|
||||
const client = new Client();
|
||||
|
||||
// Dataset
|
||||
await client.createExamples({
|
||||
inputs: [{ question: 'Capital?' }],
|
||||
outputs: [{ answer: 'Paris' }],
|
||||
datasetId: 'capitals',
|
||||
});
|
||||
|
||||
// Run + auto trace
|
||||
import { evaluate } from 'langsmith/evaluation';
|
||||
await evaluate(myAgent, {
|
||||
data: 'capitals',
|
||||
evaluators: [exactMatch],
|
||||
});
|
||||
```
|
||||
|
||||
### LLM-as-judge (rubric)
|
||||
```ts
|
||||
async function judge(input: string, output: string, criteria: string) {
|
||||
const r = await llm.complete({
|
||||
system: `You are a strict evaluator. Score 1-5 based on criteria.
|
||||
Output JSON: { "score": N, "reason": "..." }`,
|
||||
user: `Input: ${input}\nOutput: ${output}\nCriteria: ${criteria}`,
|
||||
response_format: { type: 'json_object' },
|
||||
});
|
||||
return JSON.parse(r);
|
||||
}
|
||||
|
||||
await Eval(...).addScore({
|
||||
name: 'helpful',
|
||||
scorer: ({ input, output }) => judge(input, output, 'Is it helpful and concise?'),
|
||||
});
|
||||
```
|
||||
|
||||
### Pairwise (A vs B)
|
||||
```ts
|
||||
async function pairwise(input: string, outA: string, outB: string) {
|
||||
const r = await llm.complete({
|
||||
user: `Compare A and B for query "${input}".\nA: ${outA}\nB: ${outB}\nWhich is better? JSON: { "winner": "A"|"B"|"tie", "reason": "..." }`,
|
||||
response_format: { type: 'json_object' },
|
||||
});
|
||||
return JSON.parse(r);
|
||||
}
|
||||
```
|
||||
|
||||
→ Absolute score 보다 pairwise 가 사람 판단 align.
|
||||
|
||||
### Regression detection
|
||||
```ts
|
||||
// CI 안 baseline 비교
|
||||
const current = await runEval();
|
||||
const baseline = await loadBaseline();
|
||||
|
||||
if (current.score < baseline.score - 0.05) {
|
||||
console.error(`Regression: ${baseline.score} → ${current.score}`);
|
||||
process.exit(1);
|
||||
}
|
||||
```
|
||||
|
||||
```yaml
|
||||
# CI
|
||||
- name: LLM eval
|
||||
run: promptfoo eval --output report.json
|
||||
- name: Compare to baseline
|
||||
run: node scripts/regression-check.js report.json
|
||||
```
|
||||
|
||||
### Trace + debug
|
||||
```ts
|
||||
// LangSmith / Braintrust trace
|
||||
// 매 LLM call 의 input / output / token / latency / cost 자동 기록
|
||||
|
||||
// 실패 case → web UI 에서 step 별 inspect
|
||||
```
|
||||
|
||||
### Diverse dataset
|
||||
```
|
||||
- Edge cases (empty, very long, special chars)
|
||||
- Adversarial (prompt injection)
|
||||
- 다국어
|
||||
- Real production logs (sampled)
|
||||
- Synthetic (LLM 가 generate)
|
||||
```
|
||||
|
||||
### Synthetic data
|
||||
```ts
|
||||
async function generateTestCases(n: number) {
|
||||
const r = await llm.complete({
|
||||
user: `Generate ${n} customer support questions and ideal answers.
|
||||
Output JSON: { "cases": [{ "question": "...", "answer": "..." }] }`,
|
||||
response_format: { type: 'json_object' },
|
||||
});
|
||||
return JSON.parse(r).cases;
|
||||
}
|
||||
```
|
||||
|
||||
→ 빠른 dataset 시작.
|
||||
|
||||
### Metrics 종류
|
||||
```
|
||||
- Exact match (binary): yes / no
|
||||
- Levenshtein / similarity: 0-1
|
||||
- BLEU / ROUGE: text similarity
|
||||
- Semantic similarity: embedding cosine
|
||||
- LLM-as-judge: 1-5 또는 binary
|
||||
- Cost / latency: 비용 / 속도
|
||||
- Custom: domain-specific
|
||||
```
|
||||
|
||||
### Per-task vs holistic
|
||||
```
|
||||
Per-task: 각 case 의 score → average.
|
||||
Holistic: Overall quality (LLM judge).
|
||||
|
||||
→ 둘 다.
|
||||
```
|
||||
|
||||
### Live eval (production)
|
||||
```ts
|
||||
// 1% sampling — production traffic
|
||||
if (Math.random() < 0.01) {
|
||||
await sampleForEval(input, output);
|
||||
}
|
||||
|
||||
// Daily batch eval
|
||||
const samples = await db.evalSamples.recent(1000);
|
||||
await runEval(samples);
|
||||
```
|
||||
|
||||
→ Drift detection.
|
||||
|
||||
### Eval-driven workflow
|
||||
```
|
||||
1. 수집 cases (production logs)
|
||||
2. Score 채점
|
||||
3. Eval 작성
|
||||
4. Baseline 측정
|
||||
5. Prompt / model / fine-tune 변경
|
||||
6. Eval 비교
|
||||
7. Better → ship. Worse → fix.
|
||||
```
|
||||
|
||||
### Cost-aware eval
|
||||
```ts
|
||||
// Model 비교 — 정확도 vs 비용
|
||||
const results = {
|
||||
'gpt-4o': { score: 0.92, cost: 0.005 },
|
||||
'gpt-4o-mini': { score: 0.85, cost: 0.0003 },
|
||||
'claude-haiku': { score: 0.88, cost: 0.0008 },
|
||||
};
|
||||
|
||||
// $/quality 점수
|
||||
```
|
||||
|
||||
### Anthropic Tool — Skills + Eval
|
||||
```
|
||||
.claude/skills/customer-support/eval.yaml
|
||||
→ 매 PR 가 자동 eval.
|
||||
```
|
||||
|
||||
## 🤔 의사결정 기준
|
||||
| 상황 | 추천 |
|
||||
|---|---|
|
||||
| OSS / 빠른 시작 | Promptfoo |
|
||||
| Agent / 복잡 trace | Braintrust / LangSmith |
|
||||
| Safety eval | Inspect AI |
|
||||
| Self-host | Promptfoo |
|
||||
| Quick A/B | Promptfoo CLI |
|
||||
| Production observability | LangSmith / Helicone |
|
||||
|
||||
## ❌ 안티패턴
|
||||
- **Eval 없는 변경**: 회귀.
|
||||
- **단일 case 만 (5개)**: variance 큰. 50+.
|
||||
- **LLM-as-judge 같은 모델**: 자기 편향.
|
||||
- **Test set leak (training)**: 거짓 점수.
|
||||
- **Cost / latency 무시**: 정확도만 보면 비싸짐.
|
||||
- **CI 통합 안 함**: drift 검출 X.
|
||||
- **Production live data 무 sampling**: 비용.
|
||||
|
||||
## 🤖 LLM 활용 힌트
|
||||
- Promptfoo = OSS 빠른 시작.
|
||||
- Braintrust / LangSmith = production observability.
|
||||
- Pairwise > absolute.
|
||||
- Regression detection CI.
|
||||
|
||||
## 🔗 관련 문서
|
||||
- [[AI_LLM_Eval_Patterns]]
|
||||
- [[AI_Prompt_Engineering_Patterns]]
|
||||
- [[AI_LLM_Cost_Optimization]]
|
||||
Reference in New Issue
Block a user