[G1-Sync] Manual knowledge update
This commit is contained in:
@@ -0,0 +1,398 @@
|
||||
---
|
||||
id: ai-synthetic-data
|
||||
title: Synthetic Data — LLM 으로 train / test / fixture
|
||||
category: Coding
|
||||
status: draft
|
||||
source_trust_level: B
|
||||
verification_status: conceptual
|
||||
created_at: 2026-05-09
|
||||
updated_at: 2026-05-09
|
||||
tags: [ai, synthetic-data, vibe-coding]
|
||||
tech_stack: { language: "TS / Python", applicable_to: ["Backend"] }
|
||||
applied_in: []
|
||||
aliases: [synthetic data, LLM-generated data, test fixtures, data augmentation, anonymization]
|
||||
---
|
||||
|
||||
# Synthetic Data
|
||||
|
||||
> LLM 가 fake data 생성. **Test fixture, ML training, 사용자 demo, anonymization**. Real data privacy / cost / scale 우회.
|
||||
|
||||
## 📖 핵심 개념
|
||||
- Generation: LLM 가 schema 따라 data 생성.
|
||||
- Augmentation: 기존 data 의 변형.
|
||||
- Anonymization: PII 제거 + realistic 유지.
|
||||
- Distillation: 큰 model → 작은 model 의 training.
|
||||
|
||||
## 💻 코드 패턴
|
||||
|
||||
### LLM 으로 fixture 생성
|
||||
```ts
|
||||
import { z } from 'zod';
|
||||
import OpenAI from 'openai';
|
||||
import { zodResponseFormat } from 'openai/helpers/zod';
|
||||
|
||||
const User = z.object({
|
||||
email: z.string().email(),
|
||||
name: z.string(),
|
||||
bio: z.string().max(200),
|
||||
interests: z.array(z.string()).max(5),
|
||||
age: z.number().int().min(18).max(80),
|
||||
});
|
||||
|
||||
async function generateUsers(count: number): Promise<z.infer<typeof User>[]> {
|
||||
const r = await openai.beta.chat.completions.parse({
|
||||
model: 'gpt-4o-mini',
|
||||
messages: [
|
||||
{ role: 'system', content: 'Generate diverse, realistic test user profiles. Vary demographics, names, bios.' },
|
||||
{ role: 'user', content: `Generate ${count} users.` },
|
||||
],
|
||||
response_format: zodResponseFormat(z.object({ users: z.array(User) }), 'users'),
|
||||
});
|
||||
return r.choices[0].message.parsed!.users;
|
||||
}
|
||||
|
||||
const users = await generateUsers(50);
|
||||
```
|
||||
|
||||
→ Faker.js 보다 realistic.
|
||||
|
||||
### Diverse generation
|
||||
```ts
|
||||
// 단순 — 비슷한 데이터 자주
|
||||
// Better — diversity prompt
|
||||
|
||||
const prompts = [
|
||||
'Generate users from different countries',
|
||||
'Generate users with different age groups',
|
||||
'Generate users with different income levels',
|
||||
];
|
||||
|
||||
const all: User[] = [];
|
||||
for (const prompt of prompts) {
|
||||
const batch = await generateWithPrompt(prompt, 20);
|
||||
all.push(...batch);
|
||||
}
|
||||
```
|
||||
|
||||
### Schema-driven (any)
|
||||
```ts
|
||||
const Order = z.object({
|
||||
id: z.string().uuid(),
|
||||
userId: z.string().uuid(),
|
||||
items: z.array(z.object({
|
||||
productId: z.string().uuid(),
|
||||
quantity: z.number().int().positive(),
|
||||
price: z.number().positive(),
|
||||
})).min(1).max(10),
|
||||
status: z.enum(['pending', 'paid', 'shipped', 'delivered', 'cancelled']),
|
||||
createdAt: z.string().datetime(),
|
||||
});
|
||||
|
||||
const orders = await generateFromSchema(Order, 100);
|
||||
```
|
||||
|
||||
### Faker.js (deterministic, fast)
|
||||
```ts
|
||||
import { faker } from '@faker-js/faker';
|
||||
|
||||
faker.seed(42); // deterministic
|
||||
|
||||
const user = {
|
||||
id: faker.string.uuid(),
|
||||
name: faker.person.fullName(),
|
||||
email: faker.internet.email(),
|
||||
address: {
|
||||
street: faker.location.streetAddress(),
|
||||
city: faker.location.city(),
|
||||
zip: faker.location.zipCode(),
|
||||
},
|
||||
};
|
||||
```
|
||||
|
||||
→ 빠름, 일관, but 패턴 명확 (LLM 보다 less realistic).
|
||||
|
||||
### Hybrid (Faker + LLM)
|
||||
```ts
|
||||
// Faker = structure (id, email, address)
|
||||
// LLM = creative (bio, review text)
|
||||
|
||||
const user = {
|
||||
id: faker.string.uuid(),
|
||||
email: faker.internet.email(),
|
||||
bio: await llm.generate('Write a 100-character bio for a freelance designer'),
|
||||
reviews: await llm.generate('Write 3 realistic product reviews'),
|
||||
};
|
||||
```
|
||||
|
||||
### Test database seed
|
||||
```ts
|
||||
async function seed() {
|
||||
await db.user.deleteMany();
|
||||
await db.order.deleteMany();
|
||||
|
||||
const users = await generateUsers(100);
|
||||
await db.user.createMany({ data: users });
|
||||
|
||||
const orders = await generateOrders(500, users.map(u => u.id));
|
||||
await db.order.createMany({ data: orders });
|
||||
|
||||
console.log(`Seeded ${users.length} users, ${orders.length} orders`);
|
||||
}
|
||||
```
|
||||
|
||||
```bash
|
||||
yarn seed
|
||||
```
|
||||
|
||||
→ Test environment 가 production-like.
|
||||
|
||||
### Anonymization (real → synthetic)
|
||||
```ts
|
||||
// Real user data → similar but anonymized
|
||||
async function anonymize(user: User): Promise<User> {
|
||||
const r = await llm.complete({
|
||||
system: 'Generate a realistic user profile similar to this one but with all PII changed.',
|
||||
user: `Original: ${JSON.stringify(user)}`,
|
||||
response_format: { type: 'json_object' },
|
||||
});
|
||||
return JSON.parse(r);
|
||||
}
|
||||
|
||||
// Or simpler — Faker
|
||||
function anonymize(user: User): User {
|
||||
return {
|
||||
...user,
|
||||
name: faker.person.fullName(),
|
||||
email: faker.internet.email(),
|
||||
phone: faker.phone.number(),
|
||||
// 비-PII keep (purchase history, preferences)
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
→ Test on prod-like data without exposure.
|
||||
|
||||
### ML training data augmentation
|
||||
```ts
|
||||
// Few-shot examples → 더 많은 generation
|
||||
async function augmentDataset(examples: Example[], targetSize: number) {
|
||||
const augmented: Example[] = [...examples];
|
||||
|
||||
while (augmented.length < targetSize) {
|
||||
const batch = await llm.generate({
|
||||
system: 'Generate similar examples to these, with variations.',
|
||||
user: examples.slice(0, 5).map(e => JSON.stringify(e)).join('\n'),
|
||||
response_format: { type: 'json_object' },
|
||||
});
|
||||
augmented.push(...JSON.parse(batch).examples);
|
||||
}
|
||||
|
||||
return augmented.slice(0, targetSize);
|
||||
}
|
||||
```
|
||||
|
||||
→ 100 examples → 1000.
|
||||
|
||||
### Distillation (big → small model)
|
||||
```ts
|
||||
// 1. Big model (GPT-4o) 가 답 생성
|
||||
// 2. (input, output) 쌍 = training data
|
||||
// 3. Small model (Llama 8B) fine-tune
|
||||
|
||||
async function generateTrainingData(inputs: string[]) {
|
||||
const data = [];
|
||||
for (const input of inputs) {
|
||||
const output = await openai.chat.completions.create({
|
||||
model: 'gpt-4o',
|
||||
messages: [{ role: 'user', content: input }],
|
||||
});
|
||||
data.push({ input, output: output.choices[0].message.content });
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
// 그 후 fine-tune small model.
|
||||
```
|
||||
|
||||
→ Cost ↓ runtime, 비슷 quality.
|
||||
|
||||
### Edge case generation
|
||||
```ts
|
||||
async function generateEdgeCases(schema: any, count: number) {
|
||||
return await llm.generate({
|
||||
system: `Generate edge case test inputs based on this schema.
|
||||
Include: empty, very long, special chars, boundary values, unicode, malformed.`,
|
||||
user: JSON.stringify(schema),
|
||||
response_format: { type: 'json_object' },
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
### Adversarial (security test)
|
||||
```ts
|
||||
async function generateAdversarial(target: string, count: number) {
|
||||
return await llm.generate({
|
||||
system: `Generate adversarial inputs for security testing.
|
||||
Include: SQL injection attempts, XSS, command injection, long strings, unicode tricks.`,
|
||||
user: `Target: ${target}`,
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
→ Pen testing.
|
||||
|
||||
### Validation (synthetic 가 real 같은가?)
|
||||
```ts
|
||||
// Statistical check
|
||||
const realStats = computeStats(realData);
|
||||
const synthStats = computeStats(syntheticData);
|
||||
|
||||
// Distribution similarity (KS test, etc)
|
||||
expect(ksDistance(realStats, synthStats)).toBeLessThan(0.1);
|
||||
```
|
||||
|
||||
### Privacy guarantee
|
||||
```
|
||||
GDPR / HIPAA:
|
||||
- Synthetic data 가 individual 추적 불가
|
||||
- Differential privacy 가 강한 보장
|
||||
|
||||
Tools:
|
||||
- gretel.ai
|
||||
- Mostly AI
|
||||
- YData
|
||||
```
|
||||
|
||||
### Use cases
|
||||
```
|
||||
✅ Test fixtures (unit / integration / e2e)
|
||||
✅ Demo / sandbox
|
||||
✅ Load test data
|
||||
✅ ML training augmentation
|
||||
✅ Privacy-preserving sharing
|
||||
✅ Edge case generation
|
||||
✅ Adversarial testing
|
||||
|
||||
❌ Production data 대체 (real distribution 다름)
|
||||
❌ Statistical analysis (bias)
|
||||
```
|
||||
|
||||
### LLM-as-judge (synthetic 검증)
|
||||
```ts
|
||||
async function evaluateSynthetic(real: any[], synthetic: any[]) {
|
||||
return await llm.complete({
|
||||
user: `Compare these two datasets:
|
||||
Real: ${JSON.stringify(real.slice(0, 10))}
|
||||
Synthetic: ${JSON.stringify(synthetic.slice(0, 10))}
|
||||
|
||||
Are they similar in style, distribution, realism? Score 1-10. Output JSON.`,
|
||||
response_format: { type: 'json_object' },
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
### Cost
|
||||
```
|
||||
1000 records × 100 tokens × $5/1M = $0.50
|
||||
|
||||
→ Cheap.
|
||||
|
||||
ML training data:
|
||||
10K records × 500 tokens × $5/1M = $25
|
||||
|
||||
→ Still cheap vs human labeling.
|
||||
```
|
||||
|
||||
### Reproducibility
|
||||
```ts
|
||||
// Seed
|
||||
const seed = 42;
|
||||
faker.seed(seed);
|
||||
|
||||
// LLM = non-deterministic. Use temperature 0 + cache.
|
||||
const r = await openai.chat.completions.create({
|
||||
model: 'gpt-4o-mini',
|
||||
temperature: 0,
|
||||
seed: 42, // 일부 model
|
||||
messages: [...],
|
||||
});
|
||||
```
|
||||
|
||||
### Volume
|
||||
```ts
|
||||
// 10K records — batch
|
||||
const BATCH = 50;
|
||||
const total = 10000;
|
||||
|
||||
const all: any[] = [];
|
||||
for (let i = 0; i < total; i += BATCH) {
|
||||
const batch = await generate(BATCH);
|
||||
all.push(...batch);
|
||||
console.log(`${all.length}/${total}`);
|
||||
}
|
||||
```
|
||||
|
||||
→ Rate limit / cost 주의.
|
||||
|
||||
### Streaming (large dataset)
|
||||
```ts
|
||||
async function* generateStream(count: number) {
|
||||
for (let i = 0; i < count; i += 50) {
|
||||
const batch = await generate(Math.min(50, count - i));
|
||||
for (const item of batch) yield item;
|
||||
}
|
||||
}
|
||||
|
||||
for await (const item of generateStream(10000)) {
|
||||
await db.insert(item);
|
||||
}
|
||||
```
|
||||
|
||||
### Tools
|
||||
```
|
||||
- Mockaroo (web): schema → CSV/JSON
|
||||
- Faker.js / Faker (Python)
|
||||
- gretel.ai: privacy-preserving synthetic
|
||||
- SDV (Synthetic Data Vault): tabular ML
|
||||
- LLM (GPT-4o, Claude, local)
|
||||
```
|
||||
|
||||
### Best practices
|
||||
```
|
||||
1. Schema first (Zod / Pydantic)
|
||||
2. Diverse prompts (variation)
|
||||
3. Validation 가 real distribution 비슷
|
||||
4. Privacy 검증 (no PII leak)
|
||||
5. Versioning (synthetic dataset 도)
|
||||
6. Cost monitoring
|
||||
```
|
||||
|
||||
## 🤔 의사결정 기준
|
||||
| 사용 | 추천 |
|
||||
|---|---|
|
||||
| Unit test | Faker (deterministic) |
|
||||
| E2E test | Faker + LLM 조합 |
|
||||
| Demo / sandbox | LLM (realistic) |
|
||||
| ML training | LLM + augmentation |
|
||||
| Privacy 보존 | gretel / Mostly AI |
|
||||
| 큰 volume | Faker (cost) |
|
||||
|
||||
## ❌ 안티패턴
|
||||
- **Real PII 변형 X — synthetic 가정**: privacy violation.
|
||||
- **모든 거 LLM (큰 cost)**: Faker 가 OK 자주.
|
||||
- **Distribution 가 real 같은 가정**: validate.
|
||||
- **Reproducibility 없음**: test flake.
|
||||
- **Seed 없음 (random)**: 다른 결과.
|
||||
- **Edge case 없음**: 일반 case 만 generate.
|
||||
- **Synthetic만 deploy production**: real 가 아님.
|
||||
|
||||
## 🤖 LLM 활용 힌트
|
||||
- Schema-driven (Zod) + LLM = realistic.
|
||||
- Faker (cheap) + LLM (creative) hybrid.
|
||||
- Diverse prompt (multiple variation).
|
||||
- Privacy-aware (no PII generation).
|
||||
|
||||
## 🔗 관련 문서
|
||||
- [[Testing_Faker_and_Builders]]
|
||||
- [[AI_Fine_Tuning_vs_Prompting]]
|
||||
- [[AI_LLM_Eval_Patterns]]
|
||||
Reference in New Issue
Block a user