[G1-Sync] Manual knowledge update
This commit is contained in:
@@ -0,0 +1,362 @@
|
||||
---
|
||||
id: ai-token-budget-patterns
|
||||
title: Token Budget — context limit / truncation / window
|
||||
category: Coding
|
||||
status: draft
|
||||
source_trust_level: B
|
||||
verification_status: conceptual
|
||||
created_at: 2026-05-09
|
||||
updated_at: 2026-05-09
|
||||
tags: [ai, llm, tokens, vibe-coding]
|
||||
tech_stack: { language: "TS / Python", applicable_to: ["Backend", "AI"] }
|
||||
applied_in: []
|
||||
aliases: [token budget, context window, truncation, token counting, tiktoken, prompt size]
|
||||
---
|
||||
|
||||
# Token Budget Patterns
|
||||
|
||||
> LLM 가 input + output token 합한 limit. **Track + truncate + summarize + dynamic budget**. Cost + latency 가 token 수 비례. Smart RAG / message pruning / summary cascade.
|
||||
|
||||
## 📖 핵심 개념
|
||||
- Context window: 입력 + 출력 limit (e.g. 200k tokens).
|
||||
- Per-call cost = input × $/1k + output × $/1k.
|
||||
- Tokenizer 가 model 별 다름.
|
||||
- Output limit 이 input 보다 작음 (e.g. 200k in / 8k out).
|
||||
|
||||
## 💻 코드 패턴
|
||||
|
||||
### Token counting (Anthropic / OpenAI)
|
||||
```ts
|
||||
// Anthropic
|
||||
import Anthropic from '@anthropic-ai/sdk';
|
||||
const client = new Anthropic();
|
||||
|
||||
const { input_tokens } = await client.messages.countTokens({
|
||||
model: 'claude-opus-4-7',
|
||||
messages,
|
||||
});
|
||||
```
|
||||
|
||||
```ts
|
||||
// OpenAI tiktoken
|
||||
import { encoding_for_model } from 'tiktoken';
|
||||
|
||||
const enc = encoding_for_model('gpt-4');
|
||||
const tokens = enc.encode('Hello world');
|
||||
console.log(tokens.length); // 2
|
||||
enc.free();
|
||||
```
|
||||
|
||||
### Approximate (no API)
|
||||
```ts
|
||||
// 근사: 1 token ≈ 4 char (English)
|
||||
function estimateTokens(text: string): number {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
// 한글 = 1 token ≈ 1-2 char (worse)
|
||||
```
|
||||
|
||||
→ 정확 = tokenizer. 근사 = quick budget.
|
||||
|
||||
### Budget split
|
||||
```ts
|
||||
const MAX_CONTEXT = 200_000;
|
||||
const MAX_OUTPUT = 8_192;
|
||||
|
||||
const budget = {
|
||||
system: 1_000, // fixed prompt
|
||||
rag: 50_000, // retrieval
|
||||
conversation: 100_000, // history
|
||||
user: 5_000, // current message
|
||||
output: MAX_OUTPUT,
|
||||
};
|
||||
|
||||
const sum = Object.values(budget).reduce((a, b) => a + b);
|
||||
console.assert(sum <= MAX_CONTEXT);
|
||||
```
|
||||
|
||||
→ 각 piece 의 limit 정함. 넘으면 truncate.
|
||||
|
||||
### Conversation pruning
|
||||
```ts
|
||||
function prune(messages: Message[], maxTokens: number): Message[] {
|
||||
const result: Message[] = [];
|
||||
let used = 0;
|
||||
|
||||
// 최신 → 옛 (최신 우선)
|
||||
for (let i = messages.length - 1; i >= 0; i--) {
|
||||
const t = countTokens(messages[i]);
|
||||
if (used + t > maxTokens) break;
|
||||
result.unshift(messages[i]);
|
||||
used += t;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
```
|
||||
|
||||
→ Sliding window. 옛 message 잃음.
|
||||
|
||||
### Summarization cascade
|
||||
```ts
|
||||
async function summarize(messages: Message[]): Promise<string> {
|
||||
const r = await llm.complete({
|
||||
system: 'Summarize this conversation in 200 tokens.',
|
||||
messages,
|
||||
});
|
||||
return r.text;
|
||||
}
|
||||
|
||||
// 너무 길면 요약
|
||||
if (count(messages) > 50_000) {
|
||||
const old = messages.slice(0, -10);
|
||||
const recent = messages.slice(-10);
|
||||
|
||||
const summary = await summarize(old);
|
||||
return [
|
||||
{ role: 'system', content: `Previous: ${summary}` },
|
||||
...recent,
|
||||
];
|
||||
}
|
||||
```
|
||||
|
||||
→ Old context lost detail, recent intact.
|
||||
|
||||
### Hierarchical summary
|
||||
```
|
||||
1주: 매 10 message → 요약
|
||||
1개월: 매 hour → 요약
|
||||
1년: 매 day → 요약
|
||||
|
||||
→ Long-term memory tree.
|
||||
```
|
||||
|
||||
### Truncation strategy
|
||||
```ts
|
||||
type Strategy = 'head' | 'tail' | 'middle' | 'summary';
|
||||
|
||||
function truncate(text: string, maxTokens: number, strategy: Strategy = 'tail') {
|
||||
const tokens = enc.encode(text);
|
||||
if (tokens.length <= maxTokens) return text;
|
||||
|
||||
switch (strategy) {
|
||||
case 'head':
|
||||
return enc.decode(tokens.slice(0, maxTokens));
|
||||
case 'tail':
|
||||
return enc.decode(tokens.slice(-maxTokens));
|
||||
case 'middle':
|
||||
const half = maxTokens / 2;
|
||||
return enc.decode(tokens.slice(0, half)) + '\n...[truncated]...\n' + enc.decode(tokens.slice(-half));
|
||||
case 'summary':
|
||||
return await summarize(text);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Dynamic context (RAG)
|
||||
```ts
|
||||
async function buildContext(query: string, budget: number) {
|
||||
const candidates = await vectorSearch(query, k: 50);
|
||||
|
||||
let used = 0;
|
||||
const selected = [];
|
||||
|
||||
for (const doc of candidates) {
|
||||
const t = estimateTokens(doc.text);
|
||||
if (used + t > budget) break;
|
||||
selected.push(doc);
|
||||
used += t;
|
||||
}
|
||||
|
||||
return selected;
|
||||
}
|
||||
```
|
||||
|
||||
→ Top-K → token budget 까지.
|
||||
|
||||
### Prompt caching (Anthropic / OpenAI)
|
||||
```ts
|
||||
// Anthropic prompt caching
|
||||
const r = await client.messages.create({
|
||||
model: 'claude-opus-4-7',
|
||||
system: [
|
||||
{ type: 'text', text: BIG_SYSTEM_PROMPT, cache_control: { type: 'ephemeral' } },
|
||||
],
|
||||
messages,
|
||||
});
|
||||
```
|
||||
|
||||
→ 같은 system / RAG → 90% cost ↓.
|
||||
|
||||
### Cost calculation
|
||||
```ts
|
||||
const PRICING = {
|
||||
'claude-opus-4-7': { input: 15, output: 75 }, // $/MTok
|
||||
'claude-sonnet-4-6': { input: 3, output: 15 },
|
||||
'gpt-4o': { input: 2.5, output: 10 },
|
||||
};
|
||||
|
||||
function cost(model: string, input: number, output: number) {
|
||||
const p = PRICING[model];
|
||||
return (input * p.input + output * p.output) / 1_000_000;
|
||||
}
|
||||
|
||||
console.log(cost('claude-opus-4-7', 50_000, 5_000)); // $1.125
|
||||
```
|
||||
|
||||
### Streaming + early stop
|
||||
```ts
|
||||
const stream = await llm.stream({ messages });
|
||||
let used = 0;
|
||||
for await (const chunk of stream) {
|
||||
process.stdout.write(chunk.text);
|
||||
used += chunk.tokens;
|
||||
if (used > MAX_OUTPUT) break; // safety
|
||||
}
|
||||
```
|
||||
|
||||
### Stop sequences
|
||||
```ts
|
||||
await llm.complete({
|
||||
messages,
|
||||
stop_sequences: ['\n\n###', 'END'],
|
||||
});
|
||||
// → 만나면 stop, output token 안 씀
|
||||
```
|
||||
|
||||
→ Output 의 boilerplate 줄이는 trick.
|
||||
|
||||
### Output JSON 줄이기
|
||||
```
|
||||
❌ "Please reply with detailed JSON including..."
|
||||
"{\n \"answer\": \"...\",\n ...\n}"
|
||||
|
||||
✅ "Reply: {answer, confidence}"
|
||||
{"answer":"...","confidence":0.9}
|
||||
|
||||
→ Compact JSON, no whitespace.
|
||||
```
|
||||
|
||||
### 큰 doc + 여러 query (split)
|
||||
```ts
|
||||
// Map-reduce
|
||||
async function bigDoc(doc: string, query: string) {
|
||||
const chunks = split(doc, 50_000);
|
||||
const partials = await Promise.all(
|
||||
chunks.map(c => llm.complete({ system: query, messages: [{ role: 'user', content: c }] }))
|
||||
);
|
||||
|
||||
// Reduce
|
||||
const combined = partials.map(p => p.text).join('\n\n---\n\n');
|
||||
return llm.complete({ system: 'Combine partial answers', messages: [{ role: 'user', content: combined }] });
|
||||
}
|
||||
```
|
||||
|
||||
### Refine (sequential)
|
||||
```ts
|
||||
let answer = '';
|
||||
for (const chunk of chunks) {
|
||||
answer = await llm.complete({
|
||||
system: `Refine answer. Current: ${answer}`,
|
||||
messages: [{ role: 'user', content: chunk }],
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
### Token-aware chunking (text)
|
||||
```ts
|
||||
function chunkByTokens(text: string, maxTokens: number, overlap: number) {
|
||||
const tokens = enc.encode(text);
|
||||
const chunks: string[] = [];
|
||||
for (let i = 0; i < tokens.length; i += maxTokens - overlap) {
|
||||
chunks.push(enc.decode(tokens.slice(i, i + maxTokens)));
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
```
|
||||
|
||||
→ Word boundary 안 맞을 수 있음 (overlap = sentence 보호).
|
||||
|
||||
### Visualizer
|
||||
```ts
|
||||
function visualize(messages: Message[], max: number) {
|
||||
const counts = messages.map(m => ({ role: m.role, t: countTokens(m) }));
|
||||
const sum = counts.reduce((a, b) => a + b.t, 0);
|
||||
|
||||
console.log(`Total: ${sum} / ${max} (${(sum / max * 100).toFixed(0)}%)`);
|
||||
for (const c of counts) {
|
||||
const bar = '█'.repeat(Math.floor(c.t / max * 50));
|
||||
console.log(`${c.role.padEnd(10)} ${c.t.toString().padStart(6)} ${bar}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### LangChain / LlamaIndex 자동
|
||||
```python
|
||||
from langchain.memory import ConversationSummaryBufferMemory
|
||||
memory = ConversationSummaryBufferMemory(
|
||||
llm=llm,
|
||||
max_token_limit=2000,
|
||||
return_messages=True,
|
||||
)
|
||||
```
|
||||
|
||||
→ 자동 prune + summarize.
|
||||
|
||||
### Context optimization
|
||||
```
|
||||
순서:
|
||||
1. System (always)
|
||||
2. RAG (relevant docs)
|
||||
3. Conversation summary
|
||||
4. Recent messages
|
||||
5. Current user
|
||||
|
||||
각 = budget. 넘으면 truncate.
|
||||
```
|
||||
|
||||
### Long context vs RAG
|
||||
```
|
||||
Long context (200k+):
|
||||
- Simple, 모두 in
|
||||
- Cost 큼, slow
|
||||
|
||||
RAG:
|
||||
- Embed + retrieve top-K
|
||||
- Cost 작음, 빠름
|
||||
- Tuning 필요
|
||||
|
||||
→ <50k = long context. >50k = RAG.
|
||||
```
|
||||
|
||||
## 🤔 의사결정 기준
|
||||
| 상황 | 추천 |
|
||||
|---|---|
|
||||
| Token count | Tokenizer (정확) / 4-char approx |
|
||||
| Context > limit | Prune / summarize |
|
||||
| 같은 system 자주 | Prompt caching |
|
||||
| 큰 doc 1 query | Map-reduce / refine |
|
||||
| Long history | Hierarchical summary |
|
||||
| Cost 줄이기 | Cache + smaller model + stop seq |
|
||||
| Real-time | Stream + early stop |
|
||||
|
||||
## ❌ 안티패턴
|
||||
- **Token count 안 추적**: 한도 넘으면 error.
|
||||
- **모든 history 보냄**: cost 폭발.
|
||||
- **Truncation 없음**: 한 자라도 over → 실패.
|
||||
- **Cache 안 씀**: 매번 system prompt full $.
|
||||
- **Verbose JSON output**: token 낭비.
|
||||
- **모든 doc RAG 보냄**: noise + cost.
|
||||
- **Output limit 무시**: 잘림.
|
||||
|
||||
## 🤖 LLM 활용 힌트
|
||||
- Tokenizer (model 별) 항상 count.
|
||||
- Prompt caching = 큰 cost 절감.
|
||||
- Hierarchical summary = long memory.
|
||||
- RAG vs long context = size dependent.
|
||||
|
||||
## 🔗 관련 문서
|
||||
- [[AI_Prompt_Caching]]
|
||||
- [[AI_LLM_Cost_Optimization]]
|
||||
- [[AI_RAG_Advanced]]
|
||||
Reference in New Issue
Block a user