PDFVisionFallback

2026-05-06 11:55:45 +09:00
parent 53073578e2
commit 8ece47f961
7 changed files with 68 additions and 29 deletions
@@ -338,18 +338,28 @@ export class AgentExecutor {
            const reqMessages = this.buildRequestHistory(this.chatHistory);

            // Handle Vision Content Injection
-            // Merge text prompt with file content instead of replacing, so the user's message is never lost
+            // visionContent 배열에서 이미지 base64 데이터를 추출하여 엔진에 맞는 형식으로 주입
            if (hasVisionContent && reqMessages.length > 0) {
                const lastUserIdx = reqMessages.map(m => m.role).lastIndexOf('user');
                if (lastUserIdx >= 0) {
                    const existingContent = reqMessages[lastUserIdx].content;
-                    const textParts: any[] = (typeof existingContent === 'string' && existingContent.trim())
-                        ? [{ type: 'text', text: existingContent }]
-                        : [];
+                    const textContent = (typeof existingContent === 'string' && existingContent.trim()) ? existingContent : '';
+                    
+                    // base64 이미지 데이터 추출
+                    const imageBase64List: string[] = [];
+                    for (const vc of (visionContent || [])) {
+                        if (vc && vc.data) {
+                            imageBase64List.push(vc.data);
+                        }
+                    }
+
+                    // Ollama 호환: images 배열 필드에 base64 데이터 직접 주입
+                    // LM Studio 호환: content 배열에 image_url 객체 주입
                    reqMessages[lastUserIdx] = {
                        role: 'user',
-                        content: JSON.stringify([...textParts, ...(visionContent || [])])
-                    };
+                        content: textContent,
+                        images: imageBase64List // Ollama native format
+                    } as any;
                }
            }

@@ -1925,10 +1935,15 @@ export class AgentExecutor {
                ? message.content
                : JSON.stringify(message.content);

-            return {
+            const result: any = {
                role: message.role,
                content: normalizedContent
            };
+            // Ollama Vision: images 필드 보존
+            if ((message as any).images) {
+                result.images = (message as any).images;
+            }
+            return result;
        });
    }

@@ -1851,7 +1851,8 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
                const type = file.type || '';

                if (name.endsWith('.pdf') || type === 'application/pdf') {
-                    // PDF: 서버사이드 텍스트 추출 (pdf-parse v2 API)
+                    // PDF: 서버사이드 텍스트 추출 (pdf-parse v2 API) + Vision 폴백
+                    let pdfTextOk = false;
                    try {
                        const { PDFParse } = require('pdf-parse');
                        const rawBuffer = Buffer.from(file.data, 'base64');
@@ -1859,20 +1860,43 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
                        const parser = new PDFParse(uint8);
                        await parser.load();
                        const textResult = await parser.getText();
-                        // pdf-parse v2: getText() returns {pages: [{text, num}], text: string, total: number}
                        const extracted = (typeof textResult === 'string' ? textResult : (textResult?.text || '')).trim();
-                        // 페이지 구분 마커 제거하여 깔끔한 텍스트 추출
                        const cleanText = extracted.replace(/\n*-- \d+ of \d+ --\n*/g, '\n').trim();
-                        if (cleanText && cleanText.length > 10) {
+                        if (cleanText && cleanText.length > 30) {
                            textContents.push(`\n[PDF: ${file.name}]\n${cleanText}`);
                            logInfo(`PDF text extracted successfully.`, { fileName: file.name, chars: cleanText.length });
-                        } else {
-                            textContents.push(`\n[PDF: ${file.name}]\n(텍스트 추출 결과 없음 - 이미지 기반 PDF일 수 있습니다. 텍스트 레이어가 없는 스캔 문서는 OCR 변환 후 재시도하세요.)`);
-                            logInfo(`PDF text extraction returned empty/minimal result.`, { fileName: file.name, rawLength: extracted.length });
+                            pdfTextOk = true;
+                        }
+
+                        // [Vision Fallback] 텍스트가 비어있으면 페이지 이미지 추출 -> Vision 모델에 전달
+                        if (!pdfTextOk) {
+                            logInfo(`PDF has no text layer. Extracting page screenshots for vision analysis.`, { fileName: file.name });
+                            const screenshots = await parser.getScreenshot({ page: 1 });
+                            if (screenshots?.pages && screenshots.pages.length > 0) {
+                                const maxPages = Math.min(screenshots.pages.length, 8); // 메모리 보호: 최대 8페이지
+                                for (let i = 0; i < maxPages; i++) {
+                                    const page = screenshots.pages[i];
+                                    if (page?.data) {
+                                        const pageBase64 = Buffer.from(page.data).toString('base64');
+                                        images.push({
+                                            name: `${file.name}_page${i + 1}.png`,
+                                            type: 'image/png',
+                                            data: pageBase64
+                                        });
+                                    }
+                                }
+                                textContents.push(`\n[PDF: ${file.name}]\n(이미지 기반 PDF ${screenshots.total}페이지 중 ${maxPages}페이지를 이미지로 추출하여 Vision 분석합니다. 각 페이지 이미지를 참조하여 문서의 내용을 상세히 분석하고 한국어로 정리하세요.)`);
+                                logInfo(`PDF vision fallback: extracted ${maxPages} page screenshots.`, { fileName: file.name, totalPages: screenshots.total });
+                                pdfTextOk = true; // Vision 분석으로 처리 완료
+                            }
                        }
                    } catch (pdfError: any) {
-                        logError(`PDF parsing failed.`, { fileName: file.name, error: pdfError?.message || String(pdfError) });
-                        textContents.push(`\n[PDF: ${file.name}]\n(PDF 파싱 오류: ${pdfError?.message || '알 수 없는 오류'})`);
+                        logError(`PDF processing failed.`, { fileName: file.name, error: pdfError?.message || String(pdfError) });
+                    }
+
+                    // 최종 폴백: 텍스트도 없고 이미지 추출도 실패한 경우
+                    if (!pdfTextOk) {
+                        textContents.push(`\n[PDF: ${file.name}]\n(PDF 분석에 실패했습니다. 이 파일을 텍스트로 변환하여 다시 시도해주세요.)`);
                    }
                } else if (
                    type.startsWith('text/') ||