connectai/assets/scripts/youtube_transcript.py

#!/usr/bin/env python3
"""
YouTube Transcript Extractor — Astra extension companion script.

용도:
    채널 / 플레이리스트 / 단일 영상 URL을 받아 각 영상의 자막을 추출하고
    사용자가 지정한 폴더에 *영상 제목 + 영상 ID* 가 들어간 파일명으로 저장.

호출 방식 (Astra TypeScript 측에서 spawn):
    python youtube_transcript.py \\
        --source <URL> \\
        --output-dir <폴더 경로> \\
        [--lang ko,en] \\
        [--limit 50]

stdout으로 진행 상황을 *한 줄 한 JSON*씩 흘려서 TS가 stream 파싱하기 쉽게.
각 라인은 다음 중 하나의 event:

    {"type":"start","total":N,"source":"..."}
    {"type":"video","index":i,"video_id":"...","title":"...","status":"ok|fail","saved_to":"...","error":"..."}
    {"type":"done","ok":N_ok,"fail":N_fail,"output_dir":"..."}
    {"type":"error","stage":"...","message":"..."}

의존성:
    pip install yt-dlp youtube-transcript-api

사용자 환경에 패키지가 없으면 import 단계에서 {"type":"error"} JSON 한 줄 찍고
exit 2. TS가 그것 보고 친절한 안내 메시지 표시.
"""

import argparse
import json
import os
import re
import sys
from pathlib import Path

# Windows에서 stdout 기본 인코딩이 cp949로 잡히면 한글 JSON이 깨져서 TS 측이
# 못 읽거나 화면에 �?? 로 표시된다. 가장 먼저 stdout/stderr를 UTF-8로 강제.
try:
    sys.stdout.reconfigure(encoding='utf-8', errors='replace')
    sys.stderr.reconfigure(encoding='utf-8', errors='replace')
except Exception:
    pass  # 매우 오래된 Python에선 reconfigure 미지원 — TS 측 환경변수 fallback


def _emit(event: dict) -> None:
    """JSON 한 줄을 stdout에 흘리고 즉시 flush — TS의 stream reader가 line 단위로 받음."""
    sys.stdout.write(json.dumps(event, ensure_ascii=False) + "\n")
    sys.stdout.flush()


def _trace(stage: str, **info) -> None:
    """디버그 trace — stderr로 흘려 TS 측 stderrTail에 누적된다. 사용자가 '자세히
    보기'로 stderr 확인 시 어느 단계까지 갔는지 한눈에. video event 누락 같은
    '조용한 실패'를 추적할 수 있다."""
    detail = " ".join(f"{k}={v}" for k, v in info.items())
    sys.stderr.write(f"[trace] {stage}: {detail}\n")
    sys.stderr.flush()


def _check_deps():
    """필수 패키지 import 가능 여부 검사 — 없으면 친절한 메시지로 종료."""
    missing = []
    try:
        import yt_dlp  # noqa: F401
    except ImportError:
        missing.append("yt-dlp")
    try:
        from youtube_transcript_api import YouTubeTranscriptApi  # noqa: F401
    except ImportError:
        missing.append("youtube-transcript-api")
    if missing:
        _emit({
            "type": "error",
            "stage": "deps",
            "message": f"필수 패키지가 없습니다: {', '.join(missing)}",
            "install_command": f"pip install {' '.join(missing)}",
        })
        sys.exit(2)


def _safe_filename(name: str, max_len: int = 100) -> str:
    """Windows + macOS + Linux 모두에서 안전한 파일명. 일부 특수문자 제거 + 길이 cap."""
    name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "", name)
    name = re.sub(r"\s+", " ", name).strip()
    if len(name) > max_len:
        name = name[:max_len].rstrip()
    return name or "untitled"


def _list_videos(source_url: str, limit: int | None) -> list[dict]:
    """yt-dlp로 채널/플레이리스트의 영상 목록(또는 단일 영상)을 메타데이터까지 수집.

    Return 형식: [{"id": "...", "title": "...", "url": "..."}]
    """
    import yt_dlp

    ydl_opts = {
        "quiet": True,
        "no_warnings": True,
        # 전체 메타데이터를 펴는 대신 *flat playlist*로 영상 목록만 빠르게.
        "extract_flat": "in_playlist",
        "skip_download": True,
    }
    if limit and limit > 0:
        ydl_opts["playlistend"] = limit

    videos: list[dict] = []
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(source_url, download=False)
        if not info:
            return []
        # 단일 영상 vs 채널/플레이리스트 구분
        if info.get("_type") in (None, "video"):
            # 단일 영상.
            vid = info.get("id") or ""
            title = info.get("title") or ""
            url = info.get("webpage_url") or f"https://www.youtube.com/watch?v={vid}"
            if vid:
                videos.append({"id": vid, "title": title, "url": url})
        else:
            # 채널/플레이리스트.
            entries = info.get("entries") or []
            for entry in entries:
                if not entry:
                    continue
                vid = entry.get("id") or ""
                if not vid:
                    continue
                title = entry.get("title") or vid
                url = entry.get("url") or f"https://www.youtube.com/watch?v={vid}"
                # `url`이 그냥 id인 경우(extract_flat 결과) 풀 URL로 변환.
                if not url.startswith("http"):
                    url = f"https://www.youtube.com/watch?v={url}"
                videos.append({"id": vid, "title": title, "url": url})
    return videos


def _list_transcripts_compat(video_id: str):
    """youtube-transcript-api 0.6.x / 1.x 양쪽 지원.
       0.6.x: classmethod `YouTubeTranscriptApi.list_transcripts(video_id)`
       1.x:   instance method `YouTubeTranscriptApi().list(video_id)`
    한 라이브러리만 설치돼 있을 수 있으니 두 방식 모두 시도."""
    from youtube_transcript_api import YouTubeTranscriptApi
    # 1.x 방식 먼저 시도 (사용자가 upgrade했다면 이쪽일 확률).
    if hasattr(YouTubeTranscriptApi, "list_transcripts"):
        # 0.6.x — classmethod.
        _trace("transcript_api", api="0.6.x classmethod")
        try:
            return YouTubeTranscriptApi.list_transcripts(video_id)
        except TypeError:
            # 1.x인데 호환용 stub만 있는 경우 — instance로 다시 시도.
            pass
    api = YouTubeTranscriptApi()
    if hasattr(api, "list"):
        _trace("transcript_api", api="1.x instance.list")
        return api.list(video_id)
    if hasattr(api, "list_transcripts"):
        _trace("transcript_api", api="fallback instance.list_transcripts")
        return api.list_transcripts(video_id)
    raise RuntimeError("youtube-transcript-api의 list API를 찾지 못했습니다 — 패키지 손상 가능")


def _fetch_via_transcript_api(video_id: str, languages: list[str]) -> str:
    """1차 시도: youtube-transcript-api. 빠르지만 YouTube 변경에 자주 깨짐."""
    from youtube_transcript_api.formatters import TextFormatter

    _trace("transcript_api.start", video_id=video_id, langs=",".join(languages))
    transcript_list = _list_transcripts_compat(video_id)
    chosen = None
    for lang in languages:
        try:
            chosen = transcript_list.find_manually_created_transcript([lang])
            _trace("transcript_api.found", kind="manual", lang=lang)
            break
        except Exception:
            pass
    if chosen is None:
        for lang in languages:
            try:
                chosen = transcript_list.find_generated_transcript([lang])
                _trace("transcript_api.found", kind="generated", lang=lang)
                break
            except Exception:
                pass
    if chosen is None:
        try:
            chosen = next(iter(transcript_list))
            _trace("transcript_api.found", kind="first-available")
        except StopIteration:
            raise RuntimeError("자막 트랙이 없음")
    formatter = TextFormatter()
    fetched = chosen.fetch()
    text = formatter.format_transcript(fetched)
    _trace("transcript_api.ok", chars=len(text))
    return text


def _fetch_via_yt_dlp(video_id: str, languages: list[str]) -> str:
    """2차 fallback: yt-dlp가 직접 자막 파일을 다운로드. transcript-api보다 *훨씬* 안정적
    — YouTube 페이지를 직접 파싱하므로 라이브러리 호환성 이슈 영향 적음.

    yt-dlp는 자막을 VTT/SRV3 등 다양한 포맷으로 받는데, VTT를 받아 plain text로
    변환한다. 자동 자막(`writeautomaticsub`)도 같이 요청해서 수동 자막이 없을 때도
    가져온다.
    """
    import tempfile
    import yt_dlp
    _trace("yt_dlp.start", video_id=video_id, langs=",".join(languages))

    with tempfile.TemporaryDirectory() as tmpdir:
        outtmpl = os.path.join(tmpdir, "%(id)s.%(ext)s")
        ydl_opts = {
            "quiet": True,
            "no_warnings": True,
            "skip_download": True,
            "writesubtitles": True,
            "writeautomaticsub": True,
            "subtitleslangs": languages + [f"{l}.*" for l in languages] + ["en"],
            "subtitlesformat": "vtt/best",
            "outtmpl": outtmpl,
            # 자막 다운로드 단계에서 HTTP 에러(429 등)가 SystemExit으로 빠져
            # 프로세스 전체를 죽이지 않도록. main loop가 예외를 잡아 video
            # 이벤트로 emit할 수 있게 한다.
            "ignoreerrors": True,
            # 429 등 일시적 실패 자동 재시도. 너무 공격적이면 IP block 위험,
            # 너무 느슨하면 사용자가 답답함 — 2회 정도가 적당.
            "retries": 2,
            "extractor_retries": 2,
            # 429 직격타 대응. 영상 사이에 1~3초 대기로 rate limit 회피.
            "sleep_interval": 1,
            "max_sleep_interval": 3,
            "sleep_interval_subtitles": 1,
        }
        url = f"https://www.youtube.com/watch?v={video_id}"
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            _trace("yt_dlp.download_returned")
        except SystemExit as e:
            # ignoreerrors=True여도 일부 분기에서 SystemExit이 raise됨 — 명시적으로 catch.
            _trace("yt_dlp.systemexit", code=getattr(e, "code", "?"))
            raise RuntimeError(f"yt-dlp SystemExit (코드={e.code})")
        # 결과: <video_id>.<lang>.vtt 형태로 떨어짐. 우선순위 lang 순서대로 찾음.
        vtt_path = None
        for lang in languages:
            for cand in Path(tmpdir).glob(f"{video_id}*{lang}*.vtt"):
                vtt_path = cand
                break
            if vtt_path:
                break
        # 못 찾으면 그냥 첫 vtt
        if vtt_path is None:
            for cand in Path(tmpdir).glob(f"{video_id}*.vtt"):
                vtt_path = cand
                break
        if vtt_path is None:
            raise RuntimeError("yt-dlp가 자막 파일을 만들지 못했습니다 — 영상에 자막이 정말 없거나 비공개")
        return _vtt_to_text(vtt_path.read_text(encoding="utf-8"))


def _vtt_to_text(vtt: str) -> str:
    """WebVTT를 plain text로. 타임스탬프 / 헤더 / 큐 식별자 / 빈줄 정리."""
    lines: list[str] = []
    prev = ""
    for raw in vtt.split("\n"):
        s = raw.rstrip()
        if not s:
            continue
        # WEBVTT 헤더 / NOTE 블록 / STYLE 블록 skip
        if s.startswith("WEBVTT") or s.startswith("NOTE") or s.startswith("STYLE") or s.startswith("Kind:") or s.startswith("Language:"):
            continue
        # 타임스탬프 라인 (00:00:00.000 --> 00:00:00.000) skip
        if "-->" in s and re.search(r"\d\d:\d\d", s):
            continue
        # 큐 식별자 (숫자 한 줄)
        if re.fullmatch(r"\d+", s):
            continue
        # VTT 인라인 태그 제거 (<00:00:00.000>, <c>, </c> 등)
        clean = re.sub(r"<[^>]+>", "", s).strip()
        if not clean:
            continue
        # 자동 자막은 같은 줄을 반복 출력하는 경우가 많음 — 직전 줄과 동일하면 skip
        if clean == prev:
            continue
        lines.append(clean)
        prev = clean
    return "\n".join(lines)


def _fetch_transcript(video_id: str, languages: list[str]) -> str:
    """1차 youtube-transcript-api → 실패하면 2차 yt-dlp fallback.

    두 라이브러리의 실패 이유는 서로 달라서 fallback이 의미 있음:
      - transcript-api: YouTube의 내부 자막 endpoint 변화에 자주 깨짐
      - yt-dlp: 영상 페이지 자체를 파싱하므로 endpoint 변화에 강함, 더 잘 유지보수됨
    각 단계 trace는 stderr로. 모두 실패한 경우 errors 메시지를 하나로 합쳐 raise.
    BaseException으로 잡아 SystemExit/import-time error도 포함.
    """
    errors: list[str] = []
    try:
        return _fetch_via_transcript_api(video_id, languages)
    except BaseException as e:
        msg = f"{type(e).__name__}: {e}"
        _trace("transcript_api.fail", msg=msg)
        errors.append(f"transcript-api: {msg}")
    try:
        return _fetch_via_yt_dlp(video_id, languages)
    except BaseException as e:
        msg = f"{type(e).__name__}: {e}"
        _trace("yt_dlp.fail", msg=msg)
        errors.append(f"yt-dlp: {msg}")
    raise RuntimeError(" / ".join(errors) or "자막을 가져오지 못했습니다")


def main() -> int:
    parser = argparse.ArgumentParser(description="YouTube transcript bulk extractor")
    parser.add_argument("--source", required=True, help="채널 / 플레이리스트 / 단일 영상 URL")
    parser.add_argument("--output-dir", required=True, help="자막 파일 저장 폴더")
    parser.add_argument("--lang", default="ko,en", help="자막 언어 우선순위 (콤마 구분)")
    parser.add_argument("--limit", type=int, default=0, help="최대 영상 수 (0 = 제한 없음)")
    args = parser.parse_args()

    _check_deps()

    languages = [s.strip() for s in args.lang.split(",") if s.strip()]
    if not languages:
        languages = ["ko", "en"]

    output_dir = Path(args.output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)

    # 1) 영상 목록.
    try:
        videos = _list_videos(args.source, args.limit if args.limit > 0 else None)
    except Exception as e:
        _emit({"type": "error", "stage": "list", "message": str(e)})
        return 1

    if not videos:
        _emit({"type": "error", "stage": "list", "message": "영상을 한 개도 찾지 못했습니다. URL 확인 필요."})
        return 1

    _emit({"type": "start", "total": len(videos), "source": args.source, "output_dir": str(output_dir)})
    _trace("loop.begin", total=len(videos))

    ok = 0
    fail = 0
    for i, v in enumerate(videos):
        vid = v["id"]
        title = v["title"]
        url = v["url"]
        _trace("loop.iter", index=i, video_id=vid)
        try:
            text = _fetch_transcript(vid, languages)
            safe_title = _safe_filename(title, max_len=80)
            filename = f"{safe_title}__{vid}.txt"
            target = output_dir / filename
            header = (
                f"제목: {title}\n"
                f"영상: {url}\n"
                f"비디오 ID: {vid}\n"
                f"언어 우선순위: {', '.join(languages)}\n"
                f"{'-' * 60}\n\n"
            )
            target.write_text(header + text, encoding="utf-8")
            ok += 1
            _emit({
                "type": "video", "index": i, "video_id": vid, "title": title,
                "status": "ok", "saved_to": str(target),
            })
        except KeyboardInterrupt:
            # Ctrl-C / abort 전파.
            raise
        except BaseException as e:
            # Exception 뿐 아니라 SystemExit(yt-dlp가 raise함)까지 잡는다. 어떤
            # 비정상 상황에서도 video 이벤트를 *반드시* emit해서 호출자가 영상을
            # "묵묵히 사라지지 않게" 한다.
            fail += 1
            error_msg = f"{type(e).__name__}: {e}"
            _emit({
                "type": "video", "index": i, "video_id": vid, "title": title,
                "status": "fail", "error": error_msg,
            })

    _trace("loop.end", ok=ok, fail=fail)
    _emit({"type": "done", "ok": ok, "fail": fail, "output_dir": str(output_dir)})
    return 0


if __name__ == "__main__":
    try:
        sys.exit(main())
    except KeyboardInterrupt:
        _emit({"type": "error", "stage": "interrupt", "message": "사용자가 중단했습니다."})
        sys.exit(130)