# 1. spaCy — 영어 형태소 + 의존 파싱 한 번에importspacynlp=spacy.load("en_core_web_trf")doc=nlp("The quick brown foxes were jumping over the lazy dogs.")fortokindoc:print(f"{tok.text:12} lemma={tok.lemma_:8} pos={tok.pos_:6} "f"dep={tok.dep_:10} head={tok.head.text}")
# 3. KoNLPy — 한국어 형태소fromkonlpy.tagimportMecab,Okt,Komoranmecab=Mecab()print(mecab.pos("아버지가 방에 들어가신다."))# [('아버지','NNG'),('가','JKS'),('방','NNG'),('에','JKB'),# ('들어가','VV'),('신다','EP+EF')]
# 4. kiwi (속도+정확도 균형, 한국어 2024)fromkiwipiepyimportKiwikiwi=Kiwi()result=kiwi.tokenize("나는 학교에 갑니다.")fortinresult:print(t.form,t.tag,t.start,t.len)
# 5. Stanza — 70+ 언어 신경 파서importstanzastanza.download("ko")nlp=stanza.Pipeline("ko",processors="tokenize,pos,lemma,depparse")doc=nlp("나는 책을 읽었다.")forsentindoc.sentences:forwordinsent.words:print(word.text,word.upos,word.feats,word.head,word.deprel)
# 6. Constituency parsing — Berkeley Neural Parserimportbenepar,spacynlp=spacy.load("en_core_web_md")nlp.add_pipe("benepar",config={"model":"benepar_en3"})doc=nlp("The quick brown fox jumps over the lazy dog.")sent=list(doc.sents)[0]print(sent._.parse_string)
# 8. UD features 활용 — 능동/수동 판별defis_passive(token):returnany(c.dep_=="auxpass"forcintoken.children)fortokindoc:iftok.pos_=="VERB"andis_passive(tok):print(f"Passive verb: {tok.text}")
# 9. ElasticSearch nori (한국어 색인)# 매핑:# "analyzer": {"my_nori": {"type":"custom","tokenizer":"nori_tokenizer"}}# nori_tokenizer가 mecab-ko-dic 활용해 형태소 단위 색인
# 10. LLM에게 파싱 — 구조화 출력importjsonprompt="""Tokenize and tag (Universal Dependencies) the sentence.
Return JSON: [{"text":..., "lemma":..., "upos":..., "head":..., "deprel":...}]
Sentence: She quickly read the book yesterday."""# Claude/GPT-4 응답을 json.loads로 파싱