Files
youlbot/eval/run_ragas.py
T
2026-06-01 17:43:51 +09:00

245 lines
9.0 KiB
Python

"""youlbot RAGAS 평가 스크립트 (Phase 20)
실행:
cd /path/to/youlbot
python eval/run_ragas.py [--dataset eval/dataset.jsonl] [--api http://localhost:8000]
결과:
eval/results/report_YYYYMMDD_HHMMSS.csv
사전 조건:
- youlbot API 서버 실행 중 (uvicorn api:app --port 8000)
- Qdrant + MySQL 접근 가능
- .env에 API_TOKEN, RAG_SHOW_SOURCES=true 설정
평가 지표:
- faithfulness : 답변이 검색 컨텍스트에 충실한가 (환각 탐지)
- answer_relevancy : 답변이 질문에 얼마나 관련 있는가
- context_recall : 컨텍스트가 정답에 필요한 정보를 포함하는가
- context_precision : 검색된 컨텍스트 중 실제 유용한 비율
참고:
평가에 로컬 LLM(Qwen3)을 사용하므로 결과 신뢰도는 모델 크기에 의존합니다.
더 정확한 평가를 원하면 OPENAI_API_KEY 또는 ANTHROPIC_API_KEY를 설정하세요.
"""
from __future__ import annotations
import argparse
import asyncio
import json
import os
import sys
from datetime import datetime
from pathlib import Path
# ── Compatibility shim ───────────────────────────────────────────────────────
# ragas 0.2.x imports langchain_community.chat_models.vertexai which was
# removed in langchain-community 0.4+. Re-export from langchain-google-vertexai.
try:
import langchain_community.chat_models.vertexai # noqa: F401
except ModuleNotFoundError:
try:
from langchain_google_vertexai import ChatVertexAI as _CV
_stub = type(sys)("langchain_community.chat_models.vertexai")
_stub.ChatVertexAI = _CV
sys.modules["langchain_community.chat_models.vertexai"] = _stub
except ImportError:
# vertexai not available — inject an empty stub (unused by our eval)
_stub = type(sys)("langchain_community.chat_models.vertexai")
_stub.ChatVertexAI = object
sys.modules["langchain_community.chat_models.vertexai"] = _stub
from ragas import evaluate
from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from datasets import Dataset
# ── Project path ─────────────────────────────────────────────────────────────
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
os.chdir(ROOT) # .env 읽기 위해 프로젝트 루트로 이동
from container import Container # noqa: E402 (after sys.path setup)
_container = Container()
_container.db_service().connect()
_container.db_service().init_schema()
# ── Answer collection via API ────────────────────────────────────────────────
async def _collect_answer(api_url: str, token: str, message: str) -> str:
"""youlbot /chat SSE 스트림에서 순수 답변 텍스트만 수집."""
import httpx
headers = {"Authorization": f"Bearer {token}"} if token else {}
parts: list[str] = []
async with httpx.AsyncClient(timeout=180) as client:
async with client.stream(
"POST",
f"{api_url}/chat",
json={"message": message, "user_id": "eval", "show_thinking": False},
headers=headers,
) as resp:
resp.raise_for_status()
async for line in resp.aiter_lines():
if not line.startswith("data: "):
continue
payload = json.loads(line[6:])
if isinstance(payload, str):
parts.append(payload)
elif isinstance(payload, dict) and payload.get("__done"):
break
return "".join(parts)
def collect_answer(api_url: str, token: str, message: str) -> str:
return asyncio.run(_collect_answer(api_url, token, message))
# ── Evaluator LLM 선택 ────────────────────────────────────────────────────────
def _build_evaluator_llm():
"""평가용 LLM: OpenAI > Anthropic > 로컬 MLX 순으로 시도."""
if os.getenv("OPENAI_API_KEY"):
from langchain_openai import ChatOpenAI
print("[RAGAS] 평가 LLM: OpenAI GPT-4o-mini")
return LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0))
if os.getenv("ANTHROPIC_API_KEY"):
from langchain_anthropic import ChatAnthropic
print("[RAGAS] 평가 LLM: Anthropic Claude Haiku")
return LangchainLLMWrapper(
ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0)
)
print("[RAGAS] 평가 LLM: 로컬 Qwen3 (신뢰도 제한적)")
return LangchainLLMWrapper(_container.chat_model())
def _build_evaluator_embeddings():
return LangchainEmbeddingsWrapper(_container.embeddings())
# ── Main ──────────────────────────────────────────────────────────────────────
def run(dataset_path: str, api_url: str, api_token: str) -> None:
# 1. 데이터셋 로드
samples = []
with open(dataset_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
samples.append(json.loads(line))
if not samples:
print(f"[오류] 데이터셋이 비어 있습니다: {dataset_path}")
sys.exit(1)
print(f"[RAGAS] 평가 시작 — {len(samples)}개 질문, API: {api_url}")
# 2. RetrieverService 초기화
retriever = _container.retriever_service()
# 3. 질문별 context + answer 수집
questions: list[str] = []
answers: list[str] = []
contexts: list[list[str]] = []
ground_truths: list[str] = []
for i, sample in enumerate(samples, 1):
q = sample["question"]
gt = sample["ground_truth"]
print(f"\n[{i}/{len(samples)}] {q[:50]}...")
docs = retriever.search(q)
ctxs = [doc.page_content for doc in docs]
print(f" 컨텍스트: {len(ctxs)}개 청크")
answer = collect_answer(api_url, api_token, q)
print(f" 답변: {len(answer)}")
questions.append(q)
answers.append(answer)
contexts.append(ctxs)
ground_truths.append(gt)
# 4. RAGAS Dataset
ds = Dataset.from_dict(
{
"question": questions,
"answer": answers,
"contexts": contexts,
"ground_truth": ground_truths,
}
)
# 5. 평가 실행
llm = _build_evaluator_llm()
emb = _build_evaluator_embeddings()
print("\n[RAGAS] 지표 계산 중...")
result = evaluate(
ds,
metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
llm=llm,
embeddings=emb,
raise_exceptions=False,
)
# 6. 결과 출력 및 저장
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = ROOT / "eval" / "results"
results_dir.mkdir(exist_ok=True)
out_csv = results_dir / f"report_{ts}.csv"
out_json = results_dir / f"report_{ts}.json"
df = result.to_pandas()
df.to_csv(out_csv, index=False, encoding="utf-8-sig")
summary = {
"timestamp": ts,
"dataset": dataset_path,
"n_samples": len(samples),
"scores": {
"faithfulness": float(result["faithfulness"]) if "faithfulness" in result else None,
"answer_relevancy": float(result["answer_relevancy"]) if "answer_relevancy" in result else None,
"context_recall": float(result["context_recall"]) if "context_recall" in result else None,
"context_precision": float(result["context_precision"]) if "context_precision" in result else None,
},
}
out_json.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"\n{'='*55}")
print("RAGAS 평가 결과")
print("="*55)
for k, v in summary["scores"].items():
bar = "" * int((v or 0) * 20) if v is not None else ""
score_str = f"{v:.3f}" if v is not None else "N/A"
print(f" {k:<22} {score_str} {bar}")
print("="*55)
print(f"CSV : {out_csv}")
print(f"JSON: {out_json}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="youlbot RAGAS 평가")
parser.add_argument(
"--dataset",
default=str(ROOT / "eval" / "dataset.jsonl"),
help="평가 데이터셋 경로 (기본: eval/dataset.jsonl)",
)
parser.add_argument(
"--api",
default=os.getenv("YOULBOT_API_URL", "http://localhost:8000"),
help="youlbot API URL (기본: http://localhost:8000)",
)
args = parser.parse_args()
from dotenv import load_dotenv
load_dotenv(ROOT / ".env")
api_token = os.getenv("API_TOKEN", "")
run(args.dataset, args.api, api_token)