bdb6fd83c4
- RunConfig(timeout=600, max_workers=1): local Qwen3 needs more than 60s/call - Extract scores from df.mean() instead of result[key] to handle NaN safely Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
258 lines
9.5 KiB
Python
258 lines
9.5 KiB
Python
"""youlbot RAGAS 평가 스크립트 (Phase 20)
|
|
|
|
실행:
|
|
cd /path/to/youlbot
|
|
python eval/run_ragas.py [--dataset eval/dataset.jsonl] [--api http://localhost:8000]
|
|
|
|
결과:
|
|
eval/results/report_YYYYMMDD_HHMMSS.csv
|
|
|
|
사전 조건:
|
|
- youlbot API 서버 실행 중 (uvicorn api:app --port 8000)
|
|
- Qdrant + MySQL 접근 가능
|
|
- .env에 API_TOKEN, RAG_SHOW_SOURCES=true 설정
|
|
|
|
평가 지표:
|
|
- faithfulness : 답변이 검색 컨텍스트에 충실한가 (환각 탐지)
|
|
- answer_relevancy : 답변이 질문에 얼마나 관련 있는가
|
|
- context_recall : 컨텍스트가 정답에 필요한 정보를 포함하는가
|
|
- context_precision : 검색된 컨텍스트 중 실제 유용한 비율
|
|
|
|
참고:
|
|
평가에 로컬 LLM(Qwen3)을 사용하므로 결과 신뢰도는 모델 크기에 의존합니다.
|
|
더 정확한 평가를 원하면 OPENAI_API_KEY 또는 ANTHROPIC_API_KEY를 설정하세요.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# ── Compatibility shim ───────────────────────────────────────────────────────
|
|
# ragas 0.2.x imports langchain_community.chat_models.vertexai which was
|
|
# removed in langchain-community 0.4+. Re-export from langchain-google-vertexai.
|
|
try:
|
|
import langchain_community.chat_models.vertexai # noqa: F401
|
|
except ModuleNotFoundError:
|
|
try:
|
|
from langchain_google_vertexai import ChatVertexAI as _CV
|
|
_stub = type(sys)("langchain_community.chat_models.vertexai")
|
|
_stub.ChatVertexAI = _CV
|
|
sys.modules["langchain_community.chat_models.vertexai"] = _stub
|
|
except ImportError:
|
|
# vertexai not available — inject an empty stub (unused by our eval)
|
|
_stub = type(sys)("langchain_community.chat_models.vertexai")
|
|
_stub.ChatVertexAI = object
|
|
sys.modules["langchain_community.chat_models.vertexai"] = _stub
|
|
|
|
from ragas import evaluate
|
|
from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness
|
|
from ragas.embeddings import LangchainEmbeddingsWrapper
|
|
from ragas.llms import LangchainLLMWrapper
|
|
from datasets import Dataset
|
|
|
|
# ── Project path ─────────────────────────────────────────────────────────────
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
os.chdir(ROOT) # .env 읽기 위해 프로젝트 루트로 이동
|
|
|
|
from container import Container # noqa: E402 (after sys.path setup)
|
|
|
|
_container = Container()
|
|
_container.db_service().connect()
|
|
_container.db_service().init_schema()
|
|
|
|
|
|
# ── Answer collection via API ────────────────────────────────────────────────
|
|
|
|
async def _collect_answer(api_url: str, token: str, message: str) -> str:
|
|
"""youlbot /chat SSE 스트림에서 순수 답변 텍스트만 수집."""
|
|
import httpx
|
|
|
|
headers = {"Authorization": f"Bearer {token}"} if token else {}
|
|
parts: list[str] = []
|
|
async with httpx.AsyncClient(timeout=180) as client:
|
|
async with client.stream(
|
|
"POST",
|
|
f"{api_url}/chat",
|
|
json={"message": message, "user_id": "eval", "show_thinking": False},
|
|
headers=headers,
|
|
) as resp:
|
|
resp.raise_for_status()
|
|
async for line in resp.aiter_lines():
|
|
if not line.startswith("data: "):
|
|
continue
|
|
payload = json.loads(line[6:])
|
|
if isinstance(payload, str):
|
|
parts.append(payload)
|
|
elif isinstance(payload, dict) and payload.get("__done"):
|
|
await resp.aclose()
|
|
break
|
|
return "".join(parts)
|
|
|
|
|
|
def collect_answer(api_url: str, token: str, message: str) -> str:
|
|
return asyncio.run(_collect_answer(api_url, token, message))
|
|
|
|
|
|
# ── Evaluator LLM 선택 ────────────────────────────────────────────────────────
|
|
|
|
def _build_evaluator_llm():
|
|
"""평가용 LLM: OpenAI > Anthropic > 로컬 MLX 순으로 시도."""
|
|
if os.getenv("OPENAI_API_KEY"):
|
|
from langchain_openai import ChatOpenAI
|
|
print("[RAGAS] 평가 LLM: OpenAI GPT-4o-mini")
|
|
return LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0))
|
|
|
|
if os.getenv("ANTHROPIC_API_KEY"):
|
|
from langchain_anthropic import ChatAnthropic
|
|
print("[RAGAS] 평가 LLM: Anthropic Claude Haiku")
|
|
return LangchainLLMWrapper(
|
|
ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0)
|
|
)
|
|
|
|
print("[RAGAS] 평가 LLM: 로컬 Qwen3 (신뢰도 제한적)")
|
|
return LangchainLLMWrapper(_container.chat_model())
|
|
|
|
|
|
def _build_evaluator_embeddings():
|
|
return LangchainEmbeddingsWrapper(_container.embeddings())
|
|
|
|
|
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
|
|
|
def run(dataset_path: str, api_url: str, api_token: str) -> None:
|
|
# 1. 데이터셋 로드
|
|
samples = []
|
|
with open(dataset_path, encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
samples.append(json.loads(line))
|
|
|
|
if not samples:
|
|
print(f"[오류] 데이터셋이 비어 있습니다: {dataset_path}")
|
|
sys.exit(1)
|
|
|
|
print(f"[RAGAS] 평가 시작 — {len(samples)}개 질문, API: {api_url}")
|
|
|
|
# 2. RetrieverService 초기화
|
|
retriever = _container.retriever_service()
|
|
|
|
# 3. 질문별 context + answer 수집
|
|
questions: list[str] = []
|
|
answers: list[str] = []
|
|
contexts: list[list[str]] = []
|
|
ground_truths: list[str] = []
|
|
|
|
for i, sample in enumerate(samples, 1):
|
|
q = sample["question"]
|
|
gt = sample["ground_truth"]
|
|
print(f"\n[{i}/{len(samples)}] {q[:50]}...")
|
|
|
|
docs = retriever.search(q)
|
|
ctxs = [doc.page_content for doc in docs]
|
|
print(f" 컨텍스트: {len(ctxs)}개 청크")
|
|
|
|
answer = collect_answer(api_url, api_token, q)
|
|
print(f" 답변: {len(answer)}자")
|
|
|
|
questions.append(q)
|
|
answers.append(answer)
|
|
contexts.append(ctxs)
|
|
ground_truths.append(gt)
|
|
|
|
# 4. RAGAS Dataset
|
|
ds = Dataset.from_dict(
|
|
{
|
|
"question": questions,
|
|
"answer": answers,
|
|
"contexts": contexts,
|
|
"ground_truth": ground_truths,
|
|
}
|
|
)
|
|
|
|
# 5. 평가 실행
|
|
llm = _build_evaluator_llm()
|
|
emb = _build_evaluator_embeddings()
|
|
|
|
# 로컬 LLM은 응답이 느리므로 타임아웃을 충분히 크게, 병렬 작업 수를 줄임
|
|
from ragas.run_config import RunConfig
|
|
run_cfg = RunConfig(timeout=600, max_retries=1, max_workers=1)
|
|
|
|
print("\n[RAGAS] 지표 계산 중... (로컬 LLM 사용 시 수 분 소요)")
|
|
result = evaluate(
|
|
ds,
|
|
metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
|
|
llm=llm,
|
|
embeddings=emb,
|
|
run_config=run_cfg,
|
|
raise_exceptions=False,
|
|
)
|
|
|
|
# 6. 결과 출력 및 저장
|
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
results_dir = ROOT / "eval" / "results"
|
|
results_dir.mkdir(exist_ok=True)
|
|
out_csv = results_dir / f"report_{ts}.csv"
|
|
out_json = results_dir / f"report_{ts}.json"
|
|
|
|
df = result.to_pandas()
|
|
df.to_csv(out_csv, index=False, encoding="utf-8-sig")
|
|
|
|
# 점수 추출: to_pandas() 컬럼 평균으로 안전하게 계산 (타임아웃 시 NaN 처리)
|
|
def _score(col: str) -> float | None:
|
|
if col not in df.columns:
|
|
return None
|
|
val = df[col].dropna().mean()
|
|
return float(val) if not (val != val) else None # NaN 체크
|
|
|
|
summary = {
|
|
"timestamp": ts,
|
|
"dataset": dataset_path,
|
|
"n_samples": len(samples),
|
|
"scores": {
|
|
"faithfulness": _score("faithfulness"),
|
|
"answer_relevancy": _score("answer_relevancy"),
|
|
"context_recall": _score("context_recall"),
|
|
"context_precision": _score("context_precision"),
|
|
},
|
|
}
|
|
out_json.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
|
print(f"\n{'='*55}")
|
|
print("RAGAS 평가 결과")
|
|
print("="*55)
|
|
for k, v in summary["scores"].items():
|
|
bar = "█" * int((v or 0) * 20) if v is not None else ""
|
|
score_str = f"{v:.3f}" if v is not None else "N/A"
|
|
print(f" {k:<22} {score_str} {bar}")
|
|
print("="*55)
|
|
print(f"CSV : {out_csv}")
|
|
print(f"JSON: {out_json}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="youlbot RAGAS 평가")
|
|
parser.add_argument(
|
|
"--dataset",
|
|
default=str(ROOT / "eval" / "dataset.jsonl"),
|
|
help="평가 데이터셋 경로 (기본: eval/dataset.jsonl)",
|
|
)
|
|
parser.add_argument(
|
|
"--api",
|
|
default=os.getenv("YOULBOT_API_URL", "http://localhost:8000"),
|
|
help="youlbot API URL (기본: http://localhost:8000)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv(ROOT / ".env")
|
|
api_token = os.getenv("API_TOKEN", "")
|
|
|
|
run(args.dataset, args.api, api_token)
|