"""youlbot RAGAS 평가 스크립트 (Phase 20) 실행: cd /path/to/youlbot python eval/run_ragas.py [--dataset eval/dataset.jsonl] [--api http://localhost:8000] 결과: eval/results/report_YYYYMMDD_HHMMSS.csv 사전 조건: - youlbot API 서버 실행 중 (uvicorn api:app --port 8000) - Qdrant + MySQL 접근 가능 - .env에 API_TOKEN, RAG_SHOW_SOURCES=true 설정 평가 지표: - faithfulness : 답변이 검색 컨텍스트에 충실한가 (환각 탐지) - answer_relevancy : 답변이 질문에 얼마나 관련 있는가 - context_recall : 컨텍스트가 정답에 필요한 정보를 포함하는가 - context_precision : 검색된 컨텍스트 중 실제 유용한 비율 참고: 평가에 로컬 LLM(Qwen3)을 사용하므로 결과 신뢰도는 모델 크기에 의존합니다. 더 정확한 평가를 원하면 OPENAI_API_KEY 또는 ANTHROPIC_API_KEY를 설정하세요. """ from __future__ import annotations import argparse import asyncio import json import os import sys from datetime import datetime from pathlib import Path # ── Compatibility shim ─────────────────────────────────────────────────────── # ragas 0.2.x imports langchain_community.chat_models.vertexai which was # removed in langchain-community 0.4+. Re-export from langchain-google-vertexai. try: import langchain_community.chat_models.vertexai # noqa: F401 except ModuleNotFoundError: try: from langchain_google_vertexai import ChatVertexAI as _CV _stub = type(sys)("langchain_community.chat_models.vertexai") _stub.ChatVertexAI = _CV sys.modules["langchain_community.chat_models.vertexai"] = _stub except ImportError: # vertexai not available — inject an empty stub (unused by our eval) _stub = type(sys)("langchain_community.chat_models.vertexai") _stub.ChatVertexAI = object sys.modules["langchain_community.chat_models.vertexai"] = _stub from ragas import evaluate from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness from ragas.embeddings import LangchainEmbeddingsWrapper from ragas.llms import LangchainLLMWrapper from datasets import Dataset # ── Project path ───────────────────────────────────────────────────────────── ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) os.chdir(ROOT) # .env 읽기 위해 프로젝트 루트로 이동 from container import container # noqa: E402 (after sys.path setup) # ── Answer collection via API ──────────────────────────────────────────────── async def _collect_answer(api_url: str, token: str, message: str) -> str: """youlbot /chat SSE 스트림에서 순수 답변 텍스트만 수집.""" import httpx headers = {"Authorization": f"Bearer {token}"} if token else {} parts: list[str] = [] async with httpx.AsyncClient(timeout=180) as client: async with client.stream( "POST", f"{api_url}/chat", json={"message": message, "user_id": "eval", "show_thinking": False}, headers=headers, ) as resp: resp.raise_for_status() async for line in resp.aiter_lines(): if not line.startswith("data: "): continue payload = json.loads(line[6:]) if isinstance(payload, str): parts.append(payload) elif isinstance(payload, dict) and payload.get("__done"): break return "".join(parts) def collect_answer(api_url: str, token: str, message: str) -> str: return asyncio.run(_collect_answer(api_url, token, message)) # ── Evaluator LLM 선택 ──────────────────────────────────────────────────────── def _build_evaluator_llm(): """평가용 LLM: OpenAI > Anthropic > 로컬 MLX 순으로 시도.""" if os.getenv("OPENAI_API_KEY"): from langchain_openai import ChatOpenAI print("[RAGAS] 평가 LLM: OpenAI GPT-4o-mini") return LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0)) if os.getenv("ANTHROPIC_API_KEY"): from langchain_anthropic import ChatAnthropic print("[RAGAS] 평가 LLM: Anthropic Claude Haiku") return LangchainLLMWrapper( ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0) ) print("[RAGAS] 평가 LLM: 로컬 Qwen3 (신뢰도 제한적)") return LangchainLLMWrapper(container.chat_model()) def _build_evaluator_embeddings(): return LangchainEmbeddingsWrapper(container.embeddings()) # ── Main ────────────────────────────────────────────────────────────────────── def run(dataset_path: str, api_url: str, api_token: str) -> None: # 1. 데이터셋 로드 samples = [] with open(dataset_path, encoding="utf-8") as f: for line in f: line = line.strip() if line: samples.append(json.loads(line)) if not samples: print(f"[오류] 데이터셋이 비어 있습니다: {dataset_path}") sys.exit(1) print(f"[RAGAS] 평가 시작 — {len(samples)}개 질문, API: {api_url}") # 2. RetrieverService 초기화 retriever = container.retriever_service() # 3. 질문별 context + answer 수집 questions: list[str] = [] answers: list[str] = [] contexts: list[list[str]] = [] ground_truths: list[str] = [] for i, sample in enumerate(samples, 1): q = sample["question"] gt = sample["ground_truth"] print(f"\n[{i}/{len(samples)}] {q[:50]}...") docs = retriever.search(q) ctxs = [doc.page_content for doc in docs] print(f" 컨텍스트: {len(ctxs)}개 청크") answer = collect_answer(api_url, api_token, q) print(f" 답변: {len(answer)}자") questions.append(q) answers.append(answer) contexts.append(ctxs) ground_truths.append(gt) # 4. RAGAS Dataset ds = Dataset.from_dict( { "question": questions, "answer": answers, "contexts": contexts, "ground_truth": ground_truths, } ) # 5. 평가 실행 llm = _build_evaluator_llm() emb = _build_evaluator_embeddings() print("\n[RAGAS] 지표 계산 중...") result = evaluate( ds, metrics=[faithfulness, answer_relevancy, context_recall, context_precision], llm=llm, embeddings=emb, raise_exceptions=False, ) # 6. 결과 출력 및 저장 ts = datetime.now().strftime("%Y%m%d_%H%M%S") results_dir = ROOT / "eval" / "results" results_dir.mkdir(exist_ok=True) out_csv = results_dir / f"report_{ts}.csv" out_json = results_dir / f"report_{ts}.json" df = result.to_pandas() df.to_csv(out_csv, index=False, encoding="utf-8-sig") summary = { "timestamp": ts, "dataset": dataset_path, "n_samples": len(samples), "scores": { "faithfulness": float(result["faithfulness"]) if "faithfulness" in result else None, "answer_relevancy": float(result["answer_relevancy"]) if "answer_relevancy" in result else None, "context_recall": float(result["context_recall"]) if "context_recall" in result else None, "context_precision": float(result["context_precision"]) if "context_precision" in result else None, }, } out_json.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") print(f"\n{'='*55}") print("RAGAS 평가 결과") print("="*55) for k, v in summary["scores"].items(): bar = "█" * int((v or 0) * 20) if v is not None else "" score_str = f"{v:.3f}" if v is not None else "N/A" print(f" {k:<22} {score_str} {bar}") print("="*55) print(f"CSV : {out_csv}") print(f"JSON: {out_json}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="youlbot RAGAS 평가") parser.add_argument( "--dataset", default=str(ROOT / "eval" / "dataset.jsonl"), help="평가 데이터셋 경로 (기본: eval/dataset.jsonl)", ) parser.add_argument( "--api", default=os.getenv("YOULBOT_API_URL", "http://localhost:8000"), help="youlbot API URL (기본: http://localhost:8000)", ) args = parser.parse_args() from dotenv import load_dotenv load_dotenv(ROOT / ".env") api_token = os.getenv("API_TOKEN", "") run(args.dataset, args.api, api_token)