From bdb6fd83c4662b22d5c9623862b6ac9d3fcc0aae Mon Sep 17 00:00:00 2001
From: sal <shinalok357@gmail.com>
Date: Mon, 1 Jun 2026 19:41:32 +0900
Subject: [PATCH] Fix RAGAS eval: increase timeout for local LLM, safe score
 extraction

- RunConfig(timeout=600, max_workers=1): local Qwen3 needs more than 60s/call
- Extract scores from df.mean() instead of result[key] to handle NaN safely

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 eval/run_ragas.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/eval/run_ragas.py b/eval/run_ragas.py
index 511ff4d..c3a48e2 100644
--- a/eval/run_ragas.py
+++ b/eval/run_ragas.py
@@ -90,6 +90,7 @@ async def _collect_answer(api_url: str, token: str, message: str) -> str:
                 if isinstance(payload, str):
                     parts.append(payload)
                 elif isinstance(payload, dict) and payload.get("__done"):
+                    await resp.aclose()
                     break
     return "".join(parts)
 
@@ -179,12 +180,17 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None:
     llm = _build_evaluator_llm()
     emb = _build_evaluator_embeddings()
 
-    print("\n[RAGAS] 지표 계산 중...")
+    # 로컬 LLM은 응답이 느리므로 타임아웃을 충분히 크게, 병렬 작업 수를 줄임
+    from ragas.run_config import RunConfig
+    run_cfg = RunConfig(timeout=600, max_retries=1, max_workers=1)
+
+    print("\n[RAGAS] 지표 계산 중... (로컬 LLM 사용 시 수 분 소요)")
     result = evaluate(
         ds,
         metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
         llm=llm,
         embeddings=emb,
+        run_config=run_cfg,
         raise_exceptions=False,
     )
 
@@ -198,15 +204,22 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None:
     df = result.to_pandas()
     df.to_csv(out_csv, index=False, encoding="utf-8-sig")
 
+    # 점수 추출: to_pandas() 컬럼 평균으로 안전하게 계산 (타임아웃 시 NaN 처리)
+    def _score(col: str) -> float | None:
+        if col not in df.columns:
+            return None
+        val = df[col].dropna().mean()
+        return float(val) if not (val != val) else None  # NaN 체크
+
     summary = {
         "timestamp": ts,
         "dataset": dataset_path,
         "n_samples": len(samples),
         "scores": {
-            "faithfulness": float(result["faithfulness"]) if "faithfulness" in result else None,
-            "answer_relevancy": float(result["answer_relevancy"]) if "answer_relevancy" in result else None,
-            "context_recall": float(result["context_recall"]) if "context_recall" in result else None,
-            "context_precision": float(result["context_precision"]) if "context_precision" in result else None,
+            "faithfulness": _score("faithfulness"),
+            "answer_relevancy": _score("answer_relevancy"),
+            "context_recall": _score("context_recall"),
+            "context_precision": _score("context_precision"),
         },
     }
     out_json.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")