Fix RAGAS eval: increase timeout for local LLM, safe score extraction

- RunConfig(timeout=600, max_workers=1): local Qwen3 needs more than 60s/call - Extract scores from df.mean() instead of result[key] to handle NaN safely Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-01 19:41:32 +09:00
parent a2dff825ad
commit bdb6fd83c4
1 changed files with 18 additions and 5 deletions
@@ -90,6 +90,7 @@ async def _collect_answer(api_url: str, token: str, message: str) -> str:
                if isinstance(payload, str):
                    parts.append(payload)
                elif isinstance(payload, dict) and payload.get("__done"):
+                    await resp.aclose()
                    break
    return "".join(parts)

@@ -179,12 +180,17 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None:
    llm = _build_evaluator_llm()
    emb = _build_evaluator_embeddings()

-    print("\n[RAGAS] 지표 계산 중...")
+    # 로컬 LLM은 응답이 느리므로 타임아웃을 충분히 크게, 병렬 작업 수를 줄임
+    from ragas.run_config import RunConfig
+    run_cfg = RunConfig(timeout=600, max_retries=1, max_workers=1)
+
+    print("\n[RAGAS] 지표 계산 중... (로컬 LLM 사용 시 수 분 소요)")
    result = evaluate(
        ds,
        metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
        llm=llm,
        embeddings=emb,
+        run_config=run_cfg,
        raise_exceptions=False,
    )

@@ -198,15 +204,22 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None:
    df = result.to_pandas()
    df.to_csv(out_csv, index=False, encoding="utf-8-sig")

+    # 점수 추출: to_pandas() 컬럼 평균으로 안전하게 계산 (타임아웃 시 NaN 처리)
+    def _score(col: str) -> float | None:
+        if col not in df.columns:
+            return None
+        val = df[col].dropna().mean()
+        return float(val) if not (val != val) else None  # NaN 체크
+
    summary = {
        "timestamp": ts,
        "dataset": dataset_path,
        "n_samples": len(samples),
        "scores": {
-            "faithfulness": float(result["faithfulness"]) if "faithfulness" in result else None,
-            "answer_relevancy": float(result["answer_relevancy"]) if "answer_relevancy" in result else None,
-            "context_recall": float(result["context_recall"]) if "context_recall" in result else None,
-            "context_precision": float(result["context_precision"]) if "context_precision" in result else None,
+            "faithfulness": _score("faithfulness"),
+            "answer_relevancy": _score("answer_relevancy"),
+            "context_recall": _score("context_recall"),
+            "context_precision": _score("context_precision"),
        },
    }
    out_json.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")