From bdb6fd83c4662b22d5c9623862b6ac9d3fcc0aae Mon Sep 17 00:00:00 2001 From: sal Date: Mon, 1 Jun 2026 19:41:32 +0900 Subject: [PATCH] Fix RAGAS eval: increase timeout for local LLM, safe score extraction - RunConfig(timeout=600, max_workers=1): local Qwen3 needs more than 60s/call - Extract scores from df.mean() instead of result[key] to handle NaN safely Co-Authored-By: Claude Sonnet 4.6 --- eval/run_ragas.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/eval/run_ragas.py b/eval/run_ragas.py index 511ff4d..c3a48e2 100644 --- a/eval/run_ragas.py +++ b/eval/run_ragas.py @@ -90,6 +90,7 @@ async def _collect_answer(api_url: str, token: str, message: str) -> str: if isinstance(payload, str): parts.append(payload) elif isinstance(payload, dict) and payload.get("__done"): + await resp.aclose() break return "".join(parts) @@ -179,12 +180,17 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None: llm = _build_evaluator_llm() emb = _build_evaluator_embeddings() - print("\n[RAGAS] 지표 계산 중...") + # 로컬 LLM은 응답이 느리므로 타임아웃을 충분히 크게, 병렬 작업 수를 줄임 + from ragas.run_config import RunConfig + run_cfg = RunConfig(timeout=600, max_retries=1, max_workers=1) + + print("\n[RAGAS] 지표 계산 중... (로컬 LLM 사용 시 수 분 소요)") result = evaluate( ds, metrics=[faithfulness, answer_relevancy, context_recall, context_precision], llm=llm, embeddings=emb, + run_config=run_cfg, raise_exceptions=False, ) @@ -198,15 +204,22 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None: df = result.to_pandas() df.to_csv(out_csv, index=False, encoding="utf-8-sig") + # 점수 추출: to_pandas() 컬럼 평균으로 안전하게 계산 (타임아웃 시 NaN 처리) + def _score(col: str) -> float | None: + if col not in df.columns: + return None + val = df[col].dropna().mean() + return float(val) if not (val != val) else None # NaN 체크 + summary = { "timestamp": ts, "dataset": dataset_path, "n_samples": len(samples), "scores": { - "faithfulness": float(result["faithfulness"]) if "faithfulness" in result else None, - "answer_relevancy": float(result["answer_relevancy"]) if "answer_relevancy" in result else None, - "context_recall": float(result["context_recall"]) if "context_recall" in result else None, - "context_precision": float(result["context_precision"]) if "context_precision" in result else None, + "faithfulness": _score("faithfulness"), + "answer_relevancy": _score("answer_relevancy"), + "context_recall": _score("context_recall"), + "context_precision": _score("context_precision"), }, } out_json.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")