Fix RAGAS eval: increase timeout for local LLM, safe score extraction

- RunConfig(timeout=600, max_workers=1): local Qwen3 needs more than 60s/call
- Extract scores from df.mean() instead of result[key] to handle NaN safely

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
sal
2026-06-01 19:41:32 +09:00
parent a2dff825ad
commit bdb6fd83c4
+18 -5
View File
@@ -90,6 +90,7 @@ async def _collect_answer(api_url: str, token: str, message: str) -> str:
if isinstance(payload, str):
parts.append(payload)
elif isinstance(payload, dict) and payload.get("__done"):
await resp.aclose()
break
return "".join(parts)
@@ -179,12 +180,17 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None:
llm = _build_evaluator_llm()
emb = _build_evaluator_embeddings()
print("\n[RAGAS] 지표 계산 중...")
# 로컬 LLM은 응답이 느리므로 타임아웃을 충분히 크게, 병렬 작업 수를 줄임
from ragas.run_config import RunConfig
run_cfg = RunConfig(timeout=600, max_retries=1, max_workers=1)
print("\n[RAGAS] 지표 계산 중... (로컬 LLM 사용 시 수 분 소요)")
result = evaluate(
ds,
metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
llm=llm,
embeddings=emb,
run_config=run_cfg,
raise_exceptions=False,
)
@@ -198,15 +204,22 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None:
df = result.to_pandas()
df.to_csv(out_csv, index=False, encoding="utf-8-sig")
# 점수 추출: to_pandas() 컬럼 평균으로 안전하게 계산 (타임아웃 시 NaN 처리)
def _score(col: str) -> float | None:
if col not in df.columns:
return None
val = df[col].dropna().mean()
return float(val) if not (val != val) else None # NaN 체크
summary = {
"timestamp": ts,
"dataset": dataset_path,
"n_samples": len(samples),
"scores": {
"faithfulness": float(result["faithfulness"]) if "faithfulness" in result else None,
"answer_relevancy": float(result["answer_relevancy"]) if "answer_relevancy" in result else None,
"context_recall": float(result["context_recall"]) if "context_recall" in result else None,
"context_precision": float(result["context_precision"]) if "context_precision" in result else None,
"faithfulness": _score("faithfulness"),
"answer_relevancy": _score("answer_relevancy"),
"context_recall": _score("context_recall"),
"context_precision": _score("context_precision"),
},
}
out_json.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")