Fix RAGAS eval: increase timeout for local LLM, safe score extraction
- RunConfig(timeout=600, max_workers=1): local Qwen3 needs more than 60s/call - Extract scores from df.mean() instead of result[key] to handle NaN safely Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+18
-5
@@ -90,6 +90,7 @@ async def _collect_answer(api_url: str, token: str, message: str) -> str:
|
||||
if isinstance(payload, str):
|
||||
parts.append(payload)
|
||||
elif isinstance(payload, dict) and payload.get("__done"):
|
||||
await resp.aclose()
|
||||
break
|
||||
return "".join(parts)
|
||||
|
||||
@@ -179,12 +180,17 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None:
|
||||
llm = _build_evaluator_llm()
|
||||
emb = _build_evaluator_embeddings()
|
||||
|
||||
print("\n[RAGAS] 지표 계산 중...")
|
||||
# 로컬 LLM은 응답이 느리므로 타임아웃을 충분히 크게, 병렬 작업 수를 줄임
|
||||
from ragas.run_config import RunConfig
|
||||
run_cfg = RunConfig(timeout=600, max_retries=1, max_workers=1)
|
||||
|
||||
print("\n[RAGAS] 지표 계산 중... (로컬 LLM 사용 시 수 분 소요)")
|
||||
result = evaluate(
|
||||
ds,
|
||||
metrics=[faithfulness, answer_relevancy, context_recall, context_precision],
|
||||
llm=llm,
|
||||
embeddings=emb,
|
||||
run_config=run_cfg,
|
||||
raise_exceptions=False,
|
||||
)
|
||||
|
||||
@@ -198,15 +204,22 @@ def run(dataset_path: str, api_url: str, api_token: str) -> None:
|
||||
df = result.to_pandas()
|
||||
df.to_csv(out_csv, index=False, encoding="utf-8-sig")
|
||||
|
||||
# 점수 추출: to_pandas() 컬럼 평균으로 안전하게 계산 (타임아웃 시 NaN 처리)
|
||||
def _score(col: str) -> float | None:
|
||||
if col not in df.columns:
|
||||
return None
|
||||
val = df[col].dropna().mean()
|
||||
return float(val) if not (val != val) else None # NaN 체크
|
||||
|
||||
summary = {
|
||||
"timestamp": ts,
|
||||
"dataset": dataset_path,
|
||||
"n_samples": len(samples),
|
||||
"scores": {
|
||||
"faithfulness": float(result["faithfulness"]) if "faithfulness" in result else None,
|
||||
"answer_relevancy": float(result["answer_relevancy"]) if "answer_relevancy" in result else None,
|
||||
"context_recall": float(result["context_recall"]) if "context_recall" in result else None,
|
||||
"context_precision": float(result["context_precision"]) if "context_precision" in result else None,
|
||||
"faithfulness": _score("faithfulness"),
|
||||
"answer_relevancy": _score("answer_relevancy"),
|
||||
"context_recall": _score("context_recall"),
|
||||
"context_precision": _score("context_precision"),
|
||||
},
|
||||
}
|
||||
out_json.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
Reference in New Issue
Block a user