Files
youlbot/services/rag/ingestion_service.py
T
shinalok 0b50444e43 IDEA-2/1/5/7: 스마트 알림, 대화 기반 RAG, CRAG, 파라미터 자동 튜닝
- IDEA-2 스마트 알림: td_reminders 테이블, set_reminder/list_reminders 도구,
  SchedulerService(asyncio 60초 루프, D-7/D-1/D-0 Telegram push),
  FastAPI lifespan 연동, GET /reminders/{user_id} 엔드포인트

- IDEA-1 대화 기반 RAG: IngestionService.store_text() 추가,
  AgentService._maybe_index_conversation() — 응답 후 LLM 판단 → Qdrant 저장
  (CONV_RAG_ENABLED=true 활성화, background task로 응답 속도 무관)

- IDEA-5 CRAG: AgentState에 crag_fallback_used 플래그 추가,
  crag_check LangGraph 노드 — search_documents 결과 없으면 web_search 자동 주입,
  route_after_crag으로 fallback 1회 루프 제어 (CRAG_ENABLED=true 활성화)

- IDEA-7 RAG Auto-Eval: eval/auto_tune.py — API 서버 없이 파라미터 조합별
  context_precision/recall 비교, 최적 설정 추천

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-04 10:04:05 +09:00

97 lines
3.9 KiB
Python

from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector
class IngestionService:
"""문서를 의미 단위 청크로 분할해 Qdrant에 저장하는 수집 파이프라인."""
def __init__(
self,
embeddings,
qdrant_url: str,
collection_name: str,
breakpoint_threshold_type: str = "percentile",
buffer_size: int = 1,
sparse_embeddings=None,
):
self._embeddings = embeddings
self._qdrant_url = qdrant_url
self._collection_name = collection_name
self._sparse_embeddings = sparse_embeddings
self._splitter = SemanticChunker(
embeddings=embeddings,
breakpoint_threshold_type=breakpoint_threshold_type,
buffer_size=buffer_size,
)
self._client = QdrantClient(url=qdrant_url)
def _ensure_collection_schema(self) -> None:
"""Hybrid 모드 전환 시 컬렉션에 sparse vector 설정이 없으면 삭제해 재생성을 유도한다."""
if not self._sparse_embeddings:
return
try:
info = self._client.get_collection(self._collection_name)
if not info.config.params.sparse_vectors:
print(f"[Hybrid] '{self._collection_name}' 컬렉션에 sparse vector 설정이 없어 재생성합니다.")
self._client.delete_collection(self._collection_name)
except Exception:
pass # 컬렉션 미존재 시 무시
def _delete_by_source(self, source_path: str) -> None:
"""같은 파일 경로로 저장된 기존 청크를 모두 삭제한다."""
try:
self._client.delete(
collection_name=self._collection_name,
points_selector=FilterSelector(
filter=Filter(
must=[
FieldCondition(
key="metadata.source",
match=MatchValue(value=source_path),
)
]
)
),
)
except Exception:
pass # 컬렉션이 없을 때(최초 수집) 무시
def store_text(self, text: str, metadata: dict) -> None:
"""단일 텍스트를 Qdrant에 직접 저장 (semantic chunking 없이)."""
from langchain_core.documents import Document
doc = Document(page_content=text, metadata=metadata)
kwargs = dict(
documents=[doc],
embedding=self._embeddings,
url=self._qdrant_url,
collection_name=self._collection_name,
)
if self._sparse_embeddings:
kwargs["sparse_embedding"] = self._sparse_embeddings
kwargs["retrieval_mode"] = RetrievalMode.HYBRID
QdrantVectorStore.from_documents(**kwargs)
def ingest(self, file_paths: list[str]) -> int:
self._ensure_collection_schema()
docs = []
for path in file_paths:
self._delete_by_source(path)
loader = PDFPlumberLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding="utf-8")
docs.extend(loader.load())
chunks = self._splitter.split_documents(docs)
kwargs = dict(
documents=chunks,
embedding=self._embeddings,
url=self._qdrant_url,
collection_name=self._collection_name,
)
if self._sparse_embeddings:
kwargs["sparse_embedding"] = self._sparse_embeddings
kwargs["retrieval_mode"] = RetrievalMode.HYBRID
QdrantVectorStore.from_documents(**kwargs)
return len(chunks)