0b50444e43
- IDEA-2 스마트 알림: td_reminders 테이블, set_reminder/list_reminders 도구,
SchedulerService(asyncio 60초 루프, D-7/D-1/D-0 Telegram push),
FastAPI lifespan 연동, GET /reminders/{user_id} 엔드포인트
- IDEA-1 대화 기반 RAG: IngestionService.store_text() 추가,
AgentService._maybe_index_conversation() — 응답 후 LLM 판단 → Qdrant 저장
(CONV_RAG_ENABLED=true 활성화, background task로 응답 속도 무관)
- IDEA-5 CRAG: AgentState에 crag_fallback_used 플래그 추가,
crag_check LangGraph 노드 — search_documents 결과 없으면 web_search 자동 주입,
route_after_crag으로 fallback 1회 루프 제어 (CRAG_ENABLED=true 활성화)
- IDEA-7 RAG Auto-Eval: eval/auto_tune.py — API 서버 없이 파라미터 조합별
context_precision/recall 비교, 최적 설정 추천
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
97 lines
3.9 KiB
Python
97 lines
3.9 KiB
Python
from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
|
|
from langchain_experimental.text_splitter import SemanticChunker
|
|
from langchain_qdrant import QdrantVectorStore, RetrievalMode
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector
|
|
|
|
|
|
class IngestionService:
|
|
"""문서를 의미 단위 청크로 분할해 Qdrant에 저장하는 수집 파이프라인."""
|
|
|
|
def __init__(
|
|
self,
|
|
embeddings,
|
|
qdrant_url: str,
|
|
collection_name: str,
|
|
breakpoint_threshold_type: str = "percentile",
|
|
buffer_size: int = 1,
|
|
sparse_embeddings=None,
|
|
):
|
|
self._embeddings = embeddings
|
|
self._qdrant_url = qdrant_url
|
|
self._collection_name = collection_name
|
|
self._sparse_embeddings = sparse_embeddings
|
|
self._splitter = SemanticChunker(
|
|
embeddings=embeddings,
|
|
breakpoint_threshold_type=breakpoint_threshold_type,
|
|
buffer_size=buffer_size,
|
|
)
|
|
self._client = QdrantClient(url=qdrant_url)
|
|
|
|
def _ensure_collection_schema(self) -> None:
|
|
"""Hybrid 모드 전환 시 컬렉션에 sparse vector 설정이 없으면 삭제해 재생성을 유도한다."""
|
|
if not self._sparse_embeddings:
|
|
return
|
|
try:
|
|
info = self._client.get_collection(self._collection_name)
|
|
if not info.config.params.sparse_vectors:
|
|
print(f"[Hybrid] '{self._collection_name}' 컬렉션에 sparse vector 설정이 없어 재생성합니다.")
|
|
self._client.delete_collection(self._collection_name)
|
|
except Exception:
|
|
pass # 컬렉션 미존재 시 무시
|
|
|
|
def _delete_by_source(self, source_path: str) -> None:
|
|
"""같은 파일 경로로 저장된 기존 청크를 모두 삭제한다."""
|
|
try:
|
|
self._client.delete(
|
|
collection_name=self._collection_name,
|
|
points_selector=FilterSelector(
|
|
filter=Filter(
|
|
must=[
|
|
FieldCondition(
|
|
key="metadata.source",
|
|
match=MatchValue(value=source_path),
|
|
)
|
|
]
|
|
)
|
|
),
|
|
)
|
|
except Exception:
|
|
pass # 컬렉션이 없을 때(최초 수집) 무시
|
|
|
|
def store_text(self, text: str, metadata: dict) -> None:
|
|
"""단일 텍스트를 Qdrant에 직접 저장 (semantic chunking 없이)."""
|
|
from langchain_core.documents import Document
|
|
doc = Document(page_content=text, metadata=metadata)
|
|
kwargs = dict(
|
|
documents=[doc],
|
|
embedding=self._embeddings,
|
|
url=self._qdrant_url,
|
|
collection_name=self._collection_name,
|
|
)
|
|
if self._sparse_embeddings:
|
|
kwargs["sparse_embedding"] = self._sparse_embeddings
|
|
kwargs["retrieval_mode"] = RetrievalMode.HYBRID
|
|
QdrantVectorStore.from_documents(**kwargs)
|
|
|
|
def ingest(self, file_paths: list[str]) -> int:
|
|
self._ensure_collection_schema()
|
|
docs = []
|
|
for path in file_paths:
|
|
self._delete_by_source(path)
|
|
loader = PDFPlumberLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding="utf-8")
|
|
docs.extend(loader.load())
|
|
|
|
chunks = self._splitter.split_documents(docs)
|
|
kwargs = dict(
|
|
documents=chunks,
|
|
embedding=self._embeddings,
|
|
url=self._qdrant_url,
|
|
collection_name=self._collection_name,
|
|
)
|
|
if self._sparse_embeddings:
|
|
kwargs["sparse_embedding"] = self._sparse_embeddings
|
|
kwargs["retrieval_mode"] = RetrievalMode.HYBRID
|
|
QdrantVectorStore.from_documents(**kwargs)
|
|
return len(chunks)
|