Files
youlbot/services/rag/ingestion_service.py
T
shinalok 145b0cc96f Implement Phase 12 feedback, Phase 13 Semantic Chunker, Phase 13-B Reranker, Bug 5 thinking fix
- Phase 12: FeedbackRepository + td_feedback 테이블, Gradio 👍/👎 이벤트, run_id 추적, LangSmith create_feedback() 연동
- Phase 13: 커스텀 _SemanticSplitter 제거 → langchain_experimental.SemanticChunker 교체, buffer_size/threshold_type 환경변수 적용
- Phase 13-B: RerankService (Cross-Encoder), RetrieverService.search()에 reranker 통합, tools.py as_retriever() → search() 전환
- Bug 5: mlx_chat_model enable_thinking 런타임 오버라이드, agent_service stream_mode=["messages","custom"] 이중 스트림, thinking 토큰 custom 이벤트로 emit
- ROADMAP: LLM 모델명 8B 반영, RAG에 Reranker 추가, 추천 진행 순서 갱신

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 17:41:36 +09:00

63 lines
2.3 KiB
Python

from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector
class IngestionService:
"""문서를 의미 단위 청크로 분할해 Qdrant에 저장하는 수집 파이프라인."""
def __init__(
self,
embeddings,
qdrant_url: str,
collection_name: str,
breakpoint_threshold_type: str = "percentile",
buffer_size: int = 1,
):
self._embeddings = embeddings
self._qdrant_url = qdrant_url
self._collection_name = collection_name
self._splitter = SemanticChunker(
embeddings=embeddings,
breakpoint_threshold_type=breakpoint_threshold_type,
buffer_size=buffer_size,
)
self._client = QdrantClient(url=qdrant_url)
def _delete_by_source(self, source_path: str) -> None:
"""같은 파일 경로로 저장된 기존 청크를 모두 삭제한다."""
try:
self._client.delete(
collection_name=self._collection_name,
points_selector=FilterSelector(
filter=Filter(
must=[
FieldCondition(
key="metadata.source",
match=MatchValue(value=source_path),
)
]
)
),
)
except Exception:
pass # 컬렉션이 없을 때(최초 수집) 무시
def ingest(self, file_paths: list[str]) -> int:
docs = []
for path in file_paths:
self._delete_by_source(path)
loader = PDFPlumberLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding="utf-8")
docs.extend(loader.load())
chunks = self._splitter.split_documents(docs)
QdrantVectorStore.from_documents(
documents=chunks,
embedding=self._embeddings,
url=self._qdrant_url,
collection_name=self._collection_name,
)
return len(chunks)