from langchain_community.document_loaders import PDFPlumberLoader, TextLoader from langchain_experimental.text_splitter import SemanticChunker from langchain_qdrant import QdrantVectorStore from qdrant_client import QdrantClient from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector class IngestionService: """문서를 의미 단위 청크로 분할해 Qdrant에 저장하는 수집 파이프라인.""" def __init__( self, embeddings, qdrant_url: str, collection_name: str, breakpoint_threshold_type: str = "percentile", buffer_size: int = 1, ): self._embeddings = embeddings self._qdrant_url = qdrant_url self._collection_name = collection_name self._splitter = SemanticChunker( embeddings=embeddings, breakpoint_threshold_type=breakpoint_threshold_type, buffer_size=buffer_size, ) self._client = QdrantClient(url=qdrant_url) def _delete_by_source(self, source_path: str) -> None: """같은 파일 경로로 저장된 기존 청크를 모두 삭제한다.""" try: self._client.delete( collection_name=self._collection_name, points_selector=FilterSelector( filter=Filter( must=[ FieldCondition( key="metadata.source", match=MatchValue(value=source_path), ) ] ) ), ) except Exception: pass # 컬렉션이 없을 때(최초 수집) 무시 def ingest(self, file_paths: list[str]) -> int: docs = [] for path in file_paths: self._delete_by_source(path) loader = PDFPlumberLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding="utf-8") docs.extend(loader.load()) chunks = self._splitter.split_documents(docs) QdrantVectorStore.from_documents( documents=chunks, embedding=self._embeddings, url=self._qdrant_url, collection_name=self._collection_name, ) return len(chunks)