youlbot/services/rag/ingestion_service.py

from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector


class IngestionService:
    """문서를 의미 단위 청크로 분할해 Qdrant에 저장하는 수집 파이프라인."""

    def __init__(
        self,
        embeddings,
        qdrant_url: str,
        collection_name: str,
        breakpoint_threshold_type: str = "percentile",
        buffer_size: int = 1,
    ):
        self._embeddings = embeddings
        self._qdrant_url = qdrant_url
        self._collection_name = collection_name
        self._splitter = SemanticChunker(
            embeddings=embeddings,
            breakpoint_threshold_type=breakpoint_threshold_type,
            buffer_size=buffer_size,
        )
        self._client = QdrantClient(url=qdrant_url)

    def _delete_by_source(self, source_path: str) -> None:
        """같은 파일 경로로 저장된 기존 청크를 모두 삭제한다."""
        try:
            self._client.delete(
                collection_name=self._collection_name,
                points_selector=FilterSelector(
                    filter=Filter(
                        must=[
                            FieldCondition(
                                key="metadata.source",
                                match=MatchValue(value=source_path),
                            )
                        ]
                    )
                ),
            )
        except Exception:
            pass  # 컬렉션이 없을 때(최초 수집) 무시

    def ingest(self, file_paths: list[str]) -> int:
        docs = []
        for path in file_paths:
            self._delete_by_source(path)
            loader = PDFPlumberLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding="utf-8")
            docs.extend(loader.load())

        chunks = self._splitter.split_documents(docs)
        QdrantVectorStore.from_documents(
            documents=chunks,
            embedding=self._embeddings,
            url=self._qdrant_url,
            collection_name=self._collection_name,
        )
        return len(chunks)