145b0cc96f
- Phase 12: FeedbackRepository + td_feedback 테이블, Gradio 👍/👎 이벤트, run_id 추적, LangSmith create_feedback() 연동 - Phase 13: 커스텀 _SemanticSplitter 제거 → langchain_experimental.SemanticChunker 교체, buffer_size/threshold_type 환경변수 적용 - Phase 13-B: RerankService (Cross-Encoder), RetrieverService.search()에 reranker 통합, tools.py as_retriever() → search() 전환 - Bug 5: mlx_chat_model enable_thinking 런타임 오버라이드, agent_service stream_mode=["messages","custom"] 이중 스트림, thinking 토큰 custom 이벤트로 emit - ROADMAP: LLM 모델명 8B 반영, RAG에 Reranker 추가, 추천 진행 순서 갱신 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
63 lines
2.3 KiB
Python
63 lines
2.3 KiB
Python
from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
|
|
from langchain_experimental.text_splitter import SemanticChunker
|
|
from langchain_qdrant import QdrantVectorStore
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector
|
|
|
|
|
|
class IngestionService:
|
|
"""문서를 의미 단위 청크로 분할해 Qdrant에 저장하는 수집 파이프라인."""
|
|
|
|
def __init__(
|
|
self,
|
|
embeddings,
|
|
qdrant_url: str,
|
|
collection_name: str,
|
|
breakpoint_threshold_type: str = "percentile",
|
|
buffer_size: int = 1,
|
|
):
|
|
self._embeddings = embeddings
|
|
self._qdrant_url = qdrant_url
|
|
self._collection_name = collection_name
|
|
self._splitter = SemanticChunker(
|
|
embeddings=embeddings,
|
|
breakpoint_threshold_type=breakpoint_threshold_type,
|
|
buffer_size=buffer_size,
|
|
)
|
|
self._client = QdrantClient(url=qdrant_url)
|
|
|
|
def _delete_by_source(self, source_path: str) -> None:
|
|
"""같은 파일 경로로 저장된 기존 청크를 모두 삭제한다."""
|
|
try:
|
|
self._client.delete(
|
|
collection_name=self._collection_name,
|
|
points_selector=FilterSelector(
|
|
filter=Filter(
|
|
must=[
|
|
FieldCondition(
|
|
key="metadata.source",
|
|
match=MatchValue(value=source_path),
|
|
)
|
|
]
|
|
)
|
|
),
|
|
)
|
|
except Exception:
|
|
pass # 컬렉션이 없을 때(최초 수집) 무시
|
|
|
|
def ingest(self, file_paths: list[str]) -> int:
|
|
docs = []
|
|
for path in file_paths:
|
|
self._delete_by_source(path)
|
|
loader = PDFPlumberLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding="utf-8")
|
|
docs.extend(loader.load())
|
|
|
|
chunks = self._splitter.split_documents(docs)
|
|
QdrantVectorStore.from_documents(
|
|
documents=chunks,
|
|
embedding=self._embeddings,
|
|
url=self._qdrant_url,
|
|
collection_name=self._collection_name,
|
|
)
|
|
return len(chunks)
|