From 86370f6c1e13caafcb45c19fb11b00be4301c76d Mon Sep 17 00:00:00 2001 From: sal Date: Fri, 29 May 2026 17:47:17 +0900 Subject: [PATCH] Implement Phase 18: Hybrid Search (BM25 + Vector) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - FastEmbedSparse(Qdrant/bm25) 기반 sparse 임베딩 추가 (fastembed 패키지) - IngestionService: HYBRID_SEARCH_ENABLED 시 dense + sparse 동시 저장 (RetrievalMode.HYBRID) - _ensure_collection_schema(): sparse vector 미설정 컬렉션 자동 삭제·재생성 - RetrieverService: hybrid 스토어 + dense 폴백 구조, Qdrant 내장 RRF로 결과 통합 - container.py: sparse_embeddings Singleton 프로바이더, ingestion/retriever 양쪽 주입 - .env.example: HYBRID_SEARCH_ENABLED, SPARSE_MODEL_ID 항목 추가 활성화: .env에 HYBRID_SEARCH_ENABLED=true 설정 후 기존 문서 재수집 필요 Co-Authored-By: Claude Sonnet 4.6 --- .env.example | 4 ++++ config.py | 4 ++++ container.py | 18 +++++++++++----- docs/ROADMAP.md | 24 +++++++++++++-------- requirements.txt | 2 ++ services/rag/ingestion_service.py | 23 ++++++++++++++++++-- services/rag/retriever_service.py | 35 +++++++++++++++++++++++++------ 7 files changed, 88 insertions(+), 22 deletions(-) diff --git a/.env.example b/.env.example index cb160f8..ae0a102 100644 --- a/.env.example +++ b/.env.example @@ -15,3 +15,7 @@ DB_PASSWORD= LANGCHAIN_TRACING_V2=false LANGCHAIN_API_KEY= LANGCHAIN_PROJECT=youlbot + +# Hybrid Search (Phase 18) — BM25 + Vector (활성화 후 기존 문서 재수집 필요) +HYBRID_SEARCH_ENABLED=false +SPARSE_MODEL_ID=Qdrant/bm25 diff --git a/config.py b/config.py index 9590fec..362bd1a 100644 --- a/config.py +++ b/config.py @@ -41,6 +41,10 @@ class Config(BaseSettings): reranker_enabled: bool = False reranker_model_id: str = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" # 한국어 지원 다국어 모델 reranker_fetch_k: int = 10 # rerank 전 벡터 검색 후보 수 (rag_top_k보다 커야 함) + + # Hybrid Search (Phase 18) — BM25 + Vector + hybrid_search_enabled: bool = False + sparse_model_id: str = "Qdrant/bm25" # fastembed sparse 모델 (언어 무관 BM25) rag_verbose: bool = False rag_show_sources: bool = False langgraph_verbose: bool = False diff --git a/container.py b/container.py index 2745b4a..d54e7ef 100644 --- a/container.py +++ b/container.py @@ -14,6 +14,7 @@ from services.ui.cli_service import CliUiService from services.events.event_bus import EventBus from services.events.handlers import StreamTokenHandler, StreamEndHandler from langchain_huggingface import HuggingFaceEmbeddings +from langchain_qdrant import FastEmbedSparse from services.rag.ingestion_service import IngestionService from services.rag.rerank_service import RerankService from services.rag.retriever_service import RetrieverService @@ -96,6 +97,16 @@ class Container(containers.DeclarativeContainer): model_kwargs=providers.Callable(lambda c: {"device": c.embedding_device}, config), ) + reranker = providers.Callable( + lambda c: RerankService(c.reranker_model_id) if c.reranker_enabled else None, + config, + ) + + sparse_embeddings = providers.Singleton( + lambda c: FastEmbedSparse(model_name=c.sparse_model_id) if c.hybrid_search_enabled else None, + config, + ) + ingestion_service = providers.Singleton( IngestionService, embeddings=embeddings, @@ -105,11 +116,7 @@ class Container(containers.DeclarativeContainer): lambda c: c.semantic_breakpoint_threshold_type, config ), buffer_size=providers.Callable(lambda c: c.semantic_buffer_size, config), - ) - - reranker = providers.Callable( - lambda c: RerankService(c.reranker_model_id) if c.reranker_enabled else None, - config, + sparse_embeddings=sparse_embeddings, ) retriever_service = providers.Singleton( @@ -120,6 +127,7 @@ class Container(containers.DeclarativeContainer): top_k=providers.Callable(lambda c: c.rag_top_k, config), reranker=reranker, rerank_fetch_k=providers.Callable(lambda c: c.reranker_fetch_k, config), + sparse_embeddings=sparse_embeddings, ) # Phase 3 — LangGraph Agent diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 44d139d..62914c2 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -184,15 +184,21 @@ turns = conversation_repository.load_turns_after(self._conv_id, None, limit=10) --- -## Phase 18 — Hybrid Search (BM25 + Vector) ★★☆ +## ✅ Phase 18 — Hybrid Search (BM25 + Vector) ★★☆ **배경**: 한국어 질문에서 고유명사·전문용어가 포함된 경우 의미 검색(Dense)만으로는 recall이 떨어진다. BM25 키워드 검색과 결합(Hybrid)하면 보완이 가능하다. -**구현 방식**: -- Qdrant의 Sparse Vector 지원 활용 (`FastEmbedSparseEmbeddings` 또는 BM42) -- 인덱싱 시 dense + sparse 두 벡터 동시 저장 -- 검색 시 `RRF(Reciprocal Rank Fusion)`로 결과 통합 -- `IngestionService`, `RetrieverService` 양쪽 수정 필요 +**구현 내용**: +- `FastEmbedSparse(model_name="Qdrant/bm25")` — 언어 무관 BM25 sparse 임베딩 (`fastembed` 패키지) +- `IngestionService`: `HYBRID_SEARCH_ENABLED=true` 시 dense + sparse 동시 저장 (`RetrievalMode.HYBRID`) +- `RetrieverService`: hybrid 스토어로 검색 → Qdrant 내장 RRF로 결과 통합; sparse vector 미설정 컬렉션은 dense로 자동 폴백 +- `_ensure_collection_schema()`: hybrid 전환 시 스키마 불일치 컬렉션 자동 재생성 (기존 문서 재수집 필요) +- `.env` `HYBRID_SEARCH_ENABLED=true`로 활성화, 활성화 후 기존 문서 재수집 필요 + +| 설정 | 기본값 | 설명 | +|------|--------|------| +| `HYBRID_SEARCH_ENABLED` | `false` | `true`로 설정 시 활성화 | +| `SPARSE_MODEL_ID` | `Qdrant/bm25` | fastembed sparse 모델 (첫 실행 시 자동 다운로드) | **난이도**: 중간 | **임팩트**: 높음 (키워드 포함 질문 recall 대폭 향상) @@ -271,8 +277,8 @@ docker-compose.yml ``` 단기 (1~2주) 중기 (1개월) 장기 ──────────────────────── ────────────────────── ────────────────── -Phase 18 Hybrid Search → Phase 15 (모델선택) → Phase 16 (Docker) -Phase 19 Query Rewriting → Phase 20 (RAGAS 평가) → Phase 17 (멀티모달) +Phase 19 Query Rewriting → Phase 15 (모델선택) → Phase 16 (Docker) + → Phase 20 (RAGAS 평가) → Phase 17 (멀티모달) ``` ### 우선순위 매트릭스 @@ -295,7 +301,7 @@ Phase 19 Query Rewriting → Phase 20 (RAGAS 평가) → Phase 17 (멀티모 | Phase 13 Semantic Chunker | ✅ 완료 | — | — | — | | Phase 14 음성 인터페이스 | ✅ 완료 | — | — | — | | Phase 13-B Reranker | ✅ 완료 | — | — | — | -| Phase 18 Hybrid Search | 🔲 신규 | 중간 | 높음 | ⭐ 1순위 | +| Phase 18 Hybrid Search | ✅ 완료 | — | — | — | | Phase 19 Query Rewriting | 🔲 신규 | 하 | 중간 | 3순위 | | Phase 15 모델 선택 | 🔲 미완 | 중간 | 중간 | 4순위 | | Phase 20 RAGAS 평가 | 🔲 신규 | 중간 | 중간 | 5순위 | diff --git a/requirements.txt b/requirements.txt index 44f5403..43ebf0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,8 @@ langchain-qdrant>=0.2.0 sentence-transformers>=3.0.0 qdrant-client>=1.9.0 pdfplumber>=0.11.0 +# Phase 18 — Hybrid Search (BM25 sparse vectors) +fastembed>=0.3.0 # Phase 3 — Agent orchestration langgraph>=1.0.0 # Phase 4 — Web UI diff --git a/services/rag/ingestion_service.py b/services/rag/ingestion_service.py index 7797da9..fe83f2d 100644 --- a/services/rag/ingestion_service.py +++ b/services/rag/ingestion_service.py @@ -1,6 +1,6 @@ from langchain_community.document_loaders import PDFPlumberLoader, TextLoader from langchain_experimental.text_splitter import SemanticChunker -from langchain_qdrant import QdrantVectorStore +from langchain_qdrant import QdrantVectorStore, RetrievalMode from qdrant_client import QdrantClient from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector @@ -15,10 +15,12 @@ class IngestionService: collection_name: str, breakpoint_threshold_type: str = "percentile", buffer_size: int = 1, + sparse_embeddings=None, ): self._embeddings = embeddings self._qdrant_url = qdrant_url self._collection_name = collection_name + self._sparse_embeddings = sparse_embeddings self._splitter = SemanticChunker( embeddings=embeddings, breakpoint_threshold_type=breakpoint_threshold_type, @@ -26,6 +28,18 @@ class IngestionService: ) self._client = QdrantClient(url=qdrant_url) + def _ensure_collection_schema(self) -> None: + """Hybrid 모드 전환 시 컬렉션에 sparse vector 설정이 없으면 삭제해 재생성을 유도한다.""" + if not self._sparse_embeddings: + return + try: + info = self._client.get_collection(self._collection_name) + if not info.config.params.sparse_vectors: + print(f"[Hybrid] '{self._collection_name}' 컬렉션에 sparse vector 설정이 없어 재생성합니다.") + self._client.delete_collection(self._collection_name) + except Exception: + pass # 컬렉션 미존재 시 무시 + def _delete_by_source(self, source_path: str) -> None: """같은 파일 경로로 저장된 기존 청크를 모두 삭제한다.""" try: @@ -46,6 +60,7 @@ class IngestionService: pass # 컬렉션이 없을 때(최초 수집) 무시 def ingest(self, file_paths: list[str]) -> int: + self._ensure_collection_schema() docs = [] for path in file_paths: self._delete_by_source(path) @@ -53,10 +68,14 @@ class IngestionService: docs.extend(loader.load()) chunks = self._splitter.split_documents(docs) - QdrantVectorStore.from_documents( + kwargs = dict( documents=chunks, embedding=self._embeddings, url=self._qdrant_url, collection_name=self._collection_name, ) + if self._sparse_embeddings: + kwargs["sparse_embedding"] = self._sparse_embeddings + kwargs["retrieval_mode"] = RetrievalMode.HYBRID + QdrantVectorStore.from_documents(**kwargs) return len(chunks) diff --git a/services/rag/retriever_service.py b/services/rag/retriever_service.py index 7ce441b..99b20ae 100644 --- a/services/rag/retriever_service.py +++ b/services/rag/retriever_service.py @@ -1,5 +1,5 @@ from langchain_core.documents import Document -from langchain_qdrant import QdrantVectorStore +from langchain_qdrant import QdrantVectorStore, RetrievalMode from qdrant_client import QdrantClient from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector @@ -15,24 +15,47 @@ class RetrieverService: top_k: int, reranker=None, rerank_fetch_k: int = 10, + sparse_embeddings=None, ): self._client = QdrantClient(url=qdrant_url) self._collection_name = collection_name - self._store = QdrantVectorStore( + self._top_k = top_k + self._reranker = reranker + self._rerank_fetch_k = rerank_fetch_k + self._sparse_embeddings = sparse_embeddings + + # Dense-only store — hybrid 실패 시 폴백으로도 사용 + self._dense_store = QdrantVectorStore( client=self._client, collection_name=collection_name, embedding=embeddings, ) - self._top_k = top_k - self._reranker = reranker - self._rerank_fetch_k = rerank_fetch_k + + if sparse_embeddings: + self._store = QdrantVectorStore( + client=self._client, + collection_name=collection_name, + embedding=embeddings, + sparse_embedding=sparse_embeddings, + retrieval_mode=RetrievalMode.HYBRID, + ) + else: + self._store = self._dense_store def as_retriever(self): return self._store.as_retriever(search_kwargs={"k": self._top_k}) def search(self, query: str) -> list[Document]: fetch_k = self._rerank_fetch_k if self._reranker else self._top_k - docs = self._store.similarity_search(query, k=fetch_k) + try: + docs = self._store.similarity_search(query, k=fetch_k) + except Exception as e: + if self._sparse_embeddings: + # 컬렉션에 sparse vector 없음 → dense 폴백 (재수집 필요) + print(f"[Hybrid] 검색 실패, dense 폴백 (문서 재수집 필요): {e}") + docs = self._dense_store.similarity_search(query, k=fetch_k) + else: + raise if self._reranker: docs = self._reranker.rerank(query, docs, top_k=self._top_k) return docs