Implement Phase 12 feedback, Phase 13 Semantic Chunker, Phase 13-B Reranker, Bug 5 thinking fix
- Phase 12: FeedbackRepository + td_feedback 테이블, Gradio 👍/👎 이벤트, run_id 추적, LangSmith create_feedback() 연동 - Phase 13: 커스텀 _SemanticSplitter 제거 → langchain_experimental.SemanticChunker 교체, buffer_size/threshold_type 환경변수 적용 - Phase 13-B: RerankService (Cross-Encoder), RetrieverService.search()에 reranker 통합, tools.py as_retriever() → search() 전환 - Bug 5: mlx_chat_model enable_thinking 런타임 오버라이드, agent_service stream_mode=["messages","custom"] 이중 스트림, thinking 토큰 custom 이벤트로 emit - ROADMAP: LLM 모델명 8B 반영, RAG에 Reranker 추가, 추천 진행 순서 갱신 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,59 +1,10 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_experimental.text_splitter import SemanticChunker
|
||||
from langchain_qdrant import QdrantVectorStore
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector
|
||||
|
||||
|
||||
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
||||
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10))
|
||||
|
||||
|
||||
class _SemanticSplitter:
|
||||
"""문장 임베딩 유사도 기반 청커.
|
||||
|
||||
인접 문장 간 코사인 유사도를 계산하고, 유사도가 낮은(= 의미 전환) 지점에서 청크를 분리한다.
|
||||
breakpoint_percentile=95이면 유사도 하위 5% 지점이 분리 경계가 된다.
|
||||
"""
|
||||
|
||||
_SENTENCE_RE = re.compile(r"(?<=[.!?。!?])\s+")
|
||||
|
||||
def __init__(self, embeddings, breakpoint_percentile: int = 95):
|
||||
self._embeddings = embeddings
|
||||
self._percentile = breakpoint_percentile
|
||||
|
||||
def split_documents(self, docs: list[Document]) -> list[Document]:
|
||||
result = []
|
||||
for doc in docs:
|
||||
for chunk_text in self._split_text(doc.page_content):
|
||||
result.append(Document(page_content=chunk_text, metadata=doc.metadata))
|
||||
return result
|
||||
|
||||
def _split_text(self, text: str) -> list[str]:
|
||||
sentences = [s for s in self._SENTENCE_RE.split(text.strip()) if s.strip()]
|
||||
if len(sentences) <= 1:
|
||||
return [text.strip()] if text.strip() else []
|
||||
|
||||
vecs = np.array(self._embeddings.embed_documents(sentences))
|
||||
similarities = [_cosine_similarity(vecs[i], vecs[i + 1]) for i in range(len(vecs) - 1)]
|
||||
threshold = float(np.percentile(similarities, 100 - self._percentile))
|
||||
breakpoints = [i + 1 for i, s in enumerate(similarities) if s < threshold]
|
||||
|
||||
chunks, start = [], 0
|
||||
for bp in breakpoints:
|
||||
chunk = " ".join(sentences[start:bp]).strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
start = bp
|
||||
tail = " ".join(sentences[start:]).strip()
|
||||
if tail:
|
||||
chunks.append(tail)
|
||||
return chunks
|
||||
|
||||
|
||||
class IngestionService:
|
||||
"""문서를 의미 단위 청크로 분할해 Qdrant에 저장하는 수집 파이프라인."""
|
||||
|
||||
@@ -63,12 +14,16 @@ class IngestionService:
|
||||
qdrant_url: str,
|
||||
collection_name: str,
|
||||
breakpoint_threshold_type: str = "percentile",
|
||||
buffer_size: int = 1,
|
||||
):
|
||||
self._embeddings = embeddings
|
||||
self._qdrant_url = qdrant_url
|
||||
self._collection_name = collection_name
|
||||
# breakpoint_threshold_type은 향후 확장용으로 수용 (현재는 percentile 방식 고정)
|
||||
self._splitter = _SemanticSplitter(embeddings, breakpoint_percentile=95)
|
||||
self._splitter = SemanticChunker(
|
||||
embeddings=embeddings,
|
||||
breakpoint_threshold_type=breakpoint_threshold_type,
|
||||
buffer_size=buffer_size,
|
||||
)
|
||||
self._client = QdrantClient(url=qdrant_url)
|
||||
|
||||
def _delete_by_source(self, source_path: str) -> None:
|
||||
|
||||
Reference in New Issue
Block a user