06bcdb03ac
- Upgrade LLM to Qwen3-14B-4bit with Thinking mode (MlxChatModel as LangChain BaseChatModel) - Add LangGraph ReAct agent with tool calling loop (search_documents, web_search, get_current_date, remember/recall_user_info) - Add RAG pipeline: BAAI/bge-m3 embeddings + Qdrant vector store + semantic chunking (SemanticSplitter via cosine similarity) - Replace fixed-size RecursiveCharacterTextSplitter with meaning-based SemanticSplitter (numpy only, no extra deps) - Add Gradio Web UI (app.py): chat, document ingestion, document management tabs - Add multi-user support (user_id isolation in DB + per-user agent cache + dropdown selector) - Add conversation history restore from MySQL on agent init (Phase 11) - Add UserProfileRepository for persistent user profile (remember/recall tools) - Add thread-local DB connections to fix pymysql thread-safety with LangGraph ToolNode - Add Phase 14 voice interface: Whisper STT (microphone → text) + macOS TTS (say -v Yuna) - Enforce search_documents-first policy in system prompt and tool descriptions - Update ROADMAP2.md: Phase 14 완료, Phase 13 청킹 부분 완료 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
108 lines
4.0 KiB
Python
108 lines
4.0 KiB
Python
import re
|
|
|
|
import numpy as np
|
|
from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
|
|
from langchain_core.documents import Document
|
|
from langchain_qdrant import QdrantVectorStore
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector
|
|
|
|
|
|
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10))
|
|
|
|
|
|
class _SemanticSplitter:
|
|
"""문장 임베딩 유사도 기반 청커.
|
|
|
|
인접 문장 간 코사인 유사도를 계산하고, 유사도가 낮은(= 의미 전환) 지점에서 청크를 분리한다.
|
|
breakpoint_percentile=95이면 유사도 하위 5% 지점이 분리 경계가 된다.
|
|
"""
|
|
|
|
_SENTENCE_RE = re.compile(r"(?<=[.!?。!?])\s+")
|
|
|
|
def __init__(self, embeddings, breakpoint_percentile: int = 95):
|
|
self._embeddings = embeddings
|
|
self._percentile = breakpoint_percentile
|
|
|
|
def split_documents(self, docs: list[Document]) -> list[Document]:
|
|
result = []
|
|
for doc in docs:
|
|
for chunk_text in self._split_text(doc.page_content):
|
|
result.append(Document(page_content=chunk_text, metadata=doc.metadata))
|
|
return result
|
|
|
|
def _split_text(self, text: str) -> list[str]:
|
|
sentences = [s for s in self._SENTENCE_RE.split(text.strip()) if s.strip()]
|
|
if len(sentences) <= 1:
|
|
return [text.strip()] if text.strip() else []
|
|
|
|
vecs = np.array(self._embeddings.embed_documents(sentences))
|
|
similarities = [_cosine_similarity(vecs[i], vecs[i + 1]) for i in range(len(vecs) - 1)]
|
|
threshold = float(np.percentile(similarities, 100 - self._percentile))
|
|
breakpoints = [i + 1 for i, s in enumerate(similarities) if s < threshold]
|
|
|
|
chunks, start = [], 0
|
|
for bp in breakpoints:
|
|
chunk = " ".join(sentences[start:bp]).strip()
|
|
if chunk:
|
|
chunks.append(chunk)
|
|
start = bp
|
|
tail = " ".join(sentences[start:]).strip()
|
|
if tail:
|
|
chunks.append(tail)
|
|
return chunks
|
|
|
|
|
|
class IngestionService:
|
|
"""문서를 의미 단위 청크로 분할해 Qdrant에 저장하는 수집 파이프라인."""
|
|
|
|
def __init__(
|
|
self,
|
|
embeddings,
|
|
qdrant_url: str,
|
|
collection_name: str,
|
|
breakpoint_threshold_type: str = "percentile",
|
|
):
|
|
self._embeddings = embeddings
|
|
self._qdrant_url = qdrant_url
|
|
self._collection_name = collection_name
|
|
# breakpoint_threshold_type은 향후 확장용으로 수용 (현재는 percentile 방식 고정)
|
|
self._splitter = _SemanticSplitter(embeddings, breakpoint_percentile=95)
|
|
self._client = QdrantClient(url=qdrant_url)
|
|
|
|
def _delete_by_source(self, source_path: str) -> None:
|
|
"""같은 파일 경로로 저장된 기존 청크를 모두 삭제한다."""
|
|
try:
|
|
self._client.delete(
|
|
collection_name=self._collection_name,
|
|
points_selector=FilterSelector(
|
|
filter=Filter(
|
|
must=[
|
|
FieldCondition(
|
|
key="metadata.source",
|
|
match=MatchValue(value=source_path),
|
|
)
|
|
]
|
|
)
|
|
),
|
|
)
|
|
except Exception:
|
|
pass # 컬렉션이 없을 때(최초 수집) 무시
|
|
|
|
def ingest(self, file_paths: list[str]) -> int:
|
|
docs = []
|
|
for path in file_paths:
|
|
self._delete_by_source(path)
|
|
loader = PDFPlumberLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding="utf-8")
|
|
docs.extend(loader.load())
|
|
|
|
chunks = self._splitter.split_documents(docs)
|
|
QdrantVectorStore.from_documents(
|
|
documents=chunks,
|
|
embedding=self._embeddings,
|
|
url=self._qdrant_url,
|
|
collection_name=self._collection_name,
|
|
)
|
|
return len(chunks)
|