Implement Phase 4~14: LangGraph Agent, RAG pipeline, Gradio Web UI, voice interface

- Upgrade LLM to Qwen3-14B-4bit with Thinking mode (MlxChatModel as LangChain BaseChatModel) - Add LangGraph ReAct agent with tool calling loop (search_documents, web_search, get_current_date, remember/recall_user_info) - Add RAG pipeline: BAAI/bge-m3 embeddings + Qdrant vector store + semantic chunking (SemanticSplitter via cosine similarity) - Replace fixed-size RecursiveCharacterTextSplitter with meaning-based SemanticSplitter (numpy only, no extra deps) - Add Gradio Web UI (app.py): chat, document ingestion, document management tabs - Add multi-user support (user_id isolation in DB + per-user agent cache + dropdown selector) - Add conversation history restore from MySQL on agent init (Phase 11) - Add UserProfileRepository for persistent user profile (remember/recall tools) - Add thread-local DB connections to fix pymysql thread-safety with LangGraph ToolNode - Add Phase 14 voice interface: Whisper STT (microphone → text) + macOS TTS (say -v Yuna) - Enforce search_documents-first policy in system prompt and tool descriptions - Update ROADMAP2.md: Phase 14 완료, Phase 13 청킹 부분 완료 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 14:06:22 +09:00
parent cd41e9e33e
commit 06bcdb03ac
20 changed files with 1934 additions and 47 deletions
@@ -0,0 +1,107 @@
+import re
+
+import numpy as np
+from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
+from langchain_core.documents import Document
+from langchain_qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from qdrant_client.models import Filter, FieldCondition, MatchValue, FilterSelector
+
+
+def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-10))
+
+
+class _SemanticSplitter:
+    """문장 임베딩 유사도 기반 청커.
+
+    인접 문장 간 코사인 유사도를 계산하고, 유사도가 낮은(= 의미 전환) 지점에서 청크를 분리한다.
+    breakpoint_percentile=95이면 유사도 하위 5% 지점이 분리 경계가 된다.
+    """
+
+    _SENTENCE_RE = re.compile(r"(?<=[.!?。！？])\s+")
+
+    def __init__(self, embeddings, breakpoint_percentile: int = 95):
+        self._embeddings = embeddings
+        self._percentile = breakpoint_percentile
+
+    def split_documents(self, docs: list[Document]) -> list[Document]:
+        result = []
+        for doc in docs:
+            for chunk_text in self._split_text(doc.page_content):
+                result.append(Document(page_content=chunk_text, metadata=doc.metadata))
+        return result
+
+    def _split_text(self, text: str) -> list[str]:
+        sentences = [s for s in self._SENTENCE_RE.split(text.strip()) if s.strip()]
+        if len(sentences) <= 1:
+            return [text.strip()] if text.strip() else []
+
+        vecs = np.array(self._embeddings.embed_documents(sentences))
+        similarities = [_cosine_similarity(vecs[i], vecs[i + 1]) for i in range(len(vecs) - 1)]
+        threshold = float(np.percentile(similarities, 100 - self._percentile))
+        breakpoints = [i + 1 for i, s in enumerate(similarities) if s < threshold]
+
+        chunks, start = [], 0
+        for bp in breakpoints:
+            chunk = " ".join(sentences[start:bp]).strip()
+            if chunk:
+                chunks.append(chunk)
+            start = bp
+        tail = " ".join(sentences[start:]).strip()
+        if tail:
+            chunks.append(tail)
+        return chunks
+
+
+class IngestionService:
+    """문서를 의미 단위 청크로 분할해 Qdrant에 저장하는 수집 파이프라인."""
+
+    def __init__(
+        self,
+        embeddings,
+        qdrant_url: str,
+        collection_name: str,
+        breakpoint_threshold_type: str = "percentile",
+    ):
+        self._embeddings = embeddings
+        self._qdrant_url = qdrant_url
+        self._collection_name = collection_name
+        # breakpoint_threshold_type은 향후 확장용으로 수용 (현재는 percentile 방식 고정)
+        self._splitter = _SemanticSplitter(embeddings, breakpoint_percentile=95)
+        self._client = QdrantClient(url=qdrant_url)
+
+    def _delete_by_source(self, source_path: str) -> None:
+        """같은 파일 경로로 저장된 기존 청크를 모두 삭제한다."""
+        try:
+            self._client.delete(
+                collection_name=self._collection_name,
+                points_selector=FilterSelector(
+                    filter=Filter(
+                        must=[
+                            FieldCondition(
+                                key="metadata.source",
+                                match=MatchValue(value=source_path),
+                            )
+                        ]
+                    )
+                ),
+            )
+        except Exception:
+            pass  # 컬렉션이 없을 때(최초 수집) 무시
+
+    def ingest(self, file_paths: list[str]) -> int:
+        docs = []
+        for path in file_paths:
+            self._delete_by_source(path)
+            loader = PDFPlumberLoader(path) if path.endswith(".pdf") else TextLoader(path, encoding="utf-8")
+            docs.extend(loader.load())
+
+        chunks = self._splitter.split_documents(docs)
+        QdrantVectorStore.from_documents(
+            documents=chunks,
+            embedding=self._embeddings,
+            url=self._qdrant_url,
+            collection_name=self._collection_name,
+        )
+        return len(chunks)