Phase 17: Multimodal image understanding via analyze_image tool

Dual-model approach (C): Qwen3-8B handles conversation, Qwen2.5-VL-7B analyzes images on demand via analyze_image LangChain tool. - services/model/mlx_vision_model.py: MlxVisionModel (mlx-vlm wrapper, lazy load) - services/agent/tools.py: make_vision_tool(vision_model, image_path) - agent_service.py: stream_response(image_path=None), dynamic tool binding via config["image_path"] — thread-safe per-request rebinding - container.py: vision_model Singleton provider - config.py: vision_enabled, vision_model_id, vision_max_tokens - api.py: image_base64 in ChatRequest, decode to temp file, cleanup after stream Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-02 13:52:10 +09:00
parent bdb6fd83c4
commit 68f741af72
8 changed files with 355 additions and 18 deletions
@@ -10,7 +10,7 @@ from langgraph.config import get_stream_writer
 from langgraph.graph import END, START, MessagesState, StateGraph
 from langgraph.prebuilt import ToolNode

-from services.agent.tools import get_current_date, make_memory_tools, make_retriever_tool, make_search_tool, web_search
+from services.agent.tools import get_current_date, make_memory_tools, make_retriever_tool, make_search_tool, make_vision_tool, web_search


 class AgentService:
@@ -71,11 +71,13 @@ class AgentService:
            search_tool = make_search_tool(retriever_service, self._source_buffer)
        else:
            search_tool = make_retriever_tool(retriever_service)
-        tools = [search_tool, web_search, get_current_date]
+        self._base_tools = [search_tool, web_search, get_current_date]
        if user_profile_repository is not None:
            remember_tool, recall_tool = make_memory_tools(user_profile_repository, user_id)
-            tools += [remember_tool, recall_tool]
-        llm_with_tools = chat_model.bind_tools(tools)
+            self._base_tools += [remember_tool, recall_tool]
+        self._vision_model = None  # set via set_vision_model()
+        self._llm_with_tools = chat_model.bind_tools(self._base_tools)
+        self._chat_model = chat_model

        async def call_model(state: MessagesState, config: RunnableConfig) -> dict:
            from datetime import date
@@ -123,9 +125,16 @@ class AgentService:
            # LLM 추론 시작 직전에 즉시 신호 emit — UI에 "분석 중" 표시
            if writer:
                writer({"__start": True})
-            # 체크박스 값을 모델의 enable_thinking으로 전달 (런타임 오버라이드)
-            show_thinking = config.get("configurable", {}).get("show_thinking", False)
-            _llm = llm_with_tools.bind(enable_thinking=show_thinking) if show_thinking != chat_model.enable_thinking else llm_with_tools
+            # 이미지 첨부 시 vision tool 동적 추가 (요청별로 독립적으로 바인딩)
+            cfg = config.get("configurable", {})
+            show_thinking = cfg.get("show_thinking", False)
+            image_path = cfg.get("image_path")
+            if image_path and self._vision_model:
+                tools_for_req = self._base_tools + [make_vision_tool(self._vision_model, image_path)]
+                _llm_base = self._chat_model.bind_tools(tools_for_req)
+            else:
+                _llm_base = self._llm_with_tools
+            _llm = _llm_base.bind(enable_thinking=show_thinking) if show_thinking != chat_model.enable_thinking else _llm_base
            async for chunk in _llm.astream(msgs, config):
                t = chunk.additional_kwargs.get("thinking", "")
                if t:
@@ -221,10 +230,21 @@ class AgentService:
    def last_run_id(self) -> str | None:
        return self._last_run_id

-    def _make_config(self, show_thinking: bool = False) -> dict:
-        return {"configurable": {"thread_id": self._thread_id, "show_thinking": show_thinking}}
+    def set_vision_model(self, vision_model) -> None:
+        self._vision_model = vision_model

-    async def stream_response(self, user_input: str, show_thinking: bool | None = None) -> AsyncIterator[str | dict]:
+    def _make_config(self, show_thinking: bool = False, image_path: str | None = None) -> dict:
+        cfg: dict = {"thread_id": self._thread_id, "show_thinking": show_thinking}
+        if image_path:
+            cfg["image_path"] = image_path
+        return {"configurable": cfg}
+
+    async def stream_response(
+        self,
+        user_input: str,
+        show_thinking: bool | None = None,
+        image_path: str | None = None,
+    ) -> AsyncIterator[str | dict]:
        """사용자 입력을 받아 응답 토큰을 순서대로 yield한다.

        실제 답변: plain str
@@ -233,7 +253,7 @@ class AgentService:
        _think_verbose = show_thinking if show_thinking is not None else self._think_verbose
        self._source_buffer.clear()
        run_id = uuid.uuid4()
-        run_config = {**self._make_config(_think_verbose), "run_id": str(run_id)}
+        run_config = {**self._make_config(_think_verbose, image_path=image_path), "run_id": str(run_id)}

        # 재시작 후 첫 호출 시 MySQL 이력을 초기 상태에 주입
        if self._pending_history: