Phase 17: Multimodal image understanding via analyze_image tool

Dual-model approach (C): Qwen3-8B handles conversation, Qwen2.5-VL-7B analyzes images on demand via analyze_image LangChain tool. - services/model/mlx_vision_model.py: MlxVisionModel (mlx-vlm wrapper, lazy load) - services/agent/tools.py: make_vision_tool(vision_model, image_path) - agent_service.py: stream_response(image_path=None), dynamic tool binding via config["image_path"] — thread-safe per-request rebinding - container.py: vision_model Singleton provider - config.py: vision_enabled, vision_model_id, vision_max_tokens - api.py: image_base64 in ChatRequest, decode to temp file, cleanup after stream Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-02 13:52:10 +09:00
parent bdb6fd83c4
commit 68f741af72
8 changed files with 355 additions and 18 deletions
@@ -10,7 +10,7 @@ from langgraph.config import get_stream_writer
 from langgraph.graph import END, START, MessagesState, StateGraph
 from langgraph.prebuilt import ToolNode

-from services.agent.tools import get_current_date, make_memory_tools, make_retriever_tool, make_search_tool, web_search
+from services.agent.tools import get_current_date, make_memory_tools, make_retriever_tool, make_search_tool, make_vision_tool, web_search


 class AgentService:
@@ -71,11 +71,13 @@ class AgentService:
            search_tool = make_search_tool(retriever_service, self._source_buffer)
        else:
            search_tool = make_retriever_tool(retriever_service)
-        tools = [search_tool, web_search, get_current_date]
+        self._base_tools = [search_tool, web_search, get_current_date]
        if user_profile_repository is not None:
            remember_tool, recall_tool = make_memory_tools(user_profile_repository, user_id)
-            tools += [remember_tool, recall_tool]
-        llm_with_tools = chat_model.bind_tools(tools)
+            self._base_tools += [remember_tool, recall_tool]
+        self._vision_model = None  # set via set_vision_model()
+        self._llm_with_tools = chat_model.bind_tools(self._base_tools)
+        self._chat_model = chat_model

        async def call_model(state: MessagesState, config: RunnableConfig) -> dict:
            from datetime import date
@@ -123,9 +125,16 @@ class AgentService:
            # LLM 추론 시작 직전에 즉시 신호 emit — UI에 "분석 중" 표시
            if writer:
                writer({"__start": True})
-            # 체크박스 값을 모델의 enable_thinking으로 전달 (런타임 오버라이드)
-            show_thinking = config.get("configurable", {}).get("show_thinking", False)
-            _llm = llm_with_tools.bind(enable_thinking=show_thinking) if show_thinking != chat_model.enable_thinking else llm_with_tools
+            # 이미지 첨부 시 vision tool 동적 추가 (요청별로 독립적으로 바인딩)
+            cfg = config.get("configurable", {})
+            show_thinking = cfg.get("show_thinking", False)
+            image_path = cfg.get("image_path")
+            if image_path and self._vision_model:
+                tools_for_req = self._base_tools + [make_vision_tool(self._vision_model, image_path)]
+                _llm_base = self._chat_model.bind_tools(tools_for_req)
+            else:
+                _llm_base = self._llm_with_tools
+            _llm = _llm_base.bind(enable_thinking=show_thinking) if show_thinking != chat_model.enable_thinking else _llm_base
            async for chunk in _llm.astream(msgs, config):
                t = chunk.additional_kwargs.get("thinking", "")
                if t:
@@ -221,10 +230,21 @@ class AgentService:
    def last_run_id(self) -> str | None:
        return self._last_run_id

-    def _make_config(self, show_thinking: bool = False) -> dict:
-        return {"configurable": {"thread_id": self._thread_id, "show_thinking": show_thinking}}
+    def set_vision_model(self, vision_model) -> None:
+        self._vision_model = vision_model

-    async def stream_response(self, user_input: str, show_thinking: bool | None = None) -> AsyncIterator[str | dict]:
+    def _make_config(self, show_thinking: bool = False, image_path: str | None = None) -> dict:
+        cfg: dict = {"thread_id": self._thread_id, "show_thinking": show_thinking}
+        if image_path:
+            cfg["image_path"] = image_path
+        return {"configurable": cfg}
+
+    async def stream_response(
+        self,
+        user_input: str,
+        show_thinking: bool | None = None,
+        image_path: str | None = None,
+    ) -> AsyncIterator[str | dict]:
        """사용자 입력을 받아 응답 토큰을 순서대로 yield한다.

        실제 답변: plain str
@@ -233,7 +253,7 @@ class AgentService:
        _think_verbose = show_thinking if show_thinking is not None else self._think_verbose
        self._source_buffer.clear()
        run_id = uuid.uuid4()
-        run_config = {**self._make_config(_think_verbose), "run_id": str(run_id)}
+        run_config = {**self._make_config(_think_verbose, image_path=image_path), "run_id": str(run_id)}

        # 재시작 후 첫 호출 시 MySQL 이력을 초기 상태에 주입
        if self._pending_history:
@@ -3,6 +3,17 @@ from datetime import date
 from langchain_core.tools import tool


+def make_vision_tool(vision_model, image_path: str):
+    """현재 요청에 첨부된 이미지를 분석하는 도구."""
+
+    @tool
+    def analyze_image(prompt: str = "이 이미지를 한국어로 자세히 설명해줘.") -> str:
+        """첨부된 이미지를 분석한다. 이미지 속 음식, 문서, 사람, 사물 등을 파악할 때 사용하세요."""
+        return vision_model.analyze(image_path, prompt)
+
+    return analyze_image
+
+
@tool
 def get_current_date() -> str:
    """오늘 날짜를 반환합니다. 나이 계산, 날짜 비교 등 현재 날짜가 필요할 때 반드시 먼저 호출하세요."""
@@ -0,0 +1,48 @@
+"""Qwen2.5-VL (mlx-vlm) 기반 이미지 분석 서비스.
+
+첫 analyze() 호출 시 모델을 lazy load해 메모리를 아낀다.
+"""
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_PROMPT = "이 이미지를 한국어로 자세히 설명해줘. 사람, 음식, 문서 등 보이는 것을 빠짐없이 설명해."
+
+
+class MlxVisionModel:
+    def __init__(self, model_id: str, max_tokens: int = 512) -> None:
+        self._model_id = model_id
+        self._max_tokens = max_tokens
+        self._model = None
+        self._processor = None
+
+    def _load(self) -> None:
+        if self._model is not None:
+            return
+        logger.info("Vision 모델 로딩 중: %s", self._model_id)
+        from mlx_vlm import load
+        self._model, self._processor = load(self._model_id)
+        logger.info("Vision 모델 로딩 완료")
+
+    def analyze(self, image_path: str, prompt: str = _DEFAULT_PROMPT) -> str:
+        """이미지를 분석해 한국어 설명을 반환한다."""
+        self._load()
+        from mlx_vlm import generate
+        from mlx_vlm.prompt_utils import apply_chat_template
+        from mlx_vlm.utils import load_config
+
+        config = load_config(self._model_id)
+        formatted_prompt = apply_chat_template(
+            self._processor, config, prompt, num_images=1
+        )
+        result = generate(
+            self._model,
+            self._processor,
+            image=image_path,
+            prompt=formatted_prompt,
+            max_tokens=self._max_tokens,
+            verbose=False,
+        )
+        return result if isinstance(result, str) else str(result)