From 68f741af72370f4f635912113de8a0f725508ab1 Mon Sep 17 00:00:00 2001
From: sal <shinalok357@gmail.com>
Date: Tue, 2 Jun 2026 13:52:10 +0900
Subject: [PATCH] Phase 17: Multimodal image understanding via analyze_image
 tool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dual-model approach (C): Qwen3-8B handles conversation, Qwen2.5-VL-7B
analyzes images on demand via analyze_image LangChain tool.

- services/model/mlx_vision_model.py: MlxVisionModel (mlx-vlm wrapper, lazy load)
- services/agent/tools.py: make_vision_tool(vision_model, image_path)
- agent_service.py: stream_response(image_path=None), dynamic tool binding
  via config["image_path"] — thread-safe per-request rebinding
- container.py: vision_model Singleton provider
- config.py: vision_enabled, vision_model_id, vision_max_tokens
- api.py: image_base64 in ChatRequest, decode to temp file, cleanup after stream

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 api.py                                        |  35 ++-
 config.py                                     |   5 +
 container.py                                  |   8 +
 .../features/phase17-multimodal.plan.md       | 202 ++++++++++++++++++
 docs/ROADMAP.md                               |  22 +-
 services/agent/agent_service.py               |  42 +++-
 services/agent/tools.py                       |  11 +
 services/model/mlx_vision_model.py            |  48 +++++
 8 files changed, 355 insertions(+), 18 deletions(-)
 create mode 100644 docs/01-plan/features/phase17-multimodal.plan.md
 create mode 100644 services/model/mlx_vision_model.py
diff --git a/api.py b/api.py
index cbc00e2..f308ea3 100644
--- a/api.py
+++ b/api.py
@@ -41,6 +41,9 @@ _container.db_service().init_schema()
 _cfg = _container.config()
 _agent_cache: dict[str, AgentService] = {}
 
+# Vision 모델 — VISION_ENABLED=true 시 lazy 초기화
+_vision_model = _container.vision_model() if _cfg.vision_enabled else None
+
 
 def _get_agent(user_id: str) -> AgentService:
     if user_id not in _agent_cache:
@@ -57,6 +60,8 @@ def _get_agent(user_id: str) -> AgentService:
             conversation_repository=_container.conversation_repository(),
             user_id=user_id,
         )
+        if _vision_model:
+            _agent_cache[user_id].set_vision_model(_vision_model)
     return _agent_cache[user_id]
 
 
@@ -74,6 +79,7 @@ class ChatRequest(BaseModel):
     message: str
     user_id: str = "default"
     show_thinking: bool = False
+    image_base64: str | None = None  # base64 인코딩된 이미지 (선택)
 
 
 class FeedbackRequest(BaseModel):
@@ -97,10 +103,33 @@ async def chat(req: ChatRequest, _=Depends(_auth)):
     """SSE 스트리밍 응답. 각 라인: `data: <JSON 토큰>\n\n`, 종료: `data: [DONE]\n\n`"""
     agent = _get_agent(req.user_id)
 
+    # 이미지 base64 → 임시 파일 저장
+    image_path: str | None = None
+    tmp_path: str | None = None
+    if req.image_base64 and _vision_model:
+        import base64
+        img_bytes = base64.b64decode(req.image_base64)
+        suffix = ".jpg"
+        if img_bytes[:4] == b"\x89PNG":
+            suffix = ".png"
+        elif img_bytes[:4] == b"GIF8":
+            suffix = ".gif"
+        tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir="/tmp", prefix="youlbot_img_")
+        tmp.write(img_bytes)
+        tmp.close()
+        image_path = tmp.name
+        tmp_path = tmp.name
+
     async def generate():
-        async for token in agent.stream_response(req.message, show_thinking=req.show_thinking):
-            yield f"data: {json.dumps(token, ensure_ascii=False)}\n\n"
-        yield f"data: {json.dumps({'__done': True, 'run_id': agent.last_run_id}, ensure_ascii=False)}\n\n"
+        try:
+            async for token in agent.stream_response(
+                req.message, show_thinking=req.show_thinking, image_path=image_path
+            ):
+                yield f"data: {json.dumps(token, ensure_ascii=False)}\n\n"
+            yield f"data: {json.dumps({'__done': True, 'run_id': agent.last_run_id}, ensure_ascii=False)}\n\n"
+        finally:
+            if tmp_path and os.path.exists(tmp_path):
+                os.unlink(tmp_path)
 
     return StreamingResponse(generate(), media_type="text/event-stream")
 
diff --git a/config.py b/config.py
index 2189800..e01c06f 100644
--- a/config.py
+++ b/config.py
@@ -59,6 +59,11 @@ class Config(BaseSettings):
     whisper_model_size: str = "small"
     tts_voice: str = "Yuna"  # macOS say 명령어 한국어 음성
 
+    # Vision (Phase 17)
+    vision_enabled: bool = False
+    vision_model_id: str = "mlx-community/Qwen2.5-VL-7B-Instruct-4bit"
+    vision_max_tokens: int = 512
+
     system_prompt: str = """모든 사고 과정(thinking)과 답변은 반드시 한국어로만 작성하세요. 영어 사용 절대 금지.
 
 당신의 이름은 '율봇'입니다. 친절하고 따뜻한 한국어 상담 도우미입니다.
diff --git a/container.py b/container.py
index d787393..503441e 100644
--- a/container.py
+++ b/container.py
@@ -19,6 +19,7 @@ from services.rag.ingestion_service import IngestionService
 from services.rag.rerank_service import RerankService
 from services.rag.retriever_service import RetrieverService
 from services.agent.agent_service import AgentService
+from services.model.mlx_vision_model import MlxVisionModel
 
 
 class Container(containers.DeclarativeContainer):
@@ -130,6 +131,13 @@ class Container(containers.DeclarativeContainer):
         sparse_embeddings=sparse_embeddings,
     )
 
+    # Phase 17 — Vision Model (lazy load)
+    vision_model = providers.Singleton(
+        MlxVisionModel,
+        model_id=providers.Callable(lambda c: c.vision_model_id, config),
+        max_tokens=providers.Callable(lambda c: c.vision_max_tokens, config),
+    )
+
     # Phase 3 — LangGraph Agent
     agent_service = providers.Singleton(
         AgentService,
diff --git a/docs/01-plan/features/phase17-multimodal.plan.md b/docs/01-plan/features/phase17-multimodal.plan.md
new file mode 100644
index 0000000..c16a020
--- /dev/null
+++ b/docs/01-plan/features/phase17-multimodal.plan.md
@@ -0,0 +1,202 @@
+---
+template: plan
+version: 1.3
+feature: phase17-multimodal
+date: 2026-06-02
+author: sal
+project: youlbot
+status: Draft
+---
+
+# phase17-multimodal Planning Document
+
+> **Summary**: analyze_image 도구 방식으로 이미지 이해 기능을 추가한다.
+> Qwen3-8B가 대화를 유지하고, 이미지 첨부 시 Qwen2.5-VL-7B를 도구로 호출해 설명을 얻은 뒤 답변한다.
+>
+> **Project**: youlbot
+> **Author**: sal
+> **Date**: 2026-06-02
+> **Status**: Draft
+
+---
+
+## Executive Summary
+
+| Perspective | Content |
+|-------------|---------|
+| **Problem** | 이유식 사진·금융 서류 등 이미지를 텍스트로만 처리하는 현재 한계 |
+| **Solution** | Qwen2.5-VL-7B를 `analyze_image` LangChain 도구로 래핑, Qwen3-8B가 필요 시 자동 호출 |
+| **Function/UX Effect** | 채팅창에 이미지 첨부 → 자동 분석 → 육아·금융 상담으로 자연스럽게 연결 |
+| **Core Value** | 텍스트 추론 품질(Qwen3-8B)을 유지하면서 이미지 이해 기능 추가 |
+
+---
+
+## Context Anchor
+
+| Key | Value |
+|-----|-------|
+| **WHY** | 손이 자유롭지 않은 육아 상황에서 사진 한 장으로 재료 분석·서류 해석이 가능해야 함 |
+| **WHO** | 아록(주 사용자) — 이유식 사진, 건강보험 서류, 접종 기록지 등 촬영 후 질문 |
+| **RISK** | 16GB 메모리에서 두 모델 동시 로드 시 OOM 가능 → Vision 모델 lazy load로 완화 |
+| **SUCCESS** | 이미지 첨부 → analyze_image 도구 자동 호출 → 설명이 대화 히스토리에 남아 후속 질문 가능 |
+| **SCOPE** | 이미지 분석 + 채팅 연동. 동영상·실시간 캡처는 제외 |
+
+---
+
+## 1. Overview
+
+### 1.1 Purpose
+사진을 첨부하면 `analyze_image` 도구가 Qwen2.5-VL-7B를 호출해 이미지 설명을 생성하고,
+Qwen3-8B가 그 설명을 컨텍스트로 삼아 육아·금융 상담 답변을 제공한다.
+
+### 1.2 모델 분담
+
+| 모델 | 역할 | 메모리 |
+|------|------|--------|
+| Qwen3-8B-4bit | 대화·추론·도구 결정 (항상 로드) | ~5GB |
+| Qwen2.5-VL-7B-Instruct-4bit | 이미지 분석 (lazy load) | ~5GB |
+| 합계 | — | ~10GB / 16GB 사용 가능 |
+
+---
+
+## 2. Scope
+
+### 2.1 In Scope
+- `mlx-vlm` 패키지로 Vision 모델 로드 및 추론
+- `analyze_image(image_path, prompt)` LangChain 도구 구현
+- AgentService: 요청에 이미지 있을 때 도구 동적 주입
+- API(`/chat`): 이미지 파일 업로드 지원 (multipart form)
+- WebUI: 채팅 입력창에 이미지 첨부 버튼 추가
+- Telegram: 사진 메시지 수신 → 이미지 다운로드 → API 전달
+
+### 2.2 Out of Scope
+- 동영상 분석
+- 이미지 생성(text-to-image)
+- 실시간 카메라 입력
+
+---
+
+## 3. Architecture — C방식 (analyze_image 도구)
+
+```
+사용자
+  │  텍스트 + 이미지(선택)
+  ▼
+API /chat  (multipart form)
+  │  image → /tmp/youlbot_img_xxx.jpg 저장
+  │  image_path → AgentService.stream_response(message, image_path=...)
+  ▼
+AgentService
+  │  image_path 있을 때: analyze_image 도구를 tools 목록에 동적 추가
+  │  image_path를 도구 클로저로 바인딩
+  ▼
+LangGraph ReAct
+  │  Qwen3-8B가 이미지 관련 질문 감지 → analyze_image() 자동 호출
+  ▼
+analyze_image 도구
+  │  mlx_vision_model.analyze(image_path, prompt)
+  ▼
+MlxVisionModel (Qwen2.5-VL-7B, lazy load)
+  │  이미지 설명 텍스트 반환
+  ▼
+LangGraph
+  │  설명이 ToolMessage로 대화 히스토리에 저장
+  ▼
+Qwen3-8B  →  최종 답변 생성
+```
+
+**핵심 특성:**
+- Vision 모델은 처음 analyze_image 호출 시 로드 (이후 캐시)
+- 이미지 설명이 대화 히스토리에 남아 후속 질문("그 재료로 이유식 만들어줘") 가능
+- 이미지 없는 메시지는 기존과 완전히 동일하게 동작
+
+---
+
+## 4. 변경 파일 목록
+
+### 신규 생성
+| 파일 | 설명 |
+|------|------|
+| `services/model/mlx_vision_model.py` | MlxVisionModel 클래스 (mlx-vlm 래퍼, lazy load) |
+
+### 수정
+| 파일 | 변경 내용 |
+|------|----------|
+| `config.py` | `vision_enabled: bool`, `vision_model_id: str` 추가 |
+| `container.py` | `vision_model` Singleton 프로바이더 추가 |
+| `services/agent/tools.py` | `make_vision_tool(vision_model, image_path)` 추가 |
+| `services/agent/agent_service.py` | `stream_response(image_path=None)` 파라미터 추가, 도구 동적 주입 |
+| `api.py` | `/chat` → multipart form으로 변경, 이미지 temp 저장 |
+| `youlbot-webui/api_client.py` | `chat(image_path=None)` 파라미터 추가, multipart 전송 |
+| `youlbot-webui/app.py` | 채팅 입력 영역에 이미지 업로드 컴포넌트 추가 |
+
+---
+
+## 5. 주요 구현 세부사항
+
+### 5.1 MlxVisionModel
+```python
+class MlxVisionModel:
+    def __init__(self, model_id: str): ...
+    
+    def analyze(self, image_path: str, prompt: str = "이 이미지를 한국어로 자세히 설명해줘.") -> str:
+        # 첫 호출 시 lazy load
+        # mlx_vlm.generate() 호출
+        # 한국어 설명 반환
+```
+
+### 5.2 make_vision_tool
+```python
+def make_vision_tool(vision_model, image_path: str):
+    @tool
+    def analyze_image(prompt: str = "이 이미지를 설명해줘") -> str:
+        """현재 첨부된 이미지를 분석한다."""
+        return vision_model.analyze(image_path, prompt)
+    return analyze_image
+```
+
+### 5.3 API /chat 변경
+- JSON Body → `multipart/form-data`
+- 필드: `message`, `user_id`, `show_thinking`, `image` (optional file)
+- 이미지를 `/tmp/youlbot_img_{uuid}.{ext}`에 저장 후 agent에 전달
+- 응답 완료 후 temp 파일 삭제
+
+### 5.4 WebUI 변경
+- `gr.Image(type="filepath", ...)` 컴포넌트 채팅 입력 영역에 추가
+- 이미지 첨부 시 api_client.chat()에 image_path 전달
+- 전송 후 이미지 초기화
+
+---
+
+## 6. 환경 설정
+
+```env
+# .env 추가
+VISION_ENABLED=true
+VISION_MODEL_ID=mlx-community/Qwen2.5-VL-7B-Instruct-4bit
+```
+
+```bash
+# 패키지 설치
+pip install mlx-vlm
+```
+
+---
+
+## 7. 위험 요소 및 대응
+
+| 위험 | 대응 |
+|------|------|
+| 16GB에서 두 모델 동시 OOM | Vision 모델 lazy load + 미사용 시 unload 옵션 제공 |
+| mlx-vlm API 변경 가능성 | MlxVisionModel로 캡슐화해 교체 용이하게 |
+| Telegram 이미지 전달 복잡성 | Phase 17-B로 분리, 우선 WebUI만 구현 |
+| 이미지 temp 파일 누적 | 응답 완료 후 즉시 삭제 |
+
+---
+
+## 8. 성공 기준
+
+- [ ] 이미지 첨부 시 `analyze_image` 도구가 자동 호출되어 설명 생성
+- [ ] "이 사진에서 뭐가 보여?" 후속 질문이 히스토리 기반으로 동작
+- [ ] 이미지 없는 일반 질문은 기존과 동일하게 Qwen3-8B로 처리
+- [ ] 16GB 환경에서 OOM 없이 동작
diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md
index dcf0f5f..eef0d6e 100644
--- a/docs/ROADMAP.md
+++ b/docs/ROADMAP.md
@@ -479,13 +479,27 @@ docker-compose.yml
 
 ---
 
-## Phase 17 — 멀티모달 이미지 이해 ★☆☆
+## ✅ Phase 17 — 멀티모달 이미지 이해 ★☆☆
 
 **배경**: 이유식 사진 → 재료 분석, 금융 서류 사진 → 내용 해석 등.
 
-**제약**: Qwen3-14B는 이미지 미지원 → `mlx-community/Qwen2.5-VL-7B-Instruct-4bit` 교체 필요.
+**구현 방식**: Dual-model C방식 — analyze_image 도구
 
-**난이도**: 높음 | **임팩트**: 높음 (장기 과제)
+| 모델 | 역할 |
+|------|------|
+| Qwen3-8B-4bit | 대화·추론 (항상 로드) |
+| Qwen2.5-VL-7B-Instruct-4bit | 이미지 분석 (lazy load) |
+
+- `services/model/mlx_vision_model.py` — MlxVisionModel (mlx-vlm 래퍼, lazy load)
+- `services/agent/tools.py` — `make_vision_tool(vision_model, image_path)` 추가
+- `agent_service.py` — `stream_response(image_path=None)`, config 경유 vision tool 동적 주입
+- `api.py` — `image_base64` 필드 추가, temp 파일 저장 후 응답 완료 시 삭제
+- `youlbot-webui` — `image_input` 컴포넌트 추가, ChatService.chat(image_path=) 연결
+- `.env` — `VISION_ENABLED=true`, `VISION_MODEL_ID` 설정
+
+**실행 방법**: API 서버 재시작 후 WebUI 이미지 첨부 버튼으로 사진 전송
+
+**난이도**: 높음 | **임팩트**: 높음
 
 ---
 
@@ -529,4 +543,4 @@ Phase 20 RAGAS 평가       →  Phase 15 (모델선택)    →  Phase 16 (Docke
 | Phase 20 RAGAS 평가 | ✅ 완료 | — | — | — |
 | Phase 15 모델 선택 | 🔲 미완 | 중간 | 중간 | 4순위 |
 | Phase 16 Docker | 🔲 미완 | 높음 | 중간 | 5순위 |
-| Phase 17 멀티모달 | 🔲 미완 | 높음 | 높음 | 6순위 |
+| Phase 17 멀티모달 | ✅ 완료 | — | — | — |
diff --git a/services/agent/agent_service.py b/services/agent/agent_service.py
index 146649f..7653273 100644
--- a/services/agent/agent_service.py
+++ b/services/agent/agent_service.py
@@ -10,7 +10,7 @@ from langgraph.config import get_stream_writer
 from langgraph.graph import END, START, MessagesState, StateGraph
 from langgraph.prebuilt import ToolNode
 
-from services.agent.tools import get_current_date, make_memory_tools, make_retriever_tool, make_search_tool, web_search
+from services.agent.tools import get_current_date, make_memory_tools, make_retriever_tool, make_search_tool, make_vision_tool, web_search
 
 
 class AgentService:
@@ -71,11 +71,13 @@ class AgentService:
             search_tool = make_search_tool(retriever_service, self._source_buffer)
         else:
             search_tool = make_retriever_tool(retriever_service)
-        tools = [search_tool, web_search, get_current_date]
+        self._base_tools = [search_tool, web_search, get_current_date]
         if user_profile_repository is not None:
             remember_tool, recall_tool = make_memory_tools(user_profile_repository, user_id)
-            tools += [remember_tool, recall_tool]
-        llm_with_tools = chat_model.bind_tools(tools)
+            self._base_tools += [remember_tool, recall_tool]
+        self._vision_model = None  # set via set_vision_model()
+        self._llm_with_tools = chat_model.bind_tools(self._base_tools)
+        self._chat_model = chat_model
 
         async def call_model(state: MessagesState, config: RunnableConfig) -> dict:
             from datetime import date
@@ -123,9 +125,16 @@ class AgentService:
             # LLM 추론 시작 직전에 즉시 신호 emit — UI에 "분석 중" 표시
             if writer:
                 writer({"__start": True})
-            # 체크박스 값을 모델의 enable_thinking으로 전달 (런타임 오버라이드)
-            show_thinking = config.get("configurable", {}).get("show_thinking", False)
-            _llm = llm_with_tools.bind(enable_thinking=show_thinking) if show_thinking != chat_model.enable_thinking else llm_with_tools
+            # 이미지 첨부 시 vision tool 동적 추가 (요청별로 독립적으로 바인딩)
+            cfg = config.get("configurable", {})
+            show_thinking = cfg.get("show_thinking", False)
+            image_path = cfg.get("image_path")
+            if image_path and self._vision_model:
+                tools_for_req = self._base_tools + [make_vision_tool(self._vision_model, image_path)]
+                _llm_base = self._chat_model.bind_tools(tools_for_req)
+            else:
+                _llm_base = self._llm_with_tools
+            _llm = _llm_base.bind(enable_thinking=show_thinking) if show_thinking != chat_model.enable_thinking else _llm_base
             async for chunk in _llm.astream(msgs, config):
                 t = chunk.additional_kwargs.get("thinking", "")
                 if t:
@@ -221,10 +230,21 @@ class AgentService:
     def last_run_id(self) -> str | None:
         return self._last_run_id
 
-    def _make_config(self, show_thinking: bool = False) -> dict:
-        return {"configurable": {"thread_id": self._thread_id, "show_thinking": show_thinking}}
+    def set_vision_model(self, vision_model) -> None:
+        self._vision_model = vision_model
 
-    async def stream_response(self, user_input: str, show_thinking: bool | None = None) -> AsyncIterator[str | dict]:
+    def _make_config(self, show_thinking: bool = False, image_path: str | None = None) -> dict:
+        cfg: dict = {"thread_id": self._thread_id, "show_thinking": show_thinking}
+        if image_path:
+            cfg["image_path"] = image_path
+        return {"configurable": cfg}
+
+    async def stream_response(
+        self,
+        user_input: str,
+        show_thinking: bool | None = None,
+        image_path: str | None = None,
+    ) -> AsyncIterator[str | dict]:
         """사용자 입력을 받아 응답 토큰을 순서대로 yield한다.
 
         실제 답변: plain str
@@ -233,7 +253,7 @@ class AgentService:
         _think_verbose = show_thinking if show_thinking is not None else self._think_verbose
         self._source_buffer.clear()
         run_id = uuid.uuid4()
-        run_config = {**self._make_config(_think_verbose), "run_id": str(run_id)}
+        run_config = {**self._make_config(_think_verbose, image_path=image_path), "run_id": str(run_id)}
 
         # 재시작 후 첫 호출 시 MySQL 이력을 초기 상태에 주입
         if self._pending_history:
diff --git a/services/agent/tools.py b/services/agent/tools.py
index 9aa6e25..24bd765 100644
--- a/services/agent/tools.py
+++ b/services/agent/tools.py
@@ -3,6 +3,17 @@ from datetime import date
 from langchain_core.tools import tool
 
 
+def make_vision_tool(vision_model, image_path: str):
+    """현재 요청에 첨부된 이미지를 분석하는 도구."""
+
+    @tool
+    def analyze_image(prompt: str = "이 이미지를 한국어로 자세히 설명해줘.") -> str:
+        """첨부된 이미지를 분석한다. 이미지 속 음식, 문서, 사람, 사물 등을 파악할 때 사용하세요."""
+        return vision_model.analyze(image_path, prompt)
+
+    return analyze_image
+
+
 @tool
 def get_current_date() -> str:
     """오늘 날짜를 반환합니다. 나이 계산, 날짜 비교 등 현재 날짜가 필요할 때 반드시 먼저 호출하세요."""
diff --git a/services/model/mlx_vision_model.py b/services/model/mlx_vision_model.py
new file mode 100644
index 0000000..5191582
--- /dev/null
+++ b/services/model/mlx_vision_model.py
@@ -0,0 +1,48 @@
+"""Qwen2.5-VL (mlx-vlm) 기반 이미지 분석 서비스.
+
+첫 analyze() 호출 시 모델을 lazy load해 메모리를 아낀다.
+"""
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_PROMPT = "이 이미지를 한국어로 자세히 설명해줘. 사람, 음식, 문서 등 보이는 것을 빠짐없이 설명해."
+
+
+class MlxVisionModel:
+    def __init__(self, model_id: str, max_tokens: int = 512) -> None:
+        self._model_id = model_id
+        self._max_tokens = max_tokens
+        self._model = None
+        self._processor = None
+
+    def _load(self) -> None:
+        if self._model is not None:
+            return
+        logger.info("Vision 모델 로딩 중: %s", self._model_id)
+        from mlx_vlm import load
+        self._model, self._processor = load(self._model_id)
+        logger.info("Vision 모델 로딩 완료")
+
+    def analyze(self, image_path: str, prompt: str = _DEFAULT_PROMPT) -> str:
+        """이미지를 분석해 한국어 설명을 반환한다."""
+        self._load()
+        from mlx_vlm import generate
+        from mlx_vlm.prompt_utils import apply_chat_template
+        from mlx_vlm.utils import load_config
+
+        config = load_config(self._model_id)
+        formatted_prompt = apply_chat_template(
+            self._processor, config, prompt, num_images=1
+        )
+        result = generate(
+            self._model,
+            self._processor,
+            image=image_path,
+            prompt=formatted_prompt,
+            max_tokens=self._max_tokens,
+            verbose=False,
+        )
+        return result if isinstance(result, str) else str(result)