Phase 17: Multimodal image understanding via analyze_image tool

Dual-model approach (C): Qwen3-8B handles conversation, Qwen2.5-VL-7B analyzes images on demand via analyze_image LangChain tool. - services/model/mlx_vision_model.py: MlxVisionModel (mlx-vlm wrapper, lazy load) - services/agent/tools.py: make_vision_tool(vision_model, image_path) - agent_service.py: stream_response(image_path=None), dynamic tool binding via config["image_path"] — thread-safe per-request rebinding - container.py: vision_model Singleton provider - config.py: vision_enabled, vision_model_id, vision_max_tokens - api.py: image_base64 in ChatRequest, decode to temp file, cleanup after stream Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-02 13:52:10 +09:00
parent bdb6fd83c4
commit 68f741af72
8 changed files with 355 additions and 18 deletions
@@ -41,6 +41,9 @@ _container.db_service().init_schema()
 _cfg = _container.config()
 _agent_cache: dict[str, AgentService] = {}

+# Vision 모델 — VISION_ENABLED=true 시 lazy 초기화
+_vision_model = _container.vision_model() if _cfg.vision_enabled else None
+

 def _get_agent(user_id: str) -> AgentService:
    if user_id not in _agent_cache:
@@ -57,6 +60,8 @@ def _get_agent(user_id: str) -> AgentService:
            conversation_repository=_container.conversation_repository(),
            user_id=user_id,
        )
+        if _vision_model:
+            _agent_cache[user_id].set_vision_model(_vision_model)
    return _agent_cache[user_id]


@@ -74,6 +79,7 @@ class ChatRequest(BaseModel):
    message: str
    user_id: str = "default"
    show_thinking: bool = False
+    image_base64: str | None = None  # base64 인코딩된 이미지 (선택)


 class FeedbackRequest(BaseModel):
@@ -97,10 +103,33 @@ async def chat(req: ChatRequest, _=Depends(_auth)):
    """SSE 스트리밍 응답. 각 라인: `data: <JSON 토큰>\n\n`, 종료: `data: [DONE]\n\n`"""
    agent = _get_agent(req.user_id)

+    # 이미지 base64 → 임시 파일 저장
+    image_path: str | None = None
+    tmp_path: str | None = None
+    if req.image_base64 and _vision_model:
+        import base64
+        img_bytes = base64.b64decode(req.image_base64)
+        suffix = ".jpg"
+        if img_bytes[:4] == b"\x89PNG":
+            suffix = ".png"
+        elif img_bytes[:4] == b"GIF8":
+            suffix = ".gif"
+        tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir="/tmp", prefix="youlbot_img_")
+        tmp.write(img_bytes)
+        tmp.close()
+        image_path = tmp.name
+        tmp_path = tmp.name
+
    async def generate():
-        async for token in agent.stream_response(req.message, show_thinking=req.show_thinking):
-            yield f"data: {json.dumps(token, ensure_ascii=False)}\n\n"
-        yield f"data: {json.dumps({'__done': True, 'run_id': agent.last_run_id}, ensure_ascii=False)}\n\n"
+        try:
+            async for token in agent.stream_response(
+                req.message, show_thinking=req.show_thinking, image_path=image_path
+            ):
+                yield f"data: {json.dumps(token, ensure_ascii=False)}\n\n"
+            yield f"data: {json.dumps({'__done': True, 'run_id': agent.last_run_id}, ensure_ascii=False)}\n\n"
+        finally:
+            if tmp_path and os.path.exists(tmp_path):
+                os.unlink(tmp_path)

    return StreamingResponse(generate(), media_type="text/event-stream")