Phase 17: Multimodal image understanding via analyze_image tool
Dual-model approach (C): Qwen3-8B handles conversation, Qwen2.5-VL-7B analyzes images on demand via analyze_image LangChain tool. - services/model/mlx_vision_model.py: MlxVisionModel (mlx-vlm wrapper, lazy load) - services/agent/tools.py: make_vision_tool(vision_model, image_path) - agent_service.py: stream_response(image_path=None), dynamic tool binding via config["image_path"] — thread-safe per-request rebinding - container.py: vision_model Singleton provider - config.py: vision_enabled, vision_model_id, vision_max_tokens - api.py: image_base64 in ChatRequest, decode to temp file, cleanup after stream Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -41,6 +41,9 @@ _container.db_service().init_schema()
|
||||
_cfg = _container.config()
|
||||
_agent_cache: dict[str, AgentService] = {}
|
||||
|
||||
# Vision 모델 — VISION_ENABLED=true 시 lazy 초기화
|
||||
_vision_model = _container.vision_model() if _cfg.vision_enabled else None
|
||||
|
||||
|
||||
def _get_agent(user_id: str) -> AgentService:
|
||||
if user_id not in _agent_cache:
|
||||
@@ -57,6 +60,8 @@ def _get_agent(user_id: str) -> AgentService:
|
||||
conversation_repository=_container.conversation_repository(),
|
||||
user_id=user_id,
|
||||
)
|
||||
if _vision_model:
|
||||
_agent_cache[user_id].set_vision_model(_vision_model)
|
||||
return _agent_cache[user_id]
|
||||
|
||||
|
||||
@@ -74,6 +79,7 @@ class ChatRequest(BaseModel):
|
||||
message: str
|
||||
user_id: str = "default"
|
||||
show_thinking: bool = False
|
||||
image_base64: str | None = None # base64 인코딩된 이미지 (선택)
|
||||
|
||||
|
||||
class FeedbackRequest(BaseModel):
|
||||
@@ -97,10 +103,33 @@ async def chat(req: ChatRequest, _=Depends(_auth)):
|
||||
"""SSE 스트리밍 응답. 각 라인: `data: <JSON 토큰>\n\n`, 종료: `data: [DONE]\n\n`"""
|
||||
agent = _get_agent(req.user_id)
|
||||
|
||||
# 이미지 base64 → 임시 파일 저장
|
||||
image_path: str | None = None
|
||||
tmp_path: str | None = None
|
||||
if req.image_base64 and _vision_model:
|
||||
import base64
|
||||
img_bytes = base64.b64decode(req.image_base64)
|
||||
suffix = ".jpg"
|
||||
if img_bytes[:4] == b"\x89PNG":
|
||||
suffix = ".png"
|
||||
elif img_bytes[:4] == b"GIF8":
|
||||
suffix = ".gif"
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir="/tmp", prefix="youlbot_img_")
|
||||
tmp.write(img_bytes)
|
||||
tmp.close()
|
||||
image_path = tmp.name
|
||||
tmp_path = tmp.name
|
||||
|
||||
async def generate():
|
||||
async for token in agent.stream_response(req.message, show_thinking=req.show_thinking):
|
||||
yield f"data: {json.dumps(token, ensure_ascii=False)}\n\n"
|
||||
yield f"data: {json.dumps({'__done': True, 'run_id': agent.last_run_id}, ensure_ascii=False)}\n\n"
|
||||
try:
|
||||
async for token in agent.stream_response(
|
||||
req.message, show_thinking=req.show_thinking, image_path=image_path
|
||||
):
|
||||
yield f"data: {json.dumps(token, ensure_ascii=False)}\n\n"
|
||||
yield f"data: {json.dumps({'__done': True, 'run_id': agent.last_run_id}, ensure_ascii=False)}\n\n"
|
||||
finally:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
|
||||
return StreamingResponse(generate(), media_type="text/event-stream")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user