Phase 17: Multimodal image understanding via analyze_image tool
Dual-model approach (C): Qwen3-8B handles conversation, Qwen2.5-VL-7B analyzes images on demand via analyze_image LangChain tool. - services/model/mlx_vision_model.py: MlxVisionModel (mlx-vlm wrapper, lazy load) - services/agent/tools.py: make_vision_tool(vision_model, image_path) - agent_service.py: stream_response(image_path=None), dynamic tool binding via config["image_path"] — thread-safe per-request rebinding - container.py: vision_model Singleton provider - config.py: vision_enabled, vision_model_id, vision_max_tokens - api.py: image_base64 in ChatRequest, decode to temp file, cleanup after stream Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,48 @@
|
||||
"""Qwen2.5-VL (mlx-vlm) 기반 이미지 분석 서비스.
|
||||
|
||||
첫 analyze() 호출 시 모델을 lazy load해 메모리를 아낀다.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_PROMPT = "이 이미지를 한국어로 자세히 설명해줘. 사람, 음식, 문서 등 보이는 것을 빠짐없이 설명해."
|
||||
|
||||
|
||||
class MlxVisionModel:
|
||||
def __init__(self, model_id: str, max_tokens: int = 512) -> None:
|
||||
self._model_id = model_id
|
||||
self._max_tokens = max_tokens
|
||||
self._model = None
|
||||
self._processor = None
|
||||
|
||||
def _load(self) -> None:
|
||||
if self._model is not None:
|
||||
return
|
||||
logger.info("Vision 모델 로딩 중: %s", self._model_id)
|
||||
from mlx_vlm import load
|
||||
self._model, self._processor = load(self._model_id)
|
||||
logger.info("Vision 모델 로딩 완료")
|
||||
|
||||
def analyze(self, image_path: str, prompt: str = _DEFAULT_PROMPT) -> str:
|
||||
"""이미지를 분석해 한국어 설명을 반환한다."""
|
||||
self._load()
|
||||
from mlx_vlm import generate
|
||||
from mlx_vlm.prompt_utils import apply_chat_template
|
||||
from mlx_vlm.utils import load_config
|
||||
|
||||
config = load_config(self._model_id)
|
||||
formatted_prompt = apply_chat_template(
|
||||
self._processor, config, prompt, num_images=1
|
||||
)
|
||||
result = generate(
|
||||
self._model,
|
||||
self._processor,
|
||||
image=image_path,
|
||||
prompt=formatted_prompt,
|
||||
max_tokens=self._max_tokens,
|
||||
verbose=False,
|
||||
)
|
||||
return result if isinstance(result, str) else str(result)
|
||||
Reference in New Issue
Block a user