youlbot/services/model/mlx_vision_model.py

"""Qwen2.5-VL (mlx-vlm) 기반 이미지 분석 서비스.

첫 analyze() 호출 시 모델을 lazy load해 메모리를 아낀다.
"""
from __future__ import annotations

import logging

logger = logging.getLogger(__name__)

_DEFAULT_PROMPT = "이 이미지를 한국어로 자세히 설명해줘. 사람, 음식, 문서 등 보이는 것을 빠짐없이 설명해."


class MlxVisionModel:
    def __init__(self, model_id: str, max_tokens: int = 512) -> None:
        self._model_id = model_id
        self._max_tokens = max_tokens
        self._model = None
        self._processor = None

    def _load(self) -> None:
        if self._model is not None:
            return
        logger.info("Vision 모델 로딩 중: %s", self._model_id)
        from mlx_vlm import load
        self._model, self._processor = load(self._model_id)
        logger.info("Vision 모델 로딩 완료")

    def analyze(self, image_path: str, prompt: str = _DEFAULT_PROMPT) -> str:
        """이미지를 분석해 한국어 설명을 반환한다."""
        self._load()
        from mlx_vlm import generate
        from mlx_vlm.prompt_utils import apply_chat_template
        from mlx_vlm.utils import load_config

        config = load_config(self._model_id)
        formatted_prompt = apply_chat_template(
            self._processor, config, prompt, num_images=1
        )
        result = generate(
            self._model,
            self._processor,
            image=image_path,
            prompt=formatted_prompt,
            max_tokens=self._max_tokens,
            verbose=False,
        )
        return result if isinstance(result, str) else str(result)