Phase 17: Multimodal image understanding via analyze_image tool
Dual-model approach (C): Qwen3-8B handles conversation, Qwen2.5-VL-7B analyzes images on demand via analyze_image LangChain tool. - services/model/mlx_vision_model.py: MlxVisionModel (mlx-vlm wrapper, lazy load) - services/agent/tools.py: make_vision_tool(vision_model, image_path) - agent_service.py: stream_response(image_path=None), dynamic tool binding via config["image_path"] — thread-safe per-request rebinding - container.py: vision_model Singleton provider - config.py: vision_enabled, vision_model_id, vision_max_tokens - api.py: image_base64 in ChatRequest, decode to temp file, cleanup after stream Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -10,7 +10,7 @@ from langgraph.config import get_stream_writer
|
||||
from langgraph.graph import END, START, MessagesState, StateGraph
|
||||
from langgraph.prebuilt import ToolNode
|
||||
|
||||
from services.agent.tools import get_current_date, make_memory_tools, make_retriever_tool, make_search_tool, web_search
|
||||
from services.agent.tools import get_current_date, make_memory_tools, make_retriever_tool, make_search_tool, make_vision_tool, web_search
|
||||
|
||||
|
||||
class AgentService:
|
||||
@@ -71,11 +71,13 @@ class AgentService:
|
||||
search_tool = make_search_tool(retriever_service, self._source_buffer)
|
||||
else:
|
||||
search_tool = make_retriever_tool(retriever_service)
|
||||
tools = [search_tool, web_search, get_current_date]
|
||||
self._base_tools = [search_tool, web_search, get_current_date]
|
||||
if user_profile_repository is not None:
|
||||
remember_tool, recall_tool = make_memory_tools(user_profile_repository, user_id)
|
||||
tools += [remember_tool, recall_tool]
|
||||
llm_with_tools = chat_model.bind_tools(tools)
|
||||
self._base_tools += [remember_tool, recall_tool]
|
||||
self._vision_model = None # set via set_vision_model()
|
||||
self._llm_with_tools = chat_model.bind_tools(self._base_tools)
|
||||
self._chat_model = chat_model
|
||||
|
||||
async def call_model(state: MessagesState, config: RunnableConfig) -> dict:
|
||||
from datetime import date
|
||||
@@ -123,9 +125,16 @@ class AgentService:
|
||||
# LLM 추론 시작 직전에 즉시 신호 emit — UI에 "분석 중" 표시
|
||||
if writer:
|
||||
writer({"__start": True})
|
||||
# 체크박스 값을 모델의 enable_thinking으로 전달 (런타임 오버라이드)
|
||||
show_thinking = config.get("configurable", {}).get("show_thinking", False)
|
||||
_llm = llm_with_tools.bind(enable_thinking=show_thinking) if show_thinking != chat_model.enable_thinking else llm_with_tools
|
||||
# 이미지 첨부 시 vision tool 동적 추가 (요청별로 독립적으로 바인딩)
|
||||
cfg = config.get("configurable", {})
|
||||
show_thinking = cfg.get("show_thinking", False)
|
||||
image_path = cfg.get("image_path")
|
||||
if image_path and self._vision_model:
|
||||
tools_for_req = self._base_tools + [make_vision_tool(self._vision_model, image_path)]
|
||||
_llm_base = self._chat_model.bind_tools(tools_for_req)
|
||||
else:
|
||||
_llm_base = self._llm_with_tools
|
||||
_llm = _llm_base.bind(enable_thinking=show_thinking) if show_thinking != chat_model.enable_thinking else _llm_base
|
||||
async for chunk in _llm.astream(msgs, config):
|
||||
t = chunk.additional_kwargs.get("thinking", "")
|
||||
if t:
|
||||
@@ -221,10 +230,21 @@ class AgentService:
|
||||
def last_run_id(self) -> str | None:
|
||||
return self._last_run_id
|
||||
|
||||
def _make_config(self, show_thinking: bool = False) -> dict:
|
||||
return {"configurable": {"thread_id": self._thread_id, "show_thinking": show_thinking}}
|
||||
def set_vision_model(self, vision_model) -> None:
|
||||
self._vision_model = vision_model
|
||||
|
||||
async def stream_response(self, user_input: str, show_thinking: bool | None = None) -> AsyncIterator[str | dict]:
|
||||
def _make_config(self, show_thinking: bool = False, image_path: str | None = None) -> dict:
|
||||
cfg: dict = {"thread_id": self._thread_id, "show_thinking": show_thinking}
|
||||
if image_path:
|
||||
cfg["image_path"] = image_path
|
||||
return {"configurable": cfg}
|
||||
|
||||
async def stream_response(
|
||||
self,
|
||||
user_input: str,
|
||||
show_thinking: bool | None = None,
|
||||
image_path: str | None = None,
|
||||
) -> AsyncIterator[str | dict]:
|
||||
"""사용자 입력을 받아 응답 토큰을 순서대로 yield한다.
|
||||
|
||||
실제 답변: plain str
|
||||
@@ -233,7 +253,7 @@ class AgentService:
|
||||
_think_verbose = show_thinking if show_thinking is not None else self._think_verbose
|
||||
self._source_buffer.clear()
|
||||
run_id = uuid.uuid4()
|
||||
run_config = {**self._make_config(_think_verbose), "run_id": str(run_id)}
|
||||
run_config = {**self._make_config(_think_verbose, image_path=image_path), "run_id": str(run_id)}
|
||||
|
||||
# 재시작 후 첫 호출 시 MySQL 이력을 초기 상태에 주입
|
||||
if self._pending_history:
|
||||
|
||||
Reference in New Issue
Block a user