From 7f50333bdb4d25766437d8626c755ba21a4b9eb2 Mon Sep 17 00:00:00 2001 From: sal Date: Tue, 2 Jun 2026 13:52:21 +0900 Subject: [PATCH] Phase 17: Add image upload to chat UI - app.py: image_input gr.Image component, respond() accepts image_path, all yields updated to 7 outputs - api_client.py: chat(image_path=None), base64-encodes image for API - services/chat.py: chat(image_path=None) passes through to api_client Co-Authored-By: Claude Sonnet 4.6 --- api_client.py | 8 ++++- app.py | 79 +++++++++++++++++++++++++++--------------------- services/chat.py | 8 +++-- 3 files changed, 57 insertions(+), 38 deletions(-) diff --git a/api_client.py b/api_client.py index e429be2..b29d909 100644 --- a/api_client.py +++ b/api_client.py @@ -57,11 +57,17 @@ class HTTPAPIClient: message: str, user_id: str = "default", show_thinking: bool = False, + image_path: str | None = None, ) -> AsyncIterator[tuple[str, str | None]]: + payload: dict = {"message": message, "user_id": user_id, "show_thinking": show_thinking} + if image_path: + import base64 + with open(image_path, "rb") as f: + payload["image_base64"] = base64.b64encode(f.read()).decode() async with self._client.stream( "POST", f"{self._url}/chat", - json={"message": message, "user_id": user_id, "show_thinking": show_thinking}, + json=payload, timeout=self._timeout, ) as response: response.raise_for_status() diff --git a/app.py b/app.py index 789eaee..4be3666 100644 --- a/app.py +++ b/app.py @@ -52,25 +52,31 @@ def transcribe_audio(filepath: str) -> str: # ── 채팅 ───────────────────────────────────────────────────────── -async def respond(message, history, show_thinking, user_id, use_tts, run_ids): - if not message.strip(): - yield history, "", None, run_ids, "", "" +async def respond(message, history, show_thinking, user_id, use_tts, run_ids, image_path): + if not message.strip() and not image_path: + yield history, "", None, run_ids, "", "", None return history = list(history) run_ids = list(run_ids) - history.append({"role": "user", "content": message}) + display_msg = message + if image_path: + display_msg = f"🖼️ [이미지 첨부]\n{message}" if message.strip() else "🖼️ [이미지 첨부]" + history.append({"role": "user", "content": display_msg}) history.append({"role": "assistant", "content": ""}) - yield history, "", None, run_ids, "", "" # thinking_box + source_box 초기화 + yield history, "", None, run_ids, "", "", None # boxes 초기화 + 이미지 초기화 collected_run_id: str | None = None - tts_text = "" # 순수 답변만 누적 (TTS용) - thinking_acc = "" # 전체 누적 (완료 후 details용) - thinking_text = "" # __thinking 토큰만 (줄 감지용) + tts_text = "" + thinking_acc = "" + thinking_text = "" thinking_finalized = False + source_box_html = "" try: - async for token, run_id in container.chat_service().chat(message, user_id, show_thinking): + async for token, run_id in container.chat_service().chat( + message or "이 이미지를 분석해줘.", user_id, show_thinking, image_path=image_path + ): if run_id is not None: collected_run_id = run_id break @@ -78,14 +84,14 @@ async def respond(message, history, show_thinking, user_id, use_tts, run_ids): # 즉시 상태 — thinking_acc에 누적 안 함 if isinstance(token, dict) and "__status" in token: if not thinking_acc: - yield history, "", None, run_ids, _status_html(token["__status"]), gr.update() + yield history, "", None, run_ids, _status_html(token["__status"]), gr.update(), gr.update() continue # 사고 과정(LLM thinking) — 현재 줄만 live_html로 표시 if isinstance(token, dict) and "__thinking" in token: thinking_text += token["__thinking"] thinking_acc += token["__thinking"] - yield history, "", None, run_ids, _live_html(_last_line(thinking_text)), gr.update() + yield history, "", None, run_ids, _live_html(_last_line(thinking_text)), gr.update(), gr.update() continue # 진행 로그(LangGraph, 검색 등) — 메시지 전체를 live_html로 표시 @@ -93,36 +99,36 @@ async def respond(message, history, show_thinking, user_id, use_tts, run_ids): thinking_acc += token["__meta"] live = token["__meta"].strip() if live: - yield history, "", None, run_ids, _live_html(live), gr.update() + yield history, "", None, run_ids, _live_html(live), gr.update(), gr.update() continue # RAG 출처 — 별도 source_box로 표시 if isinstance(token, dict) and "__sources" in token: source_box_html = _sources_html(token["__sources"]) - yield history, "", None, run_ids, gr.update(), source_box_html + yield history, "", None, run_ids, gr.update(), source_box_html, gr.update() continue # 첫 답변 토큰 도착 — 전체를 details로 전환 (접힌 상태) if thinking_acc and not thinking_finalized: thinking_finalized = True - yield history, "", None, run_ids, _thinking_html(thinking_acc), gr.update() + yield history, "", None, run_ids, _thinking_html(thinking_acc), gr.update(), gr.update() tts_text += token history[-1]["content"] += token - yield history, "", None, run_ids, gr.update(), gr.update() + yield history, "", None, run_ids, gr.update(), gr.update(), gr.update() except Exception as e: history[-1]["content"] += f"\n\n[오류: {e}]" - yield history, "", None, run_ids, gr.update(), gr.update() + yield history, "", None, run_ids, gr.update(), gr.update(), gr.update() return run_ids.append(collected_run_id) if use_tts: audio_path = await container.tts_service().speak(tts_text) - yield history, "", audio_path, run_ids, gr.update(), gr.update() + yield history, "", audio_path, run_ids, gr.update(), gr.update(), gr.update() else: - yield history, "", None, run_ids, gr.update(), gr.update() + yield history, "", None, run_ids, gr.update(), gr.update(), gr.update() async def handle_feedback(like_data: gr.LikeData, history, run_ids, user_id): @@ -274,14 +280,22 @@ with gr.Blocks(title="율봇") as demo: thinking_box = gr.HTML(value="") chatbot = gr.Chatbot(label="율봇", height=500) source_box = gr.HTML(value="") - with gr.Row(): - msg_box = gr.Textbox( - placeholder="질문을 입력하세요... (Enter로 전송)", - label="", - scale=5, - autofocus=True, + with gr.Row(equal_height=True): + image_input = gr.Image( + type="filepath", + label="이미지 첨부 (선택)", + sources=["upload", "clipboard"], + scale=1, + height=120, ) - send_btn = gr.Button("전송", variant="primary", scale=1) + with gr.Column(scale=5): + msg_box = gr.Textbox( + placeholder="질문을 입력하세요... (Enter로 전송)", + label="", + lines=2, + autofocus=True, + ) + send_btn = gr.Button("전송", variant="primary") with gr.Row(): audio_input = gr.Audio( @@ -310,16 +324,11 @@ with gr.Blocks(title="율봇") as demo: transcribe_btn.click(transcribe_audio, inputs=[audio_input], outputs=[msg_box]) - send_btn.click( - respond, - inputs=[msg_box, chatbot, show_thinking, user_state, use_tts, run_ids_state], - outputs=[chatbot, msg_box, tts_output, run_ids_state, thinking_box, source_box], - ) - msg_box.submit( - respond, - inputs=[msg_box, chatbot, show_thinking, user_state, use_tts, run_ids_state], - outputs=[chatbot, msg_box, tts_output, run_ids_state, thinking_box, source_box], - ) + _respond_inputs = [msg_box, chatbot, show_thinking, user_state, use_tts, run_ids_state, image_input] + _respond_outputs = [chatbot, msg_box, tts_output, run_ids_state, thinking_box, source_box, image_input] + + send_btn.click(respond, inputs=_respond_inputs, outputs=_respond_outputs) + msg_box.submit(respond, inputs=_respond_inputs, outputs=_respond_outputs) reset_btn.click(reset_chat, inputs=[user_state], outputs=[chatbot, run_ids_state]) chatbot.like( diff --git a/services/chat.py b/services/chat.py index 2e58ae8..7c976f7 100644 --- a/services/chat.py +++ b/services/chat.py @@ -8,9 +8,13 @@ class ChatService: self._api = api_client def chat( - self, message: str, user_id: str, show_thinking: bool + self, + message: str, + user_id: str, + show_thinking: bool, + image_path: str | None = None, ) -> AsyncIterator[tuple[str, str | None]]: - return self._api.chat(message, user_id, show_thinking) + return self._api.chat(message, user_id, show_thinking, image_path=image_path) async def reset(self, user_id: str) -> None: await self._api.reset(user_id)