From 7f50333bdb4d25766437d8626c755ba21a4b9eb2 Mon Sep 17 00:00:00 2001
From: sal <shinalok357@gmail.com>
Date: Tue, 2 Jun 2026 13:52:21 +0900
Subject: [PATCH] Phase 17: Add image upload to chat UI

- app.py: image_input gr.Image component, respond() accepts image_path,
  all yields updated to 7 outputs
- api_client.py: chat(image_path=None), base64-encodes image for API
- services/chat.py: chat(image_path=None) passes through to api_client

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 api_client.py    |  8 ++++-
 app.py           | 79 +++++++++++++++++++++++++++---------------------
 services/chat.py |  8 +++--
 3 files changed, 57 insertions(+), 38 deletions(-)

diff --git a/api_client.py b/api_client.py
index e429be2..b29d909 100644
--- a/api_client.py
+++ b/api_client.py
@@ -57,11 +57,17 @@ class HTTPAPIClient:
         message: str,
         user_id: str = "default",
         show_thinking: bool = False,
+        image_path: str | None = None,
     ) -> AsyncIterator[tuple[str, str | None]]:
+        payload: dict = {"message": message, "user_id": user_id, "show_thinking": show_thinking}
+        if image_path:
+            import base64
+            with open(image_path, "rb") as f:
+                payload["image_base64"] = base64.b64encode(f.read()).decode()
         async with self._client.stream(
             "POST",
             f"{self._url}/chat",
-            json={"message": message, "user_id": user_id, "show_thinking": show_thinking},
+            json=payload,
             timeout=self._timeout,
         ) as response:
             response.raise_for_status()
diff --git a/app.py b/app.py
index 789eaee..4be3666 100644
--- a/app.py
+++ b/app.py
@@ -52,25 +52,31 @@ def transcribe_audio(filepath: str) -> str:
 
 # ── 채팅 ─────────────────────────────────────────────────────────
 
-async def respond(message, history, show_thinking, user_id, use_tts, run_ids):
-    if not message.strip():
-        yield history, "", None, run_ids, "", ""
+async def respond(message, history, show_thinking, user_id, use_tts, run_ids, image_path):
+    if not message.strip() and not image_path:
+        yield history, "", None, run_ids, "", "", None
         return
 
     history = list(history)
     run_ids = list(run_ids)
-    history.append({"role": "user", "content": message})
+    display_msg = message
+    if image_path:
+        display_msg = f"🖼️ [이미지 첨부]\n{message}" if message.strip() else "🖼️ [이미지 첨부]"
+    history.append({"role": "user", "content": display_msg})
     history.append({"role": "assistant", "content": ""})
-    yield history, "", None, run_ids, "", ""  # thinking_box + source_box 초기화
+    yield history, "", None, run_ids, "", "", None  # boxes 초기화 + 이미지 초기화
 
     collected_run_id: str | None = None
-    tts_text = ""            # 순수 답변만 누적 (TTS용)
-    thinking_acc = ""        # 전체 누적 (완료 후 details용)
-    thinking_text = ""       # __thinking 토큰만 (줄 감지용)
+    tts_text = ""
+    thinking_acc = ""
+    thinking_text = ""
     thinking_finalized = False
+    source_box_html = ""
 
     try:
-        async for token, run_id in container.chat_service().chat(message, user_id, show_thinking):
+        async for token, run_id in container.chat_service().chat(
+            message or "이 이미지를 분석해줘.", user_id, show_thinking, image_path=image_path
+        ):
             if run_id is not None:
                 collected_run_id = run_id
                 break
@@ -78,14 +84,14 @@ async def respond(message, history, show_thinking, user_id, use_tts, run_ids):
             # 즉시 상태 — thinking_acc에 누적 안 함
             if isinstance(token, dict) and "__status" in token:
                 if not thinking_acc:
-                    yield history, "", None, run_ids, _status_html(token["__status"]), gr.update()
+                    yield history, "", None, run_ids, _status_html(token["__status"]), gr.update(), gr.update()
                 continue
 
             # 사고 과정(LLM thinking) — 현재 줄만 live_html로 표시
             if isinstance(token, dict) and "__thinking" in token:
                 thinking_text += token["__thinking"]
                 thinking_acc += token["__thinking"]
-                yield history, "", None, run_ids, _live_html(_last_line(thinking_text)), gr.update()
+                yield history, "", None, run_ids, _live_html(_last_line(thinking_text)), gr.update(), gr.update()
                 continue
 
             # 진행 로그(LangGraph, 검색 등) — 메시지 전체를 live_html로 표시
@@ -93,36 +99,36 @@ async def respond(message, history, show_thinking, user_id, use_tts, run_ids):
                 thinking_acc += token["__meta"]
                 live = token["__meta"].strip()
                 if live:
-                    yield history, "", None, run_ids, _live_html(live), gr.update()
+                    yield history, "", None, run_ids, _live_html(live), gr.update(), gr.update()
                 continue
 
             # RAG 출처 — 별도 source_box로 표시
             if isinstance(token, dict) and "__sources" in token:
                 source_box_html = _sources_html(token["__sources"])
-                yield history, "", None, run_ids, gr.update(), source_box_html
+                yield history, "", None, run_ids, gr.update(), source_box_html, gr.update()
                 continue
 
             # 첫 답변 토큰 도착 — 전체를 details로 전환 (접힌 상태)
             if thinking_acc and not thinking_finalized:
                 thinking_finalized = True
-                yield history, "", None, run_ids, _thinking_html(thinking_acc), gr.update()
+                yield history, "", None, run_ids, _thinking_html(thinking_acc), gr.update(), gr.update()
 
             tts_text += token
             history[-1]["content"] += token
-            yield history, "", None, run_ids, gr.update(), gr.update()
+            yield history, "", None, run_ids, gr.update(), gr.update(), gr.update()
 
     except Exception as e:
         history[-1]["content"] += f"\n\n[오류: {e}]"
-        yield history, "", None, run_ids, gr.update(), gr.update()
+        yield history, "", None, run_ids, gr.update(), gr.update(), gr.update()
         return
 
     run_ids.append(collected_run_id)
 
     if use_tts:
         audio_path = await container.tts_service().speak(tts_text)
-        yield history, "", audio_path, run_ids, gr.update(), gr.update()
+        yield history, "", audio_path, run_ids, gr.update(), gr.update(), gr.update()
     else:
-        yield history, "", None, run_ids, gr.update(), gr.update()
+        yield history, "", None, run_ids, gr.update(), gr.update(), gr.update()
 
 
 async def handle_feedback(like_data: gr.LikeData, history, run_ids, user_id):
@@ -274,14 +280,22 @@ with gr.Blocks(title="율봇") as demo:
         thinking_box = gr.HTML(value="")
         chatbot = gr.Chatbot(label="율봇", height=500)
         source_box = gr.HTML(value="")
-        with gr.Row():
-            msg_box = gr.Textbox(
-                placeholder="질문을 입력하세요... (Enter로 전송)",
-                label="",
-                scale=5,
-                autofocus=True,
+        with gr.Row(equal_height=True):
+            image_input = gr.Image(
+                type="filepath",
+                label="이미지 첨부 (선택)",
+                sources=["upload", "clipboard"],
+                scale=1,
+                height=120,
             )
-            send_btn = gr.Button("전송", variant="primary", scale=1)
+            with gr.Column(scale=5):
+                msg_box = gr.Textbox(
+                    placeholder="질문을 입력하세요... (Enter로 전송)",
+                    label="",
+                    lines=2,
+                    autofocus=True,
+                )
+                send_btn = gr.Button("전송", variant="primary")
 
         with gr.Row():
             audio_input = gr.Audio(
@@ -310,16 +324,11 @@ with gr.Blocks(title="율봇") as demo:
 
         transcribe_btn.click(transcribe_audio, inputs=[audio_input], outputs=[msg_box])
 
-        send_btn.click(
-            respond,
-            inputs=[msg_box, chatbot, show_thinking, user_state, use_tts, run_ids_state],
-            outputs=[chatbot, msg_box, tts_output, run_ids_state, thinking_box, source_box],
-        )
-        msg_box.submit(
-            respond,
-            inputs=[msg_box, chatbot, show_thinking, user_state, use_tts, run_ids_state],
-            outputs=[chatbot, msg_box, tts_output, run_ids_state, thinking_box, source_box],
-        )
+        _respond_inputs = [msg_box, chatbot, show_thinking, user_state, use_tts, run_ids_state, image_input]
+        _respond_outputs = [chatbot, msg_box, tts_output, run_ids_state, thinking_box, source_box, image_input]
+
+        send_btn.click(respond, inputs=_respond_inputs, outputs=_respond_outputs)
+        msg_box.submit(respond, inputs=_respond_inputs, outputs=_respond_outputs)
         reset_btn.click(reset_chat, inputs=[user_state], outputs=[chatbot, run_ids_state])
 
         chatbot.like(
diff --git a/services/chat.py b/services/chat.py
index 2e58ae8..7c976f7 100644
--- a/services/chat.py
+++ b/services/chat.py
@@ -8,9 +8,13 @@ class ChatService:
         self._api = api_client
 
     def chat(
-        self, message: str, user_id: str, show_thinking: bool
+        self,
+        message: str,
+        user_id: str,
+        show_thinking: bool,
+        image_path: str | None = None,
     ) -> AsyncIterator[tuple[str, str | None]]:
-        return self._api.chat(message, user_id, show_thinking)
+        return self._api.chat(message, user_id, show_thinking, image_path=image_path)
 
     async def reset(self, user_id: str) -> None:
         await self._api.reset(user_id)