From cf68e19f3866cea12762e733bc76f01ef61d651a Mon Sep 17 00:00:00 2001 From: shinalok Date: Sat, 30 May 2026 23:54:41 +0900 Subject: [PATCH] Add cross-platform TTS support and update dependencies --- api_client.py | 2 +- app.py | 55 +++++++++++++++++++++++++++++++++++++----------- requirements.txt | 4 +++- 3 files changed, 47 insertions(+), 14 deletions(-) diff --git a/api_client.py b/api_client.py index 9e4f74e..37c61ed 100644 --- a/api_client.py +++ b/api_client.py @@ -43,7 +43,7 @@ async def chat( try: payload = json.loads(raw) except json.JSONDecodeError: - yield raw, None + yield str(raw), None continue if isinstance(payload, dict) and payload.get("__done"): yield "", payload.get("run_id") diff --git a/app.py b/app.py index 4a3a3fd..a2fd99b 100644 --- a/app.py +++ b/app.py @@ -9,6 +9,7 @@ """ import asyncio import os +import platform import subprocess import tempfile @@ -25,7 +26,8 @@ DEFAULT_USER = "아록" # ── STT (Whisper) — 로컬 실행 유지 ────────────────────────────── _whisper_model = None _WHISPER_SIZE = os.getenv("WHISPER_MODEL_SIZE", "small") -_TTS_VOICE = os.getenv("TTS_VOICE", "Yuna") +_TTS_VOICE = os.getenv("TTS_VOICE", "Yuna") # macOS say 보이스 +_TTS_EDGE_VOICE = os.getenv("TTS_EDGE_VOICE", "ko-KR-SunHiNeural") # edge-tts 보이스 def _get_whisper(): @@ -44,18 +46,46 @@ def transcribe_audio(filepath: str) -> str: return result["text"].strip() -def tts_speak(text: str) -> str | None: - """macOS say 명령어로 TTS, 재생용 aiff 파일 경로 반환.""" +async def tts_speak(text: str) -> str | None: + """크로스플랫폼 TTS. macOS: say→edge-tts→pyttsx3 / Windows: edge-tts→pyttsx3""" if not text: return None + + # macOS: say 우선 (오프라인, 내장 한국어) + if platform.system() == "Darwin": + try: + tmp = tempfile.NamedTemporaryFile(suffix=".aiff", delete=False) + tmp.close() + await asyncio.to_thread( + subprocess.run, + ["say", "-v", _TTS_VOICE, "-o", tmp.name, text], + check=True, + capture_output=True, + ) + return tmp.name + except Exception: + pass + + # Windows 1순위 / macOS say 실패 시: edge-tts (온라인) try: - tmp = tempfile.NamedTemporaryFile(suffix=".aiff", delete=False) + import edge_tts + tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) tmp.close() - subprocess.run( - ["say", "-v", _TTS_VOICE, "-o", tmp.name, text], - check=True, - capture_output=True, - ) + await edge_tts.Communicate(text, _TTS_EDGE_VOICE).save(tmp.name) + return tmp.name + except Exception: + pass + + # 최종 폴백: pyttsx3 (오프라인) + try: + import pyttsx3 + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + tmp.close() + def _save(): + engine = pyttsx3.init() + engine.save_to_file(text, tmp.name) + engine.runAndWait() + await asyncio.to_thread(_save) return tmp.name except Exception: return None @@ -90,7 +120,7 @@ async def respond(message, history, show_thinking, user_id, use_tts, run_ids): run_ids.append(collected_run_id) if use_tts: - audio_path = tts_speak(history[-1]["content"]) + audio_path = await tts_speak(history[-1]["content"]) yield history, "", audio_path, run_ids else: yield history, "", None, run_ids @@ -100,12 +130,13 @@ def handle_feedback(like_data: gr.LikeData, history, run_ids, user_id): idx = like_data.index if isinstance(idx, (list, tuple)): idx = idx[0] - if not isinstance(idx, int) or idx >= len(history): + if not isinstance(idx, int) or idx < 0 or idx >= len(history): return if history[idx].get("role") != "assistant": return + # idx 위치까지 등장한 assistant 메시지 수 = 이 메시지의 0-based 턴 번호 asst_turn = sum(1 for m in history[:idx] if m.get("role") == "assistant") - run_id = run_ids[asst_turn] if asst_turn < len(run_ids) else None + run_id = run_ids[asst_turn] if run_ids and asst_turn < len(run_ids) else None user_msg = str(history[idx - 1]["content"]) if idx > 0 else "" asst_msg = str(history[idx]["content"]) diff --git a/requirements.txt b/requirements.txt index 16bcfc0..5168873 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ -gradio>=4.0.0 +gradio>=6.0.0 httpx>=0.27.0 python-dotenv>=1.0.0 openai-whisper>=20231117 +edge-tts>=6.1.9 +pyttsx3>=2.90