Spaces:

limkang
/

vitstest

No application file

App Files Files Community

limkang commited on Oct 28

Commit

82a65d8

verified ·

1 Parent(s): 085b201

Upload main.py

Browse files

Files changed (1) hide show

main.py +108 -0

main.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# coding: utf-8
+import os
+import io
+import torch
+import tempfile
+from fastapi import FastAPI, HTTPException, Form, UploadFile, File
+from fastapi.responses import StreamingResponse
+# OpenVoice V2 관련 라이브러리 임포트
+from openvoice import se_extractor
+from openvoice.api import ToneColorConverter
+# MeloTTS 관련 라이브러리 임포트
+from melo.api import TTS
+# -------------------------------------------------------------------
+# 1. FastAPI 앱 초기화 및 모델 로드
+# -------------------------------------------------------------------
+app = FastAPI()
+print("🚀 Loading models...")
+try:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # OpenVoice 모델 로드
+    # Tone Color Extractor: 음색 특징을 추출하는 모델
+    # Tone Color Converter: 음색을 변환하는 모델
+    print("Loading OpenVoice V2 models...")
+    tone_color_converter = ToneColorConverter('checkpoints/converter', device=device)
+    print("✅ OpenVoice V2 loaded.")
+    # Melotts 모델 로드 (한국어 지원)
+    print("Loading Melotts model...")
+    melotts_model = TTS(language='KR', device=device)
+    speaker_ids = melotts_model.hps.data.spk2id
+    print("✅ Melotts loaded.")
+except Exception as ex:
+    print(f"❌ Failed to load models. Error: {ex}")
+    tone_color_converter = None
+    melotts_model = None
+# -------------------------------------------------------------------
+# 2. API 엔드포인트 생성
+# -------------------------------------------------------------------
+@app.post("/generate-cloned-speech/")
+async def generate_cloned_speech(
+    text: str = Form(...),
+    reference_audio: UploadFile = File(...)
+):
+    if not tone_color_converter or not melotts_model:
+        raise HTTPException(status_code=500, detail="Models are not loaded.")
+    # 임시 파일 경로를 관리하기 위한 변수
+    reference_path = None
+    source_path = None
+    save_path = None
+    try:
+        # 1. 참조 오디오(목소리 주인)를 임시 파일로 저장
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_ref_file:
+            content = await reference_audio.read()
+            temp_ref_file.write(content)
+            reference_path = temp_ref_file.name
+        # 2. 참조 오디오에서 음색 특징(Tone Color) 추출
+        target_se, audio_name = se_extractor.get_se(reference_path, tone_color_converter, target_dir='_outputs/form_clone', vad=True)
+        # 3. Melotts를 사용해 텍스트로 기본(Source) 음성 생성
+        # 속도 조절 가능 (speed)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_src_file:
+            source_path = temp_src_file.name
+        melotts_model.tts_to_file(text, speaker_ids['KR'], source_path, speed=1.0)
+        # 4. OpenVoice를 사용해 기본 음성에 추출한 음색을 입힘 (변환)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_save_file:
+            save_path = temp_save_file.name
+        # 핵심 변환 과정
+        tone_color_converter.convert(
+            audio_src_path=source_path,
+            src_se=None, # 소스 음성의 특징은 사용 안 함
+            tgt_se=target_se, # 목표(참조) 음성의 특징을 사용
+            output_path=save_path,
+            message="@MyShell"
+        )
+        # 5. 생성된 파일을 읽어 스트리밍으로 반환
+        with open(save_path, 'rb') as f:
+            audio_data = f.read()
+        return StreamingResponse(
+            io.BytesIO(audio_data),
+            media_type="audio/wav",
+            headers={"Content-Disposition": "inline; filename=cloned_speech.wav"}
+        )
+    except Exception as e:
+        error_msg = f"Error during speech generation: {str(e)}"
+        print(f"❌ {error_msg}")
+        raise HTTPException(status_code=500, detail=error_msg)
+    finally:
+        # 6. 작업이 끝나면 모든 임시 파일 삭제
+        for path in [reference_path, source_path, save_path]:
+            if path and os.path.exists(path):
+                os.remove(path)