Spaces:
Paused
Paused
| import os | |
| import gc | |
| import torch | |
| import soundfile as sf | |
| import logging | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import spaces | |
| from datetime import datetime | |
| from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig | |
| from ncodec.codec import TTSCodec | |
| # ---------------- Logging ---------------- | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s - %(levelname)s - %(message)s" | |
| ) | |
| # ---------------- Globals ---------------- | |
| GPU_PIPE = None | |
| CODEC = None | |
| MODEL_ID = "rahul7star/mir-TTS" | |
| # ---------------- CPU Init (SAFE) ---------------- | |
| def initialize_cpu(): | |
| global CODEC | |
| if CODEC is None: | |
| logging.info("Initializing CPU components") | |
| CODEC = TTSCodec() | |
| # ---------------- Audio Utils ---------------- | |
| def validate_audio_input(audio_path): | |
| if not audio_path or not os.path.exists(audio_path): | |
| raise ValueError("Audio file not found") | |
| audio, sr = librosa.load(audio_path, sr=None, duration=30) | |
| if len(audio) == 0: | |
| raise ValueError("Audio file is empty") | |
| if sr != 16000: | |
| audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) | |
| sr = 16000 | |
| audio = audio / np.max(np.abs(audio)) | |
| tmp_path = f"/tmp/processed_{os.path.basename(audio_path)}" | |
| sf.write(tmp_path, audio, sr) | |
| return tmp_path | |
| # ---------------- GPU TTS ---------------- | |
| def generate_speech(text, audio_path): | |
| global GPU_PIPE, CODEC | |
| if not text or not text.strip(): | |
| raise ValueError("Text input is empty") | |
| initialize_cpu() | |
| # 🔥 Load GPU pipeline lazily (CORRECT) | |
| if GPU_PIPE is None: | |
| logging.info("Loading MiraTTS pipeline on GPU") | |
| backend_config = TurbomindEngineConfig( | |
| tp=1, | |
| device="cuda", | |
| dtype="bfloat16", | |
| enable_prefix_caching=False, | |
| cache_max_entry_count=0.1, | |
| ) | |
| GPU_PIPE = pipeline( | |
| MODEL_ID, | |
| backend_config=backend_config | |
| ) | |
| processed_audio = validate_audio_input(audio_path) | |
| context_tokens = CODEC.encode(processed_audio) | |
| prompt = CODEC.format_prompt(text, context_tokens, None) | |
| gen_cfg = GenerationConfig( | |
| top_p=0.95, | |
| top_k=50, | |
| temperature=0.8, | |
| max_new_tokens=1024, | |
| repetition_penalty=1.2, | |
| do_sample=True, | |
| ) | |
| response = GPU_PIPE( | |
| [prompt], | |
| gen_config=gen_cfg, | |
| do_preprocess=False | |
| ) | |
| audio = CODEC.decode(response[0].text, context_tokens) | |
| if torch.is_tensor(audio): | |
| audio = audio.float().cpu().numpy() # force float32 | |
| # 🧹 Cleanup | |
| os.remove(processed_audio) | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| return audio, 48000 | |
| # ---------------- Gradio ---------------- | |
| def voice_clone_interface(text, upload_audio, record_audio): | |
| try: | |
| audio_path = upload_audio or record_audio | |
| if not audio_path: | |
| return None, "Upload or record reference audio" | |
| audio, sr = generate_speech(text, audio_path) | |
| os.makedirs("outputs", exist_ok=True) | |
| out_path = f"outputs/mira_{datetime.now():%Y%m%d_%H%M%S}.wav" | |
| sf.write(out_path, audio, sr) | |
| return out_path, "✅ Generation successful" | |
| except Exception as e: | |
| logging.error(e) | |
| return None, f"❌ {str(e)}" | |
| def build_interface(): | |
| with gr.Blocks(title="MiraTTS Voice Cloning") as demo: | |
| gr.Markdown("# 🎤 MiraTTS – Voice Cloning") | |
| with gr.Row(): | |
| with gr.Column(): | |
| upload = gr.Audio(sources="upload", type="filepath", label="Upload Reference Audio") | |
| record = gr.Audio(sources="microphone", type="filepath", label="Record Reference Audio") | |
| with gr.Column(): | |
| text = gr.Textbox(lines=4, label="Text to Synthesize") | |
| generate = gr.Button("Generate", variant="primary") | |
| output_audio = gr.Audio(type="filepath", autoplay=True) | |
| status = gr.Textbox(label="Status") | |
| generate.click( | |
| voice_clone_interface, | |
| inputs=[text, upload, record], | |
| outputs=[output_audio, status], | |
| ) | |
| return demo | |
| # ---------------- Main ---------------- | |
| if __name__ == "__main__": | |
| initialize_cpu() | |
| demo = build_interface() | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |