import os import gc import torch import soundfile as sf import logging import gradio as gr import librosa import numpy as np import spaces from datetime import datetime from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig from ncodec.codec import TTSCodec # ---------------- Logging ---------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) # ---------------- Globals ---------------- GPU_PIPE = None CODEC = None MODEL_ID = "rahul7star/mir-TTS" # ---------------- CPU Init (SAFE) ---------------- def initialize_cpu(): global CODEC if CODEC is None: logging.info("Initializing CPU components") CODEC = TTSCodec() # ---------------- Audio Utils ---------------- def validate_audio_input(audio_path): if not audio_path or not os.path.exists(audio_path): raise ValueError("Audio file not found") audio, sr = librosa.load(audio_path, sr=None, duration=30) if len(audio) == 0: raise ValueError("Audio file is empty") if sr != 16000: audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) sr = 16000 audio = audio / np.max(np.abs(audio)) tmp_path = f"/tmp/processed_{os.path.basename(audio_path)}" sf.write(tmp_path, audio, sr) return tmp_path # ---------------- GPU TTS ---------------- def generate_speech(text, audio_path): global GPU_PIPE, CODEC if not text or not text.strip(): raise ValueError("Text input is empty") initialize_cpu() # ๐Ÿ”ฅ Load GPU pipeline lazily (CORRECT) if GPU_PIPE is None: logging.info("Loading MiraTTS pipeline on GPU") backend_config = TurbomindEngineConfig( tp=1, device="cuda", dtype="bfloat16", enable_prefix_caching=False, cache_max_entry_count=0.1, ) GPU_PIPE = pipeline( MODEL_ID, backend_config=backend_config ) processed_audio = validate_audio_input(audio_path) context_tokens = CODEC.encode(processed_audio) prompt = CODEC.format_prompt(text, context_tokens, None) gen_cfg = GenerationConfig( top_p=0.95, top_k=50, temperature=0.8, max_new_tokens=1024, repetition_penalty=1.2, do_sample=True, ) response = GPU_PIPE( [prompt], gen_config=gen_cfg, do_preprocess=False ) audio = CODEC.decode(response[0].text, context_tokens) if torch.is_tensor(audio): audio = audio.float().cpu().numpy() # force float32 # ๐Ÿงน Cleanup os.remove(processed_audio) gc.collect() torch.cuda.empty_cache() return audio, 48000 # ---------------- Gradio ---------------- def voice_clone_interface(text, upload_audio, record_audio): try: audio_path = upload_audio or record_audio if not audio_path: return None, "Upload or record reference audio" audio, sr = generate_speech(text, audio_path) os.makedirs("outputs", exist_ok=True) out_path = f"outputs/mira_{datetime.now():%Y%m%d_%H%M%S}.wav" sf.write(out_path, audio, sr) return out_path, "โœ… Generation successful" except Exception as e: logging.error(e) return None, f"โŒ {str(e)}" def build_interface(): with gr.Blocks(title="MiraTTS Voice Cloning") as demo: gr.Markdown("# ๐ŸŽค MiraTTS โ€“ Voice Cloning") with gr.Row(): with gr.Column(): upload = gr.Audio(sources="upload", type="filepath", label="Upload Reference Audio") record = gr.Audio(sources="microphone", type="filepath", label="Record Reference Audio") with gr.Column(): text = gr.Textbox(lines=4, label="Text to Synthesize") generate = gr.Button("Generate", variant="primary") output_audio = gr.Audio(type="filepath", autoplay=True) status = gr.Textbox(label="Status") generate.click( voice_clone_interface, inputs=[text, upload, record], outputs=[output_audio, status], ) return demo # ---------------- Main ---------------- if __name__ == "__main__": initialize_cpu() demo = build_interface() demo.launch(server_name="0.0.0.0", server_port=7860)