Clone-Voice / main.py
rahul7star's picture
Update main.py
f15bf15 verified
import os
import gc
import torch
import soundfile as sf
import logging
import gradio as gr
import librosa
import numpy as np
import spaces
from datetime import datetime
from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
from ncodec.codec import TTSCodec
# ---------------- Logging ----------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s"
)
# ---------------- Globals ----------------
GPU_PIPE = None
CODEC = None
MODEL_ID = "rahul7star/mir-TTS"
# ---------------- CPU Init (SAFE) ----------------
def initialize_cpu():
global CODEC
if CODEC is None:
logging.info("Initializing CPU components")
CODEC = TTSCodec()
# ---------------- Audio Utils ----------------
def validate_audio_input(audio_path):
if not audio_path or not os.path.exists(audio_path):
raise ValueError("Audio file not found")
audio, sr = librosa.load(audio_path, sr=None, duration=30)
if len(audio) == 0:
raise ValueError("Audio file is empty")
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
sr = 16000
audio = audio / np.max(np.abs(audio))
tmp_path = f"/tmp/processed_{os.path.basename(audio_path)}"
sf.write(tmp_path, audio, sr)
return tmp_path
# ---------------- GPU TTS ----------------
def generate_speech(text, audio_path):
global GPU_PIPE, CODEC
if not text or not text.strip():
raise ValueError("Text input is empty")
initialize_cpu()
# 🔥 Load GPU pipeline lazily (CORRECT)
if GPU_PIPE is None:
logging.info("Loading MiraTTS pipeline on GPU")
backend_config = TurbomindEngineConfig(
tp=1,
device="cuda",
dtype="bfloat16",
enable_prefix_caching=False,
cache_max_entry_count=0.1,
)
GPU_PIPE = pipeline(
MODEL_ID,
backend_config=backend_config
)
processed_audio = validate_audio_input(audio_path)
context_tokens = CODEC.encode(processed_audio)
prompt = CODEC.format_prompt(text, context_tokens, None)
gen_cfg = GenerationConfig(
top_p=0.95,
top_k=50,
temperature=0.8,
max_new_tokens=1024,
repetition_penalty=1.2,
do_sample=True,
)
response = GPU_PIPE(
[prompt],
gen_config=gen_cfg,
do_preprocess=False
)
audio = CODEC.decode(response[0].text, context_tokens)
if torch.is_tensor(audio):
audio = audio.float().cpu().numpy() # force float32
# 🧹 Cleanup
os.remove(processed_audio)
gc.collect()
torch.cuda.empty_cache()
return audio, 48000
# ---------------- Gradio ----------------
def voice_clone_interface(text, upload_audio, record_audio):
try:
audio_path = upload_audio or record_audio
if not audio_path:
return None, "Upload or record reference audio"
audio, sr = generate_speech(text, audio_path)
os.makedirs("outputs", exist_ok=True)
out_path = f"outputs/mira_{datetime.now():%Y%m%d_%H%M%S}.wav"
sf.write(out_path, audio, sr)
return out_path, "✅ Generation successful"
except Exception as e:
logging.error(e)
return None, f"❌ {str(e)}"
def build_interface():
with gr.Blocks(title="MiraTTS Voice Cloning") as demo:
gr.Markdown("# 🎤 MiraTTS – Voice Cloning")
with gr.Row():
with gr.Column():
upload = gr.Audio(sources="upload", type="filepath", label="Upload Reference Audio")
record = gr.Audio(sources="microphone", type="filepath", label="Record Reference Audio")
with gr.Column():
text = gr.Textbox(lines=4, label="Text to Synthesize")
generate = gr.Button("Generate", variant="primary")
output_audio = gr.Audio(type="filepath", autoplay=True)
status = gr.Textbox(label="Status")
generate.click(
voice_clone_interface,
inputs=[text, upload, record],
outputs=[output_audio, status],
)
return demo
# ---------------- Main ----------------
if __name__ == "__main__":
initialize_cpu()
demo = build_interface()
demo.launch(server_name="0.0.0.0", server_port=7860)