Spaces:
Running
on
T4
Running
on
T4
switch to llama demo space, remove short prompt
Browse files- app.py +156 -104
- src/generate.py +155 -98
- src/prompts.py +3 -11
app.py
CHANGED
|
@@ -1,32 +1,44 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
from gradio_client import Client, handle_file
|
| 3 |
|
| 4 |
import src.generate as generate
|
| 5 |
import src.process as process
|
| 6 |
|
| 7 |
-
# ------------------- Globals -------------------
|
| 8 |
global client
|
| 9 |
|
| 10 |
-
|
| 11 |
-
#
|
|
|
|
| 12 |
def clear_all():
|
| 13 |
-
|
| 14 |
-
return "", "", "", "", "",
|
| 15 |
|
| 16 |
|
| 17 |
def make_result_html(pass_threshold, passed, ratio):
|
| 18 |
-
"""Returns HTML summarizing results.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
summary = (
|
| 20 |
f"✅ Correct (≥ {int(pass_threshold * 100)}%)"
|
| 21 |
-
if passed
|
| 22 |
-
|
| 23 |
)
|
| 24 |
score = f"Similarity: {ratio * 100:.1f}%"
|
| 25 |
return summary, score
|
| 26 |
|
| 27 |
|
| 28 |
def make_alignment_html(ref_tokens, hyp_tokens, alignments):
|
| 29 |
-
"""Returns HTML showing alignment between target and recognized user audio.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
out = []
|
| 31 |
no_match_html = ' <span style="background:#ffe0e0;text-decoration:line-through;">'
|
| 32 |
match_html = ' <span style="background:#e0ffe0;">'
|
|
@@ -43,118 +55,148 @@ def make_alignment_html(ref_tokens, hyp_tokens, alignments):
|
|
| 43 |
elif op == "replace":
|
| 44 |
out.append(no_match_html + ref_string + "</span>")
|
| 45 |
out.append(match_html + hyp_string + "</span>")
|
| 46 |
-
html = '<div style="line-height:1.6;font-size:1rem;">' + "".join(
|
|
|
|
| 47 |
return html
|
| 48 |
|
| 49 |
|
| 50 |
def make_html(sentence_match):
|
| 51 |
-
"""Creates the HTML
|
| 52 |
-
|
| 53 |
-
sentence_match
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
return score_html, result_html, diff_html
|
| 61 |
|
| 62 |
|
| 63 |
# ------------------- Core Check (English-only) -------------------
|
|
|
|
| 64 |
def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
|
| 65 |
-
model_id: str, device_pref: str):
|
| 66 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
if not target_sentence:
|
| 68 |
return "Please generate a sentence first.", ""
|
|
|
|
| 69 |
if audio_path is None:
|
| 70 |
-
return
|
| 71 |
-
"Please start, record, then stop the audio recording before trying to transcribe.",
|
| 72 |
-
"",
|
| 73 |
-
)
|
| 74 |
|
|
|
|
| 75 |
user_transcript = process.run_asr(audio_path, model_id, device_pref)
|
| 76 |
|
|
|
|
| 77 |
if isinstance(user_transcript, Exception):
|
| 78 |
return f"Transcription failed: {user_transcript}", ""
|
| 79 |
return "", user_transcript
|
| 80 |
|
| 81 |
|
| 82 |
-
def transcribe_check(audio_path, target_sentence, model_id, device_pref,
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
clone_audio = False
|
| 85 |
-
|
| 86 |
-
error_msg, user_transcript = get_user_transcript(
|
| 87 |
-
|
| 88 |
-
|
| 89 |
if error_msg:
|
| 90 |
score_html = ""
|
| 91 |
diff_html = ""
|
| 92 |
result_html = error_msg
|
| 93 |
else:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
| 97 |
if sentence_match.passed:
|
| 98 |
clone_audio = True
|
| 99 |
-
|
| 100 |
score_html, result_html, diff_html = make_html(sentence_match)
|
| 101 |
|
| 102 |
return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
|
| 103 |
|
| 104 |
-
|
| 105 |
-
def clone_voice(audio_input, text_input):
|
| 106 |
-
"""Calls Chatterbox Space to clone the voice."""
|
| 107 |
global client
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
#
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
gr.Markdown(
|
| 116 |
"""
|
| 117 |
-
|
| 118 |
1) Generate a sentence.
|
| 119 |
2) Record yourself reading it.
|
| 120 |
3) Transcribe & check your accuracy.
|
| 121 |
4) If matched, clone your voice to speak any sentence you enter.
|
| 122 |
"""
|
| 123 |
)
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
with gr.Row():
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
choices=["qwen-instruct", "llama-instruct"],
|
| 137 |
-
value="llama-instruct",
|
| 138 |
-
label="Sentence generator model",
|
| 139 |
-
)
|
| 140 |
|
| 141 |
with gr.Row():
|
| 142 |
btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
|
| 143 |
btn_clear = gr.Button("🧹 Clear")
|
| 144 |
|
| 145 |
-
# --- Recording section ---
|
| 146 |
with gr.Row():
|
| 147 |
-
consent_audio = gr.Audio(
|
| 148 |
-
sources=["microphone"], type="filepath", label="Record your voice", key='consent_audio'
|
| 149 |
-
)
|
| 150 |
|
| 151 |
-
# --- Advanced settings ---
|
| 152 |
with gr.Accordion("Advanced settings", open=False):
|
| 153 |
model_id = gr.Dropdown(
|
| 154 |
choices=[
|
| 155 |
"openai/whisper-tiny.en", # fastest (CPU-friendly)
|
| 156 |
-
"openai/whisper-base.en", # better accuracy
|
| 157 |
-
"distil-whisper/distil-small.en"
|
| 158 |
],
|
| 159 |
value="openai/whisper-tiny.en",
|
| 160 |
label="ASR model (English only)",
|
|
@@ -162,80 +204,90 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
|
|
| 162 |
device_pref = gr.Radio(
|
| 163 |
choices=["auto", "cpu", "cuda"],
|
| 164 |
value="auto",
|
| 165 |
-
label="Device preference"
|
| 166 |
-
)
|
| 167 |
-
pass_threshold = gr.Slider(
|
| 168 |
-
0.50, 1.00, value=0.85, step=0.01, label="Match threshold"
|
| 169 |
)
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
# --- Transcription + comparison section ---
|
| 172 |
with gr.Row():
|
| 173 |
btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
|
| 174 |
-
|
| 175 |
with gr.Row():
|
| 176 |
user_transcript = gr.Textbox(label="Transcription", interactive=False)
|
| 177 |
-
|
| 178 |
with gr.Row():
|
| 179 |
score_html = gr.Label(label="Score")
|
| 180 |
result_html = gr.Label(label="Result")
|
| 181 |
-
|
| 182 |
diff_html = gr.HTML(
|
| 183 |
-
label="Word-level diff (red = expected but missing / green = extra or replacement)"
|
| 184 |
-
)
|
| 185 |
|
| 186 |
-
|
|
|
|
| 187 |
with gr.Row(visible=False) as tts_ui:
|
|
|
|
| 188 |
@gr.render(inputs=consent_audio)
|
| 189 |
def show_tts(audio_input):
|
| 190 |
global client
|
| 191 |
if audio_input:
|
| 192 |
client = Client("ResembleAI/Chatterbox")
|
| 193 |
-
with gr.Row():
|
| 194 |
-
gr.Markdown("# 🔁 Voice cloning")
|
| 195 |
-
|
| 196 |
with gr.Row():
|
| 197 |
with gr.Column():
|
| 198 |
gr.Markdown("## Audio input")
|
| 199 |
-
tts_audio = gr.Audio(
|
| 200 |
-
audio_input, interactive=True, type="filepath"
|
| 201 |
-
)
|
| 202 |
-
|
| 203 |
with gr.Row():
|
| 204 |
with gr.Column():
|
| 205 |
gr.Markdown("## Text input")
|
| 206 |
tts_text = gr.Textbox(
|
| 207 |
-
"Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
)
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
with gr.Row():
|
| 212 |
clone_btn = gr.Button("Clone!")
|
| 213 |
cloned_audio = gr.Audio()
|
| 214 |
-
clone_btn.click(
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
btn_gen.click(
|
| 221 |
-
fn=
|
| 222 |
-
|
| 223 |
-
outputs=target,
|
| 224 |
)
|
| 225 |
|
| 226 |
-
# 🧹 Clear button
|
| 227 |
btn_clear.click(
|
| 228 |
fn=clear_all,
|
| 229 |
-
outputs=[target, user_transcript, score_html, result_html, diff_html]
|
| 230 |
)
|
| 231 |
|
| 232 |
-
# ✅ Transcribe & Check
|
| 233 |
btn_check.click(
|
| 234 |
fn=transcribe_check,
|
| 235 |
inputs=[consent_audio, target, model_id, device_pref, pass_threshold],
|
| 236 |
-
outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
|
| 237 |
)
|
| 238 |
|
| 239 |
-
|
| 240 |
if __name__ == "__main__":
|
| 241 |
-
demo.launch(show_error=True)
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
import gradio as gr
|
| 3 |
+
|
| 4 |
from gradio_client import Client, handle_file
|
| 5 |
|
| 6 |
import src.generate as generate
|
| 7 |
import src.process as process
|
| 8 |
|
|
|
|
| 9 |
global client
|
| 10 |
|
| 11 |
+
# TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
|
| 12 |
+
#chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
|
| 13 |
+
# ------------------- UI printing functions -------------------
|
| 14 |
def clear_all():
|
| 15 |
+
# target, user_transcript, score_html, result_html, diff_html, tts_ui
|
| 16 |
+
return "", "", "", "", "", gr.Row.update(visible=False)
|
| 17 |
|
| 18 |
|
| 19 |
def make_result_html(pass_threshold, passed, ratio):
|
| 20 |
+
"""Returns HTML summarizing results.
|
| 21 |
+
Parameters:
|
| 22 |
+
pass_threshold: Minimum percentage of match between target and recognized user utterance that counts as passing.
|
| 23 |
+
passed: Whether the recognized user utterance is >= `pass_threshold`.
|
| 24 |
+
ratio: Sequence match ratio.
|
| 25 |
+
"""
|
| 26 |
summary = (
|
| 27 |
f"✅ Correct (≥ {int(pass_threshold * 100)}%)"
|
| 28 |
+
if passed else
|
| 29 |
+
f"❌ Not a match (need ≥ {int(pass_threshold * 100)}%)"
|
| 30 |
)
|
| 31 |
score = f"Similarity: {ratio * 100:.1f}%"
|
| 32 |
return summary, score
|
| 33 |
|
| 34 |
|
| 35 |
def make_alignment_html(ref_tokens, hyp_tokens, alignments):
|
| 36 |
+
"""Returns HTML showing alignment between the target and recognized user audio.
|
| 37 |
+
Parameters:
|
| 38 |
+
ref_tokens: Target sentence for the user to say, tokenized.
|
| 39 |
+
hyp_tokens: Recognized utterance from the user, tokenized.
|
| 40 |
+
alignments: Tuples of alignment pattern (equal, delete, insert) and corresponding indices in `hyp_tokens`.
|
| 41 |
+
"""
|
| 42 |
out = []
|
| 43 |
no_match_html = ' <span style="background:#ffe0e0;text-decoration:line-through;">'
|
| 44 |
match_html = ' <span style="background:#e0ffe0;">'
|
|
|
|
| 55 |
elif op == "replace":
|
| 56 |
out.append(no_match_html + ref_string + "</span>")
|
| 57 |
out.append(match_html + hyp_string + "</span>")
|
| 58 |
+
html = '<div style="line-height:1.6;font-size:1rem;">' + "".join(
|
| 59 |
+
out).strip() + "</div>"
|
| 60 |
return html
|
| 61 |
|
| 62 |
|
| 63 |
def make_html(sentence_match):
|
| 64 |
+
"""Creates the HTML written out to the UI based on the results.
|
| 65 |
+
Parameters:
|
| 66 |
+
sentence_match: Class that stores the features of the target - user utterance alignment
|
| 67 |
+
Returns:
|
| 68 |
+
diff_html: An HTML string showing how the target sentence and recognized user utterance matches.
|
| 69 |
+
result_html: An HTML string summarizing the results of the match between target and user utterance.
|
| 70 |
+
"""
|
| 71 |
+
diff_html = make_alignment_html(sentence_match.target_tokens,
|
| 72 |
+
sentence_match.user_tokens,
|
| 73 |
+
sentence_match.alignments)
|
| 74 |
+
result_html, score_html = make_result_html(sentence_match.pass_threshold,
|
| 75 |
+
sentence_match.passed,
|
| 76 |
+
sentence_match.ratio)
|
| 77 |
+
|
| 78 |
return score_html, result_html, diff_html
|
| 79 |
|
| 80 |
|
| 81 |
# ------------------- Core Check (English-only) -------------------
|
| 82 |
+
# @spaces.GPU
|
| 83 |
def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
|
| 84 |
+
model_id: str, device_pref: str) -> (str, str):
|
| 85 |
+
"""ASR for the input audio and basic validation.
|
| 86 |
+
Uses the selected ASR model `model_id` to recognize words in the input `audio_path`.
|
| 87 |
+
Parameters:
|
| 88 |
+
audio_path: Processed audio file returned from gradio Audio component.
|
| 89 |
+
target_sentence: Sentence the user needs to say.
|
| 90 |
+
model_id: Desired ASR model.
|
| 91 |
+
device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
|
| 92 |
+
Returns:
|
| 93 |
+
error_msg: If there's an error, a string describing what happened.
|
| 94 |
+
user_transcript: The recognized user utterance.
|
| 95 |
+
"""
|
| 96 |
+
# Handles user interaction errors.
|
| 97 |
if not target_sentence:
|
| 98 |
return "Please generate a sentence first.", ""
|
| 99 |
+
# TODO: Automatically stop the recording if someone presses the Transcribe & Check button.
|
| 100 |
if audio_path is None:
|
| 101 |
+
return "Please start, record, then stop the audio recording before trying to transcribe.", ""
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
# Runs the automatic speech recognition
|
| 104 |
user_transcript = process.run_asr(audio_path, model_id, device_pref)
|
| 105 |
|
| 106 |
+
# Handles processing errors.
|
| 107 |
if isinstance(user_transcript, Exception):
|
| 108 |
return f"Transcription failed: {user_transcript}", ""
|
| 109 |
return "", user_transcript
|
| 110 |
|
| 111 |
|
| 112 |
+
def transcribe_check(audio_path, target_sentence, model_id, device_pref,
|
| 113 |
+
pass_threshold):
|
| 114 |
+
"""Transcribe user, calculate match to target sentence, create results HTML.
|
| 115 |
+
Parameters:
|
| 116 |
+
audio_path: Local path to recorded audio.
|
| 117 |
+
target_sentence: Sentence the user needs to say.
|
| 118 |
+
model_id: Desired ASR model.
|
| 119 |
+
device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
|
| 120 |
+
Returns:
|
| 121 |
+
user_transcript: The recognized user utterance
|
| 122 |
+
score_html: HTML string to display the score
|
| 123 |
+
diff_html: HTML string for displaying the differences between target and user utterance
|
| 124 |
+
result_html: HTML string describing the results, or an error message
|
| 125 |
+
clone_audio: Bool for whether to allow audio cloning: This makes the audio cloning components visible
|
| 126 |
+
"""
|
| 127 |
clone_audio = False
|
| 128 |
+
# Transcribe user input
|
| 129 |
+
error_msg, user_transcript = get_user_transcript(audio_path,
|
| 130 |
+
target_sentence, model_id,
|
| 131 |
+
device_pref)
|
| 132 |
if error_msg:
|
| 133 |
score_html = ""
|
| 134 |
diff_html = ""
|
| 135 |
result_html = error_msg
|
| 136 |
else:
|
| 137 |
+
# Calculate match details between the target and recognized user input
|
| 138 |
+
sentence_match = process.SentenceMatcher(target_sentence,
|
| 139 |
+
user_transcript,
|
| 140 |
+
pass_threshold)
|
| 141 |
if sentence_match.passed:
|
| 142 |
clone_audio = True
|
| 143 |
+
# Create the output to print out
|
| 144 |
score_html, result_html, diff_html = make_html(sentence_match)
|
| 145 |
|
| 146 |
return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
|
| 147 |
|
| 148 |
+
def clone_voice(audio_input, text_input, exaggeration_input, cfgw_input, seed_num_input, temperature_input):
|
|
|
|
|
|
|
| 149 |
global client
|
| 150 |
+
# Additional specifications for Chatterbox include:
|
| 151 |
+
# exaggeration_input=0.5,
|
| 152 |
+
# temperature_input=0.8,
|
| 153 |
+
# seed_num_input=0,
|
| 154 |
+
# cfgw_input=0.5,
|
| 155 |
+
# api_name="/generate_tts_audio"
|
| 156 |
+
return client.predict(text_input=text_input,
|
| 157 |
+
audio_prompt_path_input=handle_file(audio_input),
|
| 158 |
+
exaggeration_input=exaggeration_input,
|
| 159 |
+
cfgw_input=cfgw_input,
|
| 160 |
+
seed_num_input=seed_num_input, temperature_input=temperature_input)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# ------------------- UI -------------------
|
| 164 |
+
with gr.Blocks(title="Voice Consent Gate") as demo:
|
| 165 |
+
gr.Markdown("# Voice Consent Gate: Demo")
|
| 166 |
gr.Markdown(
|
| 167 |
"""
|
| 168 |
+
## 🎤 Say the Sentence (English)
|
| 169 |
1) Generate a sentence.
|
| 170 |
2) Record yourself reading it.
|
| 171 |
3) Transcribe & check your accuracy.
|
| 172 |
4) If matched, clone your voice to speak any sentence you enter.
|
| 173 |
"""
|
| 174 |
)
|
| 175 |
+
with gr.Accordion(label="Further Details", open=False):
|
| 176 |
+
gr.Markdown("""
|
| 177 |
+
To create a basic consented voice cloning system, you need 2 parts:
|
| 178 |
+
1. An automatic speech recognition (ASR) system that recognizes a sentence conveying consent from the person whose voice will be cloned.
|
| 179 |
+
2. A voice-cloning text-to-speech (TTS) system that takes as input text and the speaker’s speech snippets to generate speech.
|
| 180 |
+
|
| 181 |
+
Some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_. This means that a sentence used for consent can **also** be used for voice cloning. We demonstrate one way to do that here.
|
| 182 |
+
""")
|
|
|
|
| 183 |
with gr.Row():
|
| 184 |
+
target = gr.Textbox(label="Target sentence", interactive=False,
|
| 185 |
+
placeholder="Click 'Generate sentence'")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
with gr.Row():
|
| 188 |
btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
|
| 189 |
btn_clear = gr.Button("🧹 Clear")
|
| 190 |
|
|
|
|
| 191 |
with gr.Row():
|
| 192 |
+
consent_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice", key='consent_audio')
|
|
|
|
|
|
|
| 193 |
|
|
|
|
| 194 |
with gr.Accordion("Advanced settings", open=False):
|
| 195 |
model_id = gr.Dropdown(
|
| 196 |
choices=[
|
| 197 |
"openai/whisper-tiny.en", # fastest (CPU-friendly)
|
| 198 |
+
"openai/whisper-base.en", # better accuracy, a bit slower
|
| 199 |
+
"distil-whisper/distil-small.en" # optional distil English model
|
| 200 |
],
|
| 201 |
value="openai/whisper-tiny.en",
|
| 202 |
label="ASR model (English only)",
|
|
|
|
| 204 |
device_pref = gr.Radio(
|
| 205 |
choices=["auto", "cpu", "cuda"],
|
| 206 |
value="auto",
|
| 207 |
+
label="Device preference"
|
|
|
|
|
|
|
|
|
|
| 208 |
)
|
| 209 |
+
pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
|
| 210 |
+
label="Match threshold")
|
| 211 |
|
|
|
|
| 212 |
with gr.Row():
|
| 213 |
btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
|
|
|
|
| 214 |
with gr.Row():
|
| 215 |
user_transcript = gr.Textbox(label="Transcription", interactive=False)
|
|
|
|
| 216 |
with gr.Row():
|
| 217 |
score_html = gr.Label(label="Score")
|
| 218 |
result_html = gr.Label(label="Result")
|
|
|
|
| 219 |
diff_html = gr.HTML(
|
| 220 |
+
label="Word-level diff (red = expected but missing / green = extra or replacement)")
|
|
|
|
| 221 |
|
| 222 |
+
gr.Markdown("## 🔁 Voice Consent Gate (opens upon consent)")
|
| 223 |
+
# TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
|
| 224 |
with gr.Row(visible=False) as tts_ui:
|
| 225 |
+
# Using the render decorator so that we can access consent audio after it's recorded.
|
| 226 |
@gr.render(inputs=consent_audio)
|
| 227 |
def show_tts(audio_input):
|
| 228 |
global client
|
| 229 |
if audio_input:
|
| 230 |
client = Client("ResembleAI/Chatterbox")
|
|
|
|
|
|
|
|
|
|
| 231 |
with gr.Row():
|
| 232 |
with gr.Column():
|
| 233 |
gr.Markdown("## Audio input")
|
| 234 |
+
tts_audio = gr.Audio(audio_input, type="filepath")
|
|
|
|
|
|
|
|
|
|
| 235 |
with gr.Row():
|
| 236 |
with gr.Column():
|
| 237 |
gr.Markdown("## Text input")
|
| 238 |
tts_text = gr.Textbox(
|
| 239 |
+
"Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
|
| 240 |
+
with gr.Row():
|
| 241 |
+
with gr.Accordion("More options", open=False):
|
| 242 |
+
exaggeration = gr.Slider(
|
| 243 |
+
0.25, 2, step=.05,
|
| 244 |
+
label="Exaggeration (Neutral = 0.5, extreme values can be unstable)",
|
| 245 |
+
value=.5
|
| 246 |
)
|
| 247 |
+
cfg_weight = gr.Slider(
|
| 248 |
+
0.2, 1, step=.05, label="CFG/Pace", value=0.5
|
| 249 |
+
)
|
| 250 |
+
seed_num = gr.Number(value=0,
|
| 251 |
+
label="Random seed (0 for random)")
|
| 252 |
+
temp = gr.Slider(0.05, 5, step=.05,
|
| 253 |
+
label="Temperature", value=.8)
|
| 254 |
with gr.Row():
|
| 255 |
clone_btn = gr.Button("Clone!")
|
| 256 |
cloned_audio = gr.Audio()
|
| 257 |
+
clone_btn.click(fn=clone_voice,
|
| 258 |
+
inputs=[tts_audio, tts_text, exaggeration,
|
| 259 |
+
cfg_weight, seed_num, temp],
|
| 260 |
+
outputs=[cloned_audio])
|
| 261 |
+
|
| 262 |
+
def gen_sentence_action():
|
| 263 |
+
# chatterbox model name, detailed prompt (short_prompt=False)
|
| 264 |
+
try:
|
| 265 |
+
return generate.gen_sentence_llm(
|
| 266 |
+
"chatterbox",
|
| 267 |
+
fallback_on_error=False # ← show errors during testing
|
| 268 |
+
)
|
| 269 |
+
except Exception as e:
|
| 270 |
+
# Show a helpful message directly in the Target sentence box
|
| 271 |
+
return f"[ERROR calling LLM] {type(e).__name__}: {e}"
|
| 272 |
+
|
| 273 |
+
# -------- Events --------
|
| 274 |
+
# Generate sentence: fixed model name + detailed prompt
|
| 275 |
btn_gen.click(
|
| 276 |
+
fn=gen_sentence_action,
|
| 277 |
+
outputs=target
|
|
|
|
| 278 |
)
|
| 279 |
|
|
|
|
| 280 |
btn_clear.click(
|
| 281 |
fn=clear_all,
|
| 282 |
+
outputs=[target, user_transcript, score_html, result_html, diff_html, tts_ui]
|
| 283 |
)
|
| 284 |
|
|
|
|
| 285 |
btn_check.click(
|
| 286 |
fn=transcribe_check,
|
| 287 |
inputs=[consent_audio, target, model_id, device_pref, pass_threshold],
|
| 288 |
+
outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
|
| 289 |
)
|
| 290 |
|
| 291 |
+
|
| 292 |
if __name__ == "__main__":
|
| 293 |
+
demo.launch(show_error=True)
|
src/generate.py
CHANGED
|
@@ -1,16 +1,32 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
import src.process as process
|
| 9 |
from src.prompts import get_consent_generation_prompt
|
| 10 |
|
| 11 |
-
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 12 |
|
| 13 |
-
# ------------------- Sentence Bank (
|
| 14 |
SENTENCE_BANK = [
|
| 15 |
"The quick brown fox jumps over the lazy dog.",
|
| 16 |
"I promise to speak clearly and at a steady pace.",
|
|
@@ -24,101 +40,142 @@ SENTENCE_BANK = [
|
|
| 24 |
"This microphone test checks my pronunciation accuracy.",
|
| 25 |
]
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
text = re.sub(r"\s+", " ", text).strip()
|
| 40 |
-
return text
|
| 41 |
-
|
| 42 |
-
# ------------------- Generators -------------------
|
| 43 |
-
def _clean(text: str) -> str:
|
| 44 |
-
# Remove prompt echo if present and tidy whitespace/quotes
|
| 45 |
-
text = text.strip().strip('`"\' ')
|
| 46 |
-
text = re.sub(r"\s+", " ", text)
|
| 47 |
-
return text
|
| 48 |
-
|
| 49 |
-
def gen_sentence_llm_chat():
|
| 50 |
-
model_id = "Qwen/Qwen2.5-1.5B-Instruct"
|
| 51 |
-
prompt = get_consent_generation_prompt("chatterbox")
|
| 52 |
-
|
| 53 |
-
tok = AutoTokenizer.from_pretrained(model_id)
|
| 54 |
-
gen = pipeline(
|
| 55 |
-
"text-generation",
|
| 56 |
-
model=model_id,
|
| 57 |
-
tokenizer=tok,
|
| 58 |
-
device_map="auto",
|
| 59 |
-
torch_dtype="auto",
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
-
out = gen(
|
| 63 |
-
prompt,
|
| 64 |
-
max_new_tokens=60,
|
| 65 |
-
temperature=0.6,
|
| 66 |
-
repetition_penalty=1.08,
|
| 67 |
-
pad_token_id=tok.eos_token_id,
|
| 68 |
-
)[0]["generated_text"]
|
| 69 |
-
|
| 70 |
-
# strip prompt echo if model returns prompt+completion
|
| 71 |
-
if out.startswith(prompt):
|
| 72 |
-
out = out[len(prompt):]
|
| 73 |
-
|
| 74 |
-
return process.normalize_text(_clean(out), lower=False)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def gen_sentence_llm_instruct() -> str:
|
| 78 |
"""
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
"""
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
def gen_sentence(model_choice: Literal["qwen-instruct", "llama-instruct"]) -> str:
|
| 110 |
"""
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
"""
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
def gen_sentence_set() -> str:
|
| 123 |
-
"""
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/generate.py
|
| 2 |
+
"""
|
| 3 |
+
Module: generate
|
| 4 |
+
----------------
|
| 5 |
+
Handles the generation of "consent sentences" for the Voice Consent Gate demo.
|
| 6 |
+
|
| 7 |
+
This module connects to an external language model (in this case, the public
|
| 8 |
+
Hugging Face Space for Llama 3.2 3B Instruct) to generate natural-sounding
|
| 9 |
+
sentences that users can read aloud to give informed consent for voice cloning.
|
| 10 |
+
|
| 11 |
+
If the model call fails (e.g., due to rate limits or network issues),
|
| 12 |
+
a fallback sentence is chosen from a small built-in sentence bank.
|
| 13 |
|
| 14 |
+
Functions:
|
| 15 |
+
- _extract_llama_text(): Normalize the API output from the Llama demo.
|
| 16 |
+
- gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
|
| 17 |
+
- gen_sentence_set(): Select a random prewritten sentence (for fallback/testing).
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import os
|
| 21 |
+
import random
|
| 22 |
+
from typing import Any
|
| 23 |
+
from gradio_client import Client
|
| 24 |
|
| 25 |
import src.process as process
|
| 26 |
from src.prompts import get_consent_generation_prompt
|
| 27 |
|
|
|
|
| 28 |
|
| 29 |
+
# ------------------- Sentence Bank (unchanged) -------------------
|
| 30 |
SENTENCE_BANK = [
|
| 31 |
"The quick brown fox jumps over the lazy dog.",
|
| 32 |
"I promise to speak clearly and at a steady pace.",
|
|
|
|
| 40 |
"This microphone test checks my pronunciation accuracy.",
|
| 41 |
]
|
| 42 |
|
| 43 |
+
|
| 44 |
+
# ------------------- Model / Space Configuration -------------------
|
| 45 |
+
# The demo connects to the Llama 3.2 3B Instruct Space on Hugging Face.
|
| 46 |
+
# You can override these defaults by setting environment variables in your Space.
|
| 47 |
+
LLAMA_SPACE_ID = os.getenv(
|
| 48 |
+
"LLAMA_SPACE_ID", "huggingface-projects/llama-3.2-3B-Instruct"
|
| 49 |
+
)
|
| 50 |
+
LLAMA_API_NAME = "/chat" # The Space exposes a single /chat endpoint.
|
| 51 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # Optional; not required for public Spaces.
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _extract_llama_text(result: Any) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
"""
|
| 56 |
+
Normalize the API response from the Llama 3.2 3B demo Space into plain text.
|
| 57 |
+
|
| 58 |
+
The Space’s `/chat` endpoint may return different shapes depending on how
|
| 59 |
+
the Gradio app is structured — sometimes a string, other times a dictionary
|
| 60 |
+
or list. This function recursively traverses and extracts the first
|
| 61 |
+
meaningful text string it finds.
|
| 62 |
+
|
| 63 |
+
Parameters
|
| 64 |
+
----------
|
| 65 |
+
result : Any
|
| 66 |
+
The raw output returned by `client.predict()`.
|
| 67 |
+
|
| 68 |
+
Returns
|
| 69 |
+
-------
|
| 70 |
+
str
|
| 71 |
+
Cleaned text output (may be empty string if extraction fails).
|
| 72 |
"""
|
| 73 |
+
if isinstance(result, str):
|
| 74 |
+
return result.strip()
|
| 75 |
+
if isinstance(result, (int, float, bool)):
|
| 76 |
+
return str(result)
|
| 77 |
+
if isinstance(result, list):
|
| 78 |
+
# If multiple segments are returned (e.g., multiple sentences),
|
| 79 |
+
# join them into one string.
|
| 80 |
+
parts = []
|
| 81 |
+
for x in result:
|
| 82 |
+
s = _extract_llama_text(x)
|
| 83 |
+
if s:
|
| 84 |
+
parts.append(s)
|
| 85 |
+
return " ".join(parts).strip()
|
| 86 |
+
if isinstance(result, dict):
|
| 87 |
+
# Common key names used in Gradio JSON responses
|
| 88 |
+
for key in ("text", "response", "content", "generated_text", "message"):
|
| 89 |
+
v = result.get(key)
|
| 90 |
+
if isinstance(v, str) and v.strip():
|
| 91 |
+
return v.strip()
|
| 92 |
+
return ""
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def gen_sentence_llm(
|
| 96 |
+
audio_model_name: str = "chatterbox",
|
| 97 |
+
*,
|
| 98 |
+
fallback_on_error: bool = False # Set True for production to avoid crashes
|
| 99 |
+
) -> str:
|
|
|
|
| 100 |
"""
|
| 101 |
+
Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
|
| 102 |
+
|
| 103 |
+
This function constructs a prompt describing the linguistic and ethical
|
| 104 |
+
requirements for a consent sentence (via `get_consent_generation_prompt`)
|
| 105 |
+
and sends it to the Llama demo hosted on Hugging Face Spaces.
|
| 106 |
+
|
| 107 |
+
The response is normalized into a single English sentence suitable
|
| 108 |
+
for reading aloud.
|
| 109 |
+
|
| 110 |
+
Parameters
|
| 111 |
+
----------
|
| 112 |
+
audio_model_name : str, optional
|
| 113 |
+
The name of the voice-cloning model to mention in the sentence.
|
| 114 |
+
Defaults to "chatterbox".
|
| 115 |
+
fallback_on_error : bool, optional
|
| 116 |
+
If True, return a random fallback sentence instead of raising
|
| 117 |
+
an error when the Space call fails. Default is False for debugging.
|
| 118 |
+
|
| 119 |
+
Returns
|
| 120 |
+
-------
|
| 121 |
+
str
|
| 122 |
+
A clean, human-readable consent sentence.
|
| 123 |
+
|
| 124 |
+
Raises
|
| 125 |
+
------
|
| 126 |
+
Exception
|
| 127 |
+
Re-raises the underlying error if `fallback_on_error` is False.
|
| 128 |
"""
|
| 129 |
+
# Generate the full natural-language prompt that the LLM will receive
|
| 130 |
+
prompt = get_consent_generation_prompt(audio_model_name)
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
# Initialize Gradio client for the Llama demo Space
|
| 134 |
+
client = Client(LLAMA_SPACE_ID, hf_token=HF_TOKEN)
|
| 135 |
+
|
| 136 |
+
# The Llama demo exposes a simple /chat endpoint with standard decoding params
|
| 137 |
+
result = client.predict(
|
| 138 |
+
message=prompt,
|
| 139 |
+
max_new_tokens=128,
|
| 140 |
+
temperature=0.6,
|
| 141 |
+
top_p=0.9,
|
| 142 |
+
top_k=50,
|
| 143 |
+
repetition_penalty=1.2,
|
| 144 |
+
api_name=LLAMA_API_NAME,
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# Normalize and clean up model output
|
| 148 |
+
text = _extract_llama_text(result)
|
| 149 |
+
text = process.normalize_text(text, lower=False)
|
| 150 |
+
|
| 151 |
+
# Handle empty or malformed outputs
|
| 152 |
+
if not text:
|
| 153 |
+
raise ValueError("Empty response from Llama Space")
|
| 154 |
+
|
| 155 |
+
# In case the model produces multiple lines or options, pick the first full sentence
|
| 156 |
+
first_line = next((ln.strip() for ln in text.splitlines() if ln.strip()), "")
|
| 157 |
+
return first_line or text
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
print(f"[gen_sentence_llm] Llama Space call failed: {type(e).__name__}: {e}")
|
| 161 |
+
if fallback_on_error:
|
| 162 |
+
# If fallback is enabled, use a predefined sentence instead
|
| 163 |
+
return random.choice(SENTENCE_BANK)
|
| 164 |
+
# Otherwise propagate the exception so the UI displays it
|
| 165 |
+
raise
|
| 166 |
|
| 167 |
|
| 168 |
def gen_sentence_set() -> str:
|
| 169 |
+
"""
|
| 170 |
+
Return a sentence from a predefined static list.
|
| 171 |
+
|
| 172 |
+
This is used as a simple fallback generator when model-based
|
| 173 |
+
generation is unavailable or for testing the ASR pipeline
|
| 174 |
+
without network access.
|
| 175 |
+
|
| 176 |
+
Returns
|
| 177 |
+
-------
|
| 178 |
+
str
|
| 179 |
+
A single English sentence from the fallback bank.
|
| 180 |
+
"""
|
| 181 |
+
return random.choice(SENTENCE_BANK)
|
src/prompts.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# src/prompts.py
|
| 2 |
|
| 3 |
-
def get_consent_generation_prompt(audio_model_name: str
|
| 4 |
"""
|
| 5 |
Returns a text prompt instructing the model to generate a natural-sounding
|
| 6 |
consent sentence for voice cloning with the specified model.
|
|
@@ -14,15 +14,6 @@ def get_consent_generation_prompt(audio_model_name: str, short_prompt: bool = Fa
|
|
| 14 |
str: The prompt text.
|
| 15 |
"""
|
| 16 |
|
| 17 |
-
if short_prompt:
|
| 18 |
-
return (
|
| 19 |
-
f"Generate one natural, spoken-style English sentence (10–20 words) in which a person "
|
| 20 |
-
f"clearly gives informed consent to use their voice for generating synthetic audio "
|
| 21 |
-
f"with the model {audio_model_name}. The sentence should sound conversational, include "
|
| 22 |
-
f"a clear consent phrase like 'I give my consent' or 'I agree', mention {audio_model_name} "
|
| 23 |
-
f"by name, and be phonetically varied but neutral in tone. Output only the final sentence."
|
| 24 |
-
)
|
| 25 |
-
|
| 26 |
return f"""
|
| 27 |
Generate a short, natural-sounding English sentence (10–20 words) that a person could say aloud
|
| 28 |
to clearly state their informed consent to use their voice for generating synthetic audio with
|
|
@@ -43,5 +34,6 @@ def get_consent_generation_prompt(audio_model_name: str, short_prompt: bool = Fa
|
|
| 43 |
- “I give my consent to use my voice for generating audio with the model {audio_model_name}. This statement is made freely and clearly.”
|
| 44 |
- “Good afternoon. I agree to the use of my recorded voice for audio generation with the model {audio_model_name}.”
|
| 45 |
|
| 46 |
-
The output should be one to three natural sentences ready to be spoken aloud for recording purposes.
|
|
|
|
| 47 |
"""
|
|
|
|
| 1 |
# src/prompts.py
|
| 2 |
|
| 3 |
+
def get_consent_generation_prompt(audio_model_name: str) -> str:
|
| 4 |
"""
|
| 5 |
Returns a text prompt instructing the model to generate a natural-sounding
|
| 6 |
consent sentence for voice cloning with the specified model.
|
|
|
|
| 14 |
str: The prompt text.
|
| 15 |
"""
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
return f"""
|
| 18 |
Generate a short, natural-sounding English sentence (10–20 words) that a person could say aloud
|
| 19 |
to clearly state their informed consent to use their voice for generating synthetic audio with
|
|
|
|
| 34 |
- “I give my consent to use my voice for generating audio with the model {audio_model_name}. This statement is made freely and clearly.”
|
| 35 |
- “Good afternoon. I agree to the use of my recorded voice for audio generation with the model {audio_model_name}.”
|
| 36 |
|
| 37 |
+
The output should be one to three natural sentences ready to be spoken aloud for recording purposes.
|
| 38 |
+
Only output the sentences that the speaker should read, no extra information, no justifications, no formatting or lists. Only the suggested sentence.
|
| 39 |
"""
|