frimelle HF Staff commited on
Commit
794256c
·
1 Parent(s): 14f5917

switch to llama demo space, remove short prompt

Browse files
Files changed (3) hide show
  1. app.py +156 -104
  2. src/generate.py +155 -98
  3. src/prompts.py +3 -11
app.py CHANGED
@@ -1,32 +1,44 @@
 
1
  import gradio as gr
 
2
  from gradio_client import Client, handle_file
3
 
4
  import src.generate as generate
5
  import src.process as process
6
 
7
- # ------------------- Globals -------------------
8
  global client
9
 
10
-
11
- # ------------------- UI helper functions -------------------
 
12
  def clear_all():
13
- """Reset all displayed fields."""
14
- return "", "", "", "", "", "", "", None,
15
 
16
 
17
  def make_result_html(pass_threshold, passed, ratio):
18
- """Returns HTML summarizing results."""
 
 
 
 
 
19
  summary = (
20
  f"✅ Correct (≥ {int(pass_threshold * 100)}%)"
21
- if passed
22
- else f"❌ Not a match (need ≥ {int(pass_threshold * 100)}%)"
23
  )
24
  score = f"Similarity: {ratio * 100:.1f}%"
25
  return summary, score
26
 
27
 
28
  def make_alignment_html(ref_tokens, hyp_tokens, alignments):
29
- """Returns HTML showing alignment between target and recognized user audio."""
 
 
 
 
 
30
  out = []
31
  no_match_html = ' <span style="background:#ffe0e0;text-decoration:line-through;">'
32
  match_html = ' <span style="background:#e0ffe0;">'
@@ -43,118 +55,148 @@ def make_alignment_html(ref_tokens, hyp_tokens, alignments):
43
  elif op == "replace":
44
  out.append(no_match_html + ref_string + "</span>")
45
  out.append(match_html + hyp_string + "</span>")
46
- html = '<div style="line-height:1.6;font-size:1rem;">' + "".join(out).strip() + "</div>"
 
47
  return html
48
 
49
 
50
  def make_html(sentence_match):
51
- """Creates the HTML for UI based on the sentence match."""
52
- diff_html = make_alignment_html(
53
- sentence_match.target_tokens,
54
- sentence_match.user_tokens,
55
- sentence_match.alignments,
56
- )
57
- result_html, score_html = make_result_html(
58
- sentence_match.pass_threshold, sentence_match.passed, sentence_match.ratio
59
- )
 
 
 
 
 
60
  return score_html, result_html, diff_html
61
 
62
 
63
  # ------------------- Core Check (English-only) -------------------
 
64
  def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
65
- model_id: str, device_pref: str):
66
- """Runs ASR for the input audio."""
 
 
 
 
 
 
 
 
 
 
 
67
  if not target_sentence:
68
  return "Please generate a sentence first.", ""
 
69
  if audio_path is None:
70
- return (
71
- "Please start, record, then stop the audio recording before trying to transcribe.",
72
- "",
73
- )
74
 
 
75
  user_transcript = process.run_asr(audio_path, model_id, device_pref)
76
 
 
77
  if isinstance(user_transcript, Exception):
78
  return f"Transcription failed: {user_transcript}", ""
79
  return "", user_transcript
80
 
81
 
82
- def transcribe_check(audio_path, target_sentence, model_id, device_pref, pass_threshold):
83
- """Transcribe user audio, compare with target, and generate HTML."""
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  clone_audio = False
85
-
86
- error_msg, user_transcript = get_user_transcript(
87
- audio_path, target_sentence, model_id, device_pref
88
- )
89
  if error_msg:
90
  score_html = ""
91
  diff_html = ""
92
  result_html = error_msg
93
  else:
94
- sentence_match = process.SentenceMatcher(
95
- target_sentence, user_transcript, pass_threshold
96
- )
 
97
  if sentence_match.passed:
98
  clone_audio = True
99
-
100
  score_html, result_html, diff_html = make_html(sentence_match)
101
 
102
  return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
103
 
104
-
105
- def clone_voice(audio_input, text_input):
106
- """Calls Chatterbox Space to clone the voice."""
107
  global client
108
- return client.predict(
109
- text_input=text_input, audio_prompt_path_input=handle_file(audio_input)
110
- )
111
-
112
-
113
- # ------------------- Gradio UI -------------------
114
- with gr.Blocks(title="Say the Sentence (English)") as demo:
 
 
 
 
 
 
 
 
 
115
  gr.Markdown(
116
  """
117
- # 🎤 Say the Sentence (English)
118
  1) Generate a sentence.
119
  2) Record yourself reading it.
120
  3) Transcribe & check your accuracy.
121
  4) If matched, clone your voice to speak any sentence you enter.
122
  """
123
  )
124
-
125
- # --- Sentence generation section ---
126
- with gr.Row():
127
- target = gr.Textbox(
128
- label="Target sentence",
129
- interactive=False,
130
- placeholder="Click 'Generate sentence'"
131
- )
132
-
133
  with gr.Row():
134
- # 🔽 New: sentence generator model selector
135
- sentence_gen_model = gr.Dropdown(
136
- choices=["qwen-instruct", "llama-instruct"],
137
- value="llama-instruct",
138
- label="Sentence generator model",
139
- )
140
 
141
  with gr.Row():
142
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
143
  btn_clear = gr.Button("🧹 Clear")
144
 
145
- # --- Recording section ---
146
  with gr.Row():
147
- consent_audio = gr.Audio(
148
- sources=["microphone"], type="filepath", label="Record your voice", key='consent_audio'
149
- )
150
 
151
- # --- Advanced settings ---
152
  with gr.Accordion("Advanced settings", open=False):
153
  model_id = gr.Dropdown(
154
  choices=[
155
  "openai/whisper-tiny.en", # fastest (CPU-friendly)
156
- "openai/whisper-base.en", # better accuracy
157
- "distil-whisper/distil-small.en",
158
  ],
159
  value="openai/whisper-tiny.en",
160
  label="ASR model (English only)",
@@ -162,80 +204,90 @@ with gr.Blocks(title="Say the Sentence (English)") as demo:
162
  device_pref = gr.Radio(
163
  choices=["auto", "cpu", "cuda"],
164
  value="auto",
165
- label="Device preference",
166
- )
167
- pass_threshold = gr.Slider(
168
- 0.50, 1.00, value=0.85, step=0.01, label="Match threshold"
169
  )
 
 
170
 
171
- # --- Transcription + comparison section ---
172
  with gr.Row():
173
  btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
174
-
175
  with gr.Row():
176
  user_transcript = gr.Textbox(label="Transcription", interactive=False)
177
-
178
  with gr.Row():
179
  score_html = gr.Label(label="Score")
180
  result_html = gr.Label(label="Result")
181
-
182
  diff_html = gr.HTML(
183
- label="Word-level diff (red = expected but missing / green = extra or replacement)"
184
- )
185
 
186
- # --- Voice cloning UI (appears only on match) ---
 
187
  with gr.Row(visible=False) as tts_ui:
 
188
  @gr.render(inputs=consent_audio)
189
  def show_tts(audio_input):
190
  global client
191
  if audio_input:
192
  client = Client("ResembleAI/Chatterbox")
193
- with gr.Row():
194
- gr.Markdown("# 🔁 Voice cloning")
195
-
196
  with gr.Row():
197
  with gr.Column():
198
  gr.Markdown("## Audio input")
199
- tts_audio = gr.Audio(
200
- audio_input, interactive=True, type="filepath"
201
- )
202
-
203
  with gr.Row():
204
  with gr.Column():
205
  gr.Markdown("## Text input")
206
  tts_text = gr.Textbox(
207
- "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
208
- interactive=True,
 
 
 
 
 
209
  )
210
-
 
 
 
 
 
 
211
  with gr.Row():
212
  clone_btn = gr.Button("Clone!")
213
  cloned_audio = gr.Audio()
214
- clone_btn.click(
215
- fn=clone_voice, inputs=[tts_audio, tts_text], outputs=[cloned_audio]
216
- )
217
-
218
- # ------------------- Event wiring -------------------
219
- # 🎲 Generate sentence using selected LLM
 
 
 
 
 
 
 
 
 
 
 
 
220
  btn_gen.click(
221
- fn=generate.gen_sentence,
222
- inputs=[sentence_gen_model],
223
- outputs=target,
224
  )
225
 
226
- # 🧹 Clear button
227
  btn_clear.click(
228
  fn=clear_all,
229
- outputs=[target, user_transcript, score_html, result_html, diff_html],
230
  )
231
 
232
- # ✅ Transcribe & Check
233
  btn_check.click(
234
  fn=transcribe_check,
235
  inputs=[consent_audio, target, model_id, device_pref, pass_threshold],
236
- outputs=[user_transcript, score_html, result_html, diff_html, tts_ui],
237
  )
238
 
239
- # ------------------- Run the app -------------------
240
  if __name__ == "__main__":
241
- demo.launch(show_error=True)
 
1
+ import inspect
2
  import gradio as gr
3
+
4
  from gradio_client import Client, handle_file
5
 
6
  import src.generate as generate
7
  import src.process as process
8
 
 
9
  global client
10
 
11
+ # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
12
+ #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
13
+ # ------------------- UI printing functions -------------------
14
  def clear_all():
15
+ # target, user_transcript, score_html, result_html, diff_html, tts_ui
16
+ return "", "", "", "", "", gr.Row.update(visible=False)
17
 
18
 
19
  def make_result_html(pass_threshold, passed, ratio):
20
+ """Returns HTML summarizing results.
21
+ Parameters:
22
+ pass_threshold: Minimum percentage of match between target and recognized user utterance that counts as passing.
23
+ passed: Whether the recognized user utterance is >= `pass_threshold`.
24
+ ratio: Sequence match ratio.
25
+ """
26
  summary = (
27
  f"✅ Correct (≥ {int(pass_threshold * 100)}%)"
28
+ if passed else
29
+ f"❌ Not a match (need ≥ {int(pass_threshold * 100)}%)"
30
  )
31
  score = f"Similarity: {ratio * 100:.1f}%"
32
  return summary, score
33
 
34
 
35
  def make_alignment_html(ref_tokens, hyp_tokens, alignments):
36
+ """Returns HTML showing alignment between the target and recognized user audio.
37
+ Parameters:
38
+ ref_tokens: Target sentence for the user to say, tokenized.
39
+ hyp_tokens: Recognized utterance from the user, tokenized.
40
+ alignments: Tuples of alignment pattern (equal, delete, insert) and corresponding indices in `hyp_tokens`.
41
+ """
42
  out = []
43
  no_match_html = ' <span style="background:#ffe0e0;text-decoration:line-through;">'
44
  match_html = ' <span style="background:#e0ffe0;">'
 
55
  elif op == "replace":
56
  out.append(no_match_html + ref_string + "</span>")
57
  out.append(match_html + hyp_string + "</span>")
58
+ html = '<div style="line-height:1.6;font-size:1rem;">' + "".join(
59
+ out).strip() + "</div>"
60
  return html
61
 
62
 
63
  def make_html(sentence_match):
64
+ """Creates the HTML written out to the UI based on the results.
65
+ Parameters:
66
+ sentence_match: Class that stores the features of the target - user utterance alignment
67
+ Returns:
68
+ diff_html: An HTML string showing how the target sentence and recognized user utterance matches.
69
+ result_html: An HTML string summarizing the results of the match between target and user utterance.
70
+ """
71
+ diff_html = make_alignment_html(sentence_match.target_tokens,
72
+ sentence_match.user_tokens,
73
+ sentence_match.alignments)
74
+ result_html, score_html = make_result_html(sentence_match.pass_threshold,
75
+ sentence_match.passed,
76
+ sentence_match.ratio)
77
+
78
  return score_html, result_html, diff_html
79
 
80
 
81
  # ------------------- Core Check (English-only) -------------------
82
+ # @spaces.GPU
83
  def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
84
+ model_id: str, device_pref: str) -> (str, str):
85
+ """ASR for the input audio and basic validation.
86
+ Uses the selected ASR model `model_id` to recognize words in the input `audio_path`.
87
+ Parameters:
88
+ audio_path: Processed audio file returned from gradio Audio component.
89
+ target_sentence: Sentence the user needs to say.
90
+ model_id: Desired ASR model.
91
+ device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
92
+ Returns:
93
+ error_msg: If there's an error, a string describing what happened.
94
+ user_transcript: The recognized user utterance.
95
+ """
96
+ # Handles user interaction errors.
97
  if not target_sentence:
98
  return "Please generate a sentence first.", ""
99
+ # TODO: Automatically stop the recording if someone presses the Transcribe & Check button.
100
  if audio_path is None:
101
+ return "Please start, record, then stop the audio recording before trying to transcribe.", ""
 
 
 
102
 
103
+ # Runs the automatic speech recognition
104
  user_transcript = process.run_asr(audio_path, model_id, device_pref)
105
 
106
+ # Handles processing errors.
107
  if isinstance(user_transcript, Exception):
108
  return f"Transcription failed: {user_transcript}", ""
109
  return "", user_transcript
110
 
111
 
112
+ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
113
+ pass_threshold):
114
+ """Transcribe user, calculate match to target sentence, create results HTML.
115
+ Parameters:
116
+ audio_path: Local path to recorded audio.
117
+ target_sentence: Sentence the user needs to say.
118
+ model_id: Desired ASR model.
119
+ device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
120
+ Returns:
121
+ user_transcript: The recognized user utterance
122
+ score_html: HTML string to display the score
123
+ diff_html: HTML string for displaying the differences between target and user utterance
124
+ result_html: HTML string describing the results, or an error message
125
+ clone_audio: Bool for whether to allow audio cloning: This makes the audio cloning components visible
126
+ """
127
  clone_audio = False
128
+ # Transcribe user input
129
+ error_msg, user_transcript = get_user_transcript(audio_path,
130
+ target_sentence, model_id,
131
+ device_pref)
132
  if error_msg:
133
  score_html = ""
134
  diff_html = ""
135
  result_html = error_msg
136
  else:
137
+ # Calculate match details between the target and recognized user input
138
+ sentence_match = process.SentenceMatcher(target_sentence,
139
+ user_transcript,
140
+ pass_threshold)
141
  if sentence_match.passed:
142
  clone_audio = True
143
+ # Create the output to print out
144
  score_html, result_html, diff_html = make_html(sentence_match)
145
 
146
  return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
147
 
148
+ def clone_voice(audio_input, text_input, exaggeration_input, cfgw_input, seed_num_input, temperature_input):
 
 
149
  global client
150
+ # Additional specifications for Chatterbox include:
151
+ # exaggeration_input=0.5,
152
+ # temperature_input=0.8,
153
+ # seed_num_input=0,
154
+ # cfgw_input=0.5,
155
+ # api_name="/generate_tts_audio"
156
+ return client.predict(text_input=text_input,
157
+ audio_prompt_path_input=handle_file(audio_input),
158
+ exaggeration_input=exaggeration_input,
159
+ cfgw_input=cfgw_input,
160
+ seed_num_input=seed_num_input, temperature_input=temperature_input)
161
+
162
+
163
+ # ------------------- UI -------------------
164
+ with gr.Blocks(title="Voice Consent Gate") as demo:
165
+ gr.Markdown("# Voice Consent Gate: Demo")
166
  gr.Markdown(
167
  """
168
+ ## 🎤 Say the Sentence (English)
169
  1) Generate a sentence.
170
  2) Record yourself reading it.
171
  3) Transcribe & check your accuracy.
172
  4) If matched, clone your voice to speak any sentence you enter.
173
  """
174
  )
175
+ with gr.Accordion(label="Further Details", open=False):
176
+ gr.Markdown("""
177
+ To create a basic consented voice cloning system, you need 2 parts:
178
+ 1. An automatic speech recognition (ASR) system that recognizes a sentence conveying consent from the person whose voice will be cloned.
179
+ 2. A voice-cloning text-to-speech (TTS) system that takes as input text and the speaker’s speech snippets to generate speech.
180
+
181
+ Some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_. This means that a sentence used for consent can **also** be used for voice cloning. We demonstrate one way to do that here.
182
+ """)
 
183
  with gr.Row():
184
+ target = gr.Textbox(label="Target sentence", interactive=False,
185
+ placeholder="Click 'Generate sentence'")
 
 
 
 
186
 
187
  with gr.Row():
188
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
189
  btn_clear = gr.Button("🧹 Clear")
190
 
 
191
  with gr.Row():
192
+ consent_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice", key='consent_audio')
 
 
193
 
 
194
  with gr.Accordion("Advanced settings", open=False):
195
  model_id = gr.Dropdown(
196
  choices=[
197
  "openai/whisper-tiny.en", # fastest (CPU-friendly)
198
+ "openai/whisper-base.en", # better accuracy, a bit slower
199
+ "distil-whisper/distil-small.en" # optional distil English model
200
  ],
201
  value="openai/whisper-tiny.en",
202
  label="ASR model (English only)",
 
204
  device_pref = gr.Radio(
205
  choices=["auto", "cpu", "cuda"],
206
  value="auto",
207
+ label="Device preference"
 
 
 
208
  )
209
+ pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
210
+ label="Match threshold")
211
 
 
212
  with gr.Row():
213
  btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
 
214
  with gr.Row():
215
  user_transcript = gr.Textbox(label="Transcription", interactive=False)
 
216
  with gr.Row():
217
  score_html = gr.Label(label="Score")
218
  result_html = gr.Label(label="Result")
 
219
  diff_html = gr.HTML(
220
+ label="Word-level diff (red = expected but missing / green = extra or replacement)")
 
221
 
222
+ gr.Markdown("## 🔁 Voice Consent Gate (opens upon consent)")
223
+ # TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
224
  with gr.Row(visible=False) as tts_ui:
225
+ # Using the render decorator so that we can access consent audio after it's recorded.
226
  @gr.render(inputs=consent_audio)
227
  def show_tts(audio_input):
228
  global client
229
  if audio_input:
230
  client = Client("ResembleAI/Chatterbox")
 
 
 
231
  with gr.Row():
232
  with gr.Column():
233
  gr.Markdown("## Audio input")
234
+ tts_audio = gr.Audio(audio_input, type="filepath")
 
 
 
235
  with gr.Row():
236
  with gr.Column():
237
  gr.Markdown("## Text input")
238
  tts_text = gr.Textbox(
239
+ "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
240
+ with gr.Row():
241
+ with gr.Accordion("More options", open=False):
242
+ exaggeration = gr.Slider(
243
+ 0.25, 2, step=.05,
244
+ label="Exaggeration (Neutral = 0.5, extreme values can be unstable)",
245
+ value=.5
246
  )
247
+ cfg_weight = gr.Slider(
248
+ 0.2, 1, step=.05, label="CFG/Pace", value=0.5
249
+ )
250
+ seed_num = gr.Number(value=0,
251
+ label="Random seed (0 for random)")
252
+ temp = gr.Slider(0.05, 5, step=.05,
253
+ label="Temperature", value=.8)
254
  with gr.Row():
255
  clone_btn = gr.Button("Clone!")
256
  cloned_audio = gr.Audio()
257
+ clone_btn.click(fn=clone_voice,
258
+ inputs=[tts_audio, tts_text, exaggeration,
259
+ cfg_weight, seed_num, temp],
260
+ outputs=[cloned_audio])
261
+
262
+ def gen_sentence_action():
263
+ # chatterbox model name, detailed prompt (short_prompt=False)
264
+ try:
265
+ return generate.gen_sentence_llm(
266
+ "chatterbox",
267
+ fallback_on_error=False # ← show errors during testing
268
+ )
269
+ except Exception as e:
270
+ # Show a helpful message directly in the Target sentence box
271
+ return f"[ERROR calling LLM] {type(e).__name__}: {e}"
272
+
273
+ # -------- Events --------
274
+ # Generate sentence: fixed model name + detailed prompt
275
  btn_gen.click(
276
+ fn=gen_sentence_action,
277
+ outputs=target
 
278
  )
279
 
 
280
  btn_clear.click(
281
  fn=clear_all,
282
+ outputs=[target, user_transcript, score_html, result_html, diff_html, tts_ui]
283
  )
284
 
 
285
  btn_check.click(
286
  fn=transcribe_check,
287
  inputs=[consent_audio, target, model_id, device_pref, pass_threshold],
288
+ outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
289
  )
290
 
291
+
292
  if __name__ == "__main__":
293
+ demo.launch(show_error=True)
src/generate.py CHANGED
@@ -1,16 +1,32 @@
1
- import random
2
- import re
3
- from typing import Literal
4
- import os
 
 
 
 
 
 
 
 
5
 
6
- from transformers import pipeline, AutoTokenizer
 
 
 
 
 
 
 
 
 
7
 
8
  import src.process as process
9
  from src.prompts import get_consent_generation_prompt
10
 
11
- HF_TOKEN = os.getenv("HF_TOKEN")
12
 
13
- # ------------------- Sentence Bank (customize freely) -------------------
14
  SENTENCE_BANK = [
15
  "The quick brown fox jumps over the lazy dog.",
16
  "I promise to speak clearly and at a steady pace.",
@@ -24,101 +40,142 @@ SENTENCE_BANK = [
24
  "This microphone test checks my pronunciation accuracy.",
25
  ]
26
 
27
- # ------------------- Model IDs -------------------
28
- CHAT_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" # changed from chat format to instruct format
29
- INSTRUCT_MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct" # plain prompt
30
-
31
- # ------------------- Helpers -------------------
32
- def _clean_output(text: str) -> str:
33
- """Trim prompt echoes / role tags / quotes and keep it neat."""
34
- # Remove typical chat role prefixes if present
35
- text = re.sub(r"^\s*(assistant|system|user)\s*[::]\s*", "", text, flags=re.I)
36
- # Drop surrounding quotes/backticks
37
- text = text.strip().strip('`"\' ')
38
- # Collapse whitespace
39
- text = re.sub(r"\s+", " ", text).strip()
40
- return text
41
-
42
- # ------------------- Generators -------------------
43
- def _clean(text: str) -> str:
44
- # Remove prompt echo if present and tidy whitespace/quotes
45
- text = text.strip().strip('`"\' ')
46
- text = re.sub(r"\s+", " ", text)
47
- return text
48
-
49
- def gen_sentence_llm_chat():
50
- model_id = "Qwen/Qwen2.5-1.5B-Instruct"
51
- prompt = get_consent_generation_prompt("chatterbox")
52
-
53
- tok = AutoTokenizer.from_pretrained(model_id)
54
- gen = pipeline(
55
- "text-generation",
56
- model=model_id,
57
- tokenizer=tok,
58
- device_map="auto",
59
- torch_dtype="auto",
60
- )
61
-
62
- out = gen(
63
- prompt,
64
- max_new_tokens=60,
65
- temperature=0.6,
66
- repetition_penalty=1.08,
67
- pad_token_id=tok.eos_token_id,
68
- )[0]["generated_text"]
69
-
70
- # strip prompt echo if model returns prompt+completion
71
- if out.startswith(prompt):
72
- out = out[len(prompt):]
73
-
74
- return process.normalize_text(_clean(out), lower=False)
75
-
76
-
77
- def gen_sentence_llm_instruct() -> str:
78
  """
79
- Llama Instruct (plain prompt): pass the instruction directly.
80
- Returns a cleaned sentence.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  """
82
- prompt_text = get_consent_generation_prompt("chatterbox")
83
-
84
- tokenizer = AutoTokenizer.from_pretrained(INSTRUCT_MODEL_ID, token=HF_TOKEN)
85
- generator = pipeline(
86
- "text-generation",
87
- model=INSTRUCT_MODEL_ID,
88
- tokenizer=tokenizer,
89
- device_map="auto",
90
- torch_dtype="auto",
91
- token=HF_TOKEN
92
- )
93
-
94
- out = generator(
95
- prompt_text,
96
- max_new_tokens=80,
97
- temperature=0.7,
98
- num_return_sequences=1,
99
- pad_token_id=tokenizer.eos_token_id,
100
- )[0]["generated_text"]
101
-
102
- # Some instruct models return prompt+completion
103
- if out.startswith(prompt_text):
104
- out = out[len(prompt_text):]
105
-
106
- return _clean_output(out)
107
-
108
-
109
- def gen_sentence(model_choice: Literal["qwen-instruct", "llama-instruct"]) -> str:
110
  """
111
- Switcher: call the appropriate generator by a simple string key.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  """
113
- if model_choice == "qwen-instruct":
114
- return gen_sentence_llm_chat()
115
- elif model_choice == "llama-instruct":
116
- return gen_sentence_llm_instruct()
117
- else:
118
- # Fallback to instruct to avoid prompt-echoing on unknown keys
119
- return gen_sentence_llm_instruct()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
 
122
  def gen_sentence_set() -> str:
123
- """Returns a sentence for the user to say using a prespecified set of options."""
124
- return random.choice(SENTENCE_BANK)
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/generate.py
2
+ """
3
+ Module: generate
4
+ ----------------
5
+ Handles the generation of "consent sentences" for the Voice Consent Gate demo.
6
+
7
+ This module connects to an external language model (in this case, the public
8
+ Hugging Face Space for Llama 3.2 3B Instruct) to generate natural-sounding
9
+ sentences that users can read aloud to give informed consent for voice cloning.
10
+
11
+ If the model call fails (e.g., due to rate limits or network issues),
12
+ a fallback sentence is chosen from a small built-in sentence bank.
13
 
14
+ Functions:
15
+ - _extract_llama_text(): Normalize the API output from the Llama demo.
16
+ - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
17
+ - gen_sentence_set(): Select a random prewritten sentence (for fallback/testing).
18
+ """
19
+
20
+ import os
21
+ import random
22
+ from typing import Any
23
+ from gradio_client import Client
24
 
25
  import src.process as process
26
  from src.prompts import get_consent_generation_prompt
27
 
 
28
 
29
+ # ------------------- Sentence Bank (unchanged) -------------------
30
  SENTENCE_BANK = [
31
  "The quick brown fox jumps over the lazy dog.",
32
  "I promise to speak clearly and at a steady pace.",
 
40
  "This microphone test checks my pronunciation accuracy.",
41
  ]
42
 
43
+
44
+ # ------------------- Model / Space Configuration -------------------
45
+ # The demo connects to the Llama 3.2 3B Instruct Space on Hugging Face.
46
+ # You can override these defaults by setting environment variables in your Space.
47
+ LLAMA_SPACE_ID = os.getenv(
48
+ "LLAMA_SPACE_ID", "huggingface-projects/llama-3.2-3B-Instruct"
49
+ )
50
+ LLAMA_API_NAME = "/chat" # The Space exposes a single /chat endpoint.
51
+ HF_TOKEN = os.getenv("HF_TOKEN") # Optional; not required for public Spaces.
52
+
53
+
54
+ def _extract_llama_text(result: Any) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  """
56
+ Normalize the API response from the Llama 3.2 3B demo Space into plain text.
57
+
58
+ The Space’s `/chat` endpoint may return different shapes depending on how
59
+ the Gradio app is structured — sometimes a string, other times a dictionary
60
+ or list. This function recursively traverses and extracts the first
61
+ meaningful text string it finds.
62
+
63
+ Parameters
64
+ ----------
65
+ result : Any
66
+ The raw output returned by `client.predict()`.
67
+
68
+ Returns
69
+ -------
70
+ str
71
+ Cleaned text output (may be empty string if extraction fails).
72
  """
73
+ if isinstance(result, str):
74
+ return result.strip()
75
+ if isinstance(result, (int, float, bool)):
76
+ return str(result)
77
+ if isinstance(result, list):
78
+ # If multiple segments are returned (e.g., multiple sentences),
79
+ # join them into one string.
80
+ parts = []
81
+ for x in result:
82
+ s = _extract_llama_text(x)
83
+ if s:
84
+ parts.append(s)
85
+ return " ".join(parts).strip()
86
+ if isinstance(result, dict):
87
+ # Common key names used in Gradio JSON responses
88
+ for key in ("text", "response", "content", "generated_text", "message"):
89
+ v = result.get(key)
90
+ if isinstance(v, str) and v.strip():
91
+ return v.strip()
92
+ return ""
93
+
94
+
95
+ def gen_sentence_llm(
96
+ audio_model_name: str = "chatterbox",
97
+ *,
98
+ fallback_on_error: bool = False # Set True for production to avoid crashes
99
+ ) -> str:
 
100
  """
101
+ Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
102
+
103
+ This function constructs a prompt describing the linguistic and ethical
104
+ requirements for a consent sentence (via `get_consent_generation_prompt`)
105
+ and sends it to the Llama demo hosted on Hugging Face Spaces.
106
+
107
+ The response is normalized into a single English sentence suitable
108
+ for reading aloud.
109
+
110
+ Parameters
111
+ ----------
112
+ audio_model_name : str, optional
113
+ The name of the voice-cloning model to mention in the sentence.
114
+ Defaults to "chatterbox".
115
+ fallback_on_error : bool, optional
116
+ If True, return a random fallback sentence instead of raising
117
+ an error when the Space call fails. Default is False for debugging.
118
+
119
+ Returns
120
+ -------
121
+ str
122
+ A clean, human-readable consent sentence.
123
+
124
+ Raises
125
+ ------
126
+ Exception
127
+ Re-raises the underlying error if `fallback_on_error` is False.
128
  """
129
+ # Generate the full natural-language prompt that the LLM will receive
130
+ prompt = get_consent_generation_prompt(audio_model_name)
131
+
132
+ try:
133
+ # Initialize Gradio client for the Llama demo Space
134
+ client = Client(LLAMA_SPACE_ID, hf_token=HF_TOKEN)
135
+
136
+ # The Llama demo exposes a simple /chat endpoint with standard decoding params
137
+ result = client.predict(
138
+ message=prompt,
139
+ max_new_tokens=128,
140
+ temperature=0.6,
141
+ top_p=0.9,
142
+ top_k=50,
143
+ repetition_penalty=1.2,
144
+ api_name=LLAMA_API_NAME,
145
+ )
146
+
147
+ # Normalize and clean up model output
148
+ text = _extract_llama_text(result)
149
+ text = process.normalize_text(text, lower=False)
150
+
151
+ # Handle empty or malformed outputs
152
+ if not text:
153
+ raise ValueError("Empty response from Llama Space")
154
+
155
+ # In case the model produces multiple lines or options, pick the first full sentence
156
+ first_line = next((ln.strip() for ln in text.splitlines() if ln.strip()), "")
157
+ return first_line or text
158
+
159
+ except Exception as e:
160
+ print(f"[gen_sentence_llm] Llama Space call failed: {type(e).__name__}: {e}")
161
+ if fallback_on_error:
162
+ # If fallback is enabled, use a predefined sentence instead
163
+ return random.choice(SENTENCE_BANK)
164
+ # Otherwise propagate the exception so the UI displays it
165
+ raise
166
 
167
 
168
  def gen_sentence_set() -> str:
169
+ """
170
+ Return a sentence from a predefined static list.
171
+
172
+ This is used as a simple fallback generator when model-based
173
+ generation is unavailable or for testing the ASR pipeline
174
+ without network access.
175
+
176
+ Returns
177
+ -------
178
+ str
179
+ A single English sentence from the fallback bank.
180
+ """
181
+ return random.choice(SENTENCE_BANK)
src/prompts.py CHANGED
@@ -1,6 +1,6 @@
1
  # src/prompts.py
2
 
3
- def get_consent_generation_prompt(audio_model_name: str, short_prompt: bool = False) -> str:
4
  """
5
  Returns a text prompt instructing the model to generate a natural-sounding
6
  consent sentence for voice cloning with the specified model.
@@ -14,15 +14,6 @@ def get_consent_generation_prompt(audio_model_name: str, short_prompt: bool = Fa
14
  str: The prompt text.
15
  """
16
 
17
- if short_prompt:
18
- return (
19
- f"Generate one natural, spoken-style English sentence (10–20 words) in which a person "
20
- f"clearly gives informed consent to use their voice for generating synthetic audio "
21
- f"with the model {audio_model_name}. The sentence should sound conversational, include "
22
- f"a clear consent phrase like 'I give my consent' or 'I agree', mention {audio_model_name} "
23
- f"by name, and be phonetically varied but neutral in tone. Output only the final sentence."
24
- )
25
-
26
  return f"""
27
  Generate a short, natural-sounding English sentence (10–20 words) that a person could say aloud
28
  to clearly state their informed consent to use their voice for generating synthetic audio with
@@ -43,5 +34,6 @@ def get_consent_generation_prompt(audio_model_name: str, short_prompt: bool = Fa
43
  - “I give my consent to use my voice for generating audio with the model {audio_model_name}. This statement is made freely and clearly.”
44
  - “Good afternoon. I agree to the use of my recorded voice for audio generation with the model {audio_model_name}.”
45
 
46
- The output should be one to three natural sentences ready to be spoken aloud for recording purposes.
 
47
  """
 
1
  # src/prompts.py
2
 
3
+ def get_consent_generation_prompt(audio_model_name: str) -> str:
4
  """
5
  Returns a text prompt instructing the model to generate a natural-sounding
6
  consent sentence for voice cloning with the specified model.
 
14
  str: The prompt text.
15
  """
16
 
 
 
 
 
 
 
 
 
 
17
  return f"""
18
  Generate a short, natural-sounding English sentence (10–20 words) that a person could say aloud
19
  to clearly state their informed consent to use their voice for generating synthetic audio with
 
34
  - “I give my consent to use my voice for generating audio with the model {audio_model_name}. This statement is made freely and clearly.”
35
  - “Good afternoon. I agree to the use of my recorded voice for audio generation with the model {audio_model_name}.”
36
 
37
+ The output should be one to three natural sentences ready to be spoken aloud for recording purposes.
38
+ Only output the sentences that the speaker should read, no extra information, no justifications, no formatting or lists. Only the suggested sentence.
39
  """