Spaces:

alexandrainst
/

roest-demo

Sleeping

App Files Files Community

roest-demo / app.py

saattrupdan

fix: Revert model ID

d74de95 about 22 hours ago

raw

history blame contribute delete

4.33 kB

	"""Røst speech-to-text demo."""

	import logging
	import warnings

	import gradio as gr
	import numpy as np
	import samplerate
	import torch
	from punctfix import PunctFixer
	from transformers import pipeline
	from dotenv import load_dotenv
	import torch_audiomentations as ta
	import os

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s ⋅ %(name)s ⋅ %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	)
	logger = logging.getLogger("roest-asr-demo")

	load_dotenv()

	warnings.filterwarnings("ignore", category=FutureWarning)


	MODEL_ID = "CoRal-project/roest-wav2vec2-315m-v2"

	TITLE = "Røst Speech-to-text Demo"

	EMAIL_SUBJECT = "Røst tale-til-tekst demo".replace(" ", "+")
	EMAIL_BODY = """
	Hej,

	Jeg har lige prøvet jeres Røst tale-til-tekst demo, og jeg er imponeret!

	Jeg kunne godt tænke mig at høre mere om jeres talegenkendelsesløsninger.

	Min use case er [indsæt use case her].

	Venlig hilsen,
	[dit navn]
	""".strip().replace(" ", "+").replace("\n", "%0D")

	ICON = """
	<svg xmlns="http://www.w3.org/2000/svg" width="25px" height="25px" viewBox="0 0 24 24"
	fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round"
	stroke-linejoin="round" style="display: inline;">
	<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
	<polyline points="17 8 12 3 7 8"/>
	<line x1="12" y1="3" x2="12" y2="15"/>
	</svg>
	"""
	DESCRIPTION = f"""
	This is a demo of the Danish speech recognition model
	[{MODEL_ID}](https://huggingface.co/{MODEL_ID}).

	Press "Record" to record your
	own voice. When you're done you can press "Stop" to stop recording and "Submit" to
	send the audio to the model for transcription. You can also upload an audio file by
	pressing the {ICON} button.

	_If you like what you see and are interested in integrating speech-to-text solutions
	into your products, feel free to
	[contact us](mailto:[email protected]?subject={EMAIL_SUBJECT}&body={EMAIL_BODY})._
	"""

	logger.info("Loading the ASR model...")
	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	transcriber = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_ID,
	device=device,
	token=os.getenv("HUGGINGFACE_HUB_TOKEN")
	)

	logger.info("Loading the punctuation fixer model...")
	transcription_fixer = PunctFixer(language="da", device=device)

	normaliser = ta.PeakNormalization(p=1.0)

	logger.info("Models loaded, ready to transcribe audio.")

	def transcribe_audio(sampling_rate_and_audio: tuple[int, np.ndarray] \| None) -> str:
	"""Transcribe the audio.

	Args:
	sampling_rate_and_audio:
	A tuple with the sampling rate and the audio, or None if no audio was
	provided.

	Returns:
	The transcription.
	"""
	if sampling_rate_and_audio is None:
	return (
	"No audio was provided. Please record or upload an audio clip, and try "
	"again."
	)

	sampling_rate, audio = sampling_rate_and_audio
	if audio.ndim > 1:
	audio = np.mean(audio, axis=1)
	audio = samplerate.resample(audio, 16_000 / sampling_rate, "sinc_best")
	audio = normaliser(torch.tensor(audio).unsqueeze(0).unsqueeze(0)).squeeze().numpy()

	logger.info(f"Transcribing audio clip of {len(audio) / 16_000:.2f} seconds...")
	transcription = transcriber(
	inputs=audio, generate_kwargs=dict(language="danish", task="transcribe")
	)
	if not isinstance(transcription, dict):
	return ""

	logger.info(f"Raw transcription is {transcription['text']!r}. Cleaning it up...")
	cleaned_transcription = transcription_fixer.punctuate(
	text=transcription["text"]
	)

	logger.info(f"Final transcription: {cleaned_transcription!r}")
	return cleaned_transcription

	demo = gr.Interface(
	fn=transcribe_audio,
	inputs=gr.Audio(
	sources=["microphone", "upload"], show_label=False
	),
	outputs="textbox",
	title=TITLE,
	description=DESCRIPTION,
	examples=[
	"https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/bornholmsk.wav",
	"https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/soenderjysk.wav",
	"https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/nordjysk.wav",
	"https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/accent.wav",
	],
	cache_examples=False,
	)

	demo.launch()