Spaces:
Sleeping
Sleeping
| """Røst speech-to-text demo.""" | |
| import logging | |
| import warnings | |
| import gradio as gr | |
| import numpy as np | |
| import samplerate | |
| import torch | |
| from punctfix import PunctFixer | |
| from transformers import pipeline | |
| from dotenv import load_dotenv | |
| import torch_audiomentations as ta | |
| import os | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s ⋅ %(name)s ⋅ %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| logger = logging.getLogger("roest-asr-demo") | |
| load_dotenv() | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| MODEL_ID = "CoRal-project/roest-wav2vec2-315m-v2" | |
| TITLE = "Røst Speech-to-text Demo" | |
| EMAIL_SUBJECT = "Røst tale-til-tekst demo".replace(" ", "+") | |
| EMAIL_BODY = """ | |
| Hej, | |
| Jeg har lige prøvet jeres Røst tale-til-tekst demo, og jeg er imponeret! | |
| Jeg kunne godt tænke mig at høre mere om jeres talegenkendelsesløsninger. | |
| Min use case er [indsæt use case her]. | |
| Venlig hilsen, | |
| [dit navn] | |
| """.strip().replace(" ", "+").replace("\n", "%0D") | |
| ICON = """ | |
| <svg xmlns="http://www.w3.org/2000/svg" width="25px" height="25px" viewBox="0 0 24 24" | |
| fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" | |
| stroke-linejoin="round" style="display: inline;"> | |
| <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
| <polyline points="17 8 12 3 7 8"/> | |
| <line x1="12" y1="3" x2="12" y2="15"/> | |
| </svg> | |
| """ | |
| DESCRIPTION = f""" | |
| This is a demo of the Danish speech recognition model | |
| [{MODEL_ID}](https://huggingface.co/{MODEL_ID}). | |
| Press "Record" to record your | |
| own voice. When you're done you can press "Stop" to stop recording and "Submit" to | |
| send the audio to the model for transcription. You can also upload an audio file by | |
| pressing the {ICON} button. | |
| _If you like what you see and are interested in integrating speech-to-text solutions | |
| into your products, feel free to | |
| [contact us](mailto:[email protected]?subject={EMAIL_SUBJECT}&body={EMAIL_BODY})._ | |
| """ | |
| logger.info("Loading the ASR model...") | |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| transcriber = pipeline( | |
| task="automatic-speech-recognition", | |
| model=MODEL_ID, | |
| device=device, | |
| token=os.getenv("HUGGINGFACE_HUB_TOKEN") | |
| ) | |
| logger.info("Loading the punctuation fixer model...") | |
| transcription_fixer = PunctFixer(language="da", device=device) | |
| normaliser = ta.PeakNormalization(p=1.0) | |
| logger.info("Models loaded, ready to transcribe audio.") | |
| def transcribe_audio(sampling_rate_and_audio: tuple[int, np.ndarray] | None) -> str: | |
| """Transcribe the audio. | |
| Args: | |
| sampling_rate_and_audio: | |
| A tuple with the sampling rate and the audio, or None if no audio was | |
| provided. | |
| Returns: | |
| The transcription. | |
| """ | |
| if sampling_rate_and_audio is None: | |
| return ( | |
| "No audio was provided. Please record or upload an audio clip, and try " | |
| "again." | |
| ) | |
| sampling_rate, audio = sampling_rate_and_audio | |
| if audio.ndim > 1: | |
| audio = np.mean(audio, axis=1) | |
| audio = samplerate.resample(audio, 16_000 / sampling_rate, "sinc_best") | |
| audio = normaliser(torch.tensor(audio).unsqueeze(0).unsqueeze(0)).squeeze().numpy() | |
| logger.info(f"Transcribing audio clip of {len(audio) / 16_000:.2f} seconds...") | |
| transcription = transcriber( | |
| inputs=audio, generate_kwargs=dict(language="danish", task="transcribe") | |
| ) | |
| if not isinstance(transcription, dict): | |
| return "" | |
| logger.info(f"Raw transcription is {transcription['text']!r}. Cleaning it up...") | |
| cleaned_transcription = transcription_fixer.punctuate( | |
| text=transcription["text"] | |
| ) | |
| logger.info(f"Final transcription: {cleaned_transcription!r}") | |
| return cleaned_transcription | |
| demo = gr.Interface( | |
| fn=transcribe_audio, | |
| inputs=gr.Audio( | |
| sources=["microphone", "upload"], show_label=False | |
| ), | |
| outputs="textbox", | |
| title=TITLE, | |
| description=DESCRIPTION, | |
| examples=[ | |
| "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/bornholmsk.wav", | |
| "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/soenderjysk.wav", | |
| "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/nordjysk.wav", | |
| "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/accent.wav", | |
| ], | |
| cache_examples=False, | |
| ) | |
| demo.launch() | |