Spaces:

alexandrainst
/

roest-demo

Sleeping

App Files Files Community

saattrupdan commited on 26 days ago

Commit

7089c67

1 Parent(s): ae07559

chore: Update deps

Browse files

Files changed (2) hide show

app.py +5 -5
requirements.txt +21 -14

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Røst speech-to-text demo."""
 import logging
-import os
 import warnings
 import gradio as gr
@@ -11,6 +10,7 @@ import torch
 from punctfix import PunctFixer
 from transformers import pipeline
 from dotenv import load_dotenv
 logging.basicConfig(
     level=logging.INFO,
@@ -69,6 +69,8 @@ transcriber = pipeline(
 logger.info("Loading the punctuation fixer model...")
 transcription_fixer = PunctFixer(language="da", device=device)
 logger.info("Models loaded, ready to transcribe audio.")
 def transcribe_audio(sampling_rate_and_audio: tuple[int, np.ndarray] | None) -> str:
@@ -92,6 +94,7 @@ def transcribe_audio(sampling_rate_and_audio: tuple[int, np.ndarray] | None) ->
     if audio.ndim > 1:
         audio = np.mean(audio, axis=1)
     audio = samplerate.resample(audio, 16_000 / sampling_rate, "sinc_best")
     logger.info(f"Transcribing audio clip of {len(audio) / 16_000:.2f} seconds...")
     transcription = transcriber(
@@ -111,13 +114,11 @@ def transcribe_audio(sampling_rate_and_audio: tuple[int, np.ndarray] | None) ->
 demo = gr.Interface(
     fn=transcribe_audio,
     inputs=gr.Audio(
-        sources=["microphone", "upload"], show_label=False, min_length=1, max_length=60
     ),
     outputs="textbox",
     title=TITLE,
     description=DESCRIPTION,
-    css="p { font-size: 1.0rem; }",
-    allow_flagging="never",
     examples=[
         "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/bornholmsk.wav",
         "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/soenderjysk.wav",
@@ -125,7 +126,6 @@ demo = gr.Interface(
         "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/accent.wav",
     ],
     cache_examples=False,
-    theme=gr.themes.Soft(primary_hue="orange"),
 )
 demo.launch()

 """Røst speech-to-text demo."""
 import logging
 import warnings
 import gradio as gr
 from punctfix import PunctFixer
 from transformers import pipeline
 from dotenv import load_dotenv
+import torch_audiomentations as ta
 logging.basicConfig(
     level=logging.INFO,
 logger.info("Loading the punctuation fixer model...")
 transcription_fixer = PunctFixer(language="da", device=device)
+normaliser = ta.PeakNormalization(p=1.0)
 logger.info("Models loaded, ready to transcribe audio.")
 def transcribe_audio(sampling_rate_and_audio: tuple[int, np.ndarray] | None) -> str:
     if audio.ndim > 1:
         audio = np.mean(audio, axis=1)
     audio = samplerate.resample(audio, 16_000 / sampling_rate, "sinc_best")
+    audio = normaliser(torch.tensor(audio).unsqueeze(0).unsqueeze(0)).squeeze().numpy()
     logger.info(f"Transcribing audio clip of {len(audio) / 16_000:.2f} seconds...")
     transcription = transcriber(
 demo = gr.Interface(
     fn=transcribe_audio,
     inputs=gr.Audio(
+        sources=["microphone", "upload"], show_label=False
     ),
     outputs="textbox",
     title=TITLE,
     description=DESCRIPTION,
     examples=[
         "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/bornholmsk.wav",
         "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/soenderjysk.wav",
         "https://filedn.com/lRBwPhPxgV74tO0rDoe8SpH/audio-examples/accent.wav",
     ],
     cache_examples=False,
 )
 demo.launch()

requirements.txt CHANGED Viewed

@@ -1,29 +1,33 @@
 aiofiles==23.2.1
 annotated-types==0.7.0
 anyio==4.4.0
 attrs==24.2.0
 certifi==2024.8.30
 charset-normalizer==3.3.2
 click==8.1.7
 contourpy==1.3.0
 cycler==0.12.1
 exceptiongroup==1.2.2
-fastapi==0.115.0
 ffmpy==0.4.0
 filelock==3.16.1
 fonttools==4.53.1
 fsspec==2024.9.0
-gradio==4.44.0
-gradio_client==1.3.0
 h11==0.14.0
 httpcore==1.0.5
 httpx==0.27.2
-huggingface-hub==0.25.0
 hypothesis==6.112.1
 idna==3.10
 importlib_resources==6.4.5
 Jinja2==3.1.4
-kenlm @ https://github.com/kpu/kenlm/archive/master.zip
 kiwisolver==1.4.7
 markdown-it-py==3.0.0
 MarkupSafe==2.1.5
@@ -38,21 +42,22 @@ pandas==2.2.2
 pillow==10.4.0
 punctfix==0.11.1
 pyctcdecode==0.5.0
-pydantic==2.9.2
-pydantic_core==2.23.4
 pydub==0.25.1
 Pygments==2.18.0
 pygtrie==2.5.0
 pyparsing==3.1.4
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
-python-multipart==0.0.9
 pytz==2024.2
 PyYAML==6.0.2
 regex==2024.9.11
 requests==2.32.3
 rich==13.8.1
 ruff==0.6.5
 safetensors==0.4.5
 samplerate==0.2.1
 semantic-version==2.10.0
@@ -60,15 +65,17 @@ shellingham==1.5.4
 six==1.16.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-starlette==0.38.5
-sympy==1.13.2
-tokenizers==0.19.1
 tomlkit==0.12.0
-torch==2.4.1
 tqdm==4.66.5
-transformers==4.44.2
 typer==0.12.5
-typing_extensions==4.12.2
 tzdata==2024.1
 urllib3==2.2.3
 uvicorn==0.30.6

 aiofiles==23.2.1
+annotated-doc==0.0.4
 annotated-types==0.7.0
 anyio==4.4.0
 attrs==24.2.0
+brotli==1.2.0
 certifi==2024.8.30
 charset-normalizer==3.3.2
 click==8.1.7
 contourpy==1.3.0
 cycler==0.12.1
 exceptiongroup==1.2.2
+fastapi==0.122.0
 ffmpy==0.4.0
 filelock==3.16.1
 fonttools==4.53.1
 fsspec==2024.9.0
+gradio==6.0.1
+gradio_client==2.0.0
+groovy==0.1.2
 h11==0.14.0
+hf-xet==1.2.0
 httpcore==1.0.5
 httpx==0.27.2
+huggingface-hub==0.36.0
 hypothesis==6.112.1
 idna==3.10
 importlib_resources==6.4.5
 Jinja2==3.1.4
+kenlm @ https://github.com/kpu/kenlm/archive/master.zip#sha256=d23d300d559a45a5e3ede958dbbf2395231119c0b8cd97a1ea43480625894ff4
 kiwisolver==1.4.7
 markdown-it-py==3.0.0
 MarkupSafe==2.1.5
 pillow==10.4.0
 punctfix==0.11.1
 pyctcdecode==0.5.0
+pydantic==2.12.4
+pydantic_core==2.41.5
 pydub==0.25.1
 Pygments==2.18.0
 pygtrie==2.5.0
 pyparsing==3.1.4
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
+python-multipart==0.0.20
 pytz==2024.2
 PyYAML==6.0.2
 regex==2024.9.11
 requests==2.32.3
 rich==13.8.1
 ruff==0.6.5
+safehttpx==0.1.7
 safetensors==0.4.5
 samplerate==0.2.1
 semantic-version==2.10.0
 six==1.16.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
+starlette==0.50.0
+sympy==1.14.0
+tokenizers==0.22.1
 tomlkit==0.12.0
+torch==2.9.1
 tqdm==4.66.5
+transformers==4.57.3
 typer==0.12.5
+typer-slim==0.20.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
 tzdata==2024.1
 urllib3==2.2.3
 uvicorn==0.30.6