File size: 5,599 Bytes
a00d269 5bc7dec e95056a 5bc7dec a00d269 5bc7dec e95056a 5bc7dec a00d269 5bc7dec e95056a 5bc7dec e95056a 5bc7dec e95056a a00d269 5bc7dec a00d269 5bc7dec e95056a a00d269 5bc7dec a00d269 5bc7dec a00d269 5bc7dec a00d269 5bc7dec a00d269 5bc7dec a00d269 e95056a 5bc7dec a00d269 5bc7dec a00d269 5bc7dec e95056a 5bc7dec a00d269 5bc7dec a00d269 5bc7dec a00d269 5bc7dec a00d269 5bc7dec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import gradio as gr
import requests
import os
from typing import Optional
# Hugging Face Inference API
API_URL = "https://huggingface.co/proxy/api-inference.huggingface.co/models/ibm-granite/granite-speech-3.3-2b"
def query_inference_api(audio_file_path: str, hf_token: Optional[str] = None) -> str:
"""
Query the Hugging Face Inference API for speech transcription
"""
headers = {}
if hf_token:
headers["Authorization"] = f"Bearer {hf_token}"
try:
with open(audio_file_path, "rb") as f:
data = f.read()
response = requests.post(API_URL, headers=headers, data=data, timeout=60)
if response.status_code == 200:
result = response.json()
if isinstance(result, dict) and 'text' in result:
return result['text']
elif isinstance(result, list) and len(result) > 0:
return result[0].get('generated_text', str(result))
else:
return str(result)
else:
return f"API Error {response.status_code}: {response.text}"
except requests.exceptions.Timeout:
return "β Request timed out. The model might be loading. Please try again in a few minutes."
except Exception as e:
return f"β Error: {str(e)}"
def transcribe_with_local_processing(audio_file_path: str) -> str:
"""
Fallback: Simple local audio processing without heavy models
"""
try:
import soundfile as sf
# Read audio file info
data, samplerate = sf.read(audio_file_path)
duration = len(data) / samplerate
return f"""
π **Audio File Analysis:**
- Duration: {duration:.2f} seconds
- Sample Rate: {samplerate} Hz
- Channels: {'Mono' if len(data.shape) == 1 else 'Stereo'}
β οΈ **For actual transcription**:
This demo shows the file was processed successfully.
For full transcription, you would need:
1. A Hugging Face token (free to get)
2. Or run this on hardware with more resources
The Granite Speech 3.3-2B model supports:
- English, French, German, Spanish, Portuguese
- Speech-to-text transcription
- Speech translation to English
"""
except Exception as e:
return f"β Error processing audio: {str(e)}"
def process_audio(audio_file, hf_token):
"""Main processing function"""
if audio_file is None:
return "β Please upload an audio file."
# Try Inference API first if token provided
if hf_token and hf_token.strip():
result = query_inference_api(audio_file, hf_token.strip())
if not result.startswith("β"):
return f"π€ **Transcription Result:**\n\n{result}"
# Fallback to local processing
return transcribe_with_local_processing(audio_file)
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(
title="Granite Speech Demo",
theme=gr.themes.Soft(),
css="footer {visibility: hidden}"
) as demo:
gr.Markdown("""
# π€ IBM Granite Speech 3.3-2B Demo
**Two ways to use this demo:**
1. **With HF Token** (recommended): Get free token from [Hugging Face Settings](https://huggingface.co/settings/tokens)
2. **Without Token**: Basic audio file analysis
**Supported Languages**: English, French, German, Spanish, Portuguese
""")
with gr.Row():
with gr.Column(scale=1):
# Token input
hf_token = gr.Textbox(
label="π Hugging Face Token (Optional)",
placeholder="hf_xxx... (get from huggingface.co/settings/tokens)",
type="password",
info="Paste your free HF token for full transcription"
)
# Audio input
audio_input = gr.Audio(
label="π Upload Audio File",
type="filepath",
format="wav"
)
# Process button
process_btn = gr.Button("π― Process Audio", variant="primary", size="lg")
# Example info
gr.Markdown("""
### π‘ Tips:
- **Get HF Token**: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) β "New token" β "Read" access
- **Audio format**: WAV, MP3, M4A supported
- **Length**: Keep under 1 minute for best results
- **Quality**: Clear speech works best
""")
with gr.Column(scale=2):
# Output
output = gr.Textbox(
label="π Results",
lines=12,
interactive=False,
placeholder="Upload audio and click 'Process Audio' to see transcription..."
)
# Event handler
process_btn.click(
fn=process_audio,
inputs=[audio_input, hf_token],
outputs=output
)
# Footer info
gr.Markdown("""
---
**About**: This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition.
Model supports multilingual transcription and translation capabilities.
""")
return demo
# Launch the app
if __name__ == "__main__":
demo = create_interface()
demo.launch(server_name="0.0.0.0", server_port=7860) |