File size: 5,599 Bytes
a00d269
5bc7dec
e95056a
5bc7dec
a00d269
5bc7dec
 
e95056a
5bc7dec
 
 
 
 
 
 
a00d269
 
5bc7dec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e95056a
5bc7dec
 
 
e95056a
5bc7dec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e95056a
a00d269
5bc7dec
a00d269
5bc7dec
 
e95056a
 
a00d269
5bc7dec
 
 
 
 
 
 
 
a00d269
5bc7dec
 
 
 
 
 
 
 
 
a00d269
 
 
5bc7dec
 
 
a00d269
5bc7dec
a00d269
 
 
5bc7dec
 
 
 
 
 
 
 
a00d269
e95056a
5bc7dec
 
a00d269
 
 
 
5bc7dec
 
 
 
 
 
 
 
 
 
 
a00d269
5bc7dec
 
e95056a
5bc7dec
 
 
 
a00d269
 
5bc7dec
 
 
 
 
 
 
 
a00d269
5bc7dec
 
 
a00d269
 
 
 
5bc7dec
a00d269
5bc7dec
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import gradio as gr
import requests
import os
from typing import Optional

# Hugging Face Inference API
API_URL = "https://huggingface.co/proxy/api-inference.huggingface.co/models/ibm-granite/granite-speech-3.3-2b"

def query_inference_api(audio_file_path: str, hf_token: Optional[str] = None) -> str:
    """
    Query the Hugging Face Inference API for speech transcription
    """
    headers = {}
    if hf_token:
        headers["Authorization"] = f"Bearer {hf_token}"
    
    try:
        with open(audio_file_path, "rb") as f:
            data = f.read()
        
        response = requests.post(API_URL, headers=headers, data=data, timeout=60)
        
        if response.status_code == 200:
            result = response.json()
            if isinstance(result, dict) and 'text' in result:
                return result['text']
            elif isinstance(result, list) and len(result) > 0:
                return result[0].get('generated_text', str(result))
            else:
                return str(result)
        else:
            return f"API Error {response.status_code}: {response.text}"
            
    except requests.exceptions.Timeout:
        return "❌ Request timed out. The model might be loading. Please try again in a few minutes."
    except Exception as e:
        return f"❌ Error: {str(e)}"

def transcribe_with_local_processing(audio_file_path: str) -> str:
    """
    Fallback: Simple local audio processing without heavy models
    """
    try:
        import soundfile as sf
        
        # Read audio file info
        data, samplerate = sf.read(audio_file_path)
        duration = len(data) / samplerate
        
        return f"""
πŸ“Š **Audio File Analysis:**
- Duration: {duration:.2f} seconds
- Sample Rate: {samplerate} Hz
- Channels: {'Mono' if len(data.shape) == 1 else 'Stereo'}

⚠️ **For actual transcription**: 
This demo shows the file was processed successfully. 
For full transcription, you would need:
1. A Hugging Face token (free to get)
2. Or run this on hardware with more resources

The Granite Speech 3.3-2B model supports:
- English, French, German, Spanish, Portuguese
- Speech-to-text transcription
- Speech translation to English
        """
        
    except Exception as e:
        return f"❌ Error processing audio: {str(e)}"

def process_audio(audio_file, hf_token):
    """Main processing function"""
    if audio_file is None:
        return "❌ Please upload an audio file."
    
    # Try Inference API first if token provided
    if hf_token and hf_token.strip():
        result = query_inference_api(audio_file, hf_token.strip())
        if not result.startswith("❌"):
            return f"🎀 **Transcription Result:**\n\n{result}"
    
    # Fallback to local processing
    return transcribe_with_local_processing(audio_file)

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(
        title="Granite Speech Demo", 
        theme=gr.themes.Soft(),
        css="footer {visibility: hidden}"
    ) as demo:
        
        gr.Markdown("""
        # 🎀 IBM Granite Speech 3.3-2B Demo
        
        **Two ways to use this demo:**
        1. **With HF Token** (recommended): Get free token from [Hugging Face Settings](https://huggingface.co/settings/tokens)
        2. **Without Token**: Basic audio file analysis
        
        **Supported Languages**: English, French, German, Spanish, Portuguese
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                # Token input
                hf_token = gr.Textbox(
                    label="πŸ”‘ Hugging Face Token (Optional)",
                    placeholder="hf_xxx... (get from huggingface.co/settings/tokens)",
                    type="password",
                    info="Paste your free HF token for full transcription"
                )
                
                # Audio input
                audio_input = gr.Audio(
                    label="πŸ“ Upload Audio File",
                    type="filepath",
                    format="wav"
                )
                
                # Process button
                process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg")
                
                # Example info
                gr.Markdown("""
                ### πŸ’‘ Tips:
                - **Get HF Token**: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) β†’ "New token" β†’ "Read" access
                - **Audio format**: WAV, MP3, M4A supported
                - **Length**: Keep under 1 minute for best results
                - **Quality**: Clear speech works best
                """)
            
            with gr.Column(scale=2):
                # Output
                output = gr.Textbox(
                    label="πŸ“ Results",
                    lines=12,
                    interactive=False,
                    placeholder="Upload audio and click 'Process Audio' to see transcription..."
                )
        
        # Event handler
        process_btn.click(
            fn=process_audio,
            inputs=[audio_input, hf_token],
            outputs=output
        )
        
        # Footer info
        gr.Markdown("""
        ---
        **About**: This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition.
        Model supports multilingual transcription and translation capabilities.
        """)
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(server_name="0.0.0.0", server_port=7860)