Inconstancies with EOU token ; Can any one check?

#7
by m-aliabbas1 - opened
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.ASRModel.restore_from("models/parakeet_realtime_eou_120m-v1/parakeet_realtime_eou_120m-v1.nemo")

# Create a simplified streaming demo
# Note: True streaming with this model requires the NeMo streaming API
# This demonstrates batch processing with configurable chunk sizes

class StreamingASR:
    def __init__(self, model, chunk_size_ms=1000):
        """
        Args:
            model: The loaded ASR model
            chunk_size_ms: Size of each audio chunk in milliseconds (1000ms default)
        """
        self.model = model
        self.sample_rate = model.cfg.sample_rate
        self.chunk_size_samples = int(chunk_size_ms * self.sample_rate / 1000)
    
    def stream_file(self, audio_file, chunk_size_ms=1000):
        """
        Process audio file in chunks to simulate streaming
        
        Args:
            audio_file: path to audio file
            chunk_size_ms: chunk duration in milliseconds
        """
        # Load audio file
        audio, sr = sf.read(audio_file)
        
        # Resample if needed
        if sr != self.sample_rate:
            try:
                import librosa
                audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
            except ImportError:
                print("Warning: librosa not installed")
        
        print(f"Processing audio file: {audio_file}")
        print(f"Audio length: {len(audio)/self.sample_rate:.2f} seconds")
        print(f"Processing in {chunk_size_ms}ms chunks...")
        print("-" * 60)
        
        # Calculate chunk size
        chunk_samples = int(chunk_size_ms * self.sample_rate / 1000)
        num_chunks = (len(audio) + chunk_samples - 1) // chunk_samples
        
        all_transcripts = []
        
        # Process each chunk
        for i in range(num_chunks):
            start_idx = i * chunk_samples
            end_idx = min(start_idx + chunk_samples, len(audio))
            chunk = audio[start_idx:end_idx]
            
            # Transcribe chunk
            try:
                result = self.model.transcribe([chunk], batch_size=1)
                
                if result and len(result) > 0:
                    text = result[0].text if hasattr(result[0], 'text') else str(result[0])
                    
                    # Check for EOU marker
                    has_eou = "<EOU>" in text or "</s>" in text
                    # text = text.replace("<EOU>", "").replace("</s>", "").strip()
                    
                    if text:
                        eou_marker = " [EOU detected]" if has_eou else ""
                        chunk_time = start_idx / self.sample_rate
                        print(f"[{chunk_time:.2f}s] Chunk {i+1}/{num_chunks}: {text}{eou_marker}")
                        all_transcripts.append(text)
            except Exception as e:
                print(f"Error in chunk {i+1}: {e}")
        
        final_transcript = " ".join(all_transcripts)
        print("-" * 60)
        print(f"Final transcript: {final_transcript}")
        
        return final_transcript

print("βœ… StreamingASR class defined")

# Create the streaming processor (using 1-second chunks for better accuracy)
streaming_asr = StreamingASR(asr_model, chunk_size_ms=1000)
print("βœ… Streaming ASR processor initialized")

Output
```
Processing audio file: 2086-149220-0033.wav
Audio length: 7.43 seconds
Processing in 1000ms chunks...

Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.04it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.04it/s]
[0.00s] Chunk 1/8: well i don't [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.25it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.25it/s]
[1.00s] Chunk 2/8: wish to see it anymore [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.21it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.21it/s]
[2.00s] Chunk 3/8: observed phoebe [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.58it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.58it/s]
[3.00s] Chunk 4/8: turning away her
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.61it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.61it/s]
[4.00s] Chunk 5/8: eyes [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.69it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.69it/s]
[5.00s] Chunk 6/8: it is certainly very [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.55it/s]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 5.55it/s]
[6.00s] Chunk 7/8: like the old portrait [EOU detected]
Transcribing: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 6.28it/s]
[7.00s] Chunk 8/8: [EOU detected]

Final transcript: well i don't wish to see it anymore observed phoebe turning away her eyes it is certainly very like the old portrait


Hi @m-aliabbas1 ,
We recommend using the following script for streaming chunk-wise inference with the parakeet_realtime_eou_120m-v1 model - https://github.com/NVIDIA-NeMo/NeMo/blob/main/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py. The parakeet_realtime_eou_120m-v1 model processes audio in 80msec chunks by design. Can you please try the above inference script and let us know if you run into any issues?

Sign up or log in to comment