Updating the README.md with diarization model links

Browse files

Files changed (1) hide show

README.md +24 -28

README.md CHANGED Viewed

@@ -257,23 +257,21 @@ The model is available for use in the NeMo Framework[7], and can be used as a pr
 ### Method 1. Code snippet
-Load a speaker diarization model [Streaming Sortformer Diarizer v2.1](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2.1) for generating speaker timestamps.
-A speaker diarization model is needed for tracking the speech activity of each speaker.
 ```python
-from nemo.collections.asr.models import SortformerEncLabelModel
 import torch
 diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2.1")
 diar_model.eval().to(torch.device("cuda"))
 asr_model = ASRModel.from_pretrained("nvidia/multitalker-parakeet-streaming-0.6b-v1.nemo")
 asr_model.eval().to(torch.device("cuda"))
-"""
-Use the pre-defined dataclass template `MultitalkerTranscriptionConfig` from `multitalker_transcript_config.py`.
-Configure the diarization model using streaming parameters:
-"""
 from multitalker_transcript_config import MultitalkerTranscriptionConfig
 from omegaconf import OmegaConf
 cfg = OmegaConf.structured(MultitalkerTranscriptionConfig())
@@ -282,9 +280,7 @@ cfg.output_path = "/path/to/output_transcription.json"
 diar_model = MultitalkerTranscriptionConfig.init_diar_model(cfg, diar_model)
-"""
-Load a streaming audio buffer to simulate a real-time audio session.
-"""
 from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
 samples = [{'audio_filepath': cfg.audio_file}]
@@ -295,9 +291,8 @@ streaming_buffer = CacheAwareStreamingAudioBuffer(
 )
 streaming_buffer.append_audio_file(audio_filepath=cfg.audio_file, stream_id=-1)
 streaming_buffer_iter = iter(streaming_buffer)
-"""
-Use a helper class `SpeakerTaggedASR` that handles all ASR and diarization cache data for streaming.
-"""
 from nemo.collections.asr.parts.utils.multispk_transcribe_utils import SpeakerTaggedASR
 multispk_asr_streamer = SpeakerTaggedASR(cfg, asr_model, diar_model)
@@ -324,25 +319,25 @@ print(seglst_dict_list)
 ```
 ### Method 2. Use NeMo example file in NVIDIA/NeMo
 ```
-# Running streaming multitalker Parakeet with streaming Sortformer
-python [NEMO_GIT_FOLDER]/examples/asr/asr_cache_aware_streaming/speech_to_text_multitalker_streaming_infer.py \
-          asr_model=nvidia/multitalker-parakeet-streaming-0.6b-v1 \
-          diar_model=nvidia/diar_streaming_sortformer_4spk-v2 \
-          audio_file="/path/to/example.wav" \
           output_path="/path/to/example_output.json" \ # where to save the output seglst file
 ```
-Or the audio_file can be replaced with the manifest_file
 ```
-# Running streaming multitalker Parakeet with streaming Sortformer
-python [NEMO_GIT_FOLDER]/examples/asr/asr_cache_aware_streaming/speech_to_text_multitalker_streaming_infer.py \
           ... \
-          manifest_file=example.json \
           ... \
 ```
-, where each line is a dictionary containing the following fields:
 ```
 {
     "audio_filepath": "/path/to/multispeaker_audio1.wav",  # path to the input audio file
@@ -408,10 +403,11 @@ Data collection methods vary across individual datasets. The training datasets i
 * All evaluations include overlapping speech.
 * Collar tolerance is 0s for DIHARD III Eval, and 0.25s for CALLHOME-part2 and CH109.
 * Post-Processing (PP) can be optimized on different held-out dataset splits to improve diarization performance.
-| **Latency** | **AMI IHM** | **AMI SDM** | **CH109** | **Mixer 6** |
-|-------------|-------------|-------------|-----------|-------------|
-| 1.12s       | 21.26       | 37.44       | 15.81     | 23.81       |
 ## References

 ### Method 1. Code snippet
+ Load one of the NeMo speaker diarization models:
+ [Streaming Sortformer Diarizer v2](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2.1),
+ [Streaming Sortformer Diarizer v2.1](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2.1)
 ```python
+from nemo.collections.asr.models import SortformerEncLabelModel, ASRModel
 import torch
+# A speaker diarization model is needed for tracking the speech activity of each speaker.
 diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2.1")
 diar_model.eval().to(torch.device("cuda"))
 asr_model = ASRModel.from_pretrained("nvidia/multitalker-parakeet-streaming-0.6b-v1.nemo")
 asr_model.eval().to(torch.device("cuda"))
+# Use the pre-defined dataclass template `MultitalkerTranscriptionConfig` from `multitalker_transcript_config.py`.
+# Configure the diarization model using streaming parameters:
 from multitalker_transcript_config import MultitalkerTranscriptionConfig
 from omegaconf import OmegaConf
 cfg = OmegaConf.structured(MultitalkerTranscriptionConfig())
 diar_model = MultitalkerTranscriptionConfig.init_diar_model(cfg, diar_model)
+# Load your audio file into a streaming audio buffer to simulate a real-time audio session.
 from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
 samples = [{'audio_filepath': cfg.audio_file}]
 )
 streaming_buffer.append_audio_file(audio_filepath=cfg.audio_file, stream_id=-1)
 streaming_buffer_iter = iter(streaming_buffer)
+# Use the helper class `SpeakerTaggedASR`, which handles all ASR and diarization cache data for streaming.
 from nemo.collections.asr.parts.utils.multispk_transcribe_utils import SpeakerTaggedASR
 multispk_asr_streamer = SpeakerTaggedASR(cfg, asr_model, diar_model)
 ```
 ### Method 2. Use NeMo example file in NVIDIA/NeMo
+Use [an multitalker streaming ASR example script file](https://github.com/NVIDIA-NeMo/NeMo/blob/main/examples/asr/asr_cache_aware_streaming/speech_to_text_multitalker_streaming_infer.py) in [NVIDIA NeMo Framework](https://github.com/NVIDIA-NeMo/NeMo) to launch.
 ```
+python ${NEMO_ROOT}/examples/asr/asr_cache_aware_streaming/speech_to_text_multitalker_streaming_infer.py \
+          asr_model=nvidia/multitalker-parakeet-streaming-0.6b-v1 \ # Multitalker ASR model
+          diar_model=nvidia/diar_streaming_sortformer_4spk-v2 \ # Diarization model
+          audio_file="/path/to/example.wav" \ # Your audio file for transcription
           output_path="/path/to/example_output.json" \ # where to save the output seglst file
 ```
+Or the `audio_file` argument can be replaced with the `manifest_file` to handle multiple files in batch mode:
 ```
+python ${NEMO_ROOT}/examples/asr/asr_cache_aware_streaming/speech_to_text_multitalker_streaming_infer.py \
           ... \
+          manifest_file=example.json \ # NeMo style manifest file
           ... \
 ```
+In `example.json` file, each line is a dictionary containing the following fields:
 ```
 {
     "audio_filepath": "/path/to/multispeaker_audio1.wav",  # path to the input audio file
 * All evaluations include overlapping speech.
 * Collar tolerance is 0s for DIHARD III Eval, and 0.25s for CALLHOME-part2 and CH109.
 * Post-Processing (PP) can be optimized on different held-out dataset splits to improve diarization performance.
+* Latency is 1.12s with 13+1 lookahead frames.
+| **Diarization Model** | **AMI IHM** | **AMI SDM** | **CH109** | **Mixer 6** |
+|-----------------------|-------------|-------------|-----------|-------------|
+| [Streaming Sortformer v2](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2) | 21.26       | 37.44       | 15.81     | 23.81       |
 ## References