Updating the README.md with diarization model links
Browse files
README.md
CHANGED
|
@@ -257,23 +257,21 @@ The model is available for use in the NeMo Framework[7], and can be used as a pr
|
|
| 257 |
|
| 258 |
### Method 1. Code snippet
|
| 259 |
|
| 260 |
-
Load
|
| 261 |
-
|
| 262 |
-
|
| 263 |
```python
|
| 264 |
-
from nemo.collections.asr.models import SortformerEncLabelModel
|
| 265 |
import torch
|
| 266 |
-
|
| 267 |
diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2.1")
|
| 268 |
diar_model.eval().to(torch.device("cuda"))
|
| 269 |
|
| 270 |
asr_model = ASRModel.from_pretrained("nvidia/multitalker-parakeet-streaming-0.6b-v1.nemo")
|
| 271 |
asr_model.eval().to(torch.device("cuda"))
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
Configure the diarization model using streaming parameters:
|
| 276 |
-
"""
|
| 277 |
from multitalker_transcript_config import MultitalkerTranscriptionConfig
|
| 278 |
from omegaconf import OmegaConf
|
| 279 |
cfg = OmegaConf.structured(MultitalkerTranscriptionConfig())
|
|
@@ -282,9 +280,7 @@ cfg.output_path = "/path/to/output_transcription.json"
|
|
| 282 |
|
| 283 |
diar_model = MultitalkerTranscriptionConfig.init_diar_model(cfg, diar_model)
|
| 284 |
|
| 285 |
-
|
| 286 |
-
Load a streaming audio buffer to simulate a real-time audio session.
|
| 287 |
-
"""
|
| 288 |
from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
|
| 289 |
|
| 290 |
samples = [{'audio_filepath': cfg.audio_file}]
|
|
@@ -295,9 +291,8 @@ streaming_buffer = CacheAwareStreamingAudioBuffer(
|
|
| 295 |
)
|
| 296 |
streaming_buffer.append_audio_file(audio_filepath=cfg.audio_file, stream_id=-1)
|
| 297 |
streaming_buffer_iter = iter(streaming_buffer)
|
| 298 |
-
|
| 299 |
-
Use
|
| 300 |
-
"""
|
| 301 |
from nemo.collections.asr.parts.utils.multispk_transcribe_utils import SpeakerTaggedASR
|
| 302 |
multispk_asr_streamer = SpeakerTaggedASR(cfg, asr_model, diar_model)
|
| 303 |
|
|
@@ -324,25 +319,25 @@ print(seglst_dict_list)
|
|
| 324 |
```
|
| 325 |
|
| 326 |
### Method 2. Use NeMo example file in NVIDIA/NeMo
|
|
|
|
|
|
|
| 327 |
```
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
audio_file="/path/to/example.wav" \
|
| 333 |
output_path="/path/to/example_output.json" \ # where to save the output seglst file
|
| 334 |
```
|
| 335 |
|
| 336 |
-
Or the audio_file can be replaced with the manifest_file
|
| 337 |
```
|
| 338 |
-
|
| 339 |
-
python [NEMO_GIT_FOLDER]/examples/asr/asr_cache_aware_streaming/speech_to_text_multitalker_streaming_infer.py \
|
| 340 |
... \
|
| 341 |
-
manifest_file=example.json \
|
| 342 |
... \
|
| 343 |
```
|
| 344 |
|
| 345 |
-
|
| 346 |
```
|
| 347 |
{
|
| 348 |
"audio_filepath": "/path/to/multispeaker_audio1.wav", # path to the input audio file
|
|
@@ -408,10 +403,11 @@ Data collection methods vary across individual datasets. The training datasets i
|
|
| 408 |
* All evaluations include overlapping speech.
|
| 409 |
* Collar tolerance is 0s for DIHARD III Eval, and 0.25s for CALLHOME-part2 and CH109.
|
| 410 |
* Post-Processing (PP) can be optimized on different held-out dataset splits to improve diarization performance.
|
|
|
|
| 411 |
|
| 412 |
-
| **
|
| 413 |
-
|-------------|-------------|-------------|-----------|-------------|
|
| 414 |
-
|
|
| 415 |
|
| 416 |
## References
|
| 417 |
|
|
|
|
| 257 |
|
| 258 |
### Method 1. Code snippet
|
| 259 |
|
| 260 |
+
Load one of the NeMo speaker diarization models:
|
| 261 |
+
[Streaming Sortformer Diarizer v2](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2.1),
|
| 262 |
+
[Streaming Sortformer Diarizer v2.1](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2.1)
|
| 263 |
```python
|
| 264 |
+
from nemo.collections.asr.models import SortformerEncLabelModel, ASRModel
|
| 265 |
import torch
|
| 266 |
+
# A speaker diarization model is needed for tracking the speech activity of each speaker.
|
| 267 |
diar_model = SortformerEncLabelModel.from_pretrained("nvidia/diar_streaming_sortformer_4spk-v2.1")
|
| 268 |
diar_model.eval().to(torch.device("cuda"))
|
| 269 |
|
| 270 |
asr_model = ASRModel.from_pretrained("nvidia/multitalker-parakeet-streaming-0.6b-v1.nemo")
|
| 271 |
asr_model.eval().to(torch.device("cuda"))
|
| 272 |
|
| 273 |
+
# Use the pre-defined dataclass template `MultitalkerTranscriptionConfig` from `multitalker_transcript_config.py`.
|
| 274 |
+
# Configure the diarization model using streaming parameters:
|
|
|
|
|
|
|
| 275 |
from multitalker_transcript_config import MultitalkerTranscriptionConfig
|
| 276 |
from omegaconf import OmegaConf
|
| 277 |
cfg = OmegaConf.structured(MultitalkerTranscriptionConfig())
|
|
|
|
| 280 |
|
| 281 |
diar_model = MultitalkerTranscriptionConfig.init_diar_model(cfg, diar_model)
|
| 282 |
|
| 283 |
+
# Load your audio file into a streaming audio buffer to simulate a real-time audio session.
|
|
|
|
|
|
|
| 284 |
from nemo.collections.asr.parts.utils.streaming_utils import CacheAwareStreamingAudioBuffer
|
| 285 |
|
| 286 |
samples = [{'audio_filepath': cfg.audio_file}]
|
|
|
|
| 291 |
)
|
| 292 |
streaming_buffer.append_audio_file(audio_filepath=cfg.audio_file, stream_id=-1)
|
| 293 |
streaming_buffer_iter = iter(streaming_buffer)
|
| 294 |
+
|
| 295 |
+
# Use the helper class `SpeakerTaggedASR`, which handles all ASR and diarization cache data for streaming.
|
|
|
|
| 296 |
from nemo.collections.asr.parts.utils.multispk_transcribe_utils import SpeakerTaggedASR
|
| 297 |
multispk_asr_streamer = SpeakerTaggedASR(cfg, asr_model, diar_model)
|
| 298 |
|
|
|
|
| 319 |
```
|
| 320 |
|
| 321 |
### Method 2. Use NeMo example file in NVIDIA/NeMo
|
| 322 |
+
|
| 323 |
+
Use [an multitalker streaming ASR example script file](https://github.com/NVIDIA-NeMo/NeMo/blob/main/examples/asr/asr_cache_aware_streaming/speech_to_text_multitalker_streaming_infer.py) in [NVIDIA NeMo Framework](https://github.com/NVIDIA-NeMo/NeMo) to launch.
|
| 324 |
```
|
| 325 |
+
python ${NEMO_ROOT}/examples/asr/asr_cache_aware_streaming/speech_to_text_multitalker_streaming_infer.py \
|
| 326 |
+
asr_model=nvidia/multitalker-parakeet-streaming-0.6b-v1 \ # Multitalker ASR model
|
| 327 |
+
diar_model=nvidia/diar_streaming_sortformer_4spk-v2 \ # Diarization model
|
| 328 |
+
audio_file="/path/to/example.wav" \ # Your audio file for transcription
|
|
|
|
| 329 |
output_path="/path/to/example_output.json" \ # where to save the output seglst file
|
| 330 |
```
|
| 331 |
|
| 332 |
+
Or the `audio_file` argument can be replaced with the `manifest_file` to handle multiple files in batch mode:
|
| 333 |
```
|
| 334 |
+
python ${NEMO_ROOT}/examples/asr/asr_cache_aware_streaming/speech_to_text_multitalker_streaming_infer.py \
|
|
|
|
| 335 |
... \
|
| 336 |
+
manifest_file=example.json \ # NeMo style manifest file
|
| 337 |
... \
|
| 338 |
```
|
| 339 |
|
| 340 |
+
In `example.json` file, each line is a dictionary containing the following fields:
|
| 341 |
```
|
| 342 |
{
|
| 343 |
"audio_filepath": "/path/to/multispeaker_audio1.wav", # path to the input audio file
|
|
|
|
| 403 |
* All evaluations include overlapping speech.
|
| 404 |
* Collar tolerance is 0s for DIHARD III Eval, and 0.25s for CALLHOME-part2 and CH109.
|
| 405 |
* Post-Processing (PP) can be optimized on different held-out dataset splits to improve diarization performance.
|
| 406 |
+
* Latency is 1.12s with 13+1 lookahead frames.
|
| 407 |
|
| 408 |
+
| **Diarization Model** | **AMI IHM** | **AMI SDM** | **CH109** | **Mixer 6** |
|
| 409 |
+
|-----------------------|-------------|-------------|-----------|-------------|
|
| 410 |
+
| [Streaming Sortformer v2](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2) | 21.26 | 37.44 | 15.81 | 23.81 |
|
| 411 |
|
| 412 |
## References
|
| 413 |
|