README.md
| 1 | --- |
| 2 | language: ko |
| 3 | datasets: |
| 4 | - kresnik/zeroth_korean |
| 5 | tags: |
| 6 | - speech |
| 7 | - audio |
| 8 | - automatic-speech-recognition |
| 9 | license: apache-2.0 |
| 10 | |
| 11 | model-index: |
| 12 | - name: 'Wav2Vec2 XLSR Korean' |
| 13 | results: |
| 14 | - task: |
| 15 | name: Automatic Speech Recognition |
| 16 | type: automatic-speech-recognition |
| 17 | dataset: |
| 18 | name: Zeroth Korean |
| 19 | type: kresnik/zeroth_korean |
| 20 | args: clean |
| 21 | metrics: |
| 22 | - name: Test WER |
| 23 | type: wer |
| 24 | value: 4.74 |
| 25 | - name: Test CER |
| 26 | type: cer |
| 27 | value: 1.78 |
| 28 | |
| 29 | --- |
| 30 | |
| 31 | |
| 32 | ## Evaluation on Zeroth-Korean ASR corpus |
| 33 | |
| 34 | [Google colab notebook(Korean)](https://colab.research.google.com/github/indra622/tutorials/blob/master/wav2vec2_korean_tutorial.ipynb) |
| 35 | |
| 36 | ``` |
| 37 | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
| 38 | from datasets import load_dataset |
| 39 | import soundfile as sf |
| 40 | import torch |
| 41 | from jiwer import wer |
| 42 | |
| 43 | processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean") |
| 44 | |
| 45 | model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean").to('cuda') |
| 46 | |
| 47 | ds = load_dataset("kresnik/zeroth_korean", "clean") |
| 48 | |
| 49 | test_ds = ds['test'] |
| 50 | |
| 51 | def map_to_array(batch): |
| 52 | speech, _ = sf.read(batch["file"]) |
| 53 | batch["speech"] = speech |
| 54 | return batch |
| 55 | |
| 56 | test_ds = test_ds.map(map_to_array) |
| 57 | |
| 58 | def map_to_pred(batch): |
| 59 | inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding="longest") |
| 60 | input_values = inputs.input_values.to("cuda") |
| 61 | |
| 62 | with torch.no_grad(): |
| 63 | logits = model(input_values).logits |
| 64 | |
| 65 | predicted_ids = torch.argmax(logits, dim=-1) |
| 66 | transcription = processor.batch_decode(predicted_ids) |
| 67 | batch["transcription"] = transcription |
| 68 | return batch |
| 69 | |
| 70 | result = test_ds.map(map_to_pred, batched=True, batch_size=16, remove_columns=["speech"]) |
| 71 | |
| 72 | print("WER:", wer(result["text"], result["transcription"])) |
| 73 | |
| 74 | ``` |
| 75 | |
| 76 | ### Expected WER: 4.74% |
| 77 | ### Expected CER: 1.78% |