README.md
2.6 KB · 234 lines · markdown Raw
1 ---
2 language:
3 - ab
4 - af
5 - ak
6 - am
7 - ar
8 - as
9 - av
10 - ay
11 - az
12 - ba
13 - bm
14 - be
15 - bn
16 - bi
17 - bo
18 - sh
19 - br
20 - bg
21 - ca
22 - cs
23 - ce
24 - cv
25 - ku
26 - cy
27 - da
28 - de
29 - dv
30 - dz
31 - el
32 - en
33 - eo
34 - et
35 - eu
36 - ee
37 - fo
38 - fa
39 - fj
40 - fi
41 - fr
42 - fy
43 - ff
44 - ga
45 - gl
46 - gn
47 - gu
48 - zh
49 - ht
50 - ha
51 - he
52 - hi
53 - sh
54 - hu
55 - hy
56 - ig
57 - ia
58 - ms
59 - is
60 - it
61 - jv
62 - ja
63 - kn
64 - ka
65 - kk
66 - kr
67 - km
68 - ki
69 - rw
70 - ky
71 - ko
72 - kv
73 - lo
74 - la
75 - lv
76 - ln
77 - lt
78 - lb
79 - lg
80 - mh
81 - ml
82 - mr
83 - ms
84 - mk
85 - mg
86 - mt
87 - mn
88 - mi
89 - my
90 - zh
91 - nl
92 - 'no'
93 - 'no'
94 - ne
95 - ny
96 - oc
97 - om
98 - or
99 - os
100 - pa
101 - pl
102 - pt
103 - ms
104 - ps
105 - qu
106 - qu
107 - qu
108 - qu
109 - qu
110 - qu
111 - qu
112 - qu
113 - qu
114 - qu
115 - qu
116 - qu
117 - qu
118 - qu
119 - qu
120 - qu
121 - qu
122 - qu
123 - qu
124 - qu
125 - qu
126 - qu
127 - ro
128 - rn
129 - ru
130 - sg
131 - sk
132 - sl
133 - sm
134 - sn
135 - sd
136 - so
137 - es
138 - sq
139 - su
140 - sv
141 - sw
142 - ta
143 - tt
144 - te
145 - tg
146 - tl
147 - th
148 - ti
149 - ts
150 - tr
151 - uk
152 - ms
153 - vi
154 - wo
155 - xh
156 - ms
157 - yo
158 - ms
159 - zu
160 - za
161 license: cc-by-nc-4.0
162 tags:
163 - mms
164 - wav2vec2
165 - audio
166 - voice
167 - speech
168 - forced-alignment
169 pipeline_tag: automatic-speech-recognition
170 ---
171
172 # Forced Alignment with Hugging Face CTC Models
173 This Python package provides an efficient way to perform forced alignment between text and audio using Hugging Face's pretrained models. it also features an improved implementation to use much less memory than TorchAudio forced alignment API.
174
175 The model checkpoint uploaded here is a conversion from torchaudio to HF Transformers for the MMS-300M checkpoint trained on forced alignment dataset
176
177 ## Installation
178
179 ```bash
180 pip install git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
181 ```
182 ## Usage
183
184 ```python
185 import torch
186 from ctc_forced_aligner import (
187 load_audio,
188 load_alignment_model,
189 generate_emissions,
190 preprocess_text,
191 get_alignments,
192 get_spans,
193 postprocess_results,
194 )
195
196 audio_path = "your/audio/path"
197 text_path = "your/text/path"
198 language = "iso" # ISO-639-3 Language code
199 device = "cuda" if torch.cuda.is_available() else "cpu"
200 batch_size = 16
201
202
203 alignment_model, alignment_tokenizer = load_alignment_model(
204 device,
205 dtype=torch.float16 if device == "cuda" else torch.float32,
206 )
207
208 audio_waveform = load_audio(audio_path, alignment_model.dtype, alignment_model.device)
209
210
211 with open(text_path, "r") as f:
212 lines = f.readlines()
213 text = "".join(line for line in lines).replace("\n", " ").strip()
214
215 emissions, stride = generate_emissions(
216 alignment_model, audio_waveform, batch_size=batch_size
217 )
218
219 tokens_starred, text_starred = preprocess_text(
220 text,
221 romanize=True,
222 language=language,
223 )
224
225 segments, scores, blank_token = get_alignments(
226 emissions,
227 tokens_starred,
228 alignment_tokenizer,
229 )
230
231 spans = get_spans(tokens_starred, segments, blank_token)
232
233 word_timestamps = postprocess_results(text_starred, spans, stride, scores)
234 ```