create_vocab.py
878 B · 38 lines · python Raw
1 #!/usr/bin/env python3
2 import os
3 import json
4 folder_path = "./vocabs"
5
6 all_dict = {}
7
8 def parse_file(filename):
9 dictionary = {
10 "</s>": 2,
11 "<pad>": 0,
12 "<s>": 1,
13 "<unk>": 3,
14 }
15 value = 4
16
17 with open(filename, 'r') as file:
18 for line in file:
19 line = line.strip().split()
20 if line:
21 key = line[0]
22 dictionary[key] = value
23 value += 1
24
25 return dictionary
26
27 for filename in os.listdir(folder_path):
28 filepath = os.path.join(folder_path, filename)
29 lang = filename.split(".")[0]
30 if os.path.isfile(filepath):
31 all_dict[lang] = parse_file(filepath)
32
33
34 output_path = "vocab.json" # Replace "output.json" with the desired output file path
35
36 with open(output_path, 'w') as output_file:
37 json.dump(all_dict, output_file, indent=4, sort_keys=True)
38