create_vocab.py
| 1 | #!/usr/bin/env python3 |
| 2 | import os |
| 3 | import json |
| 4 | folder_path = "./vocabs" |
| 5 | |
| 6 | all_dict = {} |
| 7 | |
| 8 | def parse_file(filename): |
| 9 | dictionary = { |
| 10 | "</s>": 2, |
| 11 | "<pad>": 0, |
| 12 | "<s>": 1, |
| 13 | "<unk>": 3, |
| 14 | } |
| 15 | value = 4 |
| 16 | |
| 17 | with open(filename, 'r') as file: |
| 18 | for line in file: |
| 19 | line = line.strip().split() |
| 20 | if line: |
| 21 | key = line[0] |
| 22 | dictionary[key] = value |
| 23 | value += 1 |
| 24 | |
| 25 | return dictionary |
| 26 | |
| 27 | for filename in os.listdir(folder_path): |
| 28 | filepath = os.path.join(folder_path, filename) |
| 29 | lang = filename.split(".")[0] |
| 30 | if os.path.isfile(filepath): |
| 31 | all_dict[lang] = parse_file(filepath) |
| 32 | |
| 33 | |
| 34 | output_path = "vocab.json" # Replace "output.json" with the desired output file path |
| 35 | |
| 36 | with open(output_path, 'w') as output_file: |
| 37 | json.dump(all_dict, output_file, indent=4, sort_keys=True) |
| 38 | |