-
Notifications
You must be signed in to change notification settings - Fork 0
/
words-pair-extractor.py
56 lines (42 loc) · 1.65 KB
/
words-pair-extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import json
from bs4 import BeautifulSoup
from collections import defaultdict
# Directory containing HTML files
html_folder = './files/'
# Function to clean and split text into words
def extract_words(text):
words_list = text.lower()
# NOTE: additional processing will be added later
return words_list
def break_sentence_into_pairs(sentence):
words_lib = sentence.lower().split()
pairs = []
for i in range(len(words_lib) - 1):
temp_word = f"{words_lib[i]}-{words_lib[i + 1]}"
if len(temp_word) > 1 and (temp_word[0] == 'የ' or temp_word[0] == 'በ'):
temp_word = temp_word[1:]
pairs.append(temp_word)
return pairs
word_dict = defaultdict(int)
count = 0
for filename in os.listdir(html_folder):
if filename.endswith(".html"):
print(filename)
filepath = os.path.join(html_folder, filename)
# Open and parse the HTML file
with open(filepath, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
text = soup.get_text() # Extract text from the HTML
words = break_sentence_into_pairs(text)
print(len(words))
# Update word count dictionary
for word in words:
word_dict[word] += 1
# Convert defaultdict to regular dict
word_dict = dict(word_dict)
# Save the word dictionary to a JSON file
output_file = 'outputs/word_pair_dictionary.json'
with open(output_file, 'w', encoding='utf-8') as json_file:
json.dump(word_dict, json_file, ensure_ascii=False, indent=4)
print(f"Word pair dictionary saved to {output_file}")