-
Notifications
You must be signed in to change notification settings - Fork 7
/
produce_txt.py
31 lines (27 loc) · 940 Bytes
/
produce_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import csv
from nltk.tokenize import sent_tokenize, word_tokenize
goal = 'test'
data_path = '{}_ai2_gorc.csv'.format(goal)
def process(text):
text_array = []
for sentence in sent_tokenize(text=text):
text_array.append(sentence)
return text_array
f = open('_{}_ai2_gorc.txt'.format(goal), 'w')
g = open('_{}_ai2_gorc.index'.format(goal), 'w')
with open(data_path) as csv_file:
reader = csv.reader(csv_file, quotechar='"')
for idx, line in enumerate(reader):
line[1] = line[1].replace('\001', '')
line[2] = line[2].replace('\001', '')
label = int(line[0])
text_1 = process(line[1])
text_2 = process(line[2])
idx_1 = len(text_1)
idx_2 = len(text_2)
g.write('{}\n'.format(idx_1))
g.write('{}\n'.format(idx_2))
for i in text_1:
f.write(i+'\n')
for i in text_2:
f.write(i+'\n')