-
Notifications
You must be signed in to change notification settings - Fork 39
/
preprocessing.py
31 lines (23 loc) · 1015 Bytes
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
class TSVGenerator(object):
def __init__(self, root_dir='data'):
self.root_dir = root_dir
self.phase = ['train', 'dev', 'test']
self.corpus_path = os.path.join(root_dir, "{}.sen")
self.label_path = os.path.join(root_dir, "{}.lab")
def __call__(self, phase):
assert phase in self.phase, 'Unable phase'
corpus_path = self.corpus_path.format(phase)
label_path = self.label_path.format(phase)
corpus = [line.replace('\n', '').strip()
for line in open(corpus_path, 'r').readlines()]
label = [line.replace('\n', '').strip()
for line in open(label_path, 'r').readlines()]
with open(os.path.join(self.root_dir, f'{phase}.tsv'), 'w') as f:
for sen, lab in zip(corpus, label):
f.write('{}\t{}\n'.format(sen, lab))
if __name__ == '__main__':
generator = TSVGenerator()
target = ['train', 'dev', 'test']
for val in target:
generator(val)