-
Notifications
You must be signed in to change notification settings - Fork 0
/
labelled_data_processing.py
95 lines (55 loc) · 2.39 KB
/
labelled_data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# This file transforms the original data from the DEFT tweet classification challenge into a csv file
# this csv contains : the id of the tweet, its polarity (pos, neg, neutral or mixt), the nature of the tweets and a finer classification of the sentiment or emotion
import pandas as pd
# opening the original files
def from_text_to_csv(in_path, out_path):
# simple functions which turns a space separated text file into a csv
txt_file = open(in_path, 'r')
out_file = open(out_path, 'w')
txt_file.readline()
for line in txt_file:
(id, el) = line.strip().split('\t')
out_file.write(','.join([id, el]) + '\n')
txt_file.close()
out_file.close()
polarity = open('Data/labelled_data/Test_References/T1.txt', 'r')
polarity_csv = open('Data/labelled_data/Test_References/test_polarity.csv', 'w')
polarity.readline() # skip first line
for line in polarity:
(id, pol) = line.strip().split('\t')
polarity_csv.write(','.join([id, pol]) + '\n')
polarity.close()
polarity_csv.close()
#polarity = pd.read_csv('Data/Train_References-22042015/Train_References/T1.csv')
#print(polarity)
nature = open('Data/labelled_data/Test_References/test_nature.txt', 'r')
nature_csv = open('Data/labelled_data/Test_References/test_nature.csv', 'w')
nature.readline() # skip first line
for line in nature:
print(line.strip().split('\t'))
(id, nat) = line.strip().split('\t')
nature_csv.write(','.join([id, nat]) + '\n')
nature.close()
nature_csv.close()
from_text_to_csv('Data/labelled_data/Test_References/test_emotion.txt', 'Data/labelled_data/Test_References/test_emotion.csv')
polarity = pd.read_csv('Data/labelled_data/Test_References/test_polarity.csv')
print(polarity)
nature = pd.read_csv('Data/labelled_data/Test_References/test_nature.csv')
print(nature)
emotion = pd.read_csv('Data/labelled_data/Test_References/test_emotion.csv')
print(emotion)
"""data_list = []
tweet_list = open('Data/TRAIN_TWEETS_ID-03042015/TRAIN_TWEETS_ID.txt')
print(tweet_list)
for tweet in tweet_list:
pol = polarity[polarity[] == tweet].to_dict('records')
print(pol)
#nat =
#em ="""
polarity.columns = ['id', 'polarity']
nature.columns = ['id', 'nature']
emotion.columns = ['id', 'emotion']
all_data = pd.merge(polarity, nature, how='outer', on='id')
all_data = pd.merge(all_data, emotion, how='outer', on='id')
print(all_data)
all_data.to_csv('labelled_test_data.csv')