-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexicons.py
155 lines (117 loc) · 5.8 KB
/
lexicons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import pandas as pd
import spacy
from spacy_arguing_lexicon import ArguingLexiconParser
# download the NRC emotion lexicon from http://sentiment.nrc.ca/lexicons-for-research/NRC-Emotion-Lexicon.zip
# unzip it and put it under "lexicon", in the same folder as this file, OR
# put it wherever you want and provide the full path in the parameter 'filepath' of
# "NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
NRC_FILEPATH = "lexicon/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
def load_nrc_emotions(filepath=NRC_FILEPATH ):
emolex_df = pd.read_csv(filepath, names=["word", "emotion", "association"], sep='\t')
emolex_words = emolex_df.pivot(index='word', columns='emotion', values='association').reset_index()
emolex_words.head()
emolex_words.set_index(['word'], inplace=True)
return emolex_df, emolex_words
def count_lexicons_one(text, categories, lexicon_words_df):
# initialize
counts_dict = {}
total_words = 0
found = 0
for category in categories:
counts_dict[category] = 0
for word in text.split():
total_words += 1
if word in lexicon_words_df.index:
for category in categories:
found += 1
counts_dict[category] += lexicon_words_df.loc[word][category]
counts_dict['ratio'] = round(float(found) / float(total_words), 5)
return counts_dict
def _count_nrc_emotions_and_sentiments(row, emotions,sentiments, nrc_df,
text_column = 'text',
path=NRC_FILEPATH, prefix='nrc_'):
text = row[text_column]
emotions_count_dict =count_lexicons_one(text, emotions, nrc_df)
sentiments_count_dict =count_lexicons_one(text, sentiments, nrc_df)
for k, v in emotions_count_dict.items():
row[prefix+k] = v
for k, v in sentiments_count_dict.items():
row[prefix+k] = v
return row
# 'df': The dataframe that contains the data
# 'text_column': The column name that contains the text that should be analyzed
# 'path': The path to the nrc lexicon. Default: "lexicon/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
# 'prefix': the prefic to attach to the column names that contain the results. Default: nrc
def count_nrc_emotions_and_sentiments(df, text_column = 'text', path=NRC_FILEPATH, prefix='nrc_'):
nrc_df, nrc_words_df = load_nrc_emotions(path)
emotions = list(nrc_df.emotion.unique() )
emotions.remove('positive')
emotions.remove('negative')
sentiments = ['negative', 'positive']
result_df = df.copy()
result_df = result_df.apply(_count_nrc_emotions_and_sentiments, axis=1, args=(emotions,sentiments, nrc_words_df,
text_column, path, prefix))
return result_df
# MPQA Subjectivity. Download opinion finder v2.0
# Run command
#
#'''
#cd ~/lexicons/opinionfinderv2.0
#java -Xmx1g -classpath ./lib/weka.jar:./lib/stanford-postagger.jar:opinionfinder.jar opin.main.RunOpinionFinder {path}/corpus.doclist -d
#'''
# Opinion finder data
def apply_mpqa_sentences_subjectivity(row, path='annotations/opinionfinder'):
# annotation path
opinion_finder_output = '{}/{}.txt_auto_anns/'
sentence_subjectivity_file_path = opinion_finder_output + 'sent_subj.txt'
# extract annotations
doc_id = row.name
annotation_path = sentence_subjectivity_file_path.format(path, doc_id)
annotations =pd.read_csv(annotation_path, header=None, sep='\t', names=['id', 'sent_polarity'])
# count obj and subj
counts_dic = annotations['sent_polarity'].value_counts()
polarity_sum = annotations['sent_polarity'].value_counts().sum()
for key in counts_dic.keys():
row['mpqa_subjobg_'+key] = float(counts_dic[key])
# total
#path = 'corpus/{}.txt'.format(article_id)
#text = ''# get text
##with open(path, 'r') as f:
# text = f.read()
#total_sentences = len(sent_tokenize(text))
# save to dataframe
#row['mpqa_polarity_ratio'] = round(float(polarity_sum) / float(total_sentences ), 5)
return row
def count_mpqa_subj_obj(df, annotations_path = 'annotations/opinionfinder'):
df = df.apply(apply_mpqa_sentences_subjectivity, args=(annotations_path,), axis=1)
return df
def load_arg_lexicon():
arg_lexicon_labels = ['wants', 'contrast', 'assessments','doubt',
'authority','emphasis','necessity','causation',
'generalization','structure','conditionals',
'inconsistency','possibility','priority','difficulty',
'inyourshoes','rhetoricalquestion']
arg_lex_nlp = spacy.load("en")
arg_lex_nlp.add_pipe(ArguingLexiconParser(lang=arg_lex_nlp.lang))
return arg_lex_nlp, arg_lexicon_labels
def _count_mpqa_arg(row, arg_lex_nlp, arg_lexicon_labels, text_column, prefix='mpqa_arg_'):
text = row[text_column]
doc = arg_lex_nlp(text)
arguments = list(doc._.arguments.get_argument_spans_and_matches())
total = len(arguments)
# init vals
for label in arg_lexicon_labels:
row[prefix+label] = 0
# count lexicon
total_arg_words = 0
for arg in arguments:
arg_span = arg[0]
lexicon_label = arg_span.label_
row[prefix+lexicon_label] += 1
total_arg_words += len(arg_span.text)
row['mpqa_arg_lexicon_ratio'] = round(float(total) / float( len(text.split())), 5)
return row
def count_mpqa_arg(df, text_column='text',prefix='mpqa_arg_'):
arg_lex_nlp, arg_lexicon_labels = load_arg_lexicon()
result = df.apply(_count_mpqa_arg, axis=1, args=(arg_lex_nlp, arg_lexicon_labels, text_column, prefix))
return result