-
Notifications
You must be signed in to change notification settings - Fork 0
/
nlp_proj_utils.py
173 lines (135 loc) · 4.99 KB
/
nlp_proj_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
import csv
import emoji
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tarfile
import wget
from typing import NamedTuple, Tuple, Dict
from pandas import DataFrame
from functools import partial
DataMeta = NamedTuple("DataMeta", [("name", str), ("url", str)])
_IMDB = DataMeta(
name="imdb", url="https://www.dropbox.com/s/l9pj9hy2ans3phi/imdb.tar.gz?dl=1"
)
_GLOVE = DataMeta(
name="glove", url="https://www.dropbox.com/s/g5pkso42wq2ipti/glove.tar.gz?dl=1"
)
_EMOJI_MODEL = DataMeta(
name="emoji_model_best",
url="https://www.dropbox.com/s/93frkdrhs33ihfz/emoji_model_best.tar.gz?dl=1",
)
def _download_data_if_needed(data_meta: DataMeta) -> str:
"""
Download and extract dataset if needed
return the path to the dataset
"""
path = os.path.join("resources", data_meta.name)
zip_path = path + ".tar.gz"
if os.path.exists(path):
print("data already available, skip downloading.")
else:
print("start downloading...")
wget.download(data_meta.url, zip_path)
print("start extracting compressed files...")
with tarfile.open(zip_path) as tar:
tar.extractall("resources")
os.remove(zip_path)
print("data files are now available at %s" % path)
return path
def download_best_emoji_model() -> Tuple[str, str]:
"""
Download pretrained emoji model, skip if already downloaded,
return the path to the network json and weights
"""
path = _download_data_if_needed(_EMOJI_MODEL)
return os.path.join(path, "network.json"), os.path.join(path, "weights.h5")
def _get_train_test_df(data_meta: DataMeta) -> Tuple[DataFrame, DataFrame]:
path = _download_data_if_needed(data_meta)
train, test = tuple(
pd.read_csv(os.path.join(path, file)) for file in ["train.csv", "test.csv"]
)
print("{} loaded successfully.".format(data_meta.name))
return train, test
get_imdb_dataset = partial(_get_train_test_df, _IMDB)
def load_glove_vecs() -> Tuple[Dict[str, int], Dict[str, np.array]]:
"""
Download (if necessary) and read GloVe. Two mappings are returned.
1. Word to index, mapping from word to its index in vocabulary,
needed for building Embedding layer in Keras)
2. Word to vector, mapping from word to its vec
"""
path = _download_data_if_needed(_GLOVE)
path = os.path.join(path, "glove.6B.50d.txt")
print("loading glove... this may take a while...")
with open(path, encoding="utf-8") as f:
words = set()
word_to_vec = {}
for line in f:
line_components = line.strip().split()
curr_word = line_components[0]
words.add(curr_word)
word_to_vec[curr_word] = np.array(line_components[1:], dtype=np.float64)
i = 1
word_to_index = {}
for w in sorted(words):
word_to_index[w] = i
i = i + 1
print("glove loaded successfully.")
return word_to_index, word_to_vec
def load_emoji():
def _read_csv(filename):
phrase = []
emoji = []
with open(filename, encoding="utf-8") as csvDataFile:
csvReader = csv.reader(csvDataFile)
for row in csvReader:
phrase.append(row[0])
emoji.append(row[1])
X = np.asarray(phrase)
Y = np.asarray(emoji, dtype=int)
return X, Y
train_x, train_y = _read_csv("resources/emoji/train.csv")
test_x, test_y = _read_csv("resources/emoji/test.csv")
return train_x, test_x, train_y, test_y
def plot_history(his, metrics):
"""
Given a history object returned from `fit` and the name of metrics,
plot the curve of metrics against number of epochs.
"""
for metric in metrics:
plt.plot(his.history[metric], label=metric)
plt.legend()
def convert_to_one_hot(Y, C):
Y = np.eye(C)[Y.reshape(-1)]
return Y
EMOJI_DICT = {
"0": "\u2764\uFE0F", # :heart: prints a black instead of red heart depending on the font
"1": ":baseball:",
"2": ":smile:",
"3": ":disappointed:",
"4": ":fork_and_knife:",
}
def label_to_emoji(label):
return emoji.emojize(EMOJI_DICT[str(label)],language='alias')
def sentences_to_indices(X, word_to_index, max_len, oov=0):
"""
Return a array of indices of a given sentence. The sentence will be trimed/padded to max_len
Args:
X (np.ndarray): Input array of sentences, the shape is (m,) where m is the number of sentences, each sentence is a str.
Example X: array(['Sentence 1', 'Setence 2'])
word_to_index (dict[str->int]): map from a word to its index in vocabulary
Return:
indices (np.ndarray): the shape is (m, max_len) where m is the number of sentences
"""
m = X.shape[0]
X_indices = np.zeros((m, max_len))
for i in range(m):
sentence_words = X[i].lower().split()
j = 0
for w in sentence_words:
X_indices[i, j] = word_to_index.get(w, oov)
j = j + 1
return X_indices