-
Notifications
You must be signed in to change notification settings - Fork 2
/
preprocessing.py
75 lines (67 loc) · 2.68 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import re
import string
from string import punctuation
from nltk.corpus import stopwords
import json
import numpy as np
class Preprocessing :
def __init__(self):
print("Initializing preprocessing...")
pass
def clean_text(self, text):
words = text.split()
for word in words:
if str(word).__contains__('@'):
print("removing "+word)
text = text.replace(word, '')
if str(word).__contains__('http'):
text = text.replace(word, '')
if str(word).__contains__("#"):
text = text.replace(word, '')
text = str(text).strip()
if len(text) != 0:
return text
return ''
def clean_csv(self, csv_file):
data = pd.read_csv(csv_file)
data = data.dropna(axis= 0, how='any')
data['text'] = data['text'].apply(self.clean_text)
data_cleaned = pd.DataFrame(data= {'text': data['text']})
data_cleaned.to_csv("data_cleaned.csv", header= False, index= True, encoding= "utf-8")
def processTweet(self, tweet):
tweet = re.sub(r'\&\w*;', '', tweet)
tweet = re.sub('@[^\s]+','',tweet)
tweet = re.sub(r'\$\w*', '', tweet)
tweet = tweet.lower()
tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
tweet = re.sub(r'#\w*', '', tweet)
tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet)
tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
tweet = re.sub(r'\s\s+', ' ', tweet)
tweet = tweet.lstrip(' ')
tweet = ''.join(c for c in tweet if c <= '\uFFFF')
return tweet
def text_process(self,raw_text):
nopunc = [char for char in list(raw_text) if char not in string.punctuation]
nopunc = ''.join(nopunc)
return [word for word in nopunc.lower().split() if word.lower() not in stopwords.words('english')]
def geo_mean(self, x):
y = json.loads(x)
y = np.asarray(y)
mean_geoloc = np.add(np.add((y[:][0][0]), (y[:][0][1])) / 2, (np.add((y[:][0][2]), (y[:][0][3])) / 2)) / 2
print("HASIL GEOMEAN "+ str(mean_geoloc))
return np.around(mean_geoloc, decimals=6)
def find_centroid(self, row):
try:
row_ = eval(row)
lst_of_coords = [item for sublist in row_ for item in sublist]
longitude = [p[0] for p in lst_of_coords]
latitude = [p[1] for p in lst_of_coords]
#lat lng
hasil = (sum(latitude) / float(len(latitude)), sum(longitude) / float(len(longitude)))
print(hasil)
return hasil
except Exception as e:
print(e)
return None