-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_data_preprocessing.py
123 lines (92 loc) · 3.11 KB
/
twitter_data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import csv
import re
import string
import matplotlib.pyplot as plt
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker
from textblob import TextBlob
twitter = pd.read_csv(
r"C:\Users\pawan_300\Desktop\Project work\ml files\ml project\tweets.csv"
)
print(twitter.head(5))
# Cleaning
def stopword():
stop = stopwords.words("english")
twitter["Text"] = twitter["Text"].apply(
lambda x: " ".join([word for word in x.split() if word not in (stop)])
) # Stopword removal
def remove():
twitter["Text"] = twitter["Text"].apply(
lambda x: re.sub(r"http\S+", "", x)
) # for url
twitter["Text"] = twitter["Text"].apply(
lambda x: re.sub(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", "", x)
) # for email
twitter["Text"] = twitter["Text"].apply(
lambda x: re.sub("@[^\s]+", "", x)
) # for username
twitter["Text"] = twitter["Text"].apply(
lambda x: re.sub("#[^\s]+", "", x)
) # for trending # words
twitter["Text"] = twitter["Text"].apply(lambda x: re.sub("\$\w*", "", x))
stopword()
remove()
punct_num = '''!()-àÂ[]{};:\n,<>./?@#%^"&\*_~0123456789=\x92\x92\x96\x85+|'"''' # for punctuation
def punctuation(x):
no_punct = ""
for char in x:
if char not in punct_num:
no_punct = no_punct + char
return no_punct
twitter["Text"] = twitter["Text"].apply(lambda x: punctuation(x))
fileName = r"C:\Users\pawan_300\Desktop\Project work\ml files\ml project\slang.txt"
accessMode = "r"
def slang_translator(user_string):
user_string = user_string.split(" ")
j = 0
for _str in user_string:
with open(fileName, accessMode) as myCSVfile:
dataFromFile = csv.reader(myCSVfile, delimiter="=")
_str = re.sub("[^a-zA-Z0-9-_.]", "", _str)
for row in dataFromFile:
if _str.upper() == row[0]:
user_string[j] = row[1]
myCSVfile.close()
j = j + 1
return " ".join(user_string)
twitter["Text"] = twitter["Text"].apply(lambda x: slang_translator(x))
spell = SpellChecker()
def spellcheck(x):
correct = {}
wrong = spell.unknown(x.split())
for t in wrong:
correct[t] = spell.correction(t)
for t in correct.keys():
x = re.sub(t, correct[t], x, flags=re.IGNORECASE)
return x
twitter["Text"] = twitter["Text"].apply(
lambda x: spellcheck(x)
) # this will take some time
# Polarity
def polarity():
pole = []
t = []
for line in twitter["Text"]:
temp = TextBlob(line).sentiment.polarity
t.append(temp)
if temp > 0:
pole.append("positive")
elif temp < 0:
pole.append("negative")
else:
pole.append("neutral")
return (pole, t)
pole, temp = polarity()
twitter["Sentiment"] = pole
twitter["Sentiment_score"] = temp
print("Distribution :")
plt.hist(pole, histtype="bar", align="mid")