-
-
Notifications
You must be signed in to change notification settings - Fork 107
/
Social Network Analysis of Tweets Using R
124 lines (109 loc) · 3.11 KB
/
Social Network Analysis of Tweets Using R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Read file
apple <- read.csv(file.choose(), header = T)
# Build corpus
library(tm)
corpus <- iconv(apple$text, to = "utf-8-mac")
corpus <- Corpus(VectorSource(corpus))
# Clean text
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
cleanset <- tm_map(corpus, removeWords, stopwords('english'))
removeURL <- function(x) gsub('http[[:alnum:]]*', '', x)
cleanset <- tm_map(cleanset, content_transformer(removeURL))
cleanset <- tm_map(cleanset, removeWords, c('aapl', 'apple'))
cleanset <- tm_map(cleanset, gsub,
pattern = 'stocks',
replacement = 'stock')
cleanset <- tm_map(cleanset, stripWhitespace)
# Term document matrix
tdm <- TermDocumentMatrix(cleanset)
tdm <- as.matrix(tdm)
tdm <- tdm[rowSums(tdm)>30,]
tdm[1:10,1:10]
# Network of terms
library(igraph)
tdm[tdm>1] <- 1
termM <- tdm %*% t(tdm)
termM[1:10,1:10]
g <- graph.adjacency(termM, weighted = T, mode = 'undirected')
g
g <- simplify(g)
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# Histogram of node degree
hist(V(g)$degree,
breaks = 100,
col = 'green',
main = 'Histogram of Node Degree',
ylab = 'Frequency',
xlab = 'Degree of Vertices')
# Network diagram
set.seed(222)
plot(g)
plot(g,
vertex.color='green',
vertex.size = 4,
vertex.label.dist = 1.5,
vertex.label = NA)
# Community detection
comm <- cluster_edge_betweenness(g)
plot(comm, g)
prop <- cluster_label_prop(g)
plot(prop, g)
greed <- cluster_fast_greedy(as.undirected(g))
plot(greed, as.undirected(g))
# Hub and authorities
hs <- hub_score(g, weights = NA)$vector
as <- authority_score(g, weights=NA)$vector
par(mfrow=c(1,2))
plot(g, vertex.size=hs*50, main='Hubs',
vertex.label=NA,
vertex.color=rainbow(50))
plot(g, vertex.size=as*30, main='Authorities',
vertex.label=NA,
vertex.color=rainbow(50))
par(mfrow=c(1,1))
# Highlighting degrees
V(g)$label.cex <- 2.2*V(g)$degree / max(V(g)$degree) + 0.3
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight) + .4)
E(g)$color <- rgb(.5, .5, 0, egam)
E(g)$width <- egam
plot(g,
vertex.color='green',
vertex.size = V(g)$degree*.5)
# Network of tweets
tweetM <- t(tdm) %*% tdm
g <- graph.adjacency(tweetM, weighted = T, mode = 'undirected')
V(g)$degree <- degree(g)
g <- simplify(g)
hist(V(g)$degree,
breaks = 100,
col = 'green',
main = 'Histogram of Degree',
ylab = 'Freuqency',
xlab = 'Degree')
# Set labels of vertices to tweet IDs
V(g)$label <- V(g)$name
V(g)$label.cex <- 1
V(g)$label.color <- rgb(.4, 0, 0, .7)
V(g)$size <- 2
V(g)$frame.color <- NA
plot(g, vertex.label=NA, vertex.size=6)
# Delete vertices
egam <- (log(E(g)$weight)+.2)/ max(log(E(g)$weight)+ .2)
E(g)$color <- rgb(.5, .5, 0, egam)
E(g)$width <- egam
g2 <- delete.vertices(g, V(g)[degree(g)<40])
plot(g2,
vertex.label.cex = .9,
vertex.label.color = 'black')
# Delete edges
E(g)$color <- rgb(.5, .5, 0, egam)
E(g)$width <- egam
g3 <- delete.edges(g, E(g)$weight <- 1)
g3 <- delete.vertices(g3, V(g3)[degree(g3)<20])
plot(g3)
apple$text[c(747, 430)]