-
Notifications
You must be signed in to change notification settings - Fork 1
/
fb_embedded_comment_scraper.py
541 lines (423 loc) · 20.7 KB
/
fb_embedded_comment_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
# -*- coding: utf-8 -*-
"""
fb_embedded_comment_scraper.py
@author: methylDragon
. .
. |\-^-/| .
/| } O.=.O { |\
/´ \ \_ ~ _/ / `\
/´ | \-/ ~ \-/ | `\
| | /\\ //\ | |
\|\|\/-""-""-\/|/|/
______/ /
'------'
_ _ _ ___
_ __ ___| |_| |_ _ _| || \ _ _ __ _ __ _ ___ _ _
| ' \/ -_) _| ' \ || | || |) | '_/ _` / _` / _ \ ' \
|_|_|_\___|\__|_||_\_, |_||___/|_| \__,_\__, \___/_||_|
|__/ |___/
-------------------------------------------------------
github.com/methylDragon
Description:
This script extracts JSON data that Facebook uses to power its embedded comment
platform, picking out comments. (TO A MAXIMUM OF 100 COMMENTS PER LINK!)
It can display nested replies, all in order too!
It writes to a specified .csv file the parsed data with columns:
Reply Depth |@@@| Link |@@@| Description |@@@| Total Comments |@@@| Username |@@@| City |@@@| Date-Time |@@@| Comment |@@@| Likes |@@@| Replies
"""
from datetime import datetime
import requests
import json
import csv
from domain_link_scraper import get_domain_links
import sys
import logging
import socket
import time
from timeit import default_timer as timer
# =============================================================================
# SET PARAMETERS HERE!
# =============================================================================
if __name__ == "__main__":
"""
NOTE: Make sure there's no http:// or https:// in the source_URL list
If you want to crawl something like
"troublesome_site.com/page=<??>/other_stuff"
Write it as a tuple! ("troublesome_site.com/page=","/other_stuff)
"""
# [[INSERT YOUR DOMAINS HERE!]]
# SG, ASIAPACIFIC, HEALTH, COMMENTARY,
source_URLs = [# [CNA] SG
#("www.channelnewsasia.com/archives/8396078/news?pageNum=","&channelId=7469254"),
# [CNA] ASIA PACIFIC
("www.channelnewsasia.com/archives/8395764/news?pageNum=","&channelId=7469252"),
# [CNA] HEALTH
("www.channelnewsasia.com/archives/8395790/news?pageNum=","&channelId=7469578"),
# [CNA] COMMENTARY
("www.channelnewsasia.com/archives/8550254/news?pageNum=","&channelId=8396306")]
# If you want to pull all pages from a site, ensure the site's page
# structure is numeric (Eg. somesite.com/page/<NUMBER> )
start_page = 0
end_page = 99999
# =============================================================================
# SETTING ENVIRONMENT VARIABLES
# =============================================================================
# Timer!
start = timer()
# Set time format depending on your system architecture
# This works for me, but it might not work for you! Make sure to check!
if sys.platform[0:3] == "win":
time_format = "%d %B %Y %H:%M"
else:
time_format = "%b %d, %Y %I:%M%p"
# Agent header for reducing 403 errors (makes you look like a user)
# THIS ONE IS DEPRECIATED, but I'll keep it around just in case
# user_agent_header = {'headers':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0'}
# Global comment total
overall_comment_counter = 0
# User agent header setting
user_agent_header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
# Let's turn off annoying messages, shall we?
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("bs4").setLevel(logging.ERROR)
# This is for pages that don't throw a 404 after you exceed their content
# pages. The domain_link_scraper will still return values, but they'll already
# be parsed.
skip_loop_flag = False
skip_loop_counter = 0
# This is for breaking out of a domain if too many pages were parsed with
# no new comments found. (It's the second layer of safeguarding against
# being stuck in a domain forever)
no_comment_flag = False
no_comment_counter = 0
# MEMOISATION~~~~
parsed_urls = []
skips = 0
# =============================================================================
# Useful Functions
# =============================================================================
# Check for internet
def internet(host="8.8.8.8", port=53, timeout=3):
"""
Host: 8.8.8.8 (google-public-dns-a.google.com)
OpenPort: 53/tcp
Service: domain (DNS/TCP)
"""
try:
socket.setdefaulttimeout(timeout)
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
return True
except Exception as ex:
print(ex)
return False
# Check JSON title output
def description(output, url):
if output is "":
p = requests.get(url, headers = user_agent_header)
pt = p.text
return pt[pt.find('<title>') + 7 : pt.find('</title>')]
else:
return output
#SWAG
methylDragon = '''
. .
. |\-^-/| .
/| } O.=.O { |\\
/´ \ \_ ~ _/ / `\\
/´ | \-/ ~ \-/ | `\\
| | /\\\ //\\ | |
\|\|\/-""-""-\/|/|/
______/ /
'------'
_ _ _ ___
_ __ ___| |_| |_ _ _| || \ _ _ __ _ __ _ ___ _ _
| ' \/ -_) _| ' \ || | || |) | '_/ _` / _` / _ \ ' \\
|_|_|_\___|\__|_||_\_, |_||___/|_| \__,_\__, \___/_||_|
|__/ |___/
-------------------------------------------------------
github.com/methylDragon\n'''
print(methylDragon)
print("Facebook Embedded Comment Scraper by methylDragon!")
for i in range(5):
print("." * (5 - i))
time.sleep(1)
print("\nINITIALISED!\n")
time.sleep(1)
# =============================================================================
# CORE CODE
# =============================================================================
# This function fetches comments from the nested dictionary from the JSON from each URL!
def fetch_comment(comment_id, reply_depth):
user_id = parsed_json[0]["props"]["comments"]["idMap"][str(comment_id)]["authorID"]
# Default 0, unless the comment is a child of a parent comment
reply_depth = reply_depth
try:
username = parsed_json[0]["props"]["comments"]["idMap"][str(user_id)]["name"]
except:
username = "<ERROR> no_name"
try:
city = parsed_json[0]["props"]["comments"]["idMap"][str(user_id)]["bio"]["stats"]["city"]["name"]
except:
city = "<ERROR> no_city"
comment_text = parsed_json[0]["props"]["comments"]["idMap"][str(comment_id)]["body"]["text"]
timestamp = parsed_json[0]["props"]["comments"]["idMap"][str(comment_id)]["timestamp"]["text"]
# NOTE: You might have to change the time format depending on your system
parsed_timestamp = datetime.strptime(timestamp, time_format)
likes = parsed_json[0]["props"]["comments"]["idMap"][str(comment_id)]["likeCount"]
try:
replies = parsed_json[0]["props"]["comments"]["idMap"][str(comment_id)]["public_replies"]["totalCount"]
except:
replies = 0
# We return the username, timestamp, and comment text
return {"reply_depth": reply_depth * "*", "username": username, "city": city, "date_time": parsed_timestamp, "comment_text": comment_text, "likes": likes, "replies": replies}
# This function appends comments to a comment list, and recursively appends comment replies and their replies to the same list, all in order!
def append_comment(comment_list, comment_id, reply_depth, comment_counter):
# Fetch comment data
comment = fetch_comment(comment_id, reply_depth)
comment_list.append(comment)
# Print user feedback and increase comment counter
comment_counter.append("*" * reply_depth)
#print("\nCompleted analysing comment #", str(len(comment_counter)) + str(" *" * reply_depth), sep="")
#print(comment)
# Track replies for counting total
if reply_depth > 0:
reply_tracker.append("*")
# Check for replies
try:
reply_ids = parsed_json[0]["props"]["comments"]["idMap"][str(comment_id)]["public_replies"]["commentIDs"]
# Increment reply_depth
reply_depth += 1
# Report back if found
print("\nReplies found! New Depth:", reply_depth)
# Append replies in order to main comment list
for reply in reply_ids:
# Recursively do it for replies to replies to replies ad nauseum
append_comment(comment_list, reply, reply_depth, comment_counter)
except:
if reply_depth > 0:
print("\nEnd of reply thread! Reply Depth returning to:", reply_depth - 1)
return
# =============================================================================
# LOOP THROUGH EACH STATED DOMAIN URL
# =============================================================================
# For each source (domain) stated
for source in source_URLs:
# Initialise counter values
section_counter = 0 # Unique sections
populated_section_counter = 0 # Useful sections
page_counter = 0 # Pages parsed
comment_counter = 0 # Comments pulled from source
no_comment_counter = 0 # Source Escape counter reset
# Strip http:// and https://
if source[:7] == "http://":
source = source[7:]
elif source[:8] == "https://":
source = source[8:]
# Clean source name (removing invalid characters for the saved file)
if type(source) == str:
cleaned_source = str(source)
else:
cleaned_source = str(source[0])
# Invalid characters for file names here:
rep_chars = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']
for char in rep_chars:
cleaned_source = cleaned_source.replace(char,"_")
# Prepare .csv for each source
f = cleaned_source + '.csv'
# Open .csv for the current source
with open(f, 'w', encoding='utf-8', newline="") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow("Reply Depth|@@@|Link|@@@|Description|@@@|Total Comments|@@@|Username|@@@|City|@@@|Date-Time|@@@|Comment|@@@|Likes|@@@|Replies".split(sep="|@@@|"))
# =============================================================================
# LOOP THROUGH EACH LIST OF URLs IN EACH PAGE OF THE DOMAIN
# =============================================================================
# Generate url_lists for every page in source
# get_domain_links is a generator function!
for url_list, current_page in get_domain_links(source, start_page, end_page):
# If the URL list is empty, just go to the next page
if url_list is None:
continue
urls = url_list
page_counter += 1
# Count the number of consecutive times no new articles were found in an entire
# page's url list
if skip_loop_flag == True:
skip_loop_counter += 1
print("{! WARNING !} No New Pages! Streak:", skip_loop_counter)
else:
skip_loop_counter = 0
# Count the number of times no new comments were found in an entire page's url list
if no_comment_flag == True:
no_comment_counter += 1
else:
no_comment_counter = 0
# If it crosses 5 consecutive times, stop searching the domain and move on
if skip_loop_counter >= 5:
print("\n\nSTOPPING SEARCH AFTER TOO MANY TIMES OF NO NEW ARTICLES CONSECUTIVELY FOUND")
print("\nMoving on to next domain...")
skip_loop_counter = 0
break
# If it crosses 100 consecutive times, stop searching the domain and move on
if no_comment_counter >= 100:
print("\n\nSTOPPING SEARCH AFTER TOO MANY TIMES OF NO NEW COMMENTS CONSECUTIVELY FOUND")
print("\nMoving on to next domain...\n\n")
no_comment_counter = 0
break
skip_loop_flag = True
no_comment_flag = True
# =============================================================================
# LOOP THROUGH EACH LINK IN THE LIST OF URLs
# =============================================================================
# And run through every Facebook comment widget in every link in each page
for url in urls:
# If we've parsed a comment section before, skip it
if url in parsed_urls:
print("Skipping parsed comment section", url)
continue
else:
# Otherwise, add to memoisation list
skip_loop_flag = False # New article found in page!
parsed_urls.append(str(url))
section_counter += 1
# Internet connectivity check
# INFINITE LOOP IF THERE'S NO INTERNET
while not internet():
print("NO INTERNET. RECONNECTING...")
# Wait 5s to check again
time.sleep(5)
# Timing checkpoint!
checkpoint = timer()
# Print running summary
print("\n\nRunning Summary for: " + str(cleaned_source))
print("------------------")
print("Unique Comment Sections: " + str(section_counter)
+ " | Useful Sections: " + str(populated_section_counter)
+ " | Pages Parsed: " + str(page_counter)
+ "\n------------------"
+ "\nTime Elapsed: " + str(round(checkpoint - start, 2))
+ " | Current Page: #" + str(current_page)
+ " | Total Comments: " + str(comment_counter))
print("------------------\n")
if skip_loop_flag == True:
print("{! WARNING !} No New Pages! Streak:", skip_loop_counter)
if no_comment_counter > 1 == True:
print("{! WARNING !} No New Comments! Streak:", no_comment_counter)
# Empty variables before each iteration
page = None
raw_source = None
comments = []
parsed_json = None
# Download the page
page = requests.get('https://www.facebook.com/plugins/feedback.php?api_key&href=' + url + '&numposts=100', headers = user_agent_header)
# Extract the HTML source code
raw_source = page.text
# Isolate the JSON section (81 added because it is the length of the suffix)
json_prefix = raw_source.find('[{"constructor":{"__m":"CommentsPlugin.react"},"props":')
# Future-proofing this in the short-term, just in case Facebook adds stuff to the JSON suffix
json_suffix_start = raw_source.find('{"__m":"__elem_fc9f538f_0_0"}')
# Find the end of the tag, wherever it is
json_suffix = json_suffix_start + raw_source[json_suffix_start:].find(']') + 1
# Store the JSON section, and parse it
json_data = raw_source[json_prefix:json_suffix]
# Attempt to parse JSON (This breaks easily)
try:
parsed_json = json.loads(json_data)
# Skip if it fails
except:
print("Failed to parse JSON, Skipping")
skips += 1
continue
# =============================================================================
# OUTPUT AND WRITE RELEVANT OUTPUTS
# =============================================================================
# Check if parsed data is empty
if parsed_json != None:
# Extract comment and page IDs
comment_ids = parsed_json[0]["props"]["comments"]["commentIDs"]
page_id = parsed_json[0]["props"]["meta"]["targetFBID"]
# Print comment URL and details
#print("\n------------------")
#print(parsed_json[0]["props"]["meta"]["href"])
#print("\nDescription: ",end="")
print("\nArticle: " + url + "\n")
try:
# Print name
print(parsed_json[0]["props"]["comments"]["idMap"][str(page_id)]["name"])
print()
populated_section_counter += 1
no_comment_flag = False # Comments were found in page!
except:
print("{ NO COMMENTS FOUND }")
#print("------------------")
#print("Total Comments: ",end="")
#print(parsed_json[0]["props"]["meta"]["totalCount"])
#print("------------------")
# List individual comments
comments = []
comment_count = []
reply_tracker = []
# Analyse comments and populate the comment list with the results
for comment_id in comment_ids:
reply_depth = 0
append_comment(comments, comment_id, reply_depth, comment_count)
#print("\nComment analysis complete!")
#print("Initialising comment writing\n")
# Write comment data to .csv
for comment_index, comment in enumerate(comments, 1):
# Append a line to the .csv per comment
line = (# Reply depth
str(comment["reply_depth"])
+ "|@@@|"
# Link
+ str(parsed_json[0]["props"]["meta"]["href"])
+ "|@@@|"
# Description
+ description(str(parsed_json[0]["props"]["comments"]["idMap"][str(page_id)]["name"]), url)
+ "|@@@|"
# Comment number (out of total comments in post)
+ str(comment_index)
+ " of "
+ str(parsed_json[0]["props"]["meta"]["totalCount"]
+ len(reply_tracker))
+ "|@@@|"
# Commenter name
+ str(comment["username"])
+ "|@@@|"
# Commenter city
+ str(comment["city"])
+ "|@@@|"
# Timestamp
+ str(comment["date_time"])
+ "|@@@|"
# Comment text
+ str(comment["comment_text"])
+ "|@@@|"
# Likes comment has
+ str(comment["likes"])
+ "|@@@|"
# Replies comment has
+ str(comment["replies"])
)
# Increment intra-source comment total
comment_counter += 1
# Write the comment into the .csv
writer.writerow(line.split(sep="|@@@|"))
# Report back
print("Completed writing comment #" + str(comment_index))
else:
# Report error if article data is empty
print("ERROR: Could not parse article!")
writer.writerow("Error: " + str(parsed_json[0]["props"]["meta"]["href"]) + " could not be parsed!")
# Global comment total
overall_comment_counter += comment_counter
# Timing checkpoint!
checkpoint = timer()
print(methylDragon[:101])
# Print concluding summary
print("\n------------------")
print("Concluding Summary:")
print("------------------\n")
print("Total Parsed URLs:", str(len(parsed_urls)))
print("Error Skips:", skips)
print("Total Time:", str(round(checkpoint - start, 2)))
print("Overall Total Scraped Comments:", str(overall_comment_counter))