forked from christopherkullenberg/blogspotscraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
blogspotscraper.py
41 lines (35 loc) · 1.58 KB
/
blogspotscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""
This script scrapes a Blogspot blog by iterating back in its history.
Usage:
1. Set url variable to the Blogspot URL you want as point of departure.
2. Press CTRL-C when you want to stop it.
Note: Your IP-number may be temporarily banned from the Blogger service if over-used.
Use on your own risk.
"""
import requests
import io
import re
from bs4 import BeautifulSoup
#Set url to Blogger blog between the single quotation marks
#For examle url = 'http://german-podcast.blogspot.se/2013/01/new-vocitach-android-app-with-more-than.html'
url = ''
counter = 0
while True:
r = requests.get(url)
file_like_obj = io.StringIO(r.text) #Turns the requested output into a file like objet
lines = file_like_obj.read()
for l in lines:
counter += 1 #Update the counter from proper filenames
soup = BeautifulSoup(lines)
findID = re.findall(r'post-body-(.*)\' itemprop', lines) #This retrieves each post's unique ID-number for the text.
print(findID[0]) #for debugging
div = soup.find(id="post-body-" + findID[0]) #This retrieves each post content
print(div)
with open(str(counter) + ".html", "w") as outputfile: #open file
outputfile.write(str(div)) #write to file
matchObj = re.findall(r'blog-pager-older-link\' href=\'(.*)\' id', lines) # This extract the "Older" post
next_url = matchObj[0]
print("Next URL for scraping: " + next_url)
print("Press CTRL-C to exit the program.")
url = next_url # This changes the variable in the beginning of the script
break