-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
71 lines (62 loc) · 2.33 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import requests, time, re, os, configparser, sys
from bs4 import BeautifulSoup
config = configparser.ConfigParser()
if not os.path.isfile('config.ini'):
print("config.ini is missing!")
sys.exit(0)
config.read('config.ini')
if "username" not in config["DEFAULT"] or "password" not in config["DEFAULT"] or config["DEFAULT"]["username"] == "" or config["DEFAULT"]["password"] == "":
print("username and password must be present in config.ini.")
sys.exit(0)
info = { "username": config["DEFAULT"]["username"],
"password": config["DEFAULT"]["password"],
"action": "login"
}
# files = [f for f in os.listdir('pages') if os.path.isfile("./pages/{}".format(f))]
if not os.path.isdir('pages'):
os.mkdir('pages', 0o755)
s = requests.Session()
q = s.post("https://forums.somethingawful.com/account.php", data=info)
# print(q.text)
if "lastpage" in config["DEFAULT"] and config["DEFAULT"]["lastpage"] != "":
lastpage = int(config["DEFAULT"]["lastpage"])
else:
lastpage = 1
i = lastpage
while True:
time.sleep(0.1)
payload = {'threadid': '3908778', 'pagenumber': str(i)}
r = s.get("https://forums.somethingawful.com/showthread.php", params=payload) #, cookies=jar)
# with open("pages/rawpage{}.txt".format(i), "w+") as file:
# file.write(r.text)
if "The page number you requested" in r.text:
i -= 1
break
matcher = re.compile(r'[g]aybie[s]? [n]om\S{0,} (.+)$', flags=re.IGNORECASE|re.MULTILINE)
# matcher = re.compile(r'[Gg]aybie[s]? [Nn]om')
if re.search(matcher, r.text) != None:
print("Page {} has a nomination.".format(i))
soup = BeautifulSoup(r.text, 'html.parser')
for tag in soup.find_all('tr'):
keep = False
latestimg = ""
for child in tag.descendants:
#if child.name == "img":
# lastimg = child['src']
res = re.search(matcher, str(child))
if res != None:
# out = "{}: {}".format(res.group(1), lastimg)
# print(out)
# with open("nominations.txt", "a") as file:
# file.write(out + "\n")
keep = True
if keep == False:
tag.decompose()
with open("pages/page{}.html".format(i), "w") as file:
file.write(str(soup))
else:
print("Page {} has no nominations.".format(i))
i += 1
config["DEFAULT"]["lastpage"] = str(i)
with open("config.ini", "w") as file:
config.write(file)