-
Notifications
You must be signed in to change notification settings - Fork 4
/
category.py
47 lines (38 loc) · 1.44 KB
/
category.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import gc
from page import Page
class Category():
def __init__(self, url, category, proxies={}, appendix="", db_lock=None):
self.url = url
self.category = category
self.proxies = proxies
self.next_page_appendix = appendix
# pages hold Page instances
self.pages = []
# first_page is Page instance
self.first_page = None
self.last_page_no = None
self.db_lock = db_lock
def _find_last_page(self):
last_page = self.first_page.soup.find("a", {"class": "emos_invisible lastPage"})
if last_page == None:
return 2
return int(last_page["href"].split("page=")[1])
def fetch_first_page(self):
self.first_page = Page(self.url, self.category, self.proxies, self.db_lock)
self.pages.append(self.first_page)
self.first_page.fetch_page()
def create_pages(self):
self.fetch_first_page()
self.last_page_no = self._find_last_page()
for i in range(2, self.last_page_no+1):
url = self.url + self.next_page_appendix + str(i)
page = Page(url=url, category=self.category, db_lock = self.db_lock)
self.pages.append(page)
def parse_pages(self):
print("Fetching category: " + self.category)
for page in self.pages:
page.fetch_page()
page.fetch_items()
self.pages.remove(page)
page = None
gc.collect()