-
Notifications
You must be signed in to change notification settings - Fork 0
/
driver.py
113 lines (74 loc) · 3.2 KB
/
driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import json
import time
import undetected_chromedriver as uc
from scraper_service.catalog_builder import Catalog, JsonBuilder
from scraper_service.scraper import Scraper
def initial_object_collection(source_url, driver):
"""
Сбор данных о количестве страниц и сохранение объекта первой страницы в
пулл последующего скреппинга.
"""
start_page = Scraper(source_url, driver)
total_pages = start_page.get_pagination_number
index_catalog = [start_page]
return total_pages, index_catalog
def urls_pull_object_collection(
total_pages, delay_time, source_url, index_catalog, scraper_class, driver
):
"""Создаём массив из объектов с данными индекса страниц."""
pattern = "{}?page={}"
for pagination in range(2, total_pages + 1):
time.sleep(delay_time)
url = pattern.format(str(source_url), str(pagination))
index_catalog.append(scraper_class(url, driver))
return index_catalog
def urls_pull_list_collection(index_catalog, catalog):
"""Создаём каталог с перечнем ссылок продуктов."""
for html in index_catalog:
catalog.index_catalog_builder(html.soup)
def pages_pull_objects_collection(catalog, delay_time, scraper_class, driver):
"""Создаём массив объектов с данными продуктов."""
list_to_scrape = []
total_products = len(catalog.index_catalog)
counter = 0
for url in catalog.index_catalog:
time.sleep(delay_time)
list_to_scrape.append(scraper_class(url, driver))
counter += 1
print(
f"\rProcessing {counter} of {total_products}", end="", flush=True
)
return list_to_scrape
def product_data_builder(json_builder, list_to_scrape):
"""Собираем данные о продуктах в словарь."""
for page in list_to_scrape:
json_builder.collect_product_content(page.soup)
def json_generator(json_builder):
"""Создаём JSON фаил на основе данных словаря."""
with open("leroymerlin_data.json", "w", encoding="utf-8") as outfile:
json.dump(
json_builder.product_file, outfile, indent=4, ensure_ascii=False
)
def main():
"""Основной драйвер проекта."""
delay_time: int = 5
driver = uc.Chrome()
driver.set_window_size(800, 600)
source_url = "https://leroymerlin.ru/catalogue/parketnye-laki/"
total_pages, index_catalog = initial_object_collection(source_url, driver)
full_catalog = urls_pull_object_collection(
total_pages, delay_time, source_url, index_catalog, Scraper, driver
)
catalog = Catalog()
urls_pull_list_collection(full_catalog, catalog)
list_to_scrape = pages_pull_objects_collection(
catalog, delay_time, Scraper, driver
)
json_builder = JsonBuilder()
product_data_builder(json_builder, list_to_scrape)
json_generator(json_builder)
driver.close()
driver.quit()
print("\rДанные сохранены в .json фаил.")
if __name__ == "__main__":
main()