From 4b5b35a6c9f003f50080bbc81d4a803ca179aa13 Mon Sep 17 00:00:00 2001 From: iamatulsingh Date: Tue, 21 May 2024 10:44:32 +0200 Subject: [PATCH] fix: now handling multiple type of ids to scrape --- e2e.py | 4 ++-- pinscrape/_version.py | 2 +- pinscrape/pinscrape.py | 23 ++++++++++++++--------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/e2e.py b/e2e.py index 355e431..1706251 100644 --- a/e2e.py +++ b/e2e.py @@ -1,7 +1,7 @@ from pinscrape import scraper -details = scraper.scrape("messi", "output", {}, 10, 15) +details = scraper.scrape("messi", "output", {}, 10, 1) def test_single_data(): @@ -11,6 +11,6 @@ def test_single_data(): print(f"\nTotal images downloaded (including duplicate images): {len(details['url_list'])}") print(details) else: - print("\nNothing to download !!") + print("\nNothing to download !!", details) assert len(details['extracted_urls']) > 0 diff --git a/pinscrape/_version.py b/pinscrape/_version.py index f5f41e5..1173108 100644 --- a/pinscrape/_version.py +++ b/pinscrape/_version.py @@ -1 +1 @@ -__version__ = "3.1.0" +__version__ = "3.2.0" diff --git a/pinscrape/pinscrape.py b/pinscrape/pinscrape.py index d2e40a7..7813366 100644 --- a/pinscrape/pinscrape.py +++ b/pinscrape/pinscrape.py @@ -39,6 +39,9 @@ def get_source(self, url: str, proxies: dict) -> None: return html = soup(res.text, 'html.parser') json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"}) + if not len(json_data): + json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"}) + self.json_data_list.append(json.loads(json_data[0].string)) if len(json_data) else self.json_data_list.append({}) # --------------------------- READ JSON OF PINTEREST WEBSITE ---------------------- @@ -48,14 +51,15 @@ def save_image_url(self, max_images: int) -> list: try: data = DotMap(js) urls = [] - if not data.initialReduxState: + if not data.initialReduxState and not data.props: return [] - for pin in data.initialReduxState.pins: - if isinstance(data.initialReduxState.pins[pin].images.get("orig"), list): - for i in data.initialReduxState.pins[pin].images.get("orig"): + pins = data.initialReduxState.pins if data.initialReduxState else data.props.initialReduxState.pins + for pin in pins: + if isinstance(pins[pin].images.get("orig"), list): + for i in pins[pin].images.get("orig"): urls.append(i.get("url")) else: - urls.append(data.initialReduxState.pins[pin].images.get("orig").get("url")) + urls.append(pins[pin].images.get("orig").get("url")) for url in urls: url_list.append(url) @@ -98,18 +102,18 @@ def download(self, url_list, num_of_workers, output_folder): # -------------------------- get user keyword and google search for that keywords --------------------- @staticmethod - def start_scraping(max_images, key=None, proxies={}): + def start_scraping(max_images, key=None, proxies: dict = {}): assert key is not None, "Please provide keyword for searching images" keyword = key + " pinterest" keyword = keyword.replace("+", "%20") - url = f'https://www.bing.com/search?q={keyword}&pq=messi+pinterest&first=1&FORM=PERE' + url = f'https://www.bing.com/search?q={keyword}&first=1&FORM=PERE' res = get(url, proxies=proxies) searched_urls = PinterestImageScraper.get_pinterest_links(res.content, max_images) - return searched_urls, key.replace(" ", "_") + return searched_urls, key.replace(" ", "_"), res.status_code def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, threads: int = 10, max_images: int = None) -> dict: - extracted_urls, keyword = PinterestImageScraper.start_scraping(max_images, key, proxies) + extracted_urls, keyword, search_engine_status_code = PinterestImageScraper.start_scraping(max_images, key, proxies) self.unique_img = [] self.json_data_list = [] @@ -121,6 +125,7 @@ def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, t return_data = { "isDownloaded": False, + "search_engine_status_code": search_engine_status_code, "url_list": url_list, "extracted_urls": extracted_urls, "keyword": key