Skip to content

Commit

Permalink
fix: now handling multiple type of ids to scrape
Browse files Browse the repository at this point in the history
  • Loading branch information
iamatulsingh committed May 21, 2024
1 parent a02a411 commit 4b5b35a
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 12 deletions.
4 changes: 2 additions & 2 deletions e2e.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pinscrape import scraper


details = scraper.scrape("messi", "output", {}, 10, 15)
details = scraper.scrape("messi", "output", {}, 10, 1)


def test_single_data():
Expand All @@ -11,6 +11,6 @@ def test_single_data():
print(f"\nTotal images downloaded (including duplicate images): {len(details['url_list'])}")
print(details)
else:
print("\nNothing to download !!")
print("\nNothing to download !!", details)

assert len(details['extracted_urls']) > 0
2 changes: 1 addition & 1 deletion pinscrape/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.1.0"
__version__ = "3.2.0"
23 changes: 14 additions & 9 deletions pinscrape/pinscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def get_source(self, url: str, proxies: dict) -> None:
return
html = soup(res.text, 'html.parser')
json_data = html.find_all("script", attrs={"id": "__PWS_INITIAL_PROPS__"})
if not len(json_data):
json_data = html.find_all("script", attrs={"id": "__PWS_DATA__"})

self.json_data_list.append(json.loads(json_data[0].string)) if len(json_data) else self.json_data_list.append({})

# --------------------------- READ JSON OF PINTEREST WEBSITE ----------------------
Expand All @@ -48,14 +51,15 @@ def save_image_url(self, max_images: int) -> list:
try:
data = DotMap(js)
urls = []
if not data.initialReduxState:
if not data.initialReduxState and not data.props:
return []
for pin in data.initialReduxState.pins:
if isinstance(data.initialReduxState.pins[pin].images.get("orig"), list):
for i in data.initialReduxState.pins[pin].images.get("orig"):
pins = data.initialReduxState.pins if data.initialReduxState else data.props.initialReduxState.pins
for pin in pins:
if isinstance(pins[pin].images.get("orig"), list):
for i in pins[pin].images.get("orig"):
urls.append(i.get("url"))
else:
urls.append(data.initialReduxState.pins[pin].images.get("orig").get("url"))
urls.append(pins[pin].images.get("orig").get("url"))

for url in urls:
url_list.append(url)
Expand Down Expand Up @@ -98,18 +102,18 @@ def download(self, url_list, num_of_workers, output_folder):

# -------------------------- get user keyword and google search for that keywords ---------------------
@staticmethod
def start_scraping(max_images, key=None, proxies={}):
def start_scraping(max_images, key=None, proxies: dict = {}):
assert key is not None, "Please provide keyword for searching images"
keyword = key + " pinterest"
keyword = keyword.replace("+", "%20")
url = f'https://www.bing.com/search?q={keyword}&pq=messi+pinterest&first=1&FORM=PERE'
url = f'https://www.bing.com/search?q={keyword}&first=1&FORM=PERE'
res = get(url, proxies=proxies)
searched_urls = PinterestImageScraper.get_pinterest_links(res.content, max_images)

return searched_urls, key.replace(" ", "_")
return searched_urls, key.replace(" ", "_"), res.status_code

def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, threads: int = 10, max_images: int = None) -> dict:
extracted_urls, keyword = PinterestImageScraper.start_scraping(max_images, key, proxies)
extracted_urls, keyword, search_engine_status_code = PinterestImageScraper.start_scraping(max_images, key, proxies)
self.unique_img = []
self.json_data_list = []

Expand All @@ -121,6 +125,7 @@ def scrape(self, key: str = None, output_folder: str = "", proxies: dict = {}, t

return_data = {
"isDownloaded": False,
"search_engine_status_code": search_engine_status_code,
"url_list": url_list,
"extracted_urls": extracted_urls,
"keyword": key
Expand Down

0 comments on commit 4b5b35a

Please sign in to comment.