-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Completed scrape and save archittectures for
4 websites - Sakthe
- Loading branch information
1 parent
924ea2c
commit 6c70034
Showing
7 changed files
with
299 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# AWS Credentials | ||
AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID | ||
AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY | ||
AWS_REGION=YOUR_AWS_REGION | ||
|
||
# Additional environment variables can be added here as needed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
import os | ||
import json | ||
import boto3 | ||
import scrapy | ||
from dotenv import load_dotenv | ||
from scrapy import Request | ||
from math import ceil | ||
from datetime import datetime, timedelta | ||
|
||
# Load environment variables from .env file | ||
load_dotenv() | ||
links=['/all'] | ||
class Spider3(scrapy.Spider): | ||
name = 'spider3' | ||
start_urls = ['https://www.producthunt.com/'] | ||
output_file = 'final.json' | ||
bucket_name = 'dinostomach' | ||
folder_name = 'producthunt' | ||
|
||
def parse(self, response): | ||
# Start date | ||
start_date = datetime(2024, 4, 8) | ||
# End date | ||
end_date = datetime(2023, 1, 1) | ||
|
||
# Loop through each date, decreasing from start_date to end_date | ||
current_date = start_date | ||
while current_date >= end_date: | ||
# Construct the date part of the URL | ||
date_part = current_date.strftime("%Y/%m/%d") | ||
# Construct the URL | ||
url = f'https://www.producthunt.com/leaderboard/daily/{date_part}/all' | ||
# Make a request to the URL with custom_parse_method callback | ||
yield Request(url, callback=self.custom_parse_method) | ||
# Decrease the date by one day | ||
current_date -= timedelta(days=1) | ||
|
||
def custom_parse_method(self, response): | ||
# Loop through each div with the specified class name | ||
for div in response.css('div.styles_item__Dk_nz.my-2.flex.flex-1.flex-row.gap-2.py-2.sm\:gap-4'): | ||
# Extracting title | ||
title = div.css('strong::text').get() | ||
print(title) | ||
# Extracting image_url | ||
image_url = div.css('img.styles_mediaThumbnail__NCzNO::attr(src)').get() | ||
print(image_url) | ||
# Extracting website | ||
website = 'https://www.producthunt.com/' + div.css('a.styles_externalLinkIcon__vjPDi::attr(href)').get() | ||
print(website) | ||
# Create mets | ||
mets = {'title': title, 'image_url': image_url, 'website': website} | ||
titles =title.lower(). replace(" ", "-") | ||
|
||
# Request the detailed page | ||
detailed_url = 'https://www.producthunt.com/' + f'posts/{titles}' | ||
print(detailed_url) | ||
links.extend(response.css('a.text-14.font-semibold.text-light-grey::attr(href)').extract()) | ||
yield Request(detailed_url, callback=self.parse_detailed_page, meta=mets) | ||
|
||
def parse_detailed_page(self, response): | ||
mets = response.meta | ||
# Extracting description | ||
description = response.css('div.styles_htmlText__eYPgj.text-16.font-normal.text-dark-grey::text').get() | ||
if(description == "null" or description == None or description ==""): | ||
description = response.css('div.text-16.font-normal.text-light-grey.mb-6::text').get() | ||
|
||
|
||
mets['description'] = description | ||
print(description) | ||
|
||
reviews_url = response.url + '/reviews' | ||
|
||
yield Request(reviews_url, callback=self.parse_reviews, meta=mets) | ||
|
||
def parse_reviews(self, response): | ||
mets = response.meta | ||
reviews = response.css('div.text-18.font-normal.text-dark-grey.text-center.mt-4::text').get() | ||
# Check if reviews is None or an empty string | ||
if reviews is None or reviews == "": | ||
# # Try a different selector | ||
reviews = response.css('div.styles_htmlText__eYPgj.text-18.font-normal.text-light-grey.italic.styles_format__8NeQe.styles_overallExperience__x7Gqf::text').get() | ||
|
||
# If reviews is still None or an empty string, set it to "No reviews" or any other suitable default value | ||
if reviews is None or reviews == "": | ||
reviews = "No reviews" | ||
print(reviews) | ||
# Add reviews to meta | ||
mets['reviews'] = [reviews] | ||
meta_data = { | ||
'title': mets['title'], | ||
'image_url': mets['image_url'], | ||
'website': mets['website'], | ||
'description': mets['description'], | ||
'reviews': mets['reviews'] | ||
} | ||
self.save_data(meta_data) | ||
|
||
def save_data(self, data): | ||
file_exists = os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0 | ||
|
||
with open(self.output_file, 'a') as json_file: | ||
if not file_exists: | ||
json_file.write('[') | ||
else: | ||
json_file.write(',') | ||
|
||
json.dump(data, json_file, indent=4) | ||
|
||
|
||
def closed(self, reason): | ||
# Custom logic to execute after the spider is closed | ||
# Example: Upload the JSON file to S3 | ||
with open(self.output_file, 'a') as json_file: | ||
json_file.write(']') # Add closing square bracket to indicate the end of JSON array | ||
|
||
s3 = boto3.client('s3', | ||
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), | ||
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), | ||
region_name=os.getenv('AWS_REGION')) | ||
|
||
if os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0: | ||
with open(self.output_file, 'r') as json_file: | ||
existing_data = json.load(json_file) | ||
|
||
total_items = len(existing_data) | ||
chunk_size = 1000 | ||
num_chunks = ceil(total_items / chunk_size) | ||
|
||
for i in range(num_chunks): | ||
chunk = existing_data[i * chunk_size: (i + 1) * chunk_size] | ||
chunk_file = f'custom_output_chunk_{i + 1}.json' | ||
|
||
with open(chunk_file, 'w') as chunk_json_file: | ||
json.dump(chunk, chunk_json_file, indent=4) | ||
|
||
s3.upload_file(chunk_file, self.bucket_name, f'{self.folder_name}/{chunk_file}') | ||
self.logger.info(f'{chunk_file} uploaded to {self.bucket_name}/{self.folder_name}/{chunk_file}') | ||
|
||
os.remove(chunk_file) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import os | ||
import json | ||
import boto3 | ||
from dotenv import load_dotenv | ||
from scrapy import Spider, Request | ||
from math import ceil | ||
|
||
# Instructions for integrating with FastAPI | ||
# ---------------------------------------- | ||
# 1. Import this spider into your FastAPI application's main.py file. | ||
# 2. Use the Scrapy CrawlerProcess to run the spider. | ||
# 3. Define an endpoint in your FastAPI application that triggers the spider. | ||
# 4. Call the spider using its name ('custom_scraper' in this case) when the endpoint is hit. | ||
# Replace 'YOUR_START_URL_HERE', 'YOUR_CSS_SELECTOR_HERE', 'YOUR_BUCKET_NAME', and 'YOUR_FOLDER_NAME' with actual values. | ||
|
||
|
||
# Load environment variables from .env file | ||
load_dotenv() | ||
|
||
class CustomScraper(Spider): | ||
name = 'custom_scraper' | ||
start_urls = ['YOUR_START_URL_HERE'] # Define your start URLs here | ||
output_file = 'custom_output.json' # Output file name | ||
bucket_name = 'YOUR_BUCKET_NAME' # S3 bucket name | ||
folder_name = 'YOUR_FOLDER_NAME' # S3 folder name | ||
|
||
def parse(self, response): | ||
# Custom parsing logic goes here | ||
# Example: Extract links from the page | ||
links = response.css('YOUR_CSS_SELECTOR_HERE').extract() | ||
|
||
for link in links: | ||
# Customize the request to the extracted links | ||
yield Request(link, callback=self.custom_parse_method) | ||
|
||
def custom_parse_method(self, response): | ||
# Custom parsing logic for each extracted link | ||
# Example: Extract product details | ||
product_data = { | ||
'title': response.css('YOUR_CSS_SELECTOR_HERE').get(), | ||
'description': response.css('YOUR_CSS_SELECTOR_HERE').get(), | ||
# Add more fields as needed | ||
} | ||
|
||
# Custom processing or storage logic | ||
# Example: Save product data to a file or database | ||
self.save_data(product_data) | ||
|
||
def save_data(self, data): | ||
# Custom logic to save data | ||
# Example: Save to a JSON file | ||
file_exists = os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0 | ||
|
||
with open(self.output_file, 'a') as json_file: | ||
if not file_exists: | ||
json_file.write('[') # Add opening square bracket if the file is empty | ||
else: | ||
json_file.write(',') # Add comma to separate JSON objects | ||
|
||
json.dump(data, json_file, indent=4) | ||
|
||
def closed(self, reason): | ||
# Custom logic to execute after the spider is closed | ||
# Example: Upload the JSON file to S3 | ||
with open(self.output_file, 'a') as json_file: | ||
json_file.write(']') # Add closing square bracket to indicate the end of JSON array | ||
|
||
s3 = boto3.client('s3', | ||
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), | ||
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), | ||
region_name=os.getenv('AWS_REGION')) | ||
|
||
if os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0: | ||
with open(self.output_file, 'r') as json_file: | ||
existing_data = json.load(json_file) | ||
|
||
total_items = len(existing_data) | ||
chunk_size = 1000 | ||
num_chunks = ceil(total_items / chunk_size) | ||
|
||
for i in range(num_chunks): | ||
chunk = existing_data[i * chunk_size: (i + 1) * chunk_size] | ||
chunk_file = f'custom_output_chunk_{i + 1}.json' | ||
|
||
with open(chunk_file, 'w') as chunk_json_file: | ||
json.dump(chunk, chunk_json_file, indent=4) | ||
|
||
s3.upload_file(chunk_file, self.bucket_name, f'{self.folder_name}/{chunk_file}') | ||
self.logger.info(f'{chunk_file} uploaded to {self.bucket_name}/{self.folder_name}/{chunk_file}') | ||
|
||
os.remove(chunk_file) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters