Skip to content

Commit

Permalink
Completed scrape and save archittectures for
Browse files Browse the repository at this point in the history
4 websites - Sakthe
  • Loading branch information
Sakthe-Balan committed Apr 8, 2024
1 parent 924ea2c commit 6c70034
Show file tree
Hide file tree
Showing 7 changed files with 299 additions and 5 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,5 @@ __pycache__/
# Ignore files generated by testing frameworks
coverage/
products.json
outputs.json
outputs.json
final.json
2 changes: 2 additions & 0 deletions server/dino/dino/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,5 @@
HTTPCOMPRESSION_ENABLED = True
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
# Define the headers to be used for making requests to the website
# RETRY_HTTP_CODES = [500, 503, 504, 400, 408, 307, 403]
# RETRY_TIMES=5
6 changes: 6 additions & 0 deletions server/dino/dino/spiders/.env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# AWS Credentials
AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY
AWS_REGION=YOUR_AWS_REGION

# Additional environment variables can be added here as needed
3 changes: 2 additions & 1 deletion server/dino/dino/spiders/spider2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1734,7 +1734,7 @@ def parse_software_page(self, response):
'price': section.css('span.star_new_background::text').get(),
'image_url': section.css('img.soft_logo.ls-is-cached.lazyloaded::attr(src)').get(),
'profile_href': section.css('a.ga_track_soft_profile.view_profile_inline::attr(href)').get(),
'website':section.css('a.ga_track_vwl_vwb.ripple_btn.cat_vwbft_fdfcgp_btn.vwb_btn.d-flex.align-items-center.justify-content-center::attr(href)')[0].get()

}
print(software_info)

Expand All @@ -1747,6 +1747,7 @@ def parse_software_page(self, response):
def parse_software_profile(self, response):
software_info = response.meta['software_info']
# Extract the overview
software_info['website']=response.css('a.ga_track_vwl_comp_d.specification_visit_website::attr(href)').get()
software_info['image_url']=response.xpath('//img[not(@class)]')[2].attrib.get('src', None)
software_info['description'] = response.css('p.read_more_text_overview::text').get()

Expand Down
142 changes: 142 additions & 0 deletions server/dino/dino/spiders/spider3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import os
import json
import boto3
import scrapy
from dotenv import load_dotenv
from scrapy import Request
from math import ceil
from datetime import datetime, timedelta

# Load environment variables from .env file
load_dotenv()
links=['/all']
class Spider3(scrapy.Spider):
name = 'spider3'
start_urls = ['https://www.producthunt.com/']
output_file = 'final.json'
bucket_name = 'dinostomach'
folder_name = 'producthunt'

def parse(self, response):
# Start date
start_date = datetime(2024, 4, 8)
# End date
end_date = datetime(2023, 1, 1)

# Loop through each date, decreasing from start_date to end_date
current_date = start_date
while current_date >= end_date:
# Construct the date part of the URL
date_part = current_date.strftime("%Y/%m/%d")
# Construct the URL
url = f'https://www.producthunt.com/leaderboard/daily/{date_part}/all'
# Make a request to the URL with custom_parse_method callback
yield Request(url, callback=self.custom_parse_method)
# Decrease the date by one day
current_date -= timedelta(days=1)

def custom_parse_method(self, response):
# Loop through each div with the specified class name
for div in response.css('div.styles_item__Dk_nz.my-2.flex.flex-1.flex-row.gap-2.py-2.sm\:gap-4'):
# Extracting title
title = div.css('strong::text').get()
print(title)
# Extracting image_url
image_url = div.css('img.styles_mediaThumbnail__NCzNO::attr(src)').get()
print(image_url)
# Extracting website
website = 'https://www.producthunt.com/' + div.css('a.styles_externalLinkIcon__vjPDi::attr(href)').get()
print(website)
# Create mets
mets = {'title': title, 'image_url': image_url, 'website': website}
titles =title.lower(). replace(" ", "-")

# Request the detailed page
detailed_url = 'https://www.producthunt.com/' + f'posts/{titles}'
print(detailed_url)
links.extend(response.css('a.text-14.font-semibold.text-light-grey::attr(href)').extract())
yield Request(detailed_url, callback=self.parse_detailed_page, meta=mets)

def parse_detailed_page(self, response):
mets = response.meta
# Extracting description
description = response.css('div.styles_htmlText__eYPgj.text-16.font-normal.text-dark-grey::text').get()
if(description == "null" or description == None or description ==""):
description = response.css('div.text-16.font-normal.text-light-grey.mb-6::text').get()


mets['description'] = description
print(description)

reviews_url = response.url + '/reviews'

yield Request(reviews_url, callback=self.parse_reviews, meta=mets)

def parse_reviews(self, response):
mets = response.meta
reviews = response.css('div.text-18.font-normal.text-dark-grey.text-center.mt-4::text').get()
# Check if reviews is None or an empty string
if reviews is None or reviews == "":
# # Try a different selector
reviews = response.css('div.styles_htmlText__eYPgj.text-18.font-normal.text-light-grey.italic.styles_format__8NeQe.styles_overallExperience__x7Gqf::text').get()

# If reviews is still None or an empty string, set it to "No reviews" or any other suitable default value
if reviews is None or reviews == "":
reviews = "No reviews"
print(reviews)
# Add reviews to meta
mets['reviews'] = [reviews]
meta_data = {
'title': mets['title'],
'image_url': mets['image_url'],
'website': mets['website'],
'description': mets['description'],
'reviews': mets['reviews']
}
self.save_data(meta_data)

def save_data(self, data):
file_exists = os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0

with open(self.output_file, 'a') as json_file:
if not file_exists:
json_file.write('[')
else:
json_file.write(',')

json.dump(data, json_file, indent=4)


def closed(self, reason):
# Custom logic to execute after the spider is closed
# Example: Upload the JSON file to S3
with open(self.output_file, 'a') as json_file:
json_file.write(']') # Add closing square bracket to indicate the end of JSON array

s3 = boto3.client('s3',
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
region_name=os.getenv('AWS_REGION'))

if os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0:
with open(self.output_file, 'r') as json_file:
existing_data = json.load(json_file)

total_items = len(existing_data)
chunk_size = 1000
num_chunks = ceil(total_items / chunk_size)

for i in range(num_chunks):
chunk = existing_data[i * chunk_size: (i + 1) * chunk_size]
chunk_file = f'custom_output_chunk_{i + 1}.json'

with open(chunk_file, 'w') as chunk_json_file:
json.dump(chunk, chunk_json_file, indent=4)

s3.upload_file(chunk_file, self.bucket_name, f'{self.folder_name}/{chunk_file}')
self.logger.info(f'{chunk_file} uploaded to {self.bucket_name}/{self.folder_name}/{chunk_file}')

os.remove(chunk_file)



93 changes: 93 additions & 0 deletions server/dino/dino/spiders/spider_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
import json
import boto3
from dotenv import load_dotenv
from scrapy import Spider, Request
from math import ceil

# Instructions for integrating with FastAPI
# ----------------------------------------
# 1. Import this spider into your FastAPI application's main.py file.
# 2. Use the Scrapy CrawlerProcess to run the spider.
# 3. Define an endpoint in your FastAPI application that triggers the spider.
# 4. Call the spider using its name ('custom_scraper' in this case) when the endpoint is hit.
# Replace 'YOUR_START_URL_HERE', 'YOUR_CSS_SELECTOR_HERE', 'YOUR_BUCKET_NAME', and 'YOUR_FOLDER_NAME' with actual values.


# Load environment variables from .env file
load_dotenv()

class CustomScraper(Spider):
name = 'custom_scraper'
start_urls = ['YOUR_START_URL_HERE'] # Define your start URLs here
output_file = 'custom_output.json' # Output file name
bucket_name = 'YOUR_BUCKET_NAME' # S3 bucket name
folder_name = 'YOUR_FOLDER_NAME' # S3 folder name

def parse(self, response):
# Custom parsing logic goes here
# Example: Extract links from the page
links = response.css('YOUR_CSS_SELECTOR_HERE').extract()

for link in links:
# Customize the request to the extracted links
yield Request(link, callback=self.custom_parse_method)

def custom_parse_method(self, response):
# Custom parsing logic for each extracted link
# Example: Extract product details
product_data = {
'title': response.css('YOUR_CSS_SELECTOR_HERE').get(),
'description': response.css('YOUR_CSS_SELECTOR_HERE').get(),
# Add more fields as needed
}

# Custom processing or storage logic
# Example: Save product data to a file or database
self.save_data(product_data)

def save_data(self, data):
# Custom logic to save data
# Example: Save to a JSON file
file_exists = os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0

with open(self.output_file, 'a') as json_file:
if not file_exists:
json_file.write('[') # Add opening square bracket if the file is empty
else:
json_file.write(',') # Add comma to separate JSON objects

json.dump(data, json_file, indent=4)

def closed(self, reason):
# Custom logic to execute after the spider is closed
# Example: Upload the JSON file to S3
with open(self.output_file, 'a') as json_file:
json_file.write(']') # Add closing square bracket to indicate the end of JSON array

s3 = boto3.client('s3',
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
region_name=os.getenv('AWS_REGION'))

if os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0:
with open(self.output_file, 'r') as json_file:
existing_data = json.load(json_file)

total_items = len(existing_data)
chunk_size = 1000
num_chunks = ceil(total_items / chunk_size)

for i in range(num_chunks):
chunk = existing_data[i * chunk_size: (i + 1) * chunk_size]
chunk_file = f'custom_output_chunk_{i + 1}.json'

with open(chunk_file, 'w') as chunk_json_file:
json.dump(chunk, chunk_json_file, indent=4)

s3.upload_file(chunk_file, self.bucket_name, f'{self.folder_name}/{chunk_file}')
self.logger.info(f'{chunk_file} uploaded to {self.bucket_name}/{self.folder_name}/{chunk_file}')

os.remove(chunk_file)


55 changes: 52 additions & 3 deletions server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
from scrapy.utils.project import get_project_settings
from dino.dino.spiders.spider1 import Spider1
from dino.dino.spiders.spider2 import Spider2
from dino.dino.spiders.spider3 import Spider3
from multiprocessing import Process

from typing import Optional, Dict
from importlib import import_module


from typing import Optional
Expand All @@ -19,11 +21,15 @@
# MongoDB connection settings
MONGO_URI = os.getenv("MONGO_URI")


# Initialize MongoDB client with server API version 1
client = MongoClient(MONGO_URI, server_api=ServerApi('1'))

app = FastAPI()

# Global dictionary to store running spider processes
running_spiders: Dict[str, Process] = {}

# Default endpoint
@app.get("/")
async def base_function():
Expand All @@ -50,20 +56,63 @@ def _run_spider(spider_class):
def run_spider(spider_class):
process = Process(target=_run_spider, args=(spider_class,))
process.start()
# Store the process in the global dictionary using the spider class name as the key
running_spiders[spider_class.__name__] = process
return process

@app.get("/scrape")
async def scrape_data():
# Start both spiders in separate processes
process1 = run_spider(Spider1)
process2 = run_spider(Spider2)

process3 = run_spider(Spider3)
# Wait for both processes to complete
process1.join()
process2.join()

process3.join()
return {"message": "Scraping completed"}

@app.post("/stop_spider/{spider_name}")
async def stop_spider(spider_name: str):
"""
Endpoint to stop a running spider.
Parameters:
spider_name (str): The name of the class of the spider to stop.
Returns:
dict: A message indicating whether the spider was stopped successfully.
"""
if spider_name in running_spiders:
process = running_spiders[spider_name]
process.terminate()
process.join() # Wait for the process to terminate
del running_spiders[spider_name] # Remove the process from the dictionary
return {"message": f"Spider {spider_name} stopped successfully."}
else:
return {"message": f"No spider with the name {spider_name} is currently running."}

@app.get("/run_spider/{spider_name}")
async def run_specific_spider(spider_name: str):
"""
Endpoint to run a specific spider by name.
Parameters:
spider_name (str): The name of the class of the spider to be run.
Returns:
dict: A message indicating whether the spider was started successfully.
"""
try:
spider_class = globals().get(spider_name)
if spider_class and isinstance(spider_class, type):
run_spider(spider_class)
return {"message": f"{spider_name} started successfully."}
else:
raise HTTPException(status_code=404, detail="Spider not found")
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))


# Run the FastAPI app using Uvicorn server
if __name__ == "__main__":
Expand Down

0 comments on commit 6c70034

Please sign in to comment.