Skip to content

Commit

Permalink
initial scraper implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
Sakthe-Balan committed Apr 6, 2024
1 parent b7581f9 commit 2af58a6
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,4 @@ __pycache__/

# Ignore files generated by testing frameworks
coverage/
products.json
53 changes: 43 additions & 10 deletions server/dino/dino/spiders/spider1.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,60 @@
import os
import json
import boto3
from dotenv import load_dotenv
from scrapy.spiders import Spider
from scrapy import Spider, Request

# Load environment variables from .env file
load_dotenv()

class Spider1(Spider):
name = 'spider1'
start_urls = ['https://www.softwareadvice.com/categories/']
output_file = 'products.json'
bucket_name = 'dinostomach'
folder_name = 'softwareadvice'

def parse(self, response):
# Extract the entire HTML content of the page
page_content = response.text

# Print the entire HTML content
print(1, page_content)
# Extract links from the page
links = response.css('a::attr(href)').extract()

# Write the HTML content to S3 bucket
for link in links:
# Append '/p/all' to the link
link += '/p/all'

# Make a request to the category URL
yield Request(link, callback=self.parse_category)

def parse_category(self, response):
# Extract product cards from the category page
product_cards = response.css('.ProductCardComponent.alternatives-card.mb-4.rounded-lg.border.border-solid.border-grey-100.shadow')

products = []

for card in product_cards:
# Extract data from each product card
product_data = {
'title': card.css('h3::text').get(),
'description': card.css('p::text').get(),
'price': card.css('strong::text').get(),
'image_url': card.css('img::attr(src)').get()
}

# Append product data to the list
products.append(product_data)

# Save product data to a JSON file
with open(self.output_file, 'a') as json_file:
json.dump(products, json_file, indent=4)

self.logger.info(f'Product data appended to {self.output_file}')

def closed(self, reason):
# Upload the JSON file to S3 after the spider is closed
s3 = boto3.client('s3',
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
region_name=os.getenv('AWS_REGION'))
bucket_name = 'dinostomach'
object_key = 'example.html'
s3.put_object(Body=page_content, Bucket=bucket_name, Key=object_key)

s3.upload_file(self.output_file, self.bucket_name, f'{self.folder_name}/{self.output_file}')
self.logger.info(f'{self.output_file} uploaded to {self.bucket_name}/{self.folder_name}/{self.output_file}')
8 changes: 3 additions & 5 deletions server/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import json
import uvicorn
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient
Expand All @@ -9,9 +8,8 @@
from scrapy.utils.project import get_project_settings
from dino.dino.spiders.spider1 import Spider1
from dino.dino.spiders.spider2 import Spider2
import asyncio
from multiprocessing import Process
import threading



from typing import Optional
Expand Down Expand Up @@ -58,11 +56,11 @@ def run_spider(spider_class):
async def scrape_data():
# Start both spiders in separate processes
process1 = run_spider(Spider1)
process2 = run_spider(Spider2)
# process2 = run_spider(Spider2)

# Wait for both processes to complete
process1.join()
process2.join()
# process2.join()

return {"message": "Scraping completed"}

Expand Down

0 comments on commit 2af58a6

Please sign in to comment.