Skip to content

Commit

Permalink
Completed two scraper bots
Browse files Browse the repository at this point in the history
  • Loading branch information
Sakthe-Balan committed Apr 7, 2024
1 parent 6320085 commit e75ec7c
Show file tree
Hide file tree
Showing 8 changed files with 1,880 additions and 114 deletions.
9 changes: 4 additions & 5 deletions server/ScraperList.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
https://www.capterra.com/categories/
https://www.softwareadvice.com/categories/
https://sourceforge.net/software/
https://sourceforge.net/software/windows/
https://www.trustradius.com/categories
https://www.getapp.com/browse/
https://sourceforge.net/software/ -out
https://www.trustradius.com/categories out
https://www.getapp.com/browse/ -out
https://www.softwaresuggest.com/all-categories
https://www.producthunt.com/categories
https://clutch.co/sitemap
https://clutch.co/sitemap -out



41 changes: 41 additions & 0 deletions server/chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import json
import boto3
from math import ceil
import uuid
from dotenv import load_dotenv

load_dotenv()

output_file = 'products.json'
bucket_name = 'dinostomach' # Update with your AWS S3 bucket name
folder_name = 'getapp' # Update with your desired folder name in S3
chunk_size = 1000

# Load existing data from the output_file
if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
with open(output_file, 'r') as json_file:
existing_data = json.load(json_file)

total_items = len(existing_data)
num_chunks = ceil(total_items / chunk_size)

s3 = boto3.client('s3',
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
region_name=os.getenv('AWS_REGION'))

for i in range(num_chunks):
chunk = existing_data[i * chunk_size: (i + 1) * chunk_size]
new_uuid = uuid.uuid4()
chunk_file = f'products_chunk_{new_uuid}.json'

# Save chunk to a separate JSON file
with open(chunk_file, 'w') as chunk_json_file:
json.dump(chunk, chunk_json_file, indent=4)

# Upload the chunk file to S3
s3.upload_file(chunk_file, bucket_name, f'{folder_name}/{chunk_file}')

# Remove the chunk file
os.remove(chunk_file)
14 changes: 12 additions & 2 deletions server/dino/dino/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#USER_AGENT = "dino (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
Expand Down Expand Up @@ -88,6 +88,16 @@
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"




TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
# In your Scrapy project's settings.py file
# settings.py
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.6'
DOWNLOAD_DELAY = 2
HTTPCOMPRESSION_ENABLED = True
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
# Define the headers to be used for making requests to the website
7 changes: 4 additions & 3 deletions server/dino/dino/spiders/spider1.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ def parse_product_details(self, response):
product_data['additional_info'] = additional_info.strip() if additional_info else None

# Request to the review page
product_data['website'] = response.css('a::attr(data-href)').get()

review_link = product_data['link'] + '/reviews/'
yield Request(review_link, callback=self.parse_reviews, meta={'product_data': product_data})

Expand All @@ -74,7 +76,7 @@ def parse_reviews(self, response):

# Adding reviews to product_data
product_data['reviews'] = reviews

file_exists = os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0

# Append product data to the JSON file
Expand Down Expand Up @@ -122,5 +124,4 @@ def closed(self, reason):
# Remove the chunk file
os.remove(chunk_file)

# Remove the original JSON file
os.remove(self.output_file)

Loading

0 comments on commit e75ec7c

Please sign in to comment.