Completed two scraper bots

Sakthe-Balan · Apr 7, 2024 · e75ec7c · e75ec7c
1 parent 6320085
commit e75ec7c
Show file tree

Hide file tree

Showing 8 changed files with 1,880 additions and 114 deletions.
diff --git a/server/ScraperList.txt b/server/ScraperList.txt
@@ -1,12 +1,11 @@
 https://www.capterra.com/categories/
 https://www.softwareadvice.com/categories/
-https://sourceforge.net/software/
-https://sourceforge.net/software/windows/
-https://www.trustradius.com/categories
-https://www.getapp.com/browse/
+https://sourceforge.net/software/  -out
+https://www.trustradius.com/categories out
+https://www.getapp.com/browse/ -out
 https://www.softwaresuggest.com/all-categories
 https://www.producthunt.com/categories
-https://clutch.co/sitemap
+https://clutch.co/sitemap -out
 
 
 
diff --git a/server/chunk.py b/server/chunk.py
@@ -0,0 +1,41 @@
+import os
+import json
+import boto3
+from math import ceil
+import uuid
+from dotenv import load_dotenv
+
+load_dotenv()
+
+output_file = 'products.json'
+bucket_name = 'dinostomach'  # Update with your AWS S3 bucket name
+folder_name = 'getapp'  # Update with your desired folder name in S3
+chunk_size = 1000
+
+# Load existing data from the output_file
+if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
+    with open(output_file, 'r') as json_file:
+        existing_data = json.load(json_file)
+
+    total_items = len(existing_data)
+    num_chunks = ceil(total_items / chunk_size)
+
+    s3 = boto3.client('s3',
+                      aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
+                      aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
+                      region_name=os.getenv('AWS_REGION'))
+
+    for i in range(num_chunks):
+        chunk = existing_data[i * chunk_size: (i + 1) * chunk_size]
+        new_uuid = uuid.uuid4()
+        chunk_file = f'products_chunk_{new_uuid}.json'
+
+        # Save chunk to a separate JSON file
+        with open(chunk_file, 'w') as chunk_json_file:
+            json.dump(chunk, chunk_json_file, indent=4)
+
+        # Upload the chunk file to S3
+        s3.upload_file(chunk_file, bucket_name, f'{folder_name}/{chunk_file}')
+
+        # Remove the chunk file
+        os.remove(chunk_file)
diff --git a/server/dino/dino/settings.py b/server/dino/dino/settings.py
@@ -17,7 +17,7 @@
 #USER_AGENT = "dino (+http://www.yourdomain.com)"
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -88,6 +88,16 @@
 #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 
 # Set settings whose default value is deprecated to a future-proof value
-REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+
+
+
+
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"
+# In your Scrapy project's settings.py file
+# settings.py
+REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.6'
+DOWNLOAD_DELAY = 2
+HTTPCOMPRESSION_ENABLED = True
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
+# Define the headers to be used for making requests to the website
diff --git a/server/dino/dino/spiders/spider1.py b/server/dino/dino/spiders/spider1.py
@@ -54,6 +54,8 @@ def parse_product_details(self, response):
         product_data['additional_info'] = additional_info.strip() if additional_info else None
 
         # Request to the review page
+        product_data['website'] = response.css('a::attr(data-href)').get()
+
         review_link = product_data['link'] + '/reviews/'
         yield Request(review_link, callback=self.parse_reviews, meta={'product_data': product_data})
 
@@ -74,7 +76,7 @@ def parse_reviews(self, response):
 
         # Adding reviews to product_data
         product_data['reviews'] = reviews
-
+        
         file_exists = os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0
 
         # Append product data to the JSON file
@@ -122,5 +124,4 @@ def closed(self, reason):
                 # Remove the chunk file
                 os.remove(chunk_file)
 
-        # Remove the original JSON file
-        os.remove(self.output_file)
+