initial scraper implemented

Sakthe-Balan · Apr 6, 2024 · 2af58a6 · 2af58a6
1 parent b7581f9
commit 2af58a6
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,4 @@ __pycache__/
 
 # Ignore files generated by testing frameworks
 coverage/
+products.json
diff --git a/server/dino/dino/spiders/spider1.py b/server/dino/dino/spiders/spider1.py
@@ -1,27 +1,60 @@
 import os
+import json
 import boto3
 from dotenv import load_dotenv
-from scrapy.spiders import Spider
+from scrapy import Spider, Request
 
 # Load environment variables from .env file
 load_dotenv()
 
 class Spider1(Spider):
     name = 'spider1'
     start_urls = ['https://www.softwareadvice.com/categories/']
+    output_file = 'products.json'
+    bucket_name = 'dinostomach'
+    folder_name = 'softwareadvice'
 
     def parse(self, response):
-        # Extract the entire HTML content of the page
-        page_content = response.text
-
-        # Print the entire HTML content
-        print(1, page_content)
+        # Extract links from the page
+        links = response.css('a::attr(href)').extract()
 
-        # Write the HTML content to S3 bucket
+        for link in links:
+            # Append '/p/all' to the link
+            link += '/p/all'
+
+            # Make a request to the category URL
+            yield Request(link, callback=self.parse_category)
+
+    def parse_category(self, response):
+        # Extract product cards from the category page
+        product_cards = response.css('.ProductCardComponent.alternatives-card.mb-4.rounded-lg.border.border-solid.border-grey-100.shadow')
+
+        products = []
+
+        for card in product_cards:
+            # Extract data from each product card
+            product_data = {
+                'title': card.css('h3::text').get(),
+                'description': card.css('p::text').get(),
+                'price': card.css('strong::text').get(),
+                'image_url': card.css('img::attr(src)').get()
+            }
+
+            # Append product data to the list
+            products.append(product_data)
+
+        # Save product data to a JSON file
+        with open(self.output_file, 'a') as json_file:
+            json.dump(products, json_file, indent=4)
+
+        self.logger.info(f'Product data appended to {self.output_file}')
+
+    def closed(self, reason):
+        # Upload the JSON file to S3 after the spider is closed
         s3 = boto3.client('s3',
                           aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
                           aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
                           region_name=os.getenv('AWS_REGION'))
-        bucket_name = 'dinostomach'  
-        object_key = 'example.html'  
-        s3.put_object(Body=page_content, Bucket=bucket_name, Key=object_key)
+
+        s3.upload_file(self.output_file, self.bucket_name, f'{self.folder_name}/{self.output_file}')
+        self.logger.info(f'{self.output_file} uploaded to {self.bucket_name}/{self.folder_name}/{self.output_file}')
diff --git a/server/main.py b/server/main.py
@@ -1,5 +1,4 @@
 import os
-import json
 import uvicorn
 from dotenv import load_dotenv
 from pymongo.mongo_client import MongoClient
@@ -9,9 +8,8 @@
 from scrapy.utils.project import get_project_settings
 from dino.dino.spiders.spider1 import Spider1
 from dino.dino.spiders.spider2 import Spider2
-import asyncio
 from multiprocessing import Process
-import threading
+
 
 
 from typing import Optional
@@ -58,11 +56,11 @@ def run_spider(spider_class):
 async def scrape_data():
     # Start both spiders in separate processes
     process1 = run_spider(Spider1)
-    process2 = run_spider(Spider2)
+    # process2 = run_spider(Spider2)
 
     # Wait for both processes to complete
     process1.join()
-    process2.join()
+    # process2.join()
 
     return {"message": "Scraping completed"}