Completed scrape and save archittectures for

4 websites - Sakthe
Sakthe-Balan · Apr 8, 2024 · 6c70034 · 6c70034
1 parent 924ea2c
commit 6c70034
Show file tree

Hide file tree

Showing 7 changed files with 299 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -41,4 +41,5 @@ __pycache__/
 # Ignore files generated by testing frameworks
 coverage/
 products.json
-outputs.json
+outputs.json
+final.json
diff --git a/server/dino/dino/settings.py b/server/dino/dino/settings.py
@@ -101,3 +101,5 @@
 HTTPCOMPRESSION_ENABLED = True
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
 # Define the headers to be used for making requests to the website
+# RETRY_HTTP_CODES = [500, 503, 504, 400, 408, 307, 403]
+# RETRY_TIMES=5
diff --git a/server/dino/dino/spiders/.env.template b/server/dino/dino/spiders/.env.template
@@ -0,0 +1,6 @@
+# AWS Credentials
+AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY_ID
+AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_ACCESS_KEY
+AWS_REGION=YOUR_AWS_REGION
+
+# Additional environment variables can be added here as needed
diff --git a/server/dino/dino/spiders/spider2.py b/server/dino/dino/spiders/spider2.py
@@ -1734,7 +1734,7 @@ def parse_software_page(self, response):
     'price': section.css('span.star_new_background::text').get(),
     'image_url': section.css('img.soft_logo.ls-is-cached.lazyloaded::attr(src)').get(),
     'profile_href': section.css('a.ga_track_soft_profile.view_profile_inline::attr(href)').get(),
-    'website':section.css('a.ga_track_vwl_vwb.ripple_btn.cat_vwbft_fdfcgp_btn.vwb_btn.d-flex.align-items-center.justify-content-center::attr(href)')[0].get()
+
  }
             print(software_info)
 
@@ -1747,6 +1747,7 @@ def parse_software_page(self, response):
     def parse_software_profile(self, response):
         software_info = response.meta['software_info']
         # Extract the overview
+        software_info['website']=response.css('a.ga_track_vwl_comp_d.specification_visit_website::attr(href)').get()
         software_info['image_url']=response.xpath('//img[not(@class)]')[2].attrib.get('src', None)
         software_info['description'] = response.css('p.read_more_text_overview::text').get()
 

diff --git a/server/dino/dino/spiders/spider3.py b/server/dino/dino/spiders/spider3.py
@@ -0,0 +1,142 @@
+import os
+import json
+import boto3
+import scrapy
+from dotenv import load_dotenv
+from scrapy import Request
+from math import ceil
+from datetime import datetime, timedelta
+
+# Load environment variables from .env file
+load_dotenv()
+links=['/all']
+class Spider3(scrapy.Spider):
+    name = 'spider3'
+    start_urls = ['https://www.producthunt.com/']
+    output_file = 'final.json'
+    bucket_name = 'dinostomach'
+    folder_name = 'producthunt'
+
+    def parse(self, response):
+        # Start date
+        start_date = datetime(2024, 4, 8)
+        # End date
+        end_date = datetime(2023, 1, 1)
+
+        # Loop through each date, decreasing from start_date to end_date
+        current_date = start_date
+        while current_date >= end_date:
+            # Construct the date part of the URL
+            date_part = current_date.strftime("%Y/%m/%d")
+            # Construct the URL
+            url = f'https://www.producthunt.com/leaderboard/daily/{date_part}/all'
+            # Make a request to the URL with custom_parse_method callback
+            yield Request(url, callback=self.custom_parse_method)
+            # Decrease the date by one day
+            current_date -= timedelta(days=1)
+
+    def custom_parse_method(self, response):
+        # Loop through each div with the specified class name
+          for div in response.css('div.styles_item__Dk_nz.my-2.flex.flex-1.flex-row.gap-2.py-2.sm\:gap-4'):
+            # Extracting title
+            title = div.css('strong::text').get()
+            print(title)
+            # Extracting image_url
+            image_url = div.css('img.styles_mediaThumbnail__NCzNO::attr(src)').get()
+            print(image_url)
+            # Extracting website
+            website = 'https://www.producthunt.com/' + div.css('a.styles_externalLinkIcon__vjPDi::attr(href)').get()
+            print(website)
+            # Create mets
+            mets = {'title': title, 'image_url': image_url, 'website': website}
+            titles =title.lower(). replace(" ", "-")
+
+            # Request the detailed page
+            detailed_url = 'https://www.producthunt.com/' + f'posts/{titles}'
+            print(detailed_url)
+            links.extend(response.css('a.text-14.font-semibold.text-light-grey::attr(href)').extract())
+            yield Request(detailed_url, callback=self.parse_detailed_page, meta=mets)
+
+    def parse_detailed_page(self, response):
+        mets = response.meta
+        # Extracting description
+        description = response.css('div.styles_htmlText__eYPgj.text-16.font-normal.text-dark-grey::text').get()
+        if(description == "null" or description == None or description ==""):
+            description = response.css('div.text-16.font-normal.text-light-grey.mb-6::text').get()
+
+
+        mets['description'] = description
+        print(description)
+
+        reviews_url = response.url + '/reviews'
+
+        yield Request(reviews_url, callback=self.parse_reviews, meta=mets)
+
+    def parse_reviews(self, response):
+        mets = response.meta
+        reviews = response.css('div.text-18.font-normal.text-dark-grey.text-center.mt-4::text').get()
+         # Check if reviews is None or an empty string
+        if reviews is None or reviews == "":
+        # # Try a different selector
+            reviews = response.css('div.styles_htmlText__eYPgj.text-18.font-normal.text-light-grey.italic.styles_format__8NeQe.styles_overallExperience__x7Gqf::text').get()
+
+    # If reviews is still None or an empty string, set it to "No reviews" or any other suitable default value
+        if reviews is None or reviews == "":
+            reviews = "No reviews"
+        print(reviews)
+    # Add reviews to meta
+        mets['reviews'] = [reviews]
+        meta_data = {
+        'title': mets['title'],
+        'image_url': mets['image_url'],
+        'website': mets['website'],
+        'description': mets['description'],
+        'reviews': mets['reviews']
+    }
+        self.save_data(meta_data)
+
+    def save_data(self, data):
+        file_exists = os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0
+
+        with open(self.output_file, 'a') as json_file:
+            if not file_exists:
+                json_file.write('[')
+            else:
+                json_file.write(',')
+
+            json.dump(data, json_file, indent=4)
+
+
+    def closed(self, reason):
+        # Custom logic to execute after the spider is closed
+        # Example: Upload the JSON file to S3
+        with open(self.output_file, 'a') as json_file:
+            json_file.write(']') # Add closing square bracket to indicate the end of JSON array
+
+        s3 = boto3.client('s3',
+                          aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
+                          aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
+                          region_name=os.getenv('AWS_REGION'))
+
+        if os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0:
+            with open(self.output_file, 'r') as json_file:
+                existing_data = json.load(json_file)
+
+            total_items = len(existing_data)
+            chunk_size = 1000
+            num_chunks = ceil(total_items / chunk_size)
+
+            for i in range(num_chunks):
+                chunk = existing_data[i * chunk_size: (i + 1) * chunk_size]
+                chunk_file = f'custom_output_chunk_{i + 1}.json'
+
+                with open(chunk_file, 'w') as chunk_json_file:
+                    json.dump(chunk, chunk_json_file, indent=4)
+
+                s3.upload_file(chunk_file, self.bucket_name, f'{self.folder_name}/{chunk_file}')
+                self.logger.info(f'{chunk_file} uploaded to {self.bucket_name}/{self.folder_name}/{chunk_file}')
+
+                os.remove(chunk_file)
+
+
+
diff --git a/server/dino/dino/spiders/spider_template.py b/server/dino/dino/spiders/spider_template.py
@@ -0,0 +1,93 @@
+import os
+import json
+import boto3
+from dotenv import load_dotenv
+from scrapy import Spider, Request
+from math import ceil
+
+# Instructions for integrating with FastAPI
+# ----------------------------------------
+# 1. Import this spider into your FastAPI application's main.py file.
+# 2. Use the Scrapy CrawlerProcess to run the spider.
+# 3. Define an endpoint in your FastAPI application that triggers the spider.
+# 4. Call the spider using its name ('custom_scraper' in this case) when the endpoint is hit.
+# Replace 'YOUR_START_URL_HERE', 'YOUR_CSS_SELECTOR_HERE', 'YOUR_BUCKET_NAME', and 'YOUR_FOLDER_NAME' with actual values.
+
+
+# Load environment variables from .env file
+load_dotenv()
+
+class CustomScraper(Spider):
+    name = 'custom_scraper'
+    start_urls = ['YOUR_START_URL_HERE'] # Define your start URLs here
+    output_file = 'custom_output.json' # Output file name
+    bucket_name = 'YOUR_BUCKET_NAME' # S3 bucket name
+    folder_name = 'YOUR_FOLDER_NAME' # S3 folder name
+
+    def parse(self, response):
+        # Custom parsing logic goes here
+        # Example: Extract links from the page
+        links = response.css('YOUR_CSS_SELECTOR_HERE').extract()
+
+        for link in links:
+            # Customize the request to the extracted links
+            yield Request(link, callback=self.custom_parse_method)
+
+    def custom_parse_method(self, response):
+        # Custom parsing logic for each extracted link
+        # Example: Extract product details
+        product_data = {
+            'title': response.css('YOUR_CSS_SELECTOR_HERE').get(),
+            'description': response.css('YOUR_CSS_SELECTOR_HERE').get(),
+            # Add more fields as needed
+        }
+
+        # Custom processing or storage logic
+        # Example: Save product data to a file or database
+        self.save_data(product_data)
+
+    def save_data(self, data):
+        # Custom logic to save data
+        # Example: Save to a JSON file
+        file_exists = os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0
+
+        with open(self.output_file, 'a') as json_file:
+            if not file_exists:
+                json_file.write('[') # Add opening square bracket if the file is empty
+            else:
+                json_file.write(',') # Add comma to separate JSON objects
+
+            json.dump(data, json_file, indent=4)
+
+    def closed(self, reason):
+        # Custom logic to execute after the spider is closed
+        # Example: Upload the JSON file to S3
+        with open(self.output_file, 'a') as json_file:
+            json_file.write(']') # Add closing square bracket to indicate the end of JSON array
+
+        s3 = boto3.client('s3',
+                          aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
+                          aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
+                          region_name=os.getenv('AWS_REGION'))
+
+        if os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0:
+            with open(self.output_file, 'r') as json_file:
+                existing_data = json.load(json_file)
+
+            total_items = len(existing_data)
+            chunk_size = 1000
+            num_chunks = ceil(total_items / chunk_size)
+
+            for i in range(num_chunks):
+                chunk = existing_data[i * chunk_size: (i + 1) * chunk_size]
+                chunk_file = f'custom_output_chunk_{i + 1}.json'
+
+                with open(chunk_file, 'w') as chunk_json_file:
+                    json.dump(chunk, chunk_json_file, indent=4)
+
+                s3.upload_file(chunk_file, self.bucket_name, f'{self.folder_name}/{chunk_file}')
+                self.logger.info(f'{chunk_file} uploaded to {self.bucket_name}/{self.folder_name}/{chunk_file}')
+
+                os.remove(chunk_file)
+
+
diff --git a/server/main.py b/server/main.py
@@ -8,8 +8,10 @@
 from scrapy.utils.project import get_project_settings
 from dino.dino.spiders.spider1 import Spider1
 from dino.dino.spiders.spider2 import Spider2
+from dino.dino.spiders.spider3 import Spider3
 from multiprocessing import Process
-
+from typing import Optional, Dict
+from importlib import import_module
 
 
 from typing import Optional
@@ -19,11 +21,15 @@
 # MongoDB connection settings
 MONGO_URI = os.getenv("MONGO_URI")
 
+
 # Initialize MongoDB client with server API version 1
 client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
 
 app = FastAPI()
 
+# Global dictionary to store running spider processes
+running_spiders: Dict[str, Process] = {}
+
 # Default endpoint
 @app.get("/")
 async def base_function():
@@ -50,20 +56,63 @@ def _run_spider(spider_class):
 def run_spider(spider_class):
     process = Process(target=_run_spider, args=(spider_class,))
     process.start()
+    # Store the process in the global dictionary using the spider class name as the key
+    running_spiders[spider_class.__name__] = process
     return process
 
 @app.get("/scrape")
 async def scrape_data():
     # Start both spiders in separate processes
     process1 = run_spider(Spider1)
     process2 = run_spider(Spider2)
-
+    process3 = run_spider(Spider3)
     # Wait for both processes to complete
     process1.join()
     process2.join()
-
+    process3.join()
     return {"message": "Scraping completed"}
 
+@app.post("/stop_spider/{spider_name}")
+async def stop_spider(spider_name: str):
+    """
+    Endpoint to stop a running spider.
+    
+    Parameters:
+    spider_name (str): The name of the class of the spider to stop.
+    
+    Returns:
+    dict: A message indicating whether the spider was stopped successfully.
+    """
+    if spider_name in running_spiders:
+        process = running_spiders[spider_name]
+        process.terminate()
+        process.join() # Wait for the process to terminate
+        del running_spiders[spider_name] # Remove the process from the dictionary
+        return {"message": f"Spider {spider_name} stopped successfully."}
+    else:
+        return {"message": f"No spider with the name {spider_name} is currently running."}
+
+@app.get("/run_spider/{spider_name}")
+async def run_specific_spider(spider_name: str):
+    """
+    Endpoint to run a specific spider by name.
+    
+    Parameters:
+    spider_name (str): The name of the class of the spider to be run.
+    
+    Returns:
+    dict: A message indicating whether the spider was started successfully.
+    """
+    try:
+        spider_class = globals().get(spider_name)
+        if spider_class and isinstance(spider_class, type):
+            run_spider(spider_class)
+            return {"message": f"{spider_name} started successfully."}
+        else:
+            raise HTTPException(status_code=404, detail="Spider not found")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
 
 # Run the FastAPI app using Uvicorn server
 if __name__ == "__main__":