Implemented Paralell architecture

Sakthe-Balan · Apr 6, 2024 · b7581f9 · b7581f9
1 parent 73eca93
commit b7581f9
Show file tree

Hide file tree

Showing 5 changed files with 92 additions and 18 deletions.
diff --git a/server/dino/dino/spiders/my_spider.py b/server/dino/dino/spiders/my_spider.py
diff --git a/server/dino/dino/spiders/spider1.py b/server/dino/dino/spiders/spider1.py
@@ -0,0 +1,27 @@
+import os
+import boto3
+from dotenv import load_dotenv
+from scrapy.spiders import Spider
+
+# Load environment variables from .env file
+load_dotenv()
+
+class Spider1(Spider):
+    name = 'spider1'
+    start_urls = ['https://www.softwareadvice.com/categories/']
+
+    def parse(self, response):
+        # Extract the entire HTML content of the page
+        page_content = response.text
+
+        # Print the entire HTML content
+        print(1, page_content)
+
+        # Write the HTML content to S3 bucket
+        s3 = boto3.client('s3',
+                          aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
+                          aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
+                          region_name=os.getenv('AWS_REGION'))
+        bucket_name = 'dinostomach'  
+        object_key = 'example.html'  
+        s3.put_object(Body=page_content, Bucket=bucket_name, Key=object_key)
diff --git a/server/dino/dino/spiders/spider2.py b/server/dino/dino/spiders/spider2.py
@@ -0,0 +1,27 @@
+import os
+import boto3
+from dotenv import load_dotenv
+from scrapy.spiders import Spider
+
+# Load environment variables from .env file
+load_dotenv()
+
+class Spider2(Spider):
+    name = 'spider2'
+    start_urls = ['https://www.softwareadvice.com/categories/']
+
+    def parse(self, response):
+        # Extract the entire HTML content of the page
+        page_content = response.css('body').get()
+
+        # Print the entire HTML content
+        print(1, page_content)
+
+        # Write the HTML content to S3 bucket
+        s3 = boto3.client('s3',
+                          aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
+                          aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
+                          region_name=os.getenv('AWS_REGION'))
+        bucket_name = 'dinostomach'  
+        object_key = 'example1.html'  
+        s3.put_object(Body=page_content, Bucket=bucket_name, Key=object_key)
diff --git a/server/main.py b/server/main.py
@@ -4,10 +4,16 @@
 from dotenv import load_dotenv
 from pymongo.mongo_client import MongoClient
 from pymongo.server_api import ServerApi
-from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
-from dino.dino.spiders.my_spider import MySpider
+from dino.dino.spiders.spider1 import Spider1
+from dino.dino.spiders.spider2 import Spider2
+import asyncio
+from multiprocessing import Process
+import threading
+
+
 from typing import Optional
 
 load_dotenv()
@@ -37,14 +43,31 @@ async def base_function():
     except Exception as e:
         return {"message": f"The following exception occurred: {e}", "status": 404}
 
-@app.get("/scrape")
-def run_scraper():
+
+def _run_spider(spider_class):
     process = CrawlerProcess(get_project_settings())
-    process.crawl(MySpider)
-    process.start() # This will block until the crawling is finished
+    process.crawl(spider_class)
+    process.start()
+
+def run_spider(spider_class):
+    process = Process(target=_run_spider, args=(spider_class,))
+    process.start()
+    return process
+
+@app.get("/scrape")
+async def scrape_data():
+    # Start both spiders in separate processes
+    process1 = run_spider(Spider1)
+    process2 = run_spider(Spider2)
+
+    # Wait for both processes to complete
+    process1.join()
+    process2.join()
+
     return {"message": "Scraping completed"}
-
+
+
 # Run the FastAPI app using Uvicorn server
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    uvicorn.run("main:app", host="0.0.0.0", port=8000)
 
diff --git a/server/test.py b/server/test.py
@@ -0,0 +1,7 @@
+import requests
+
+url = "http://localhost:8000/scrape"
+
+response = requests.get(url)
+
+print(response.json())