Skip to content

Commit

Permalink
Implemented Paralell architecture
Browse files Browse the repository at this point in the history
  • Loading branch information
Sakthe-Balan committed Apr 6, 2024
1 parent 73eca93 commit b7581f9
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 18 deletions.
10 changes: 0 additions & 10 deletions server/dino/dino/spiders/my_spider.py

This file was deleted.

27 changes: 27 additions & 0 deletions server/dino/dino/spiders/spider1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import boto3
from dotenv import load_dotenv
from scrapy.spiders import Spider

# Load environment variables from .env file
load_dotenv()

class Spider1(Spider):
name = 'spider1'
start_urls = ['https://www.softwareadvice.com/categories/']

def parse(self, response):
# Extract the entire HTML content of the page
page_content = response.text

# Print the entire HTML content
print(1, page_content)

# Write the HTML content to S3 bucket
s3 = boto3.client('s3',
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
region_name=os.getenv('AWS_REGION'))
bucket_name = 'dinostomach'
object_key = 'example.html'
s3.put_object(Body=page_content, Bucket=bucket_name, Key=object_key)
27 changes: 27 additions & 0 deletions server/dino/dino/spiders/spider2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import boto3
from dotenv import load_dotenv
from scrapy.spiders import Spider

# Load environment variables from .env file
load_dotenv()

class Spider2(Spider):
name = 'spider2'
start_urls = ['https://www.softwareadvice.com/categories/']

def parse(self, response):
# Extract the entire HTML content of the page
page_content = response.css('body').get()

# Print the entire HTML content
print(1, page_content)

# Write the HTML content to S3 bucket
s3 = boto3.client('s3',
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
region_name=os.getenv('AWS_REGION'))
bucket_name = 'dinostomach'
object_key = 'example1.html'
s3.put_object(Body=page_content, Bucket=bucket_name, Key=object_key)
39 changes: 31 additions & 8 deletions server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from dino.dino.spiders.my_spider import MySpider
from dino.dino.spiders.spider1 import Spider1
from dino.dino.spiders.spider2 import Spider2
import asyncio
from multiprocessing import Process
import threading


from typing import Optional

load_dotenv()
Expand Down Expand Up @@ -37,14 +43,31 @@ async def base_function():
except Exception as e:
return {"message": f"The following exception occurred: {e}", "status": 404}

@app.get("/scrape")
def run_scraper():

def _run_spider(spider_class):
process = CrawlerProcess(get_project_settings())
process.crawl(MySpider)
process.start() # This will block until the crawling is finished
process.crawl(spider_class)
process.start()

def run_spider(spider_class):
process = Process(target=_run_spider, args=(spider_class,))
process.start()
return process

@app.get("/scrape")
async def scrape_data():
# Start both spiders in separate processes
process1 = run_spider(Spider1)
process2 = run_spider(Spider2)

# Wait for both processes to complete
process1.join()
process2.join()

return {"message": "Scraping completed"}



# Run the FastAPI app using Uvicorn server
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
uvicorn.run("main:app", host="0.0.0.0", port=8000)

7 changes: 7 additions & 0 deletions server/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import requests

url = "http://localhost:8000/scrape"

response = requests.get(url)

print(response.json())

0 comments on commit b7581f9

Please sign in to comment.