Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

asos_app #11

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
97 changes: 31 additions & 66 deletions items/utils.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,22 @@
import json
import os
from typing import Any, Dict

from urllib.parse import urlparse, parse_qsl, urlunparse, urlencode

import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup

from items.debugging import app_logger as log

load_dotenv()


def get_next_url(url: str, param: str, nxt: int):
url_parse = urlparse(url)
query = url_parse.query
url_dict: Dict[str, Any] = dict(parse_qsl(query))
if isinstance(url_dict[param], list):
page = int(url_dict[param][0]) + nxt
else:
page = int(url_dict[param]) + nxt
params = {param: page}
url_dict.update(params)
url_new_query = urlencode(url_dict)
url_parse = url_parse._replace(query=url_new_query)
next_url = urlunparse(url_parse)
return next_url

def global_headers():
return {
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,ar;q=0.8,fr;q=0.7,de;q=0.6',
}

def get_ld_json(response: requests.Response):
soup = BeautifulSoup(response.content, 'html.parser')
Expand All @@ -40,59 +29,35 @@ def get_ld_json(response: requests.Response):
log.info(f'ld+json not found for {response.url}')
return None


# This should be a standard template for all shopify websites
def get_shopify_variants(response: requests.Response):
soup = BeautifulSoup(response.content, 'html.parser')
scripts = soup.findAll('script')
script = [ele.text for ele in scripts if '"variants":' in ele.text][0]
str_json = [ele for ele in script.split(';') if '"variants":' in ele][0].strip()
str_json = str_json.replace('var meta = ', '')
data = json.loads(str_json)
variants = data['product']['variants']
rid = data['product']['id']
rtype = data['product']['type']
return rid, rtype, variants


# Parsing reviews from stamped.oo
def parse_stamped_reviews(rid, rtype, product_name, product_sku):
def parse_api_reviews(self):
product_id = self.product_id
reviews = []
review_containers = True
api_key = os.getenv('ninewest_stamped_api')
store_key = os.getenv('ninewest_stamped_store')
page = 1
rating = 0
count = 0
while review_containers:
url = f'https://stamped.io/api/widget?productId={rid}&productName={product_name}&productType={rtype}&productSKU={product_sku}&page={page}&apiKey={api_key}&storeUrl={store_key}&take=16&sort=rece'
def _get_totalResults(product_id):
url = f'https://www.asos.com/api/product/reviews/v1/products/{product_id}?offset=1&limit=100&include=Products&store=US&lang=en-US&filteredStats=reviews&sort=SubmissionTime:desc'
response = requests.get(url)
data = response.json()
totalResults = data['totalResults']
return totalResults

for offset in range(1,int(_get_totalResults(product_id)),100):
url = f'https://www.asos.com/api/product/reviews/v1/products/{product_id}?offset={offset}&limit=100&include=Products&store=US&lang=en-US&filteredStats=reviews&sort=SubmissionTime:desc'
response = requests.get(url)
data = response.json()
rating = data['rating']
count = data['count']
html_reviews = data['widget'].strip()
soup = BeautifulSoup(html_reviews, 'html.parser')
review_containers = soup.findAll('div', {'class': 'stamped-review'})
if review_containers:
for review_container in review_containers:
review_date = review_container.find('div', {'class': 'created'}).text
review_author = review_container.find('strong', {'class': 'author'}).text
review_location = review_container.find('div', {'class': 'review-location'}).text
review_header = review_container.find('h3', {'class': 'stamped-review-header-title'}).text
review_body = review_container.find('p', {'class': 'stamped-review-content-body'}).text
review_thumbs_up = review_container.find('i', {'class': 'stamped-fa stamped-fa-thumbs-up'}).text.strip()
review_thumbs_down = review_container.find('i',
{'class': 'stamped-fa stamped-fa-thumbs-down'}).text.strip()

for ele in data['results']:
if ele['reviewText']:
review_date = ele['submissionTime']
review_author = ele['userNickname']
review_location = ele['contentLocale']
review_header = ele['title']
review_body = ele['reviewText']

review = {
'date': review_date,
'author': review_author,
'location': review_location,
'header': review_header,
'body': review_body,
'thumbs_up': review_thumbs_up,
'thumbs_down': review_thumbs_down
}
reviews.append(review)
page += 1
return rating, count, reviews
return reviews
7 changes: 4 additions & 3 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from spiders.ninewest import NineWest
from spiders.asos import Asos

if __name__ == '__main__':
url = 'https://ninewest.com/products/speakup-almond-toe-flats-in-black-floral'
p = NineWest(product_url=url)
url = 'https://www.asos.com/adidas-originals/adidas-originals-streetball-ii-trainers-in-grey-tones-and-cream/prd/201271050?ctaref=we+recommend+grid_6&featureref1=we+recommend+pers'
p = Asos(product_url=url)
print(p.get_product_info())
print(p.get_product_review())
78 changes: 78 additions & 0 deletions spiders/asos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import requests
import json
from typing import List, Dict
from dotenv import load_dotenv
from items.utils import get_ld_json, global_headers, parse_api_reviews
load_dotenv()
from datetime import datetime

class Asos:
product_info = None
product_review = None

def __init__(self, product_url, product_name=None, product_sku=None, product_id=None):
if product_url.endswith('/'):
self.product_url = product_url[:-1]

elif 'prd' in product_url:
self.product_url = product_url.split('?')[0]

else:
self.product_url = product_url
self.product_name = product_name
self.product_sku = product_sku
self.product_id = product_id

@staticmethod
def _parse_json(ld: json):
'''
{
'@context': 'https://schema.org/',
'@type': 'Product',
'name': 'adidas Originals Ozrah trainers in pale nude',
'sku': '109868766',
'color': 'Beige',
'image': 'https://images.asos-media.com/products/adidas-originals-ozrah-trainers-in-pale-nude/201136102-1-beige',
'brand': {
'@type': 'Brand',
'name': 'adidas Originals'},
'description': 'Trainers by adidas Made for unboxing Low-profile design Pull tab for easy entry Lace-up fastening Padded tongue and cuff Signature adidas branding Adiprene cushioning for added comfort Durable rubber outsole Textured grip tread',
'productID': 201136102,
'url': 'https://www.asos.com/adidas-originals/adidas-originals-ozrah-trainers-in-pale-nude/prd/201136102',
'offers': {}
}
'''
return {
'title' : ld['name'],
'sku' : ld['sku'],
'description' : ld.get('description'),
'image' : ld['image'],
'url' : ld['url'],
'brand' : ld['brand']['name'],
'product_id' : ld['productID'],
'product_url' : ld['url'],
'created': str(datetime.now()),
'last_updated': str(datetime.now())
}

def get_product_info(self, proxy=False) -> Dict:
response = requests.get(self.product_url, headers=global_headers())
ld_json = get_ld_json(response)
data = self._parse_json(ld_json)

# Updating the product info dictionary
data['product_url'] = self.product_url
data['spider'] = Asos.__name__.lower()
self.product_info = data
self.product_id = self.product_info['product_id']
return data

def get_product_review(self):
if self.product_id:
product_review = parse_api_reviews(self)
else:
self.product_info = self.get_product_info()
product_review = parse_api_reviews(self)

return product_review

101 changes: 0 additions & 101 deletions spiders/ninewest.py

This file was deleted.

68 changes: 0 additions & 68 deletions spiders/sephora.py

This file was deleted.

Loading