-
Notifications
You must be signed in to change notification settings - Fork 5
/
Web Scraping_new.py
92 lines (68 loc) · 3.44 KB
/
Web Scraping_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 24 12:30:20 2021
@author: Chandramouli
"""
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
movie_name = []
year = []
time=[]
rating=[]
metascore =[]
director=[]
votes = []
gross = []
description = []
genre=[]
cast=[]
cas=[]
pages = np.arange(1,1000,50)
#https://www.imdb.com/search/title/?title_type=feature&primary_language=en
#https://www.imdb.com/search/title/?title_type=feature&primary_language=en&ref_=adv_prv
for page in pages:
page = requests.get("https://www.imdb.com/search/title/?title_type=feature&primary_language=en&start="+str(page)+"&ref_=adv_nxt")
soup = BeautifulSoup(page.text, 'html.parser')
movie_data = soup.findAll('div', attrs = {'class': 'lister-item mode-advanced'})
for store in movie_data:
name = store.h3.a.text
movie_name.append(name)
year_of_release = store.h3.find('span', class_ = "lister-item-year text-muted unbold").text.replace('(', '')
year_of_release=year_of_release.replace(')','')
year.append(year_of_release)
runtime = store.p.find("span", class_ = 'runtime').text if store.find('span', class_ = "runtime") else "NA"
time.append(runtime)
gen = store.p.find("span", class_ = 'genre').text
genre.append(gen)
rate = store.find('div', class_ = "inline-block ratings-imdb-rating").text.replace('\n', '') if store.find('div', class_ = "inline-block ratings-imdb-rating") else "NA"
rating.append(rate)
#rate = store.find('div', class_ = "ratings-bar").find('strong').text.replace('\n', '')
#rating.append(rate)
meta = store.find('span', class_ = "metascore").text if store.find('span', class_ = "metascore") else "NA"#if meta score not present then *
metascore.append(meta)
#dire=store.find('p',class_ = "metascore")
dire=store.find('p',class_='').find_all('a')[0].text
director.append(dire)
#cas=([a.text for a in store.find('p',class_='').find_all('a')[1:]])
#cast=','.join(map(str,cas))
cast.append([a.text for a in store.find('p',class_='').find_all('a')[1:]])
value = store.find_all('span', attrs = {'name':'nv'}) if store.find_all('span', attrs = {'name':'nv'}) else 'NA'
vote = value[0].text if store.find_all('span', attrs = {'name':'nv'}) else 'NA'
#vote = value[0].text if len(value)>1 else 'NA'
votes.append(vote)
#grosses = value[1].text if len(value)>1 else 'NA'
#gross.append(grosses)
describe = store.find_all('p', class_ = 'text-muted')
description_ = describe[1].text.replace('\n', '') if len(describe) >1 else 'NA'
description.append(description_)
#dataframe
for i in cast:
c=','.join(map(str,i))
cas.append(c)
movie_list = pd.DataFrame({ "Title": movie_name, "Year of Release" : year, "Watch Time": time,"Genre":genre,"Movie Rating": rating, "Metascore of movie": metascore,"Director":director,"Cast":cas,"Votes" : votes,"Description": description})
movie_list.to_excel("movie data_new1.xlsx")