-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrap.py
59 lines (47 loc) · 1.98 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import urllib
from bs4 import BeautifulSoup
import requests
import re
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["travelbig1"]
mycol = mydb["big_col1"]
urls = ['https://www.keralatourism.org/destination/munnar/202',
'https://www.keralatourism.org/destination/periyar-tiger-reserve-idukki/192',
'https://www.keralatourism.org/destination/fort-kochi/422',
'https://www.keralatourism.org/destination/alappuzha-beach/60',
'https://www.keralatourism.org/destination/bekal-kasaragod/259',
'https://www.keralatourism.org/destination/vagamon-idukki/324',
'https://www.keralatourism.org/destination/muzhapilangad-beach/85',
'https://www.holidify.com/state/kerala/',
'https://www.makemytrip.com/holidays-india/kerala-tourism.html',
'https://www.ekeralatourism.net/']
for url in urls:
page = requests.get(url)
data = page.text
soup = BeautifulSoup(data)
links=[]
for link in soup.find_all('a'):
links.append(link.get('href'))
links = list(set(links))
for link in links:
#print(link.get('href'))
if (link is not None) and ("www.keralatourism.org" in link or "www.makemytrip.com/holidays-india/kerala" in link or "www.holidify.com/pages/kerala" in link or "www.ekeralatourism.net" in link):
r = requests.get(link)
html_content = r.text
soup = BeautifulSoup(html_content, 'html.parser')
if soup.title:
title = soup.title.string
namebox = soup.find_all('p')
if not namebox:
continue
text = ""
for name in namebox[:4]:
temp = name.text
# temp.split()
text = text + " " + temp
# text.split()
# print("new")
text = " ".join(text.split())
mydict = {"url": url, "title": title, "text": text}
x = mycol.insert_one(mydict)