This repository has been archived by the owner on Oct 22, 2018. It is now read-only.
forked from minimalparts/PeARS
-
Notifications
You must be signed in to change notification settings - Fork 10
/
retrieve_pages.py
112 lines (90 loc) · 3.52 KB
/
retrieve_pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import os
import platform
import re
import requests
import sqlite3
import sys
from urllib2 import HTTPError
from create_history_db import create_history_db
HISTORY_DB = ''
print platform.system(), platform.release()
home_directory = os.path.expanduser('~')
print home_directory
def get_firefox_history_db(in_dir):
"""Given a home directory it will search it for the places.sqlite file
in Mozilla Firefox and return the path. This should work on Windows/Linux"""
firefox_directory = in_dir + "/.mozilla/firefox"
print firefox_directory
for files in os.walk(firefox_directory):
# Build the filename
if re.search('places.sqlite', str(os.path.join(files))):
history_db = str(os.path.realpath(files[0])+'/places.sqlite')
print history_db
return history_db
return None
print HISTORY_DB
def retrieve_pages():
con = sqlite3.connect('history.db')
con.row_factory = sqlite3.Row
cur = con.cursor()
cur.execute("SELECT * FROM History;")
rows = cur.fetchall()
for row in rows:
id = row["Id"]
url = str(row['URL'])
# url = str('http://www.ubuntu.com')
if url.endswith(('.gz', '.zip', '.exe', '.pdf', '.jpeg', '.jpg', '.7z', '.iso', '.img', '.png', '.svg',
'mp4', '.mp3', 'ogg', '.avi', '.wma', '.wmv', '.gif', '.rpm', '.deb', '.mkv', '.rar',
'.m4a', '.tgz', '.tar', '.webm')):
print url + ' is a binary - omitted from database'
continue
try:
r = requests.get(url, allow_redirects=False)
r.encoding = 'utf-8'
print str(r.status_code) + ' - ' + str(r.url)
if r.status_code is not 200:
print str(r.url) + ' has a status code of: ' + str(r.status_code) + ' omitted from database.'
continue
except:
e = sys.exc_info()[0]
print "Error - %s" % e
continue
bs_obj = BeautifulSoup(r.text)
if hasattr(bs_obj.title, 'string') & (r.status_code == requests.codes.ok):
try:
title = unicode(bs_obj.title.string)
if url.startswith('http'):
if title is None:
title = u'Untitled'
for x in bs_obj.find_all(['script', 'style', 'meta', '<!--', ]):
x.extract()
body = bs_obj.get_text()
title_str = title
body_str = body.strip()
print body_str
cur.execute("UPDATE History SET title=?, body=? WHERE ID=?", (title_str, body_str, id), )
con.commit()
print str(r.status_code) + ' - ' + title + ' - Committed.'
if title is None:
title = u'Untitled'
except HTTPError as e:
title = u'Untitled'
except None:
title = u'Untitled'
continue
else:
continue
con.close()
if __name__ == '__main__':
HISTORY_DB = get_firefox_history_db(home_directory)
if HISTORY_DB is None:
print 'Error - Cannot find the Firefox history database.\n\nExiting...'
sys.exit(1)
# If the firefox history db exists we then want to create our own version from it, minus the domains specified in
# the user's ~/.pearsignore.
#
if not HISTORY_DB:
create_history_db(HISTORY_DB)
retrieve_pages()