-
Notifications
You must be signed in to change notification settings - Fork 0
/
ex_12_chapter.py
143 lines (120 loc) · 4.73 KB
/
ex_12_chapter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# SIMPLE WEB CONNECTION
import socket
# socket.socket creates a point ready to connect, but not yet connected
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# .socket can be used to connect to other servers,
#just need to follow the appropiate protocol
mysock.connect(('data.pr4e.org', 80)) #this is the actual connection
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode() #needs an empty line
# .encode() transforms string into a BYTES OBJECT
# which can be done with b'<string>' also #DIFFERENT TYPE OF OBJECT
mysock.send(cmd)
while True:
data = mysock.recv(512) #receiving packets
if len(data) < 1: #finish when it is nothing left coming in
break
print(data.decode(),end='') # transforms BYTES -> STRINGS
mysock.close()
# RETRIEVING AN IMAGE
import socket
import time
HOST = 'data.pr4e.org'
PORT = 80
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect((HOST, PORT))
mysock.sendall(b'GET http://data.pr4e.org/cover3.jpg HTTP/1.0\r\n\r\n')
count = 0
picture = b""
while True:
data = mysock.recv(5120)
if len(data) < 1: break
#time.sleep(0.25) #this is for FLOW CONTROL
count = count + len(data)
print(len(data), count) #this is to see how the data has been sent
picture = picture + data
mysock.close()
# Look for the end of the header (2 CRLF)
pos = picture.find(b"\r\n\r\n")
print('Header length', pos)
print(picture[:pos].decode())
# Skip past the header and save the picture data
picture = picture[pos+4:]
fhand = open("stuff.jpg", "wb")
fhand.write(picture)
fhand.close()
# URLLIB - a library that has coded in methods with the previos stuff
import urllib.request
fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand: #the for loop is to read the file
print(line.decode().strip()) # READING THE FILE ONLY
#the urllib code cleans the data received and only shows the contents of the file
# another way to use the urllib
import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
# This is like if we read a file on our own computer
counts = dict() # this time we are counting the words of the text
for line in fhand:
words = line.decode().split() # IMPORTANT TO DECODE as it is coming from outside on UTF-8
for word in words:
counts[word] = counts.get(word, 0) + 1
print(counts)
#this saves the file in your disk
import urllib.request, urllib.parse, urllib.error
img = urllib.request.urlopen('http://data.pr4e.org/cover3.jpg').read()
fhand = open('cover3.jpg', 'wb')
fhand.write(img) #if the file is too big, it might give you problems
fhand.close()
# so similar to the control flow, we download it in blocks, to reduce errors
import urllib.request, urllib.parse, urllib.error
img = urllib.request.urlopen('http://data.pr4e.org/cover3.jpg')
fhand = open('cover3.jpg', 'wb')
size = 0
while True: # for the data blocks
info = img.read(100000) # instead of .recv
if len(info) < 1: break
size = size + len(info)
fhand.write(info) # writing it to disk, block by block
print(size, 'characters copied.')
fhand.close()
# URLLIB AND PARSING WITH RE ------------------------
"""# we can construct a RE to parse
<h1>The First Page</h1>
<p>
If you like, you can switch to the
<a href="http://www.dr-chuck.com/page2.htm"> Second Page</a>.
</p>
# to get the website from the page
href="http[s]?://.+?" #[s]? means 0 or 1 s
# (.+?) means one or more characters till " non-greedy"""
# complete code looks like:
import urllib.request, urllib.parse, urllib.error
import re
import ssl #libary to access HTTPS site
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ') # https or http format website
html = urllib.request.urlopen(url, context=ctx).read()
links = re.findall(b'href="(http[s]?://.*?)"', html)
# list of all strings that match our RE , returning the link text between espressions
for link in links:
print(link.decode())
# LIBRARY BeautifulSoup for broken HTML code
# There are a number of Python libraries which can help you parse HTML and extract data from the pages
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser') # this cleans the data IT IS A SOUP OBJECT
# Retrieve all of the anchor tags
tags = soup('a') # this gives us a list of the anchor tags
# READ THE BeautifulSoup PAGE FOR MORE INFO
for tag in tags:
print(tag.get('href', None))
# there is another way to print things you need, in the book