-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapping.py
51 lines (43 loc) · 1.63 KB
/
scrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
import html2text
from dateutil import parser
from fastprogress.fastprogress import master_bar, progress_bar
meses = {
'enero': '01',
'febrero': '02',
'marzo': '03',
'abril': '04',
'mayo': '05',
'junio': '06',
'julio': '07',
'agosto': '08',
'septiembre': '09',
'octubre': '10',
'noviembre': '11',
'diciembre': '12'
}
# Paso 1: Hacer una solicitud HTTP a la página web
url = 'https://www.gob.mx/presidencia/es/archivo/articulos?category=764&filter_origin=archive&idiom=es&order=DESC&page='
mb = master_bar(range(1, 258))
for i in mb:
response = requests.get(url + str(i))
soup = BeautifulSoup(response.content, 'html.parser')
titulos = soup.find_all('h2')
for j in progress_bar(range(len(titulos)), parent=mb):
titulo = titulos[j]
link = titulo.find('a')['href'][2:-2]
article_url = 'https://www.gob.mx' + link
article_response = requests.get(article_url)
article_soup = BeautifulSoup(article_response.content, 'html.parser')
# Find tabindex="0"
article = article_soup.find_all('div', {'tabindex': '0'})[0]
title = article_soup.find('h1').text
date_text = article_soup.find('p').text.split('|')[-1].strip()
day, month_name, year = date_text.split(' de ')
month = meses[month_name.lower()]
formatted_date = f"{year}-{month}-{day.strip()}"
title = title.replace('Versión estenográfica. ', '')
markdown = html2text.html2text(article.prettify())
with open(f'./corpus/{formatted_date}.md', 'w') as f:
f.write(markdown)