-
Notifications
You must be signed in to change notification settings - Fork 0
/
jun_kazama_scraper.py
118 lines (96 loc) · 4 KB
/
jun_kazama_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import bs4
import requests
from bs4 import BeautifulSoup
from loguru import logger
logger.add('../tekken_8.log',
format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {name} | {module} | {function} | {line} | {message}",
mode='w')
class JunKazamaScraper:
def __init__(self):
logger.info("JunKazamaScraper initialized")
def _web_scrape(self, html_content: str) -> dict[str, list]:
"""
Web-scrape HTML content.
:param html_content: HTML as String.
:return: Dictionary containing all scraped data.
"""
logger.info('Starting Web-scraping process...')
soup = BeautifulSoup(html_content, 'html.parser')
tbody_content = soup.find('tbody')
td_contents = tbody_content.find_all('td')
frame_data_dictionary = self._append_data_to_dict(td_contents)
return frame_data_dictionary
@staticmethod
def _append_data_to_dict(td_contents: bs4.ResultSet) -> dict[str, list]:
"""
Append bs4 ResultSet data to dictionary.
:param td_contents: Bs4 ResultSet of the target data.
:return: Dictionary containing all scraped data.
"""
logger.info('Appending data to dictionary...')
frame_data_dict = {
"Command": [],
"Hit Level": [],
"Damage": [],
"Start Up Frame": [],
"Block Frame": [],
"Hit Frame": [],
"Counter Hit Frame": [],
"Notes": []
}
num_columns = len(frame_data_dict)
for i, td_content in enumerate(td_contents):
if i % num_columns == 0:
frame_data_dict["Command"].append(td_content.text)
elif i % num_columns == 1:
frame_data_dict["Hit Level"].append(td_content.text)
elif i % num_columns == 2:
frame_data_dict["Damage"].append(td_content.text)
elif i % num_columns == 3:
frame_data_dict["Start Up Frame"].append(td_content.text)
elif i % num_columns == 4:
frame_data_dict["Block Frame"].append(td_content.text)
elif i % num_columns == 5:
frame_data_dict["Hit Frame"].append(td_content.text)
elif i % num_columns == 6:
frame_data_dict["Counter Hit Frame"].append(td_content.text)
elif i % num_columns == 7:
frame_data_dict["Notes"].append(td_content.text)
return frame_data_dict
@staticmethod
def _get_page_html() -> str:
"""
Get page's HTML content.
:return: page's HTML content as String.
"""
logger.info('Getting Jun Kazama page HTML...')
url = 'https://rbnorway.org/jun-t8-frames/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
try:
logger.info(f'Send GET request to URL: {url}')
response = requests.get(url, headers=headers)
# Raise an HTTPError for bad responses (4xx and 5xx)
response.raise_for_status()
html_content = response.text
except requests.RequestException as e:
logger.error(f'Failed to fetch the page HTML: {e}')
else:
logger.info('Successfully fetched the page HTML')
return html_content
def start_jun_scraper(self) -> dict[str, list]:
"""
Start Jun Kazama data scraper.
:return: Dictionary containing all scraped data.
"""
logger.info('Starting Jun Kazama scraper...')
html_content = self._get_page_html()
return self._web_scrape(html_content)
if __name__ == '__main__':
pass