forked from binref/refinery
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run-crawl.py
39 lines (33 loc) · 1.05 KB
/
run-crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Generates the lists of toplevel domains and URL specifiers.
"""
import pprint
import os.path
import re
import requests
from refinery.lib.patterns.tlds import tlds as old_tlds
template = '''
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
{variable} = {contents}
'''.lstrip()
def normalize(data, *required):
data.update(list(required))
return data
if __name__ == '__main__':
session = requests.session()
tlds = session.get('https://data.iana.org/TLD/tlds-alpha-by-domain.txt').text
tlds = {t.strip() for t in tlds.split('\n') if '#' not in t} | {'bit', 'onion', 'sys', 'bazar', 'coin'}
tlds = {item.lower() for item in tlds if item}
tlds = {re.escape(item) for item in set(tlds)}
tlds.update(old_tlds)
tlds = list(tlds)
tlds.sort()
tlds.sort(key=len, reverse=True)
with open(os.path.join('.', 'refinery', 'lib', 'patterns', 'tlds.py'), 'w') as stream:
stream.write(template.format(
variable='tlds',
contents=pprint.pformat(tlds)
))