-
Notifications
You must be signed in to change notification settings - Fork 0
/
helpers_lxml.py
106 lines (85 loc) · 4.14 KB
/
helpers_lxml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import lxml
import lxml.html
from lxml import etree
import helpers_web as hw
def remove_attributes(tree, tag, attributes):
print(f"remove_attributes: <{tag}> {attributes} | {tree}")
cnt = 0
for attribute in list(attributes):
attribute = attribute.replace('@', '')
_xpath = f"//{tag}[@{attribute}]"
items = tree.xpath(_xpath)
#print("\t items", items)
for item in items:
#print("\t", hw.GRAY, item.get(attribute), hw.RESET)
item.attrib.pop(attribute, None) # None is to not raise an exception
cnt += 1
assert tree.xpath(_xpath) == [] # make sure all popped
#print(lxml.etree.tostring(tree, pretty_print=True))
print(f"\t removed: {cnt} attributes from <{tag}> ")
return tree
"""
<div id="simple-banner" class="simple-banner">
<div class="simple-banner-text">
<span>This site is run on sustainable energy and reduces image quality & quantity for a low carbon footprint. <br>Videos load after clicking to avoid tracking. <a href="http://openresource.1001suns.com/pyramis-niger.php">Why?</a><!--br/>Access the low carbon version of this site <a href="http://static.1001suns.com/">here</a--></span>
</div>
</div>
CSS will be applied directly to the simple-banner class,
the simple-banner-scrolling class for scrolling styles,
the simple-banner-text class for text specific styles,
and the simple-banner-button class for close button specific styles.
Be very careful, bad CSS can break the banner.
"""
def __get_content(text, id, class_banner, class_text):
return f'''
<div id="{id}" class="{class_banner}">
<div class="{class_text}">
<span>{text}</span>
</div>
</div>
'''
def banner_header(text, id="banner_header", class_banner="banner_header", class_text="banner_header_text"):
banner = lxml.html.fragment_fromstring(__get_content(text, id, class_banner, class_text))
#print(lxml.html.etree.tostring(banner, pretty_print=True).decode())
return banner
def banner_footer(text, id="banner_footer", class_banner="banner_footer", class_text="banner_footer_text"):
banner = lxml.html.fragment_fromstring(__get_content(text, id, class_banner, class_text))
#print(lxml.html.etree.tostring(banner, pretty_print=True).decode())
return banner
def remove_by_xpath(tree, sxpath):
#print("remove_by_xpath", sxpath)
for item in tree.xpath(sxpath):
print("\t removing:", hw.CYAN + sxpath + hw.RESET)
item.getparent().remove(item)
def remove_children_by_xpath(tree, sxpath):
#print("remove_children_by_xpath", sxpath)
for item in tree.xpath(sxpath):
print("\t drop_tree:", hw.CYAN + sxpath + hw.RESET)
item.drop_tree()
def set_text_by_xpath(tree, sxpath, text):
#print("set_text_by_xpath", sxpath)
for item in tree.xpath(sxpath):
print("\t set text:", text, hw.CYAN + sxpath + hw.RESET)
item.text = text
def set_tail_by_xpath(tree, sxpath, text):
#print("set_tail_by_xpath", sxpath)
for item in tree.xpath(sxpath):
print("\t set tail:", text, hw.CYAN + sxpath + hw.RESET)
item.tail = text
def replace_xpath_with_fragment(tree, sxpath, html_string):
#print("replace_xpath_with_fragment", sxpath)
for item in tree.xpath(sxpath):
print("\t replacing:", hw.CYAN + sxpath + hw.RESET)
item.getparent().replace(item, lxml.html.fragment_fromstring(html_string))
def append_xpath_with_fragment(tree, sxpath, html_string):
#print("append_xpath_with_fragment", sxpath)
for item in tree.xpath(sxpath):
print("\t append:", hw.CYAN + sxpath + hw.RESET)
item.append(lxml.html.fragment_fromstring(html_string))
def replace_xpath_with_fragment_from_file(tree, sxpath, frag_file_path):
html_string = hw.string_from_file(frag_file_path, sanitize=False)
for item in tree.xpath(sxpath):
print("\t replacing:", hw.YELLOW + sxpath + hw.RESET)
item.getparent().replace(item, lxml.html.fragment_fromstring(html_string))
if __name__ == "__main__":
banner_header("text with html<br> end.")