This repository has been archived by the owner on Mar 4, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
html2markdown.py
executable file
·77 lines (69 loc) · 2.21 KB
/
html2markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#! /usr/bin/env python3.2
import re
def _subpre(text):
list=re.split('(<pre>|</pre>)',text)
for i in range(len(list)):
# begin of pre
if i%4==1:
list[i]='\n\n '
# in pre
elif i%4==2:
list[i]=re.sub('<p>|<br>|\n\n', '\n\n ',list[i])
# end of pre
elif i%4==3:
list[i]='\n\n'
return ''.join(list)
def _subblock(text):
list=re.split('(<blockquote>|</blockquote>)',text)
for i in range(len(list)):
# begin of blockquote
if i%4==1:
list[i]='\n\n> '
# in blockquote
elif i%4==2:
list[i]=re.sub('<p>|<br>|\n\n', '\n\n> ',list[i])
# end of blockquote
elif i%4==3:
list[i]='\n\n'
return ''.join(list)
def _sublinks(text):
return re.sub('<a href=\"(?P<link>.*?)\">(?P<linktext>.*?)</a>', lambda m : '[' + _markdownify_linktext(m.group('linktext')) + '](' + _fefe_linksintern(m.group('link')) + ')', text)
def _markdownify(text):
list=re.split('(\[.*\]\(.*\))',text)
# only change when not a link
for i in range(0,len(list),2):
list[i]=re.sub('\*','\\*',list[i])
list[i]=re.sub('_','\\_',list[i])
list[i]=re.sub('<b>','**',list[i])
list[i]=re.sub('</b>','**',list[i])
list[i]=re.sub('<i>','_',list[i])
list[i]=re.sub('</i>','_',list[i])
list[i]=re.sub('<u>','\n',list[i])
list[i]=re.sub('</u>','\n',list[i])
list[i]=re.sub('<li>','\n - ',list[i])
list[i]=re.sub('</li>','\n',list[i])
list[i]=re.sub('<p>','\n\n',list[i])
list[i]=re.sub('</p>','\n\n',list[i])
list[i]=re.sub('<br>','\n\n',list[i])
return ''.join(list)
def _markdownify_linktext(text):
list=re.split('(\[.*\]\(.*\))',text)
# only change when not a link
for i in range(0,len(list),2):
list[i]=re.sub('\*','\\*',list[i])
list[i]=re.sub('_','\\_',list[i])
list[i]=re.sub('<b>','**',list[i])
list[i]=re.sub('</b>','**',list[i])
list[i]=re.sub('<i>','_',list[i])
list[i]=re.sub('</i>','_',list[i])
return ''.join(list)
def _fefe_linksintern(text):
text=re.sub('^\/\?ts=','https://blog.fefe.de/?ts=',text)
text=re.sub('^\/\?q=','https://blog.fefe.de/?q=',text)
return text
def html2md(html):
html=_subpre(html)
html=_subblock(html)
html=_sublinks(html)
html=_markdownify(html)
return html