-
Notifications
You must be signed in to change notification settings - Fork 2
/
newsletter_to_markdown.py
executable file
·110 lines (91 loc) · 3.42 KB
/
newsletter_to_markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python2
from collections import OrderedDict
from lxml import html as xhtml
import requests
import shutil
import sys
import os
posts_dir = "_posts"
images_dir = "images"
#Taken from https://github.com/matthewwithanm/python-markdownify
from markdownify import MarkdownConverter
def md(html, **kwargs):
class CustomMarkdownConverter(MarkdownConverter):
def convert_hn(self, n, el, text):
hashes = "#"*n
ret = "\n---\n\n{} {}\n\n".format(hashes, text.strip().encode('ascii', 'ignore'))
return ret
return CustomMarkdownConverter(**kwargs).convert(html)
print("Enter the URL for the newsletter page on MailChimp")
print("(the 'see in browser' URL from the email that MailChimp sent out)")
url = raw_input("URL: ")
if len(url) < 10:
print("Do you have a filename to read from?")
filename = raw_input("Enter filename: ")
with open(filename, 'r') as f:
html = f.read()
else:
req = requests.get(url)
if req.status_code != 200:
print("Can't get the newsletter HTML!")
sys.exit(1)
html = req.text
#Getting article content
content_xpath = "/html/body/center/table/tr/td/table/tr[3]//tbody[@class='mcnTextBlockOuter']//td[@class='mcnTextContent']"
#TODO: shorten (XPATH: https://msdn.microsoft.com/en-us/library/ms256086(v=vs.110).aspx )
tree = xhtml.fromstring(html)
content_element = tree.xpath(content_xpath)[0]
#Asking newsletter date and number from user
print("Enter date in YYYY-MM-DD format")
date = raw_input("Date: ")
print("Enter newsletter number")
newsletter_number = raw_input("Number: ")
#Generating filename
post_filename = "{0}-ZeroPhone-Weekly-No.-{1}.md".format(date, newsletter_number)
print("Filename: {}".format(post_filename))
#Finding images
image_xpath_1 = "//img[@align='center']"
image_xpath_2 = "//img[@data-file-id]"
images = tree.xpath(image_xpath_1) + tree.xpath(image_xpath_2)
image_filenames = []
for i, image in enumerate(images):
extension = image.get("src").rsplit('.', 1)[1]
image_filename = "{0}_{1}_{2}.{3}".format(date, newsletter_number, i+1, extension)
print(image_filename)
image_filenames.append(image_filename)
first_image_filename = image_filenames[0]
#Downloading images
for i, image in enumerate(images):
print("Downloading {}".format(image.get("src")))
r = requests.get(image.get("src"), stream=True)
if r.status_code == 200:
img_path = os.path.join(images_dir, image_filenames[i])
print("Done, writing into {}".format(img_path))
with open(img_path, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
image.set("src", os.path.join("..", img_path))
else:
print("Failed!")
#Getting title
title_xpath = "/html/head/title"
title = tree.xpath(title_xpath)[0].text
#Filtering title for non-ASCII characters
title = title.encode('ascii', 'ignore')
#Generating post header
post_header = """---
layout: post
title: {0}
img: {1}""".format(title, first_image_filename)
# Getting HTML text from the tree
content_html = xhtml.tostring(content_element)
# Filtering and converting HTML
content_html = content_html.encode('ascii', 'ignore')
unfiltered_md = md(content_html)
#Writing everything into a post file
post_path = os.path.join(posts_dir, post_filename)
with open(post_path, "w") as f:
f.write(post_header.encode("utf8"))
f.write(unfiltered_md.encode("utf8"))
#Done!
print("Article written to {}".format(post_path))