Skip to content

Commit

Permalink
Merge commit '6cb3640b29b987978fec0525908a5e3d0921bb4f' of https://gi…
Browse files Browse the repository at this point in the history
  • Loading branch information
KaiHuaDou committed Jun 5, 2024
2 parents 5f6e521 + 6cb3640 commit baddf26
Show file tree
Hide file tree
Showing 47 changed files with 3,369 additions and 439 deletions.
31 changes: 31 additions & 0 deletions Changelog.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,37 @@
# - title by author
# }}}

{{{ 7.10.0 2024-05-03

:: new features

- Export of calibre data: Ensure individual part files in the exported data are no larger than one gigabyte even
if the library contains individual files larger than that size.

Note that this means that exports created by calibre from this version
on will not be importable by earlier versions. However, exports from
earlier versions should still be importable.

- Edit book: Spell check: Add options to exclude words in ALL CAPS or with numbers or in camelCase/snake_case from the list of words

- Allow easily inverting the current search via the right click menu on the search box

:: bug fixes

- [2064546] Kobo driver: Fix database unsupported error with newest firmware

- [2063301] DOCX Input: Fix text elements containing only whitespace being incorrectly ignored

- Bulk metadata dialog: Do not fail when setting covers from ebook files and some of the files have invalid covers

:: improved recipes
- Economist
- The Week
- Caravan Magazine
- Financial Times

}}}

{{{ 7.9.0 2024-04-19

:: new features
Expand Down
17 changes: 13 additions & 4 deletions bypy/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,8 @@
{
"name": "libxml2",
"unix": {
"filename": "libxml2-2.12.1.tar.xz",
"hash": "sha256:8982b9ccdf7f456e30d8f7012d50858c6623e495333b6191def455c7e95427eb",
"filename": "libxml2-2.12.6.tar.xz",
"hash": "sha256:889c593a881a3db5fdd96cc9318c87df34eb648edfc458272ad46fd607353fbb",
"urls": ["https://download.gnome.org/sources/libxml2/2.12/{filename}"]
}
},
Expand Down Expand Up @@ -620,8 +620,8 @@
{
"name": "lxml",
"unix": {
"filename": "lxml-4.9.3.tar.gz",
"hash": "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c",
"filename": "lxml-5.2.1.tar.gz",
"hash": "sha256:3f7765e69bbce0906a7c74d5fe46d2c7a7596147318dbc08e4a2431f3060e306",
"urls": ["pypi"]
}
},
Expand Down Expand Up @@ -968,6 +968,15 @@
}
},

{
"name": "lxml-html-clean",
"unix": {
"filename": "lxml_html_clean-0.1.1-py3-none-any.whl",
"hash": "sha256:58c04176593c9caf72ec92e033d2f38859e918b3eff0cc0f8051ad27dc2ab8ef",
"urls": ["pypi"]
}
},

{
"name": "ply",
"comment": "Needed for sip (build time dependency)",
Expand Down
12 changes: 0 additions & 12 deletions bypy/windows/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,6 @@ def set_quit():
builtins.exit = _sitebuiltins.Quitter('exit', eof)


def workaround_lxml_bug():
# Without calling xmlInitParser() import lxml causes a segfault
import ctypes
x = ctypes.WinDLL('libxml2.dll')
x.xmlInitParser()
workaround_lxml_bug.libxml2 = x
from lxml import etree
del etree


def main():
sys.meta_path.insert(0, PydImporter())
os.add_dll_directory(os.path.abspath(os.path.join(sys.app_dir, 'app', 'bin')))
Expand All @@ -85,8 +75,6 @@ def fake_getline(filename, lineno, module_globals=None):
set_helper()
set_quit()

workaround_lxml_bug()

return run_entry_point()


Expand Down
144 changes: 111 additions & 33 deletions recipes/caravan_magazine.recipe
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>

import json
from urllib.parse import quote, urlparse

from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre.web.feeds.news import BasicNewsRecipe
from mechanize import Request


Expand All @@ -21,6 +18,45 @@ def safe_dict(data, *names):
ans = ans.get(x) or ''
return ans


def parse_body(x):
if x.get('type', '') == 'paragraph':
yield '<p>'
for p in x.get('content', {}):
yield ''.join(parse_p(p))
yield '</p>\n'
elif x.get('type', '') in {'blockquote', 'pullquote'}:
yield '<blockquote>'
for p in x.get('content', {}):
yield from parse_body(p)
yield '</blockquote>'
elif x.get('type', '') == 'figure':
yield '<img src="{}">'.format(absurl(x['attrs']['src'].replace('=s0', '=s768-rw')))
for p in x.get('content', {}):
yield from parse_body(p)
elif x.get('type', '') in {'caption', 'credit'}:
yield '<div class="sub">'
for div in x.get('content', {}):
yield ''.join(parse_p(div))
yield '</div>\n'
elif x.get('type', '') != '':
if 'content' in x:
yield '<p>'
for p in x.get('content', {}):
yield from parse_body(p)
yield '</p>'

def parse_p(p):
if p.get('type', '') == 'text':
if 'marks' in p:
tag = p['marks'][0]['type']
yield '<' + tag + '>'
yield p['text']
yield '</' + tag + '>'
else:
yield p['text']


class CaravanMagazine(BasicNewsRecipe):

title = 'Caravan Magazine'
Expand All @@ -40,23 +76,26 @@ class CaravanMagazine(BasicNewsRecipe):
remove_attributes = ['style', 'height', 'width']
ignore_duplicate_articles = {'url'}
resolve_internal_links = True
needs_subscription = 'optional'
logged = False

extra_css = '''
img {display:block; margin:0 auto;}
blockquote, em {color:#202020;}
.article_subtitle {font-style:italic; color:#202020;}
#fig-c, .photo_wrapper, .cover_figure_element {text-align:center; font-size:small;}
.pre-title, .text_wrapper {font-size:small; color:#404040;}
.desc {font-style:italic; color:#202020;}
.sub {text-align:center; font-size:small;}
.cat, .auth {font-size:small; color:#404040;}
'''

def get_browser(self, *args, **kw):
br = BasicNewsRecipe.get_browser(self, *args, **kw)
if not self.username or not self.password:
return br
data = json.dumps({'email': self.username, 'name': '', 'password': self.password})
data = json.dumps({"0":{"json":{"email":self.username,"password":self.password}}})
if not isinstance(data, bytes):
data = data.encode('utf-8')
rq = Request(
url='https://caravanmagazine.in/api/users/login',
url='https://caravanmagazine.in/api/trpc/users.login?batch=1',
data=data,
headers={
'Accept': 'application/json, text/plain, */*',
Expand All @@ -66,37 +105,33 @@ class CaravanMagazine(BasicNewsRecipe):
},
method='POST'
)
res = br.open(rq).read()
res = res.decode('utf-8')
self.log('Login request response: {}'.format(res))
res = json.loads(res)
if res['code'] != 200 or res['message'] != "Login success":
raise ValueError('Login failed, check your username and password')
try:
res = br.open(rq).read()
res = res.decode('utf-8')
res = json.loads(res)
self.log(safe_dict(res[0], 'result', 'data', 'json', 'message'))
self.logged = True
except:
self.log.warn('\n**Login failed, check your username and password\n')
return br
return br

keep_only_tags = [
classes('text_wrapper cover_figure_element article_content')
]

def preprocess_html(self, soup):
h2 = soup.find('h2')
if h2:
h2.name = 'p'
for fc in soup.findAll('figcaption'):
fc['id'] = 'fig-c'
return soup

def parse_index(self):
self.log(
'\n***\nif this recipe fails, report it on: '
'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
)

api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue'
# api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&' + \
# 'input=%7B%220%22%3A%7B%22json%22%3A%7B%22month%22%3A' + '2' + '%2C%22year%22%3A' + '2024' + '%7D%7D%7D'
# input={"0":{"json":{"month":2,"year":2024}}}
raw = self.index_to_soup(api, raw=True)
data = json.loads(raw)['result']['data']['json']
# for past editions
# inp = json.dumps({"0":{"json":{"month":6,"year":2023}}})
# api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input=' + quote(inp, safe='')

raw = json.loads(self.index_to_soup(api, raw=True))
if isinstance(raw, list):
data = raw[0]['result']['data']['json']
else:
data = raw['result']['data']['json']
cover = safe_dict(data, 'issue', 'cover', 'data', 'url').replace('=s0', '=s768-rw')
self.cover_url = absurl(cover)

Expand All @@ -122,3 +157,46 @@ class CaravanMagazine(BasicNewsRecipe):
if articles:
feeds.append((section, articles))
return feeds

def print_version(self, url):
slug = urlparse(url).path
inp = json.dumps({"0":{"json":{"slug":slug}}})
return 'https://api.caravanmagazine.in/api/trpc/articles.getFromCache?batch=1&input=' + quote(inp, safe='')

def preprocess_raw_html(self, raw, url):
cache_data = json.loads(raw)[0]
art_id = cache_data['result']['data']['json']['articleId']
prim_data = cache_data['result']['data']['json']['data']

cat = desc = lede = auth = ''

cat = '<div class="cat">' + safe_dict(prim_data, 'printTitle') + '</div>\n'
title = '<h1>' + safe_dict(prim_data, 'title') + '</h1>\n'
desc = '<p class="desc">' + safe_dict(prim_data, 'description') + '</p>\n'

authors = []
for q in prim_data.get('authors', {}):
authors.append(safe_dict(q, 'name'))
dt = ''
if prim_data.get('writtenAt', '') != '':
import time
from datetime import datetime, timedelta
dt = datetime.fromisoformat(prim_data['writtenAt'][:-1]) + timedelta(seconds=time.timezone)
dt = dt.strftime('%b %d, %Y, %I:%M %p')
auth ='<p class="auth">' + ', '.join(authors) + ' | ' + dt + '</p>\n'
lede = ''.join(parse_body(prim_data.get('cover', {})))

free_cont = ''
for x in prim_data['data']['content']:
free_cont += '\n'+ ''.join(parse_body(x))

premium_cont = ''
if self.logged:
cont_url = 'https://api.caravanmagazine.in/api/paywall/check-article?articleId='
art_cont = json.loads(self.index_to_soup(cont_url + str(art_id), raw=True))
for x in art_cont['premiumContent']:
premium_cont += '\n' + ''.join(parse_body(x))

return '<html><body><div>' \
+ cat + title + desc + auth + lede + free_cont + premium_cont + \
'</div></body></html>'
Loading

0 comments on commit baddf26

Please sign in to comment.