Merge commit '6cb3640b29b987978fec0525908a5e3d0921bb4f' of https://gi…

…thub.com/kovidgoyal/calibre
KaiHuaDou · Jun 5, 2024 · baddf26 · baddf26
2 parents 5f6e521 + 6cb3640
commit baddf26
Show file tree

Hide file tree

Showing 47 changed files with 3,369 additions and 439 deletions.
diff --git a/Changelog.txt b/Changelog.txt
@@ -23,6 +23,37 @@
 # - title by author
 # }}}
 
+{{{ 7.10.0 2024-05-03
+
+:: new features
+
+- Export of calibre data: Ensure individual part files in the exported data are no larger than one gigabyte even
+  if the library contains individual files larger than that size.
+
+  Note that this means that exports created by calibre from this version
+  on will not be importable by earlier versions. However, exports from
+  earlier versions should still be importable.
+
+- Edit book: Spell check: Add options to exclude words in ALL CAPS or with numbers or in camelCase/snake_case from the list of words
+
+- Allow easily inverting the current search via the right click menu on the search box
+
+:: bug fixes
+
+- [2064546] Kobo driver: Fix database unsupported error with newest firmware
+
+- [2063301] DOCX Input: Fix text elements containing only whitespace being incorrectly ignored
+
+- Bulk metadata dialog: Do not fail when setting covers from ebook files and some of the files have invalid covers
+
+:: improved recipes
+- Economist
+- The Week
+- Caravan Magazine
+- Financial Times
+
+}}}
+
 {{{ 7.9.0 2024-04-19
 
 :: new features

diff --git a/bypy/sources.json b/bypy/sources.json
@@ -323,8 +323,8 @@
     {
         "name": "libxml2",
         "unix": {
-            "filename": "libxml2-2.12.1.tar.xz",
-            "hash": "sha256:8982b9ccdf7f456e30d8f7012d50858c6623e495333b6191def455c7e95427eb",
+            "filename": "libxml2-2.12.6.tar.xz",
+            "hash": "sha256:889c593a881a3db5fdd96cc9318c87df34eb648edfc458272ad46fd607353fbb",
             "urls": ["https://download.gnome.org/sources/libxml2/2.12/{filename}"]
         }
     },
@@ -620,8 +620,8 @@
     {
         "name": "lxml",
         "unix": {
-            "filename": "lxml-4.9.3.tar.gz",
-            "hash": "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c",
+            "filename": "lxml-5.2.1.tar.gz",
+            "hash": "sha256:3f7765e69bbce0906a7c74d5fe46d2c7a7596147318dbc08e4a2431f3060e306",
             "urls": ["pypi"]
         }
     },
@@ -968,6 +968,15 @@
         }
     },
 
+    {
+        "name": "lxml-html-clean",
+        "unix": {
+            "filename": "lxml_html_clean-0.1.1-py3-none-any.whl",
+            "hash": "sha256:58c04176593c9caf72ec92e033d2f38859e918b3eff0cc0f8051ad27dc2ab8ef",
+            "urls": ["pypi"]
+        }
+    },
+
     {
         "name": "ply",
 		"comment": "Needed for sip (build time dependency)",

diff --git a/bypy/windows/site.py b/bypy/windows/site.py
@@ -60,16 +60,6 @@ def set_quit():
     builtins.exit = _sitebuiltins.Quitter('exit', eof)
 
 
-def workaround_lxml_bug():
-    # Without calling xmlInitParser() import lxml causes a segfault
-    import ctypes
-    x = ctypes.WinDLL('libxml2.dll')
-    x.xmlInitParser()
-    workaround_lxml_bug.libxml2 = x
-    from lxml import etree
-    del etree
-
-
 def main():
     sys.meta_path.insert(0, PydImporter())
     os.add_dll_directory(os.path.abspath(os.path.join(sys.app_dir, 'app', 'bin')))
@@ -85,8 +75,6 @@ def fake_getline(filename, lineno, module_globals=None):
     set_helper()
     set_quit()
 
-    workaround_lxml_bug()
-
     return run_entry_point()
 
 

diff --git a/recipes/caravan_magazine.recipe b/recipes/caravan_magazine.recipe
@@ -1,10 +1,7 @@
-#!/usr/bin/env python
-# vim:fileencoding=utf-8
-# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
-
 import json
+from urllib.parse import quote, urlparse
 
-from calibre.web.feeds.news import BasicNewsRecipe, classes
+from calibre.web.feeds.news import BasicNewsRecipe
 from mechanize import Request
 
 
@@ -21,6 +18,45 @@ def safe_dict(data, *names):
         ans = ans.get(x) or ''
     return ans
 
+
+def parse_body(x):
+    if x.get('type', '') == 'paragraph':
+        yield '<p>'
+        for p in x.get('content', {}):
+            yield ''.join(parse_p(p))
+        yield '</p>\n'
+    elif x.get('type', '') in {'blockquote', 'pullquote'}:
+        yield '<blockquote>'
+        for p in x.get('content', {}):
+            yield from parse_body(p)
+        yield '</blockquote>'
+    elif x.get('type', '') == 'figure':
+        yield '<img src="{}">'.format(absurl(x['attrs']['src'].replace('=s0', '=s768-rw')))
+        for p in x.get('content', {}):
+            yield from parse_body(p)
+    elif x.get('type', '') in {'caption', 'credit'}:
+        yield '<div class="sub">'
+        for div in x.get('content', {}):
+            yield ''.join(parse_p(div))
+        yield '</div>\n'
+    elif x.get('type', '') != '':
+        if 'content' in x:
+            yield '<p>'
+            for p in x.get('content', {}):
+                yield from parse_body(p)
+            yield '</p>'
+
+def parse_p(p):
+    if p.get('type', '') == 'text':
+        if 'marks' in p:
+            tag = p['marks'][0]['type']
+            yield '<' + tag + '>'
+            yield p['text']
+            yield '</' + tag + '>'
+        else:
+            yield p['text']
+
+
 class CaravanMagazine(BasicNewsRecipe):
 
     title = 'Caravan Magazine'
@@ -40,23 +76,26 @@ class CaravanMagazine(BasicNewsRecipe):
     remove_attributes = ['style', 'height', 'width']
     ignore_duplicate_articles = {'url'}
     resolve_internal_links = True
+    needs_subscription = 'optional'
+    logged = False
 
     extra_css = '''
+        img {display:block; margin:0 auto;}
         blockquote, em {color:#202020;}
-        .article_subtitle {font-style:italic; color:#202020;}
-        #fig-c, .photo_wrapper, .cover_figure_element {text-align:center; font-size:small;}
-        .pre-title, .text_wrapper {font-size:small; color:#404040;}
+        .desc {font-style:italic; color:#202020;}
+        .sub {text-align:center; font-size:small;}
+        .cat, .auth {font-size:small; color:#404040;}
     '''
 
     def get_browser(self, *args, **kw):
         br = BasicNewsRecipe.get_browser(self, *args, **kw)
         if not self.username or not self.password:
             return br
-        data = json.dumps({'email': self.username, 'name': '', 'password': self.password})
+        data = json.dumps({"0":{"json":{"email":self.username,"password":self.password}}})
         if not isinstance(data, bytes):
             data = data.encode('utf-8')
         rq = Request(
-            url='https://caravanmagazine.in/api/users/login',
+            url='https://caravanmagazine.in/api/trpc/users.login?batch=1',
             data=data,
             headers={
                 'Accept': 'application/json, text/plain, */*',
@@ -66,37 +105,33 @@ class CaravanMagazine(BasicNewsRecipe):
             },
             method='POST'
         )
-        res = br.open(rq).read()
-        res = res.decode('utf-8')
-        self.log('Login request response: {}'.format(res))
-        res = json.loads(res)
-        if res['code'] != 200 or res['message'] != "Login success":
-            raise ValueError('Login failed, check your username and password')
+        try:
+            res = br.open(rq).read()
+            res = res.decode('utf-8')
+            res = json.loads(res)
+            self.log(safe_dict(res[0], 'result', 'data', 'json', 'message'))
+            self.logged = True
+        except:
+            self.log.warn('\n**Login failed, check your username and password\n')
+            return br
         return br
 
-    keep_only_tags = [
-        classes('text_wrapper cover_figure_element article_content')
-    ]
-
-    def preprocess_html(self, soup):
-        h2 = soup.find('h2')
-        if h2:
-            h2.name = 'p'
-        for fc in soup.findAll('figcaption'):
-            fc['id'] = 'fig-c'
-        return soup
-
     def parse_index(self):
         self.log(
             '\n***\nif this recipe fails, report it on: '
             'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
         )
+
         api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue'
-        # api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&' + \
-        # 'input=%7B%220%22%3A%7B%22json%22%3A%7B%22month%22%3A' + '2' + '%2C%22year%22%3A' + '2024' + '%7D%7D%7D'
-        # input={"0":{"json":{"month":2,"year":2024}}}
-        raw = self.index_to_soup(api, raw=True)
-        data = json.loads(raw)['result']['data']['json']
+        # for past editions
+        # inp = json.dumps({"0":{"json":{"month":6,"year":2023}}})
+        # api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input=' + quote(inp, safe='')
+
+        raw = json.loads(self.index_to_soup(api, raw=True))
+        if isinstance(raw, list):
+            data = raw[0]['result']['data']['json']
+        else:
+            data = raw['result']['data']['json']
         cover = safe_dict(data, 'issue', 'cover', 'data', 'url').replace('=s0', '=s768-rw')
         self.cover_url = absurl(cover)
 
@@ -122,3 +157,46 @@ class CaravanMagazine(BasicNewsRecipe):
             if articles:
                 feeds.append((section, articles))
         return feeds
+
+    def print_version(self, url):
+        slug = urlparse(url).path
+        inp = json.dumps({"0":{"json":{"slug":slug}}})
+        return 'https://api.caravanmagazine.in/api/trpc/articles.getFromCache?batch=1&input=' + quote(inp, safe='')
+
+    def preprocess_raw_html(self, raw, url):
+        cache_data = json.loads(raw)[0]
+        art_id = cache_data['result']['data']['json']['articleId']
+        prim_data = cache_data['result']['data']['json']['data']
+
+        cat = desc = lede = auth = ''
+
+        cat = '<div class="cat">' + safe_dict(prim_data, 'printTitle') + '</div>\n'
+        title = '<h1>' + safe_dict(prim_data, 'title') + '</h1>\n'
+        desc = '<p class="desc">' + safe_dict(prim_data, 'description') + '</p>\n'
+
+        authors = []
+        for q in prim_data.get('authors', {}):
+            authors.append(safe_dict(q, 'name'))
+        dt = ''
+        if prim_data.get('writtenAt', '') != '':
+            import time
+            from datetime import datetime, timedelta
+            dt = datetime.fromisoformat(prim_data['writtenAt'][:-1]) + timedelta(seconds=time.timezone)
+            dt = dt.strftime('%b %d, %Y, %I:%M %p')
+        auth ='<p class="auth">' + ', '.join(authors) + ' | ' + dt + '</p>\n'
+        lede = ''.join(parse_body(prim_data.get('cover', {})))
+
+        free_cont = ''
+        for x in prim_data['data']['content']:
+            free_cont += '\n'+ ''.join(parse_body(x))
+
+        premium_cont = ''
+        if self.logged:
+            cont_url = 'https://api.caravanmagazine.in/api/paywall/check-article?articleId='
+            art_cont = json.loads(self.index_to_soup(cont_url + str(art_id), raw=True))
+            for x in art_cont['premiumContent']:
+                premium_cont += '\n' + ''.join(parse_body(x))
+
+        return '<html><body><div>' \
+                    + cat + title + desc + auth + lede + free_cont + premium_cont + \
+                        '</div></body></html>'