Merge commit 'f1e57a86f12b668602825c03f1ff93fe8b8d34a9' of https://gi…

…thub.com/kovidgoyal/calibre
KaiHuaDou · Aug 14, 2024 · 5e98b8f · 5e98b8f
2 parents 9cd7a8c + f1e57a8
commit 5e98b8f
Show file tree

Hide file tree

Showing 49 changed files with 1,117 additions and 584 deletions.
diff --git a/Changelog.txt b/Changelog.txt
@@ -23,6 +23,42 @@
 # - title by author
 # }}}
 
+{{{ 7.14.0 2024-07-12
+
+:: new features
+
+- [2072442] Book details: When dropping files add an option to add them to the book as data files
+
+- Edit book: A new action to toggle line wrapping mode in all code editors. Can be assigned via Preferences->Keyboard shortcuts->Global actions or added to the toolbar via Preferences->Toolbars->Book wide actions
+
+- Kobo driver: Add an option to force the SeriesID for all books in a series to have the same value
+
+:: bug fixes
+
+- [2071458] Fix a regression in the previous release that broke merging of books when the confirmation for the merge was disabled
+
+- [2072412] E-book viewer: Allow some header and footer items such as progress to overflow instead of being truncated
+
+- [2072405] HTMLZ output: Make renaming of image files happen in filename order
+
+- [2072384] MTP driver: Fix infinite loop when connecting to some devices with more 65K objects in their filesystem
+
+- Fix shortcut editing widget when Qt is using a theme that inserts accelerators into push button labels automatically
+
+:: improved recipes
+- Instapaper
+- MIT Tech Review
+- Guardian
+- Liberation
+- The Times and Sunday Times
+- Bloomberg Businessweek
+- Times Literary Supplement
+
+:: new recipes
+- Ancient Egypt Magazine, Minerva Magazine, Military History Magazine and World Archaeology Magazine by unkn0wn
+
+}}}
+
 {{{ 7.13.0 2024-06-28
 
 :: new features
@@ -43,7 +79,7 @@
 
 - Amazon metadata: When filtering search engine results by title ignore words of the title that are purely punctuation
 
-- When matching books on a device to book in the library assume a match if the title and any one author match, dont require all authors to match
+- When matching books on a device to book in the library assume a match if the title and any one author match, don't require all authors to match
 
 - [2069553] Update Google Images cover download plugin for website changes
 
@@ -749,7 +785,7 @@
 
 - MTP driver: Ignore top level folders whose names start with a leading dot Also ignore AppleDouble files, top level system and fonts folders and sdr folders on Kindle devices
 
-- [2033074] FB2 Input: use the <p> tag for paragraphs that dont contain other block content
+- [2033074] FB2 Input: use the <p> tag for paragraphs that don't contain other block content
 
 - [2033118] E-book viewer: Fix clicking on the back/forward buttons not working in some situations
 

diff --git a/manual/faq.rst b/manual/faq.rst
@@ -660,7 +660,7 @@ then import it on another computer. First let's see how to export the data:
     that case, right-click the calibre icon in the toolbar and point it to the
     newly copied folder. You will now have two calibre libraries on your
     computer and you can switch between them by clicking the calibre icon on
-    the toolbar. Transferring your library in this manner preserver all your
+    the toolbar. Transferring your library in this manner preserves all your
     metadata, tags, custom columns, etc.
 
 

diff --git a/manual/mathjax.html b/manual/mathjax.html
@@ -86,7 +86,7 @@ <h2>Maxwell's Equations</h2>
 \end{align}
 </p>
 
-<h2>In-line Mathematics</h2>
+<h2>Inline Mathematics</h2>
 
 <p>While display equations look good for a page of samples, the
 ability to mix math and text in a paragraph is also important.  This

diff --git a/recipes/ancient_egypt.recipe b/recipes/ancient_egypt.recipe
@@ -0,0 +1,103 @@
+'''
+https://ancientegyptmagazine.com
+'''
+from calibre import browser
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class ancientegypt(BasicNewsRecipe):
+    title = 'The Past: Ancient Egypt Magazine'
+    language = 'en'
+    __author__ = 'unkn0wn'
+    description = (
+        'Ancient Egypt is the world\'s leading Egyptology magazine, exploring the history, people and culture of the Nile Valley. '
+        'Now in a larger format with a fresh new design, AE brings you the latest news and discoveries, and feature articles covering '
+        'more than 5000 years of Egyptian history. Published bimonthly.'
+    )
+    no_stylesheets = True
+    use_embedded_content = False
+    remove_attributes = ['style', 'height', 'width']
+    ignore_duplicate_articles = {'url'}
+    resolve_internal_links = True
+    masthead_url = 'https://ancientegyptmagazine.com/media/website/ae-logo-2.png'
+    simultaneous_downloads = 1
+
+    extra_css = '''
+        [class^="meta"] { font-size:small; }
+        .post-subtitle { font-style: italic; color:#202020; }
+        .wp-block-image { font-size:small; text-align:center; }
+    '''
+
+    keep_only_tags = [
+        dict(attrs={'class':lambda x: x and '__header' in x}),
+        dict(attrs={'class':lambda x: x and '__background' in x}),
+        dict(attrs={'class':lambda x: x and '__body_area' in x}),
+    ]
+
+    remove_tags = [
+        dict(attrs={'class':'ad-break'}),
+        dict(attrs={'class':lambda x: x and 'avatar' in x.split()}),
+        dict(attrs={'class':lambda x: x and '--share' in x})
+    ]
+
+    def preprocess_html(self, soup):
+        exp = soup.find(attrs={'class':lambda x: x and 'post-subtitle' in x.split()})
+        if exp:
+            exp.name = 'p'
+        return soup
+
+    def parse_index(self):
+        soup = self.index_to_soup('https://the-past.com/category/magazines/ae/')
+        art = soup.find('article', attrs={'class':lambda x: x and 'tag-magazines' in x.split()})
+        url = art.h2.a['href']
+
+        # for past editions, add url
+        # url = ''
+
+        issue = self.index_to_soup(url)
+        ti = issue.find('h1', attrs={'class':lambda x: x and 'post-title' in x.split()})
+        if ti:
+            self.title = self.tag_to_string(ti).strip()
+        dt = soup.find(attrs={'class':lambda x: x and '__date' in x})
+        if dt:
+            self.timefmt = ' [' + self.tag_to_string(dt).strip() + ']'
+        edit = issue.find('h2', attrs={'id':'from-the-editor'})
+        if edit and edit.findParent('div'):
+            self.description = self.tag_to_string(edit.findParent('div'))
+        cov = issue.find('figure', attrs={'class':lambda x: x and 'wp-block-image' in x.split()})
+        if cov:
+            self.cover_url = cov.img['src']
+        div = issue.find('div', attrs={'class':lambda x: x and 'entry-content' in x.split()})
+
+        feeds = []
+
+        h2 = div.findAll('h2', attrs={'class':lambda x: x and 'wp-block-heading' in x.split()})
+        lt = div.findAll(attrs={'class':'display-posts-listing'})
+        for x, y in zip(h2, lt):
+            section = self.tag_to_string(x).strip()
+            self.log(section)
+            articles = []
+            for a in y.findAll('a', href=True, attrs={'class':'title'}):
+                url = a['href']
+                title = self.tag_to_string(a).strip()
+                desc = ''
+                exp = a.findNext(attrs={'class':'excerpt'})
+                if exp:
+                    desc = self.tag_to_string(exp).strip()
+                self.log('\t', title, '\n\t', desc, '\n\t\t', url)
+                articles.append({'title': title, 'description':desc, 'url': url})
+            if articles:
+                feeds.append((section, articles))
+        return feeds
+
+    def get_browser(self, *args, **kwargs):
+        return self
+
+    def clone_browser(self, *args, **kwargs):
+        return self.get_browser()
+
+    def open_novisit(self, *args, **kwargs):
+        br = browser()
+        return br.open_novisit(*args, **kwargs)
+
+    open = open_novisit
diff --git a/recipes/bloomberg-business-week.recipe b/recipes/bloomberg-business-week.recipe
@@ -1,8 +1,9 @@
 import json
 import random
 import time
+from collections import defaultdict
 
-from calibre.web.feeds.news import BasicNewsRecipe, classes
+from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
 from html5_parser import parse
 
 
@@ -106,30 +107,28 @@ class Bloomberg(BasicNewsRecipe):
         self.log('Downloading ', edition)
         self.cover_url = bw.find('img')['src'].replace('25x19', '600x800')
         soup = self.index_to_soup(edition)
-        if timefmt := soup.find(attrs={'class':lambda x: x and x.startswith('styles_MagazineTitle__')}):
+        if timefmt := soup.find(**prefixed_classes('styles_TableOfContentsTitle__')):
             self.timefmt = ' [' + (self.tag_to_string(timefmt).replace(' Issue', '')).strip() + ']'
 
-        feeds = []
-        for div in soup.findAll(attrs={'class':lambda x: x and x.startswith(
-                ('styles_MagazineFeatures__', 'styles_MagazineStoryList__')
-            )}):
-            h3 = div.find(attrs={'class':lambda x: x and x.startswith(
-                ('styles_featuresTitle__', 'styles_magazineSectionTitle__')
-            )})
-            sec = self.tag_to_string(h3)
-            self.log(sec)
-            articles = []
-            for art in div.findAll(attrs={'data-component':'headline'}):
-                a = art.find('a', href=True)
-                url = a['href']
-                if url.startswith('http') is False:
-                    url = 'https://www.bloomberg.com' + a['href']
-                title = self.tag_to_string(a)
-                articles.append({'title': title, 'url': url})
-                self.log('\t', title, '\n\t\t', url)
-            if articles:
-                feeds.append((sec, articles))
-        return feeds
+        feeds_dict = defaultdict(list)
+
+        sec = ''
+        toc = soup.find('section', attrs={'id':'toc-archive-businessweek'})
+        for div in toc.findAll(**prefixed_classes('MagazinePageMagazineArchive_itemContainer__')):
+            h3 = div.find(**prefixed_classes('MagazinePageMagazineArchive_itemSection__'))
+            if h3 and h3.text:
+                sec = self.tag_to_string(h3)
+                self.log(sec)
+            a = div.find(**prefixed_classes('MagazinePageMagazineArchive_storyLink__'))
+            url = a['href']
+            if url.startswith('http') is False:
+                url = 'https://www.bloomberg.com' + a['href']
+            title = self.tag_to_string(a)
+            byl = div.find(**prefixed_classes('Byline_phoenix__'))
+            desc = self.tag_to_string(byl)
+            self.log('\t', title, '\n\t', desc, '\n\t\t', url)
+            feeds_dict[sec].append({"title": title, "url": url, "description": desc})
+        return [(sec, articles) for sec, articles in feeds_dict.items()]
 
     def preprocess_raw_html(self, raw, *a):
         root = parse(raw)

diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe
@@ -114,3 +114,9 @@ class Guardian(BasicNewsRecipe):
         feeds = list(self.parse_section(self.base_url))
         feeds += list(self.parse_section('https://www.theguardian.com/uk/sport'))
         return feeds
+
+    def preprocess_html(self, soup):
+        for table in soup.findAll('table'):
+            if len(table.findAll('tr')) > 20:
+                table.decompose()
+        return soup
diff --git a/recipes/harpers.recipe b/recipes/harpers.recipe
@@ -51,7 +51,7 @@ class Harpers(BasicNewsRecipe):
         for img in soup.findAll('img', attrs={'srcset':True}):
             for src in img['srcset'].split(','):
                 if '768w' in src:
-                    img['src'] = img['src'].split()[0]
+                    img['src'] = src.split()[0]
         return soup
 
     def parse_index(self):
@@ -67,12 +67,12 @@ class Harpers(BasicNewsRecipe):
         for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(url + '/')}):
             if not a.find('img') and a.find(['h1', 'h2', 'h3', 'h4']):
                 url = a['href']
-                title = self.tag_to_string(a)
+                title = self.tag_to_string(a).strip()
                 desc = ''
                 div = a.findParent('div').find('div', attrs={'class':'byline'})
                 if div:
-                    desc = self.tag_to_string(div)
-                self.log('\t', title, '\n\t', desc, '\n\t', url)
+                    desc = self.tag_to_string(div).strip()
+                self.log('      ', title, '\n\t', desc[:-1], '\n\t', url)
                 ans.append({'title': title, 'description': desc, 'url': url})
         return [('Articles', ans)]
 

diff --git a/recipes/icons/ancient_egypt.png b/recipes/icons/ancient_egypt.png
diff --git a/recipes/icons/military_history.png b/recipes/icons/military_history.png
diff --git a/recipes/icons/minerva_magazine.png b/recipes/icons/minerva_magazine.png
diff --git a/recipes/icons/sunday_times_magazine.png b/recipes/icons/sunday_times_magazine.png
diff --git a/recipes/icons/times_online.png b/recipes/icons/times_online.png
diff --git a/recipes/icons/world_archeology.png b/recipes/icons/world_archeology.png
diff --git a/recipes/icons/wsj_mag.png b/recipes/icons/wsj_mag.png