diff --git a/Changelog.txt b/Changelog.txt index 351c8a5e6a0a..45d2c10c4165 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -23,6 +23,37 @@ # - title by author # }}} +{{{ 7.10.0 2024-05-03 + +:: new features + +- Export of calibre data: Ensure individual part files in the exported data are no larger than one gigabyte even + if the library contains individual files larger than that size. + + Note that this means that exports created by calibre from this version + on will not be importable by earlier versions. However, exports from + earlier versions should still be importable. + +- Edit book: Spell check: Add options to exclude words in ALL CAPS or with numbers or in camelCase/snake_case from the list of words + +- Allow easily inverting the current search via the right click menu on the search box + +:: bug fixes + +- [2064546] Kobo driver: Fix database unsupported error with newest firmware + +- [2063301] DOCX Input: Fix text elements containing only whitespace being incorrectly ignored + +- Bulk metadata dialog: Do not fail when setting covers from ebook files and some of the files have invalid covers + +:: improved recipes +- Economist +- The Week +- Caravan Magazine +- Financial Times + +}}} + {{{ 7.9.0 2024-04-19 :: new features diff --git a/bypy/sources.json b/bypy/sources.json index 0e1a8ae071d2..506a79bab504 100644 --- a/bypy/sources.json +++ b/bypy/sources.json @@ -323,8 +323,8 @@ { "name": "libxml2", "unix": { - "filename": "libxml2-2.12.1.tar.xz", - "hash": "sha256:8982b9ccdf7f456e30d8f7012d50858c6623e495333b6191def455c7e95427eb", + "filename": "libxml2-2.12.6.tar.xz", + "hash": "sha256:889c593a881a3db5fdd96cc9318c87df34eb648edfc458272ad46fd607353fbb", "urls": ["https://download.gnome.org/sources/libxml2/2.12/{filename}"] } }, @@ -620,8 +620,8 @@ { "name": "lxml", "unix": { - "filename": "lxml-4.9.3.tar.gz", - "hash": "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c", + "filename": "lxml-5.2.1.tar.gz", + "hash": "sha256:3f7765e69bbce0906a7c74d5fe46d2c7a7596147318dbc08e4a2431f3060e306", "urls": ["pypi"] } }, @@ -968,6 +968,15 @@ } }, + { + "name": "lxml-html-clean", + "unix": { + "filename": "lxml_html_clean-0.1.1-py3-none-any.whl", + "hash": "sha256:58c04176593c9caf72ec92e033d2f38859e918b3eff0cc0f8051ad27dc2ab8ef", + "urls": ["pypi"] + } + }, + { "name": "ply", "comment": "Needed for sip (build time dependency)", diff --git a/bypy/windows/site.py b/bypy/windows/site.py index 2871855025b3..fb0e6450b073 100644 --- a/bypy/windows/site.py +++ b/bypy/windows/site.py @@ -60,16 +60,6 @@ def set_quit(): builtins.exit = _sitebuiltins.Quitter('exit', eof) -def workaround_lxml_bug(): - # Without calling xmlInitParser() import lxml causes a segfault - import ctypes - x = ctypes.WinDLL('libxml2.dll') - x.xmlInitParser() - workaround_lxml_bug.libxml2 = x - from lxml import etree - del etree - - def main(): sys.meta_path.insert(0, PydImporter()) os.add_dll_directory(os.path.abspath(os.path.join(sys.app_dir, 'app', 'bin'))) @@ -85,8 +75,6 @@ def fake_getline(filename, lineno, module_globals=None): set_helper() set_quit() - workaround_lxml_bug() - return run_entry_point() diff --git a/recipes/caravan_magazine.recipe b/recipes/caravan_magazine.recipe index c271c3a92c14..98e86d58f613 100644 --- a/recipes/caravan_magazine.recipe +++ b/recipes/caravan_magazine.recipe @@ -1,10 +1,7 @@ -#!/usr/bin/env python -# vim:fileencoding=utf-8 -# License: GPLv3 Copyright: 2015, Kovid Goyal - import json +from urllib.parse import quote, urlparse -from calibre.web.feeds.news import BasicNewsRecipe, classes +from calibre.web.feeds.news import BasicNewsRecipe from mechanize import Request @@ -21,6 +18,45 @@ def safe_dict(data, *names): ans = ans.get(x) or '' return ans + +def parse_body(x): + if x.get('type', '') == 'paragraph': + yield '

' + for p in x.get('content', {}): + yield ''.join(parse_p(p)) + yield '

\n' + elif x.get('type', '') in {'blockquote', 'pullquote'}: + yield '
' + for p in x.get('content', {}): + yield from parse_body(p) + yield '
' + elif x.get('type', '') == 'figure': + yield ''.format(absurl(x['attrs']['src'].replace('=s0', '=s768-rw'))) + for p in x.get('content', {}): + yield from parse_body(p) + elif x.get('type', '') in {'caption', 'credit'}: + yield '
' + for div in x.get('content', {}): + yield ''.join(parse_p(div)) + yield '
\n' + elif x.get('type', '') != '': + if 'content' in x: + yield '

' + for p in x.get('content', {}): + yield from parse_body(p) + yield '

' + +def parse_p(p): + if p.get('type', '') == 'text': + if 'marks' in p: + tag = p['marks'][0]['type'] + yield '<' + tag + '>' + yield p['text'] + yield '' + else: + yield p['text'] + + class CaravanMagazine(BasicNewsRecipe): title = 'Caravan Magazine' @@ -40,23 +76,26 @@ class CaravanMagazine(BasicNewsRecipe): remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url'} resolve_internal_links = True + needs_subscription = 'optional' + logged = False extra_css = ''' + img {display:block; margin:0 auto;} blockquote, em {color:#202020;} - .article_subtitle {font-style:italic; color:#202020;} - #fig-c, .photo_wrapper, .cover_figure_element {text-align:center; font-size:small;} - .pre-title, .text_wrapper {font-size:small; color:#404040;} + .desc {font-style:italic; color:#202020;} + .sub {text-align:center; font-size:small;} + .cat, .auth {font-size:small; color:#404040;} ''' def get_browser(self, *args, **kw): br = BasicNewsRecipe.get_browser(self, *args, **kw) if not self.username or not self.password: return br - data = json.dumps({'email': self.username, 'name': '', 'password': self.password}) + data = json.dumps({"0":{"json":{"email":self.username,"password":self.password}}}) if not isinstance(data, bytes): data = data.encode('utf-8') rq = Request( - url='https://caravanmagazine.in/api/users/login', + url='https://caravanmagazine.in/api/trpc/users.login?batch=1', data=data, headers={ 'Accept': 'application/json, text/plain, */*', @@ -66,37 +105,33 @@ class CaravanMagazine(BasicNewsRecipe): }, method='POST' ) - res = br.open(rq).read() - res = res.decode('utf-8') - self.log('Login request response: {}'.format(res)) - res = json.loads(res) - if res['code'] != 200 or res['message'] != "Login success": - raise ValueError('Login failed, check your username and password') + try: + res = br.open(rq).read() + res = res.decode('utf-8') + res = json.loads(res) + self.log(safe_dict(res[0], 'result', 'data', 'json', 'message')) + self.logged = True + except: + self.log.warn('\n**Login failed, check your username and password\n') + return br return br - keep_only_tags = [ - classes('text_wrapper cover_figure_element article_content') - ] - - def preprocess_html(self, soup): - h2 = soup.find('h2') - if h2: - h2.name = 'p' - for fc in soup.findAll('figcaption'): - fc['id'] = 'fig-c' - return soup - def parse_index(self): self.log( '\n***\nif this recipe fails, report it on: ' 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' ) + api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue' - # api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&' + \ - # 'input=%7B%220%22%3A%7B%22json%22%3A%7B%22month%22%3A' + '2' + '%2C%22year%22%3A' + '2024' + '%7D%7D%7D' - # input={"0":{"json":{"month":2,"year":2024}}} - raw = self.index_to_soup(api, raw=True) - data = json.loads(raw)['result']['data']['json'] + # for past editions + # inp = json.dumps({"0":{"json":{"month":6,"year":2023}}}) + # api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input=' + quote(inp, safe='') + + raw = json.loads(self.index_to_soup(api, raw=True)) + if isinstance(raw, list): + data = raw[0]['result']['data']['json'] + else: + data = raw['result']['data']['json'] cover = safe_dict(data, 'issue', 'cover', 'data', 'url').replace('=s0', '=s768-rw') self.cover_url = absurl(cover) @@ -122,3 +157,46 @@ class CaravanMagazine(BasicNewsRecipe): if articles: feeds.append((section, articles)) return feeds + + def print_version(self, url): + slug = urlparse(url).path + inp = json.dumps({"0":{"json":{"slug":slug}}}) + return 'https://api.caravanmagazine.in/api/trpc/articles.getFromCache?batch=1&input=' + quote(inp, safe='') + + def preprocess_raw_html(self, raw, url): + cache_data = json.loads(raw)[0] + art_id = cache_data['result']['data']['json']['articleId'] + prim_data = cache_data['result']['data']['json']['data'] + + cat = desc = lede = auth = '' + + cat = '
' + safe_dict(prim_data, 'printTitle') + '
\n' + title = '

' + safe_dict(prim_data, 'title') + '

\n' + desc = '

' + safe_dict(prim_data, 'description') + '

\n' + + authors = [] + for q in prim_data.get('authors', {}): + authors.append(safe_dict(q, 'name')) + dt = '' + if prim_data.get('writtenAt', '') != '': + import time + from datetime import datetime, timedelta + dt = datetime.fromisoformat(prim_data['writtenAt'][:-1]) + timedelta(seconds=time.timezone) + dt = dt.strftime('%b %d, %Y, %I:%M %p') + auth ='

' + ', '.join(authors) + ' | ' + dt + '

\n' + lede = ''.join(parse_body(prim_data.get('cover', {}))) + + free_cont = '' + for x in prim_data['data']['content']: + free_cont += '\n'+ ''.join(parse_body(x)) + + premium_cont = '' + if self.logged: + cont_url = 'https://api.caravanmagazine.in/api/paywall/check-article?articleId=' + art_cont = json.loads(self.index_to_soup(cont_url + str(art_id), raw=True)) + for x in art_cont['premiumContent']: + premium_cont += '\n' + ''.join(parse_body(x)) + + return '
' \ + + cat + title + desc + auth + lede + free_cont + premium_cont + \ + '
' diff --git a/recipes/dilema.recipe b/recipes/dilema.recipe new file mode 100644 index 000000000000..1a64701880c4 --- /dev/null +++ b/recipes/dilema.recipe @@ -0,0 +1,107 @@ +#!/usr/bin/env python +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class Volkskrant(BasicNewsRecipe): + title = 'Dilema' + __author__ = 'Cristi Ghera' + max_articles_per_feed = 100 + description = '"Sint vechi, domnule!" (I.L. Caragiale)' + needs_subscription = False + language = 'ro' + country = 'RO' + category = 'politics, culture, Romania' + resolve_internal_links = True + remove_tags_before = { 'class': 'post' } + remove_tags_after = { 'class': 'post_content' } + remove_tags = [ + dict( + attrs={ + 'class': [ + 'single_meta_category', + 'avatar', + 'jm-post-like', + 'fa', + ] + } + ), + dict( + name=['div'], + attrs={ + 'class': ['mb-2'] + } + ), + dict(id=['like', 'dlik']), + dict(name=['script', 'noscript', 'style']), + ] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' + no_stylesheets = True + ignore_duplicate_articles = {'url'} + + def parse_index(self): + homepage_url = 'https://www.dilema.ro/' + soup = self.index_to_soup(homepage_url) + + articles = [] + + # .banner-container + banner_container = soup.find('div', attrs={'class': 'banner-container'}) + container = banner_container.find('h5') + a = container.find('a') + url = homepage_url + a.attrs['href'] + articles.append( + dict( + title=self.tag_to_string(container).strip(), + url=url, + date=self.tag_to_string(banner_container.find(attrs={'class': 'post-date'})).strip(), + description='', + content='' + ) + ) + + # .homepage_builder_3grid_post + containers = soup.findAll('div', attrs={'class': 'homepage_builder_3grid_post'}) + for container in containers: + if self.tag_to_string(container.find('h2')) in ['CELE MAI RECENTE', 'CELE MAI CITITE']: + continue + for article in container.findAll('div', attrs={'class': 'blog_grid_post_style'}): + title_container = article.find('h3') + if not title_container: + continue + url = title_container.find('a')['href'] + url = homepage_url + url + article_title = self.tag_to_string(title_container).strip() + author = self.tag_to_string( + article.find('a', attrs={'rel': 'author'}) + ).strip() + summary = self.tag_to_string(article.find('p')).strip() + pubdate = self.tag_to_string(article.find(attrs={'class': 'post-date'})) + description = author + ' - ' + summary + articles.append( + dict( + title=article_title, + url=url, + date=pubdate, + description=description, + content='' + ) + ) + + sections = [("Numărul curent", articles)] + return sections + + def preprocess_html(self, soup): + main_carousel = soup.find(attrs={'id': 'main-carousel'}) + if main_carousel: + img = main_carousel.find('img') + body = soup.find('body') + body.clear() + body.append(img) + return soup + + def get_cover_url(self): + url = 'https://www.dilema.ro/coperta-saptaminii/' + soup = self.index_to_soup(url) + img = soup.find(attrs={'id': 'main-carousel'}).find('img') + return url + img.attrs['src'] diff --git a/recipes/economist.recipe b/recipes/economist.recipe index e6dcd3acb851..960df8f62422 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -2,17 +2,22 @@ # License: GPLv3 Copyright: 2008, Kovid Goyal import json +import time from collections import defaultdict +from datetime import datetime, timedelta +from urllib.parse import quote, urlencode from calibre import replace_entities from calibre.ebooks.BeautifulSoup import NavigableString, Tag +from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import parse_only_date from calibre.web.feeds.news import BasicNewsRecipe from html5_parser import parse from lxml import etree -# For past editions, set date to, for example, '2020-11-28' +# For past editions, set date to, for example, '2020-11-28'. edition_date = None +use_archive = True def E(parent, name, text='', **attrs): @@ -52,31 +57,63 @@ class JSONHasNoContent(ValueError): pass -def load_article_from_json(raw, root): - # open('/t/raw.json', 'w').write(raw) - try: - data = json.loads(raw)['props']['pageProps']['content'] - except KeyError as e: - raise JSONHasNoContent(e) - if isinstance(data, list): - data = data[0] - body = root.xpath('//body')[0] - for child in tuple(body): - body.remove(child) - article = E(body, 'article') - E(article, 'h4', data['subheadline'], style='color: red; margin: 0') - E(article, 'h1', data['headline'], style='font-size: x-large') - E(article, 'div', data['description'], style='font-style: italic') - E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em') - main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') - if main_image_url: - div = E(article, 'div') +if use_archive: + def load_article_from_json(raw, root): + # open('/t/raw.json', 'w').write(raw) + data = json.loads(raw) + body = root.xpath('//body')[0] + article = E(body, 'article') + E(article, 'div', data['flyTitle'] , style='color: red; font-size:small; font-weight:bold;') + E(article, 'h1', data['title'], title=safe_dict(data, "url", "canonical") or '') + E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;') try: - E(div, 'img', src=main_image_url) + date = data['dateModified'] except Exception: - pass - for node in data.get('text') or (): - process_node(node, article) + date = data['datePublished'] + dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) + dt = dt.strftime('%b %d, %Y, %I:%M %p') + if data['dateline'] is None: + E(article, 'p', dt, style='color: gray; font-size:small;') + else: + E(article, 'p', dt + ' | ' + (data['dateline']), style='color: gray; font-size:small;') + main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') + if main_image_url: + div = E(article, 'div') + try: + E(div, 'img', src=main_image_url) + except Exception: + pass + for node in data.get('text') or (): + process_node(node, article) +else: + def load_article_from_json(raw, root): + # open('/t/raw.json', 'w').write(raw) + try: + data = json.loads(raw)['props']['pageProps']['content'] + except KeyError as e: + raise JSONHasNoContent(e) + if isinstance(data, list): + data = data[0] + body = root.xpath('//body')[0] + for child in tuple(body): + body.remove(child) + article = E(body, 'article') + E(article, 'div', replace_entities(data['subheadline']) , style='color: red; font-size:small; font-weight:bold;') + E(article, 'h1', replace_entities(data['headline'])) + E(article, 'div', replace_entities(data['description']), style='font-style: italic; color:#202020;') + if data['dateline'] is None: + E(article, 'p', (data['datePublishedString'] or ''), style='color: gray; font-size:small;') + else: + E(article, 'p', (data['datePublishedString'] or '') + ' | ' + (data['dateline']), style='color: gray; font-size:small;') + main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') + if main_image_url: + div = E(article, 'div') + try: + E(div, 'img', src=main_image_url) + except Exception: + pass + for node in data.get('text') or (): + process_node(node, article) def cleanup_html_article(root): @@ -129,31 +166,9 @@ class Economist(BasicNewsRecipe): ' perspective. Best downloaded on Friday mornings (GMT)' ) extra_css = ''' - .headline {font-size: x-large;} - h2 { font-size: small; } - h1 { font-size: medium; } - em.Bold {font-weight:bold;font-style:normal;} - em.Italic {font-style:italic;} - p.xhead {font-weight:bold;} - .pullquote { - float: right; - font-size: larger; - font-weight: bold; - font-style: italic; - page-break-inside:avoid; - border-bottom: 3px solid black; - border-top: 3px solid black; - width: 228px; - margin: 0px 0px 10px 15px; - padding: 7px 0px 9px; - } - .flytitle-and-title__flytitle { - display: block; - font-size: smaller; - color: red; - } + em { color:#202020; } img {display:block; margin:0 auto;} - ''' + ''' oldest_article = 7.0 resolve_internal_links = True remove_tags = [ @@ -186,15 +201,6 @@ class Economist(BasicNewsRecipe): needs_subscription = False - def __init__(self, *args, **kwargs): - BasicNewsRecipe.__init__(self, *args, **kwargs) - if self.output_profile.short_name.startswith('kindle'): - # Reduce image sizes to get file size below amazon's email - # sending threshold - self.web2disk_options.compress_news_images = True - self.web2disk_options.compress_news_images_auto_size = 5 - self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') - def get_browser(self, *args, **kwargs): # Needed to bypass cloudflare kwargs['user_agent'] = 'common_words/based' @@ -202,19 +208,170 @@ class Economist(BasicNewsRecipe): br.addheaders += [('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8')] return br + def publication_date(self): + if edition_date: + return parse_only_date(edition_date, as_utc=False) + url = self.browser.open("https://www.economist.com/printedition").geturl() + return parse_only_date(url.split("/")[-1], as_utc=False) + + def economist_test_article(self): + return [('Articles', [{'title':'test', + 'url':'https://www.economist.com/the-americas/2024/04/14/elon-musk-is-feuding-with-brazils-powerful-supreme-court' + }])] + + def economist_return_index(self, ans): + if not ans: + raise NoArticles( + 'Could not find any articles, either the ' + 'economist.com server is having trouble and you should ' + 'try later or the website format has changed and the ' + 'recipe needs to be updated.' + ) + return ans + + if use_archive: + def parse_index(self): + # return self.economist_test_article() + url = 'https://www.economist.com/weeklyedition/archive' + if edition_date: + url = 'https://www.economist.com/weeklyedition/' + edition_date + soup = self.index_to_soup(url) + script_tag = soup.find("script", id="__NEXT_DATA__") + if script_tag is None: + raise ValueError('No script tag with JSON data found in the weeklyedition archive') + data = json.loads(script_tag.string) + content_id = data['props']['pageProps']['content']['id'].split('/')[-1] + query = { + 'query': 'query LatestWeeklyAutoEditionQuery($ref:String!){canonical(ref:$ref){hasPart(from:0 size:1 sort:"datePublished:desc"){parts{...WeeklyEditionFragment __typename}__typename}__typename}}fragment WeeklyEditionFragment on Content{id type datePublished image{...ImageCoverFragment __typename}url{canonical __typename}hasPart(size:100 sort:"publication.context.position"){parts{...ArticleFragment __typename}__typename}__typename}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}fragment ImageCoverFragment on Media{cover{headline width height url{canonical __typename}regionsAllowed __typename}__typename}', # noqa + 'operationName': 'LatestWeeklyAutoEditionQuery', + 'variables': '{{"ref":"/content/{}"}}'.format(content_id), + } + if edition_date: + query = { + 'query': 'query SpecificWeeklyEditionQuery($path:String!){section:canonical(ref:$path){...WeeklyEditionFragment __typename}}fragment WeeklyEditionFragment on Content{id type datePublished image{...ImageCoverFragment __typename}url{canonical __typename}hasPart(size:100 sort:"publication.context.position"){parts{...ArticleFragment __typename}__typename}__typename}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}fragment ImageCoverFragment on Media{cover{headline width height url{canonical __typename}regionsAllowed __typename}__typename}', # noqa + 'operationName': 'SpecificWeeklyEditionQuery', + 'variables': '{{"path":"/content/{}"}}'.format(content_id), + } + url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote) + raw = self.index_to_soup(url, raw=True) + ans = self.economist_parse_index(raw) + return self.economist_return_index(ans) + + def economist_parse_index(self, raw): + if edition_date: + data = json.loads(raw)['data']['section'] + else: + data = json.loads(raw)['data']['canonical']['hasPart']['parts'][0] + self.description = data['image']['cover'][0]['headline'] + dt = datetime.fromisoformat(data['datePublished'][:-1]) + timedelta(seconds=time.timezone) + dt = dt.strftime('%b %d, %Y') + self.timefmt = ' [' + dt + ']' + self.cover_url = data['image']['cover'][0]['url']['canonical'].replace('economist.com/', + 'economist.com/cdn-cgi/image/width=960,quality=80,format=auto/') + self.log('Got cover:', self.cover_url) + + feeds_dict = defaultdict(list) + for part in safe_dict(data, "hasPart", "parts"): + try: + section = part['articleSection']['internal'][0]['title'] + except Exception: + section = safe_dict(part, 'print', 'section', 'title') or 'section' + if section not in feeds_dict: + self.log(section) + title = safe_dict(part, "title") + desc = safe_dict(part, "rubric") or '' + sub = safe_dict(part, "flyTitle") or '' + if sub and section != sub: + desc = sub + ' :: ' + desc + pt = PersistentTemporaryFile('.html') + pt.write(json.dumps(part).encode('utf-8')) + pt.close() + url = 'file:///' + pt.name + feeds_dict[section].append({"title": title, "url": url, "description": desc}) + self.log('\t', title, '\n\t\t', desc) + return [(section, articles) for section, articles in feeds_dict.items()] + + def populate_article_metadata(self, article, soup, first): + article.url = soup.find('h1')['title'] + + def preprocess_html(self, soup): + for img in soup.findAll('img', src=True): + img['src'] = img['src'].replace('economist.com/', + 'economist.com/cdn-cgi/image/width=600,quality=80,format=auto/') + return soup + + else: # Load articles from individual article pages {{{ + + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + if self.output_profile.short_name.startswith('kindle'): + # Reduce image sizes to get file size below amazon's email + # sending threshold + self.web2disk_options.compress_news_images = True + self.web2disk_options.compress_news_images_auto_size = 5 + self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') + + def parse_index(self): + # return self.economist_test_article() + if edition_date: + url = 'https://www.economist.com/weeklyedition/' + edition_date + self.timefmt = ' [' + edition_date + ']' + else: + url = 'https://www.economist.com/weeklyedition' + soup = self.index_to_soup(url) + ans = self.economist_parse_index(soup) + return self.economist_return_index(ans) + + def economist_parse_index(self, soup): + script_tag = soup.find("script", id="__NEXT_DATA__") + if script_tag is not None: + data = json.loads(script_tag.string) + # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True)) + self.description = safe_dict(data, "props", "pageProps", "content", "image", "main", "headline") + self.timefmt = ' [' + safe_dict(data, "props", "pageProps", "content", "datePublishedString") + ']' + self.cover_url = safe_dict(data, "props", "pageProps", "content", "image", "main", "url", "canonical") + self.log('Got cover:', self.cover_url) + + feeds_dict = defaultdict(list) + for part in safe_dict(data, "props", "pageProps", "content", "hasPart", "parts"): + section = safe_dict(part, "print", "section", "headline") or '' + title = safe_dict(part, "headline") or '' + url = safe_dict(part, "url", "canonical") or '' + if not section or not title or not url: + continue + desc = safe_dict(part, "description") or '' + sub = safe_dict(part, "subheadline") or '' + if sub and section != sub: + desc = sub + ' :: ' + desc + feeds_dict[section].append({"title": title, "url": url, "description": desc}) + self.log(' ', title, url, '\n ', desc) + return [(section, articles) for section, articles in feeds_dict.items()] + else: + return [] + + # }}} + + def preprocess_raw_html(self, raw, url): # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) - root = parse(raw) + if use_archive: + body = '
' + root = parse(body) + load_article_from_json(raw, root) + else: + root = parse(raw) + script = root.xpath('//script[@id="__NEXT_DATA__"]') + if script: + try: + load_article_from_json(script[0].text, root) + except JSONHasNoContent: + cleanup_html_article(root) + if '/interactive/' in url: - return '

' + root.xpath('//h1')[0].text + '

' \ + return '

' + root.xpath('//h1')[0].text + '

' \ + 'This article is supposed to be read in a browser' \ + '
' - script = root.xpath('//script[@id="__NEXT_DATA__"]') - if script: - try: - load_article_from_json(script[0].text, root) - except JSONHasNoContent: - cleanup_html_article(root) + for div in root.xpath('//div[@class="lazy-image"]'): noscript = list(div.iter('noscript')) if noscript and noscript[0].text: @@ -227,11 +384,15 @@ class Economist(BasicNewsRecipe): for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'): x.getparent().remove(x) # the economist uses for small caps with a custom font + for init in root.xpath('//span[@data-caps="initial"]'): + init.set('style', 'font-weight:bold;') for x in root.xpath('//small'): if x.text and len(x) == 0: x.text = x.text.upper() x.tag = 'span' x.set('style', 'font-variant: small-caps') + for h2 in root.xpath('//h2'): + h2.tag = 'h4' for x in root.xpath('//figcaption'): x.set('style', 'text-align:center; font-size:small;') for x in root.xpath('//cite'): @@ -239,17 +400,8 @@ class Economist(BasicNewsRecipe): x.set('style', 'color:#404040;') raw = etree.tostring(root, encoding='unicode') return raw - - def publication_date(self): - if edition_date: - return parse_only_date(edition_date, as_utc=False) - url = self.browser.open("https://www.economist.com/printedition").geturl() - return parse_only_date(url.split("/")[-1], as_utc=False) - - def parse_index(self): - # return [('Articles', [{'title':'test', - # 'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-models-are-turbo-charging-ai-progress' - # }])] + def parse_index_from_printedition(self): + # return self.economist_test_article() if edition_date: url = 'https://www.economist.com/weeklyedition/' + edition_date self.timefmt = ' [' + edition_date + ']' @@ -276,33 +428,6 @@ class Economist(BasicNewsRecipe): ) return ans - def economist_parse_index(self, soup): - script_tag = soup.find("script", id="__NEXT_DATA__") - if script_tag is not None: - data = json.loads(script_tag.string) - # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True)) - # self.title = 'The Economist | ' + safe_dict(data, "props", "pageProps", "content", "image", "main", "headline") - self.timefmt = ' [' + safe_dict(data, "props", "pageProps", "content", "datePublishedString") + ']' - self.cover_url = safe_dict(data, "props", "pageProps", "content", "image", "main", "url", "canonical") - self.log('Got cover:', self.cover_url) - - feeds_dict = defaultdict(list) - for part in safe_dict(data, "props", "pageProps", "content", "hasPart", "parts"): - section = safe_dict(part, "print", "section", "headline") or '' - title = safe_dict(part, "headline") or '' - url = safe_dict(part, "url", "canonical") or '' - if not section or not title or not url: - continue - desc = safe_dict(part, "description") or '' - sub = safe_dict(part, "subheadline") or '' - if sub and section != sub: - desc = sub + ' :: ' + desc - feeds_dict[section].append({"title": title, "url": url, "description": desc}) - self.log(' ', title, url, '\n ', desc) - return [(section, articles) for section, articles in feeds_dict.items()] - else: - return [] - def eco_find_image_tables(self, soup): for x in soup.findAll('table', align=['right', 'center']): if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1: @@ -330,3 +455,12 @@ class Economist(BasicNewsRecipe): if url.endswith('/print'): url = url.rpartition('/')[0] return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link) + + +def get_login_cookies(username, password): + print(33333333333, username, password) + + +if __name__ == '__main__': + import sys + get_login_cookies(sys.argv[-2], sys.argv[-1]) diff --git a/recipes/economist_espresso.recipe b/recipes/economist_espresso.recipe index 512bd7dfcfb8..522fe4a715f1 100644 --- a/recipes/economist_espresso.recipe +++ b/recipes/economist_espresso.recipe @@ -56,6 +56,9 @@ class Espresso(BasicNewsRecipe): ), ] + def print_version(self, url): + return 'https://webcache.googleusercontent.com/search?q=cache:' + url + def preprocess_html(self, soup): if h1 := soup.find('h1'): if p := h1.find_next_sibling('p'): diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index e6dcd3acb851..960df8f62422 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -2,17 +2,22 @@ # License: GPLv3 Copyright: 2008, Kovid Goyal import json +import time from collections import defaultdict +from datetime import datetime, timedelta +from urllib.parse import quote, urlencode from calibre import replace_entities from calibre.ebooks.BeautifulSoup import NavigableString, Tag +from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import parse_only_date from calibre.web.feeds.news import BasicNewsRecipe from html5_parser import parse from lxml import etree -# For past editions, set date to, for example, '2020-11-28' +# For past editions, set date to, for example, '2020-11-28'. edition_date = None +use_archive = True def E(parent, name, text='', **attrs): @@ -52,31 +57,63 @@ class JSONHasNoContent(ValueError): pass -def load_article_from_json(raw, root): - # open('/t/raw.json', 'w').write(raw) - try: - data = json.loads(raw)['props']['pageProps']['content'] - except KeyError as e: - raise JSONHasNoContent(e) - if isinstance(data, list): - data = data[0] - body = root.xpath('//body')[0] - for child in tuple(body): - body.remove(child) - article = E(body, 'article') - E(article, 'h4', data['subheadline'], style='color: red; margin: 0') - E(article, 'h1', data['headline'], style='font-size: x-large') - E(article, 'div', data['description'], style='font-style: italic') - E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em') - main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') - if main_image_url: - div = E(article, 'div') +if use_archive: + def load_article_from_json(raw, root): + # open('/t/raw.json', 'w').write(raw) + data = json.loads(raw) + body = root.xpath('//body')[0] + article = E(body, 'article') + E(article, 'div', data['flyTitle'] , style='color: red; font-size:small; font-weight:bold;') + E(article, 'h1', data['title'], title=safe_dict(data, "url", "canonical") or '') + E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;') try: - E(div, 'img', src=main_image_url) + date = data['dateModified'] except Exception: - pass - for node in data.get('text') or (): - process_node(node, article) + date = data['datePublished'] + dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone) + dt = dt.strftime('%b %d, %Y, %I:%M %p') + if data['dateline'] is None: + E(article, 'p', dt, style='color: gray; font-size:small;') + else: + E(article, 'p', dt + ' | ' + (data['dateline']), style='color: gray; font-size:small;') + main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') + if main_image_url: + div = E(article, 'div') + try: + E(div, 'img', src=main_image_url) + except Exception: + pass + for node in data.get('text') or (): + process_node(node, article) +else: + def load_article_from_json(raw, root): + # open('/t/raw.json', 'w').write(raw) + try: + data = json.loads(raw)['props']['pageProps']['content'] + except KeyError as e: + raise JSONHasNoContent(e) + if isinstance(data, list): + data = data[0] + body = root.xpath('//body')[0] + for child in tuple(body): + body.remove(child) + article = E(body, 'article') + E(article, 'div', replace_entities(data['subheadline']) , style='color: red; font-size:small; font-weight:bold;') + E(article, 'h1', replace_entities(data['headline'])) + E(article, 'div', replace_entities(data['description']), style='font-style: italic; color:#202020;') + if data['dateline'] is None: + E(article, 'p', (data['datePublishedString'] or ''), style='color: gray; font-size:small;') + else: + E(article, 'p', (data['datePublishedString'] or '') + ' | ' + (data['dateline']), style='color: gray; font-size:small;') + main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical') + if main_image_url: + div = E(article, 'div') + try: + E(div, 'img', src=main_image_url) + except Exception: + pass + for node in data.get('text') or (): + process_node(node, article) def cleanup_html_article(root): @@ -129,31 +166,9 @@ class Economist(BasicNewsRecipe): ' perspective. Best downloaded on Friday mornings (GMT)' ) extra_css = ''' - .headline {font-size: x-large;} - h2 { font-size: small; } - h1 { font-size: medium; } - em.Bold {font-weight:bold;font-style:normal;} - em.Italic {font-style:italic;} - p.xhead {font-weight:bold;} - .pullquote { - float: right; - font-size: larger; - font-weight: bold; - font-style: italic; - page-break-inside:avoid; - border-bottom: 3px solid black; - border-top: 3px solid black; - width: 228px; - margin: 0px 0px 10px 15px; - padding: 7px 0px 9px; - } - .flytitle-and-title__flytitle { - display: block; - font-size: smaller; - color: red; - } + em { color:#202020; } img {display:block; margin:0 auto;} - ''' + ''' oldest_article = 7.0 resolve_internal_links = True remove_tags = [ @@ -186,15 +201,6 @@ class Economist(BasicNewsRecipe): needs_subscription = False - def __init__(self, *args, **kwargs): - BasicNewsRecipe.__init__(self, *args, **kwargs) - if self.output_profile.short_name.startswith('kindle'): - # Reduce image sizes to get file size below amazon's email - # sending threshold - self.web2disk_options.compress_news_images = True - self.web2disk_options.compress_news_images_auto_size = 5 - self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') - def get_browser(self, *args, **kwargs): # Needed to bypass cloudflare kwargs['user_agent'] = 'common_words/based' @@ -202,19 +208,170 @@ class Economist(BasicNewsRecipe): br.addheaders += [('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8')] return br + def publication_date(self): + if edition_date: + return parse_only_date(edition_date, as_utc=False) + url = self.browser.open("https://www.economist.com/printedition").geturl() + return parse_only_date(url.split("/")[-1], as_utc=False) + + def economist_test_article(self): + return [('Articles', [{'title':'test', + 'url':'https://www.economist.com/the-americas/2024/04/14/elon-musk-is-feuding-with-brazils-powerful-supreme-court' + }])] + + def economist_return_index(self, ans): + if not ans: + raise NoArticles( + 'Could not find any articles, either the ' + 'economist.com server is having trouble and you should ' + 'try later or the website format has changed and the ' + 'recipe needs to be updated.' + ) + return ans + + if use_archive: + def parse_index(self): + # return self.economist_test_article() + url = 'https://www.economist.com/weeklyedition/archive' + if edition_date: + url = 'https://www.economist.com/weeklyedition/' + edition_date + soup = self.index_to_soup(url) + script_tag = soup.find("script", id="__NEXT_DATA__") + if script_tag is None: + raise ValueError('No script tag with JSON data found in the weeklyedition archive') + data = json.loads(script_tag.string) + content_id = data['props']['pageProps']['content']['id'].split('/')[-1] + query = { + 'query': 'query LatestWeeklyAutoEditionQuery($ref:String!){canonical(ref:$ref){hasPart(from:0 size:1 sort:"datePublished:desc"){parts{...WeeklyEditionFragment __typename}__typename}__typename}}fragment WeeklyEditionFragment on Content{id type datePublished image{...ImageCoverFragment __typename}url{canonical __typename}hasPart(size:100 sort:"publication.context.position"){parts{...ArticleFragment __typename}__typename}__typename}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}fragment ImageCoverFragment on Media{cover{headline width height url{canonical __typename}regionsAllowed __typename}__typename}', # noqa + 'operationName': 'LatestWeeklyAutoEditionQuery', + 'variables': '{{"ref":"/content/{}"}}'.format(content_id), + } + if edition_date: + query = { + 'query': 'query SpecificWeeklyEditionQuery($path:String!){section:canonical(ref:$path){...WeeklyEditionFragment __typename}}fragment WeeklyEditionFragment on Content{id type datePublished image{...ImageCoverFragment __typename}url{canonical __typename}hasPart(size:100 sort:"publication.context.position"){parts{...ArticleFragment __typename}__typename}__typename}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}fragment ImageCoverFragment on Media{cover{headline width height url{canonical __typename}regionsAllowed __typename}__typename}', # noqa + 'operationName': 'SpecificWeeklyEditionQuery', + 'variables': '{{"path":"/content/{}"}}'.format(content_id), + } + url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote) + raw = self.index_to_soup(url, raw=True) + ans = self.economist_parse_index(raw) + return self.economist_return_index(ans) + + def economist_parse_index(self, raw): + if edition_date: + data = json.loads(raw)['data']['section'] + else: + data = json.loads(raw)['data']['canonical']['hasPart']['parts'][0] + self.description = data['image']['cover'][0]['headline'] + dt = datetime.fromisoformat(data['datePublished'][:-1]) + timedelta(seconds=time.timezone) + dt = dt.strftime('%b %d, %Y') + self.timefmt = ' [' + dt + ']' + self.cover_url = data['image']['cover'][0]['url']['canonical'].replace('economist.com/', + 'economist.com/cdn-cgi/image/width=960,quality=80,format=auto/') + self.log('Got cover:', self.cover_url) + + feeds_dict = defaultdict(list) + for part in safe_dict(data, "hasPart", "parts"): + try: + section = part['articleSection']['internal'][0]['title'] + except Exception: + section = safe_dict(part, 'print', 'section', 'title') or 'section' + if section not in feeds_dict: + self.log(section) + title = safe_dict(part, "title") + desc = safe_dict(part, "rubric") or '' + sub = safe_dict(part, "flyTitle") or '' + if sub and section != sub: + desc = sub + ' :: ' + desc + pt = PersistentTemporaryFile('.html') + pt.write(json.dumps(part).encode('utf-8')) + pt.close() + url = 'file:///' + pt.name + feeds_dict[section].append({"title": title, "url": url, "description": desc}) + self.log('\t', title, '\n\t\t', desc) + return [(section, articles) for section, articles in feeds_dict.items()] + + def populate_article_metadata(self, article, soup, first): + article.url = soup.find('h1')['title'] + + def preprocess_html(self, soup): + for img in soup.findAll('img', src=True): + img['src'] = img['src'].replace('economist.com/', + 'economist.com/cdn-cgi/image/width=600,quality=80,format=auto/') + return soup + + else: # Load articles from individual article pages {{{ + + def __init__(self, *args, **kwargs): + BasicNewsRecipe.__init__(self, *args, **kwargs) + if self.output_profile.short_name.startswith('kindle'): + # Reduce image sizes to get file size below amazon's email + # sending threshold + self.web2disk_options.compress_news_images = True + self.web2disk_options.compress_news_images_auto_size = 5 + self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') + + def parse_index(self): + # return self.economist_test_article() + if edition_date: + url = 'https://www.economist.com/weeklyedition/' + edition_date + self.timefmt = ' [' + edition_date + ']' + else: + url = 'https://www.economist.com/weeklyedition' + soup = self.index_to_soup(url) + ans = self.economist_parse_index(soup) + return self.economist_return_index(ans) + + def economist_parse_index(self, soup): + script_tag = soup.find("script", id="__NEXT_DATA__") + if script_tag is not None: + data = json.loads(script_tag.string) + # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True)) + self.description = safe_dict(data, "props", "pageProps", "content", "image", "main", "headline") + self.timefmt = ' [' + safe_dict(data, "props", "pageProps", "content", "datePublishedString") + ']' + self.cover_url = safe_dict(data, "props", "pageProps", "content", "image", "main", "url", "canonical") + self.log('Got cover:', self.cover_url) + + feeds_dict = defaultdict(list) + for part in safe_dict(data, "props", "pageProps", "content", "hasPart", "parts"): + section = safe_dict(part, "print", "section", "headline") or '' + title = safe_dict(part, "headline") or '' + url = safe_dict(part, "url", "canonical") or '' + if not section or not title or not url: + continue + desc = safe_dict(part, "description") or '' + sub = safe_dict(part, "subheadline") or '' + if sub and section != sub: + desc = sub + ' :: ' + desc + feeds_dict[section].append({"title": title, "url": url, "description": desc}) + self.log(' ', title, url, '\n ', desc) + return [(section, articles) for section, articles in feeds_dict.items()] + else: + return [] + + # }}} + + def preprocess_raw_html(self, raw, url): # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) - root = parse(raw) + if use_archive: + body = '
' + root = parse(body) + load_article_from_json(raw, root) + else: + root = parse(raw) + script = root.xpath('//script[@id="__NEXT_DATA__"]') + if script: + try: + load_article_from_json(script[0].text, root) + except JSONHasNoContent: + cleanup_html_article(root) + if '/interactive/' in url: - return '

' + root.xpath('//h1')[0].text + '

' \ + return '

' + root.xpath('//h1')[0].text + '

' \ + 'This article is supposed to be read in a browser' \ + '
' - script = root.xpath('//script[@id="__NEXT_DATA__"]') - if script: - try: - load_article_from_json(script[0].text, root) - except JSONHasNoContent: - cleanup_html_article(root) + for div in root.xpath('//div[@class="lazy-image"]'): noscript = list(div.iter('noscript')) if noscript and noscript[0].text: @@ -227,11 +384,15 @@ class Economist(BasicNewsRecipe): for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'): x.getparent().remove(x) # the economist uses for small caps with a custom font + for init in root.xpath('//span[@data-caps="initial"]'): + init.set('style', 'font-weight:bold;') for x in root.xpath('//small'): if x.text and len(x) == 0: x.text = x.text.upper() x.tag = 'span' x.set('style', 'font-variant: small-caps') + for h2 in root.xpath('//h2'): + h2.tag = 'h4' for x in root.xpath('//figcaption'): x.set('style', 'text-align:center; font-size:small;') for x in root.xpath('//cite'): @@ -239,17 +400,8 @@ class Economist(BasicNewsRecipe): x.set('style', 'color:#404040;') raw = etree.tostring(root, encoding='unicode') return raw - - def publication_date(self): - if edition_date: - return parse_only_date(edition_date, as_utc=False) - url = self.browser.open("https://www.economist.com/printedition").geturl() - return parse_only_date(url.split("/")[-1], as_utc=False) - - def parse_index(self): - # return [('Articles', [{'title':'test', - # 'url':'https://www.economist.com/interactive/briefing/2022/06/11/huge-foundation-models-are-turbo-charging-ai-progress' - # }])] + def parse_index_from_printedition(self): + # return self.economist_test_article() if edition_date: url = 'https://www.economist.com/weeklyedition/' + edition_date self.timefmt = ' [' + edition_date + ']' @@ -276,33 +428,6 @@ class Economist(BasicNewsRecipe): ) return ans - def economist_parse_index(self, soup): - script_tag = soup.find("script", id="__NEXT_DATA__") - if script_tag is not None: - data = json.loads(script_tag.string) - # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True)) - # self.title = 'The Economist | ' + safe_dict(data, "props", "pageProps", "content", "image", "main", "headline") - self.timefmt = ' [' + safe_dict(data, "props", "pageProps", "content", "datePublishedString") + ']' - self.cover_url = safe_dict(data, "props", "pageProps", "content", "image", "main", "url", "canonical") - self.log('Got cover:', self.cover_url) - - feeds_dict = defaultdict(list) - for part in safe_dict(data, "props", "pageProps", "content", "hasPart", "parts"): - section = safe_dict(part, "print", "section", "headline") or '' - title = safe_dict(part, "headline") or '' - url = safe_dict(part, "url", "canonical") or '' - if not section or not title or not url: - continue - desc = safe_dict(part, "description") or '' - sub = safe_dict(part, "subheadline") or '' - if sub and section != sub: - desc = sub + ' :: ' + desc - feeds_dict[section].append({"title": title, "url": url, "description": desc}) - self.log(' ', title, url, '\n ', desc) - return [(section, articles) for section, articles in feeds_dict.items()] - else: - return [] - def eco_find_image_tables(self, soup): for x in soup.findAll('table', align=['right', 'center']): if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1: @@ -330,3 +455,12 @@ class Economist(BasicNewsRecipe): if url.endswith('/print'): url = url.rpartition('/')[0] return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link) + + +def get_login_cookies(username, password): + print(33333333333, username, password) + + +if __name__ == '__main__': + import sys + get_login_cookies(sys.argv[-2], sys.argv[-1]) diff --git a/recipes/el_correo.recipe b/recipes/el_correo.recipe index 4f03835d4684..f83ee410df07 100644 --- a/recipes/el_correo.recipe +++ b/recipes/el_correo.recipe @@ -19,6 +19,8 @@ class elcorreo(BasicNewsRecipe): encoding = 'utf-8' remove_empty_feeds = True resolve_internal_links = True + max_articles_per_feed = 25 # articles + compress_news_images = True extra_css = ''' .v-mdl-ath__inf, .v-mdl-ath__p--2, .v-mdl-ath__p {font-size:small; color:#404040;} diff --git a/recipes/financial_times.recipe b/recipes/financial_times.recipe index a4c6dfc507d7..62384d6aedcb 100644 --- a/recipes/financial_times.recipe +++ b/recipes/financial_times.recipe @@ -146,6 +146,13 @@ class ft(BasicNewsRecipe): return html def preprocess_html(self, soup): + p = soup.find(**classes('o-topper__standfirst')) + if p: + p.name = 'p' + for table in soup.findAll('table'): + if len(table.find('tbody').findAll('tr')) > 20: + table.find('tbody').decompose() + table.string = '** a table that was supposed to be here has been removed.' for con in soup.findAll(attrs={'class':'n-content-layout__slot'}): if con.find('figure'): con['id'] = 'fig' diff --git a/recipes/icons/dilema.png b/recipes/icons/dilema.png new file mode 100644 index 000000000000..86df3de293df Binary files /dev/null and b/recipes/icons/dilema.png differ diff --git a/recipes/icons/internazionale.png b/recipes/icons/internazionale.png new file mode 100644 index 000000000000..09447924625d Binary files /dev/null and b/recipes/icons/internazionale.png differ diff --git a/recipes/icons/parool.png b/recipes/icons/parool.png new file mode 100644 index 000000000000..27af33f1ddd2 Binary files /dev/null and b/recipes/icons/parool.png differ diff --git a/recipes/icons/revista22.png b/recipes/icons/revista22.png new file mode 100644 index 000000000000..41b27a753353 Binary files /dev/null and b/recipes/icons/revista22.png differ diff --git a/recipes/icons/volksrant.png b/recipes/icons/volksrant.png index 57349203ab8e..ec3d3c8a0f1a 100644 Binary files a/recipes/icons/volksrant.png and b/recipes/icons/volksrant.png differ diff --git a/recipes/internazionale.recipe b/recipes/internazionale.recipe new file mode 100644 index 000000000000..d2c1a0bbfc07 --- /dev/null +++ b/recipes/internazionale.recipe @@ -0,0 +1,121 @@ +#!/usr/bin/env python +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class Volkskrant(BasicNewsRecipe): + title = 'Internazionale' + __author__ = 'Cristi Ghera' + max_articles_per_feed = 100 + description = 'Internazionale - Notizie dall’Italia e dal mondo' + needs_subscription = False + language = 'it' + country = 'IT' + category = 'news, politics, Italy, world' + resolve_internal_links = True + remove_tags_before = { 'name': 'article' } + remove_tags_after = { 'name': 'article' } + remove_tags = [ + dict( + attrs={ + 'class': [ + 'item-banner', + 'hentryfeed__side', + 'magazine-article-share-tools', + 'magazine-article-share-popup', + 'article_next', + 'cta_nl_ext_container', + ] + } + ), + dict(name=['script', 'style']), + ] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' + no_stylesheets = True + ignore_duplicate_articles = {'url'} + + current_number_url = "https://www.internazionale.it/sommario" + home_url = "https://www.internazionale.it" + cover_url = None + + def extract_article(self, article): + url = article.find('a')['href'] + if url[0] == '/': + url = self.home_url + url + title_parts = [] + tag = article.find('div', {'class': 'abstract-article__tag'}) + if tag: + title_parts.append(self.tag_to_string(tag).upper()) + title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'}))) + article_title = ' \u2022 '.join(title_parts) + pubdate='' + description_parts = [] + author = article.find('div', {'class': 'abstract-article__author'}) + if author: + description_parts.append(self.tag_to_string(author)) + summary = article.find('div', {'class': 'abstract-article__content'}) + if summary: + description_parts.append(self.tag_to_string(summary)) + description = ' \u2022 '.join(description_parts) + return dict( + title=article_title, + url=url, + date=pubdate, + description=description, + content='' + ) + + def parse_index(self): + soup = self.index_to_soup(self.current_number_url) + self.cover_url = soup.find('span', { 'class': 'img_expand' })['data-src'] + main_container = soup.find('div', { 'class': 'content_data' }) + children = main_container.findAll('div', recursive=False) + sections = [] + current_section = None + for container in children: + if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']: + if current_section: + sections.append(current_section) + current_section = (self.tag_to_string(container), []) + continue + + if 'masonry-items' in container['class']: + for article in container.findAll('div', {'class': 'abstract-article'}): + current_section[1].append(self.extract_article(article)) + continue + + if 'abstract-article' in container['class']: + current_section[1].append(self.extract_article(container)) + continue + + # print(container['class']) + if current_section: + sections.append(current_section) + return sections + + def preprocess_html(self, soup): + for node in soup.findAll('figure'): + img_src = None + image_attributes = [ + 'data-media1024', + 'data-media1025', + 'data-media641', + 'data-media321', + 'data-media', + ] + for attr in image_attributes: + if node.has_attr(attr): + img_src = node[attr] + break + node.name = 'div' + if img_src: + img = soup.new_tag('img', src=img_src) + node.insert(0, img) + for node in soup.findAll('figcaption'): + node.name = 'div' + # if self.browser.cookiejar: + # self.browser.cookiejar.clear() + return soup + + def get_cover_url(self): + return self.cover_url diff --git a/recipes/mit_technology_review.recipe b/recipes/mit_technology_review.recipe index 709962c0ba8d..08bbd0b9fc6a 100644 --- a/recipes/mit_technology_review.recipe +++ b/recipes/mit_technology_review.recipe @@ -76,7 +76,9 @@ class MitTechnologyReview(BasicNewsRecipe): soup = self.index_to_soup(self.INDEX) issue = soup.find(attrs={'class':lambda x: x and x.startswith('magazineHero__title')}) time = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__date')}) + desc = soup.find(attrs={'class': lambda x: x and x.startswith('magazineHero__description')}) self.title = 'MIT Tech Review ' + self.tag_to_string(issue) + self.description = self.tag_to_string(desc) self.timefmt = ' [' + self.tag_to_string(time) + ']' self.log('Downloading issue: ', self.timefmt) diff --git a/recipes/parool.recipe b/recipes/parool.recipe new file mode 100644 index 000000000000..4f19e35d3975 --- /dev/null +++ b/recipes/parool.recipe @@ -0,0 +1,99 @@ +#!/usr/bin/env python +import json +import uuid +from contextlib import closing + +from calibre.web.feeds.recipes import BasicNewsRecipe +from mechanize import Request + + +class Parool(BasicNewsRecipe): + title = 'Het Parool' + __author__ = 'Cristi Ghera' + max_articles_per_feed = 100 + description = 'Het Parool - Vrij, Onverveerd' + needs_subscription = False + language = 'nl' + country = 'NL' + category = 'news, politics, Netherlands' + resolve_internal_links = True + remove_tags_before = dict(id='main-content') + remove_tags_after = dict(id='main-content') + remove_tags = [ + dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement', + 'artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}), + dict(attrs={'data-element-id': ['article-element-authors']}), + dict(name=['script', 'noscript', 'style']), + ] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' + no_stylesheets = True + ignore_duplicate_articles = {'url'} + + def parse_index(self): + soup = self.index_to_soup('https://www.parool.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())) + containers = soup.findAll('section', attrs={'class': 'section--horizontal'}) + sections = [] + for container in containers: + section_title = self.tag_to_string(container.find('h2')).strip() + articles = [] + + for art in container.findAll('article'): + a = art.find('a') + url = a['href'] + if url[0] == '/': + url = 'https://www.parool.nl' + url + if '/editie/' not in url: + continue + header = a.find('header') + teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip() + teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip() + teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip() + ignore = { "dirkjan", "s1ngle", "pukkels", "hein de kort" } + if teaser_label.lower() in ignore: + continue + parts = [] + if teaser_label: + parts.append(teaser_label.upper()) + if teaser_sublabel: + parts.append(teaser_sublabel) + if teaser_title: + parts.append(teaser_title) + article_title = ' \u2022 '.join(parts) + articles.append(dict(title=article_title, + url=url, + content='')) + + sections.append((section_title, articles)) + return sections + + def preprocess_html(self, soup): + for tag in soup(): + if tag.name == 'img': + if tag['src'][0] == '/': + tag['src'] = 'https://www.parool.nl' + tag['src'] + for tag in soup(): + if tag.name == "picture": + tag.replaceWith(tag.find("img")) + comic_articles = { + "Alle strips van Dirkjan", + "S1NGLE", + "Pukkels", + "Bekijk hier alle cartoons van Hein de Kort", + } + if self.tag_to_string(soup.find('h1')).strip() in comic_articles: + for node in soup.find('figure').find_next_siblings(): + node.extract() + return soup + + def get_cover_url(self): + headers = { + 'X-Requested-With': 'XMLHttpRequest', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'DNT': '1', + } + url = "https://login-api.e-pages.dk/v1/krant.parool.nl/folders" + with closing(self.browser.open(Request(url, None, headers))) as r: + folders = json.loads(r.read()) + return folders["objects"][0]["teaser_medium"] + return None diff --git a/recipes/revista22.recipe b/recipes/revista22.recipe new file mode 100644 index 000000000000..7d2a55b2f1ab --- /dev/null +++ b/recipes/revista22.recipe @@ -0,0 +1,75 @@ +#!/usr/bin/env python +from calibre.web.feeds.recipes import BasicNewsRecipe + + +class Volkskrant(BasicNewsRecipe): + title = 'Revista 22' + __author__ = 'Cristi Ghera' + max_articles_per_feed = 100 + description = 'Revista 22' + needs_subscription = False + language = 'ro' + country = 'RO' + category = 'news, politics, Romania' + resolve_internal_links = True + remove_tags_before = { 'class': 'col-span-8' } + remove_tags_after = { 'class': 'col-span-8' } + remove_tags = [ + dict( + attrs={ + 'class': [ + 'icons', + 'float-left', + 'samesection', + ] + } + ), + dict( + name=['div'], + attrs={ + 'class': ['mb-2'] + } + ), + dict(id=['comments']), + dict(name=['script', 'noscript', 'style']), + ] + remove_attributes = ["class", "id", "name", "style"] + encoding = 'utf-8' + no_stylesheets = True + ignore_duplicate_articles = {'url'} + + def parse_index(self): + soup = self.index_to_soup('https://revista22.ro') + url = soup.find('div', attrs={'class': 'uppercase'}).find('a').attrs['href'] + if url[0] == '/': + url = 'https://revista22.ro' + url + soup = self.index_to_soup(url) + main_container = soup.find('div', attrs={'class': 'col-span-8'}) + containers = main_container.findAll(attrs={'class': 'mb-4'}) + articles = [] + for container in containers: + if 'pb-4' not in container.attrs['class']: + continue + a = container.find('a') + url = a['href'] + if url[0] == '/': + url = 'https://revista22.ro' + url + article_title = self.tag_to_string(a.find('h3')).strip() + author = self.tag_to_string( + container.find('span', attrs={'class': 'text-red'}) + ).strip() + summary = self.tag_to_string(container.find('p')).strip() + pubdate = self.tag_to_string(a.find('span')) + description = author + ' - ' + summary + articles.append( + dict( + title=article_title, + url=url, + date=pubdate, + description=description, + content='' + ) + ) + + sections = [('Numărul curent', articles)] + return sections diff --git a/recipes/the_week_magazine_free.recipe b/recipes/the_week_magazine_free.recipe index c2d3717c3058..3f1f0b2dc2d1 100644 --- a/recipes/the_week_magazine_free.recipe +++ b/recipes/the_week_magazine_free.recipe @@ -65,7 +65,7 @@ class TheWeek(BasicNewsRecipe): ] remove_tags = [ - dict(name='aside'), + dict(name=['aside', 'source']), classes( 'blueconic-article__wrapper ad-unit van_vid_carousel tag-links' ) diff --git a/recipes/the_week_uk.recipe b/recipes/the_week_uk.recipe index 0ccf8b9902a9..7652ce29bfdd 100644 --- a/recipes/the_week_uk.recipe +++ b/recipes/the_week_uk.recipe @@ -65,7 +65,7 @@ class TheWeek(BasicNewsRecipe): ] remove_tags = [ - dict(name='aside'), + dict(name=['aside', 'source']), classes( 'blueconic-article__wrapper ad-unit van_vid_carousel tag-links' ) diff --git a/recipes/volksrant.recipe b/recipes/volksrant.recipe index 9a116aa7cee8..da3b850ec483 100644 --- a/recipes/volksrant.recipe +++ b/recipes/volksrant.recipe @@ -1,7 +1,10 @@ #!/usr/bin/env python +import json import uuid +from contextlib import closing from calibre.web.feeds.recipes import BasicNewsRecipe +from mechanize import Request class Volkskrant(BasicNewsRecipe): @@ -95,4 +98,25 @@ class Volkskrant(BasicNewsRecipe): if tag.name == 'img': if tag['src'][0] == '/': tag['src'] = 'https://www.volkskrant.nl' + tag['src'] + + for tag in soup(): + if tag.name == "picture": + tag.replaceWith(tag.find("img")) + + comic_articles = { "Bas van der Schot", "Poldermodellen", "Gummbah", "Sigmund" } + if self.tag_to_string(soup.find('h1')).strip() in comic_articles: + for node in soup.find('figure').find_next_siblings(): + node.extract() return soup + + def get_cover_url(self): + headers = { + 'X-Requested-With': 'XMLHttpRequest', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'DNT': '1', + } + url = "https://login-api.e-pages.dk/v1/krant.volkskrant.nl/folders" + with closing(self.browser.open(Request(url, None, headers))) as r: + folders = json.loads(r.read()) + return folders["objects"][0]["teaser_medium"] + return None diff --git a/setup/polib.py b/setup/polib.py new file mode 100644 index 000000000000..f45caa2e1e60 --- /dev/null +++ b/setup/polib.py @@ -0,0 +1,1821 @@ +# -* coding: utf-8 -*- +# +# License: MIT (see LICENSE file provided) +# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: + +""" +**polib** allows you to manipulate, create, modify gettext files (pot, po and +mo files). You can load existing files, iterate through it's entries, add, +modify entries, comments or metadata, etc. or create new po files from scratch. + +**polib** provides a simple and pythonic API via the :func:`~polib.pofile` and +:func:`~polib.mofile` convenience functions. +""" + +import array +import codecs +import os +import re +import struct +import sys +import textwrap +import io + + +__author__ = 'David Jean Louis ' +__version__ = '1.2.0' +__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', + 'default_encoding', 'escape', 'unescape', 'detect_encoding', ] + + +# the default encoding to use when encoding cannot be detected +default_encoding = 'utf-8' + +# python 2/3 compatibility helpers {{{ + + +if sys.version_info < (3,): + PY3 = False + text_type = unicode + + def b(s): + return s + + def u(s): + return unicode(s, "unicode_escape") + +else: + PY3 = True + text_type = str + + def b(s): + return s.encode("latin-1") + + def u(s): + return s +# }}} +# _pofile_or_mofile {{{ + + +def _pofile_or_mofile(f, type, **kwargs): + """ + Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to + honor the DRY concept. + """ + # get the file encoding + enc = kwargs.get('encoding') + if enc is None: + enc = detect_encoding(f, type == 'mofile') + + # parse the file + kls = type == 'pofile' and _POFileParser or _MOFileParser + parser = kls( + f, + encoding=enc, + check_for_duplicates=kwargs.get('check_for_duplicates', False), + klass=kwargs.get('klass') + ) + instance = parser.parse() + instance.wrapwidth = kwargs.get('wrapwidth', 78) + return instance +# }}} +# _is_file {{{ + + +def _is_file(filename_or_contents): + """ + Safely returns the value of os.path.exists(filename_or_contents). + + Arguments: + + ``filename_or_contents`` + either a filename, or a string holding the contents of some file. + In the latter case, this function will always return False. + """ + try: + return os.path.isfile(filename_or_contents) + except (TypeError, ValueError, UnicodeEncodeError): + return False +# }}} +# function pofile() {{{ + + +def pofile(pofile, **kwargs): + """ + Convenience function that parses the po or pot file ``pofile`` and returns + a :class:`~polib.POFile` instance. + + Arguments: + + ``pofile`` + string, full or relative path to the po/pot file or its content (data). + + ``wrapwidth`` + integer, the wrap width, only useful when the ``-w`` option was passed + to xgettext (optional, default: ``78``). + + ``encoding`` + string, the encoding to use (e.g. "utf-8") (default: ``None``, the + encoding will be auto-detected). + + ``check_for_duplicates`` + whether to check for duplicate entries when adding entries to the + file (optional, default: ``False``). + + ``klass`` + class which is used to instantiate the return value (optional, + default: ``None``, the return value with be a :class:`~polib.POFile` + instance). + """ + return _pofile_or_mofile(pofile, 'pofile', **kwargs) +# }}} +# function mofile() {{{ + + +def mofile(mofile, **kwargs): + """ + Convenience function that parses the mo file ``mofile`` and returns a + :class:`~polib.MOFile` instance. + + Arguments: + + ``mofile`` + string, full or relative path to the mo file or its content (string + or bytes). + + ``wrapwidth`` + integer, the wrap width, only useful when the ``-w`` option was passed + to xgettext to generate the po file that was used to format the mo file + (optional, default: ``78``). + + ``encoding`` + string, the encoding to use (e.g. "utf-8") (default: ``None``, the + encoding will be auto-detected). + + ``check_for_duplicates`` + whether to check for duplicate entries when adding entries to the + file (optional, default: ``False``). + + ``klass`` + class which is used to instantiate the return value (optional, + default: ``None``, the return value with be a :class:`~polib.POFile` + instance). + """ + return _pofile_or_mofile(mofile, 'mofile', **kwargs) +# }}} +# function detect_encoding() {{{ + + +def detect_encoding(file, binary_mode=False): + """ + Try to detect the encoding used by the ``file``. The ``file`` argument can + be a PO or MO file path or a string containing the contents of the file. + If the encoding cannot be detected, the function will return the value of + ``default_encoding``. + + Arguments: + + ``file`` + string, full or relative path to the po/mo file or its content. + + ``binary_mode`` + boolean, set this to True if ``file`` is a mo file. + """ + PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)' + rxt = re.compile(u(PATTERN)) + rxb = re.compile(b(PATTERN)) + + def charset_exists(charset): + """Check whether ``charset`` is valid or not.""" + try: + codecs.lookup(charset) + except LookupError: + return False + return True + + if not _is_file(file): + try: + match = rxt.search(file) + except TypeError: + match = rxb.search(file) + if match: + enc = match.group(1).strip() + if not isinstance(enc, text_type): + enc = enc.decode('utf-8') + if charset_exists(enc): + return enc + else: + # For PY3, always treat as binary + if binary_mode or PY3: + mode = 'rb' + rx = rxb + else: + mode = 'r' + rx = rxt + with open(file, mode) as f: + for line in f.readlines(): + match = rx.search(line) + if match: + f.close() + enc = match.group(1).strip() + if not isinstance(enc, text_type): + enc = enc.decode('utf-8') + if charset_exists(enc): + return enc + return default_encoding +# }}} +# function escape() {{{ + + +def escape(st): + """ + Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r``, ``\\v``, + ``\\b``, ``\\f`` and ``"`` in the given string ``st`` and returns it. + """ + return st.replace('\\', r'\\')\ + .replace('\t', r'\t')\ + .replace('\r', r'\r')\ + .replace('\n', r'\n')\ + .replace('\v', r'\v')\ + .replace('\b', r'\b')\ + .replace('\f', r'\f')\ + .replace('\"', r'\"') +# }}} +# function unescape() {{{ + + +def unescape(st): + """ + Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r``, ``\\v``, + ``\\b``, ``\\f`` and ``"`` in the given string ``st`` and returns it. + """ + def unescape_repl(m): + m = m.group(1) + if m == 'n': + return '\n' + if m == 't': + return '\t' + if m == 'r': + return '\r' + if m == 'v': + return '\v' + if m == 'b': + return '\b' + if m == 'f': + return '\f' + if m == '\\': + return '\\' + return m # handles escaped double quote + return re.sub(r'\\(\\|n|t|r|v|b|f|")', unescape_repl, st) +# }}} +# function natural_sort() {{{ + + +def natural_sort(lst): + """ + Sort naturally the given list. + Credits: http://stackoverflow.com/a/4836734 + """ + def convert(text): + return int(text) if text.isdigit() else text.lower() + + def alphanum_key(key): + return [convert(c) for c in re.split('([0-9]+)', key)] + + return sorted(lst, key=alphanum_key) + +# }}} +# class _BaseFile {{{ + + +class _BaseFile(list): + """ + Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile` + classes. This class should **not** be instantiated directly. + """ + + def __init__(self, *args, **kwargs): + """ + Constructor, accepts the following keyword arguments: + + ``pofile`` + string, the path to the po or mo file, or its content as a string. + + ``wrapwidth`` + integer, the wrap width, only useful when the ``-w`` option was + passed to xgettext (optional, default: ``78``). + + ``encoding`` + string, the encoding to use, defaults to ``default_encoding`` + global variable (optional). + + ``check_for_duplicates`` + whether to check for duplicate entries when adding entries to the + file, (optional, default: ``False``). + """ + list.__init__(self) + # the opened file handle + pofile = kwargs.get('pofile', None) + if pofile and _is_file(pofile): + self.fpath = pofile + else: + self.fpath = kwargs.get('fpath') + # the width at which lines should be wrapped + self.wrapwidth = kwargs.get('wrapwidth', 78) + # the file encoding + self.encoding = kwargs.get('encoding', default_encoding) + # whether to check for duplicate entries or not + self.check_for_duplicates = kwargs.get('check_for_duplicates', False) + # header + self.header = '' + # both po and mo files have metadata + self.metadata = {} + self.metadata_is_fuzzy = 0 + + def __unicode__(self): + """ + Returns the unicode representation of the file. + """ + ret = [] + entries = [self.metadata_as_entry()] + \ + [e for e in self if not e.obsolete] + for entry in entries: + ret.append(entry.__unicode__(self.wrapwidth)) + for entry in self.obsolete_entries(): + ret.append(entry.__unicode__(self.wrapwidth)) + ret = u('\n').join(ret) + return ret + + if PY3: + def __str__(self): + return self.__unicode__() + else: + def __str__(self): + """ + Returns the string representation of the file. + """ + return unicode(self).encode(self.encoding) + + def __contains__(self, entry): + """ + Overridden ``list`` method to implement the membership test (in and + not in). + The method considers that an entry is in the file if it finds an entry + that has the same msgid (the test is **case sensitive**) and the same + msgctxt (or none for both entries). + + Argument: + + ``entry`` + an instance of :class:`~polib._BaseEntry`. + """ + return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \ + is not None + + def __eq__(self, other): + return str(self) == str(other) + + def append(self, entry): + """ + Overridden method to check for duplicates entries, if a user tries to + add an entry that is already in the file, the method will raise a + ``ValueError`` exception. + + Argument: + + ``entry`` + an instance of :class:`~polib._BaseEntry`. + """ + # check_for_duplicates may not be defined (yet) when unpickling. + # But if pickling, we never want to check for duplicates anyway. + if getattr(self, 'check_for_duplicates', False) and entry in self: + raise ValueError('Entry "%s" already exists' % entry.msgid) + super(_BaseFile, self).append(entry) + + def insert(self, index, entry): + """ + Overridden method to check for duplicates entries, if a user tries to + add an entry that is already in the file, the method will raise a + ``ValueError`` exception. + + Arguments: + + ``index`` + index at which the entry should be inserted. + + ``entry`` + an instance of :class:`~polib._BaseEntry`. + """ + if self.check_for_duplicates and entry in self: + raise ValueError('Entry "%s" already exists' % entry.msgid) + super(_BaseFile, self).insert(index, entry) + + def metadata_as_entry(self): + """ + Returns the file metadata as a :class:`~polib.POFile` instance. + """ + e = POEntry(msgid='') + mdata = self.ordered_metadata() + if mdata: + strs = [] + for name, value in mdata: + # Strip whitespace off each line in a multi-line entry + strs.append('%s: %s' % (name, value)) + e.msgstr = '\n'.join(strs) + '\n' + if self.metadata_is_fuzzy: + e.flags.append('fuzzy') + return e + + def save(self, fpath=None, repr_method='__unicode__', newline=None): + """ + Saves the po file to ``fpath``. + If it is an existing file and no ``fpath`` is provided, then the + existing file is rewritten with the modified data. + + Keyword arguments: + + ``fpath`` + string, full or relative path to the file. + + ``repr_method`` + string, the method to use for output. + + ``newline`` + string, controls how universal newlines works + """ + if self.fpath is None and fpath is None: + raise IOError('You must provide a file path to save() method') + contents = getattr(self, repr_method)() + if fpath is None: + fpath = self.fpath + if repr_method == 'to_binary': + with open(fpath, 'wb') as fhandle: + fhandle.write(contents) + else: + with io.open( + fpath, + 'w', + encoding=self.encoding, + newline=newline + ) as fhandle: + if not isinstance(contents, text_type): + contents = contents.decode(self.encoding) + fhandle.write(contents) + + # set the file path if not set + if self.fpath is None and fpath: + self.fpath = fpath + + def find(self, st, by='msgid', include_obsolete_entries=False, + msgctxt=False): + """ + Find the entry which msgid (or property identified by the ``by`` + argument) matches the string ``st``. + + Keyword arguments: + + ``st`` + string, the string to search for. + + ``by`` + string, the property to use for comparison (default: ``msgid``). + + ``include_obsolete_entries`` + boolean, whether to also search in entries that are obsolete. + + ``msgctxt`` + string, allows specifying a specific message context for the + search. + """ + if include_obsolete_entries: + entries = self[:] + else: + entries = [e for e in self if not e.obsolete] + matches = [] + for e in entries: + if getattr(e, by) == st: + if msgctxt is not False and e.msgctxt != msgctxt: + continue + matches.append(e) + if len(matches) == 1: + return matches[0] + elif len(matches) > 1: + if not msgctxt: + # find the entry with no msgctx + e = None + for m in matches: + if not m.msgctxt: + e = m + if e: + return e + # fallback to the first entry found + return matches[0] + return None + + def ordered_metadata(self): + """ + Convenience method that returns an ordered version of the metadata + dictionary. The return value is list of tuples (metadata name, + metadata_value). + """ + # copy the dict first + metadata = self.metadata.copy() + data_order = [ + 'Project-Id-Version', + 'Report-Msgid-Bugs-To', + 'POT-Creation-Date', + 'PO-Revision-Date', + 'Last-Translator', + 'Language-Team', + 'Language', + 'MIME-Version', + 'Content-Type', + 'Content-Transfer-Encoding', + 'Plural-Forms' + ] + ordered_data = [] + for data in data_order: + try: + value = metadata.pop(data) + ordered_data.append((data, value)) + except KeyError: + pass + # the rest of the metadata will be alphabetically ordered since there + # are no specs for this AFAIK + for data in natural_sort(metadata.keys()): + value = metadata[data] + ordered_data.append((data, value)) + return ordered_data + + def to_binary(self): + """ + Return the binary representation of the file. + """ + offsets = [] + entries = self.translated_entries() + + # the keys are sorted in the .mo file + def cmp(_self, other): + # msgfmt compares entries with msgctxt if it exists + self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid + other_msgid = other.msgctxt and other.msgctxt or other.msgid + if self_msgid > other_msgid: + return 1 + elif self_msgid < other_msgid: + return -1 + else: + return 0 + # add metadata entry + entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8')) + mentry = self.metadata_as_entry() + entries = [mentry] + entries + entries_len = len(entries) + ids, strs = b(''), b('') + for e in entries: + # For each string, we need size and file offset. Each string is + # NUL terminated; the NUL does not count into the size. + msgid = b('') + if e.msgctxt: + # Contexts are stored by storing the concatenation of the + # context, a byte, and the original string + msgid = self._encode(e.msgctxt + '\4') + if e.msgid_plural: + msgstr = [] + for index in sorted(e.msgstr_plural.keys()): + msgstr.append(e.msgstr_plural[index]) + msgid += self._encode(e.msgid + '\0' + e.msgid_plural) + msgstr = self._encode('\0'.join(msgstr)) + else: + msgid += self._encode(e.msgid) + msgstr = self._encode(e.msgstr) + offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) + ids += msgid + b('\0') + strs += msgstr + b('\0') + + # The header is 7 32-bit unsigned integers. + keystart = 7 * 4 + 16 * entries_len + # and the values start after the keys + valuestart = keystart + len(ids) + koffsets = [] + voffsets = [] + # The string table first has the list of keys, then the list of values. + # Each entry has first the size of the string, then the file offset. + for o1, l1, o2, l2 in offsets: + koffsets += [l1, o1 + keystart] + voffsets += [l2, o2 + valuestart] + offsets = koffsets + voffsets + + output = struct.pack( + "Iiiiiii", + # Magic number + MOFile.MAGIC, + # Version + 0, + # number of entries + entries_len, + # start of key index + 7 * 4, + # start of value index + 7 * 4 + entries_len * 8, + # size and offset of hash table, we don't use hash tables + 0, keystart + + ) + if PY3 and sys.version_info.minor > 1: # python 3.2 or superior + output += array.array("i", offsets).tobytes() + else: + output += array.array("i", offsets).tostring() + output += ids + output += strs + return output + + def _encode(self, mixed): + """ + Encodes the given ``mixed`` argument with the file encoding if and + only if it's an unicode string and returns the encoded string. + """ + if isinstance(mixed, text_type): + mixed = mixed.encode(self.encoding) + return mixed +# }}} +# class POFile {{{ + + +class POFile(_BaseFile): + """ + Po (or Pot) file reader/writer. + This class inherits the :class:`~polib._BaseFile` class and, by extension, + the python ``list`` type. + """ + + def __unicode__(self): + """ + Returns the unicode representation of the po file. + """ + ret, headers = '', self.header.split('\n') + for header in headers: + if not len(header): + ret += "#\n" + elif header[:1] in [',', ':']: + ret += '#%s\n' % header + else: + ret += '# %s\n' % header + + if not isinstance(ret, text_type): + ret = ret.decode(self.encoding) + + return ret + _BaseFile.__unicode__(self) + + def save_as_mofile(self, fpath): + """ + Saves the binary representation of the file to given ``fpath``. + + Keyword argument: + + ``fpath`` + string, full or relative path to the mo file. + """ + _BaseFile.save(self, fpath, 'to_binary') + + def percent_translated(self): + """ + Convenience method that returns the percentage of translated + messages. + """ + total = len([e for e in self if not e.obsolete]) + if total == 0: + return 100 + translated = len(self.translated_entries()) + return int(translated * 100 / float(total)) + + def translated_entries(self): + """ + Convenience method that returns the list of translated entries. + """ + return [e for e in self if e.translated()] + + def untranslated_entries(self): + """ + Convenience method that returns the list of untranslated entries. + """ + return [e for e in self if not e.translated() and not e.obsolete + and not e.fuzzy] + + def fuzzy_entries(self): + """ + Convenience method that returns the list of fuzzy entries. + """ + return [e for e in self if e.fuzzy and not e.obsolete] + + def obsolete_entries(self): + """ + Convenience method that returns the list of obsolete entries. + """ + return [e for e in self if e.obsolete] + + def merge(self, refpot): + """ + Convenience method that merges the current pofile with the pot file + provided. It behaves exactly as the gettext msgmerge utility: + + * comments of this file will be preserved, but extracted comments and + occurrences will be discarded; + * any translations or comments in the file will be discarded, however, + dot comments and file positions will be preserved; + * the fuzzy flags are preserved. + + Keyword argument: + + ``refpot`` + object POFile, the reference catalog. + """ + # Store entries in dict/set for faster access + self_entries = dict( + (entry.msgid_with_context, entry) for entry in self + ) + refpot_msgids = set(entry.msgid_with_context for entry in refpot) + # Merge entries that are in the refpot + for entry in refpot: + e = self_entries.get(entry.msgid_with_context) + if e is None: + e = POEntry() + self.append(e) + e.merge(entry) + # ok, now we must "obsolete" entries that are not in the refpot anymore + for entry in self: + if entry.msgid_with_context not in refpot_msgids: + entry.obsolete = True +# }}} +# class MOFile {{{ + + +class MOFile(_BaseFile): + """ + Mo file reader/writer. + This class inherits the :class:`~polib._BaseFile` class and, by + extension, the python ``list`` type. + """ + MAGIC = 0x950412de + MAGIC_SWAPPED = 0xde120495 + + def __init__(self, *args, **kwargs): + """ + Constructor, accepts all keywords arguments accepted by + :class:`~polib._BaseFile` class. + """ + _BaseFile.__init__(self, *args, **kwargs) + self.magic_number = None + self.version = 0 + + def save_as_pofile(self, fpath): + """ + Saves the mofile as a pofile to ``fpath``. + + Keyword argument: + + ``fpath`` + string, full or relative path to the file. + """ + _BaseFile.save(self, fpath) + + def save(self, fpath=None): + """ + Saves the mofile to ``fpath``. + + Keyword argument: + + ``fpath`` + string, full or relative path to the file. + """ + _BaseFile.save(self, fpath, 'to_binary') + + def percent_translated(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return 100 + + def translated_entries(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return self + + def untranslated_entries(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return [] + + def fuzzy_entries(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return [] + + def obsolete_entries(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return [] +# }}} +# class _BaseEntry {{{ + + +class _BaseEntry(object): + """ + Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes. + This class should **not** be instantiated directly. + """ + + def __init__(self, *args, **kwargs): + """ + Constructor, accepts the following keyword arguments: + + ``msgid`` + string, the entry msgid. + + ``msgstr`` + string, the entry msgstr. + + ``msgid_plural`` + string, the entry msgid_plural. + + ``msgstr_plural`` + dict, the entry msgstr_plural lines. + + ``msgctxt`` + string, the entry context (msgctxt). + + ``obsolete`` + bool, whether the entry is "obsolete" or not. + + ``encoding`` + string, the encoding to use, defaults to ``default_encoding`` + global variable (optional). + """ + self.msgid = kwargs.get('msgid', '') + self.msgstr = kwargs.get('msgstr', '') + self.msgid_plural = kwargs.get('msgid_plural', '') + self.msgstr_plural = kwargs.get('msgstr_plural', {}) + self.msgctxt = kwargs.get('msgctxt', None) + self.obsolete = kwargs.get('obsolete', False) + self.encoding = kwargs.get('encoding', default_encoding) + + def __unicode__(self, wrapwidth=78): + """ + Returns the unicode representation of the entry. + """ + if self.obsolete: + delflag = '#~ ' + else: + delflag = '' + ret = [] + # write the msgctxt if any + if self.msgctxt is not None: + ret += self._str_field("msgctxt", delflag, "", self.msgctxt, + wrapwidth) + # write the msgid + ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth) + # write the msgid_plural if any + if self.msgid_plural: + ret += self._str_field("msgid_plural", delflag, "", + self.msgid_plural, wrapwidth) + if self.msgstr_plural: + # write the msgstr_plural if any + msgstrs = self.msgstr_plural + keys = list(msgstrs) + keys.sort() + for index in keys: + msgstr = msgstrs[index] + plural_index = '[%s]' % index + ret += self._str_field("msgstr", delflag, plural_index, msgstr, + wrapwidth) + else: + # otherwise write the msgstr + ret += self._str_field("msgstr", delflag, "", self.msgstr, + wrapwidth) + ret.append('') + ret = u('\n').join(ret) + return ret + + if PY3: + def __str__(self): + return self.__unicode__() + else: + def __str__(self): + """ + Returns the string representation of the entry. + """ + return unicode(self).encode(self.encoding) + + def __eq__(self, other): + return str(self) == str(other) + + def _str_field(self, fieldname, delflag, plural_index, field, + wrapwidth=78): + lines = field.splitlines(True) + if len(lines) > 1: + lines = [''] + lines # start with initial empty line + else: + escaped_field = escape(field) + specialchars_count = 0 + for c in ['\\', '\n', '\r', '\t', '\v', '\b', '\f', '"']: + specialchars_count += field.count(c) + # comparison must take into account fieldname length + one space + # + 2 quotes (eg. msgid "") + flength = len(fieldname) + 3 + if plural_index: + flength += len(plural_index) + real_wrapwidth = wrapwidth - flength + specialchars_count + if wrapwidth > 0 and len(field) > real_wrapwidth: + # Wrap the line but take field name into account + lines = [''] + [unescape(item) for item in textwrap.wrap( + escaped_field, + wrapwidth - 2, # 2 for quotes "" + drop_whitespace=False, + break_long_words=False + )] + else: + lines = [field] + if fieldname.startswith('previous_'): + # quick and dirty trick to get the real field name + fieldname = fieldname[9:] + + ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, + escape(lines.pop(0)))] + for line in lines: + ret.append('%s"%s"' % (delflag, escape(line))) + return ret + + @property + def msgid_with_context(self): + if self.msgctxt: + return '%s%s%s' % (self.msgctxt, "\x04", self.msgid) + return self.msgid +# }}} +# class POEntry {{{ + + +class POEntry(_BaseEntry): + """ + Represents a po file entry. + """ + + def __init__(self, *args, **kwargs): + """ + Constructor, accepts the following keyword arguments: + + ``comment`` + string, the entry comment. + + ``tcomment`` + string, the entry translator comment. + + ``occurrences`` + list, the entry occurrences. + + ``flags`` + list, the entry flags. + + ``previous_msgctxt`` + string, the entry previous context. + + ``previous_msgid`` + string, the entry previous msgid. + + ``previous_msgid_plural`` + string, the entry previous msgid_plural. + + ``linenum`` + integer, the line number of the entry + """ + _BaseEntry.__init__(self, *args, **kwargs) + self.comment = kwargs.get('comment', '') + self.tcomment = kwargs.get('tcomment', '') + self.occurrences = kwargs.get('occurrences', []) + self.flags = kwargs.get('flags', []) + self.previous_msgctxt = kwargs.get('previous_msgctxt', None) + self.previous_msgid = kwargs.get('previous_msgid', None) + self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) + self.linenum = kwargs.get('linenum', None) + + def __unicode__(self, wrapwidth=78): + """ + Returns the unicode representation of the entry. + """ + ret = [] + # comments first, if any (with text wrapping as xgettext does) + if self.obsolete: + comments = [('tcomment', '# ')] + else: + comments = [('tcomment', '# '), ('comment', '#. ')] + for c in comments: + val = getattr(self, c[0]) + if val: + for comment in val.split('\n'): + if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth: + ret += textwrap.wrap( + comment, + wrapwidth, + initial_indent=c[1], + subsequent_indent=c[1], + break_long_words=False + ) + else: + ret.append('%s%s' % (c[1], comment)) + + # occurrences (with text wrapping as xgettext does) + if not self.obsolete and self.occurrences: + filelist = [] + for fpath, lineno in self.occurrences: + if lineno: + filelist.append('%s:%s' % (fpath, lineno)) + else: + filelist.append(fpath) + filestr = ' '.join(filelist) + if wrapwidth > 0 and len(filestr) + 3 > wrapwidth: + # textwrap split words that contain hyphen, this is not + # what we want for filenames, so the dirty hack is to + # temporally replace hyphens with a char that a file cannot + # contain, like "*" + ret += [line.replace('*', '-') for line in textwrap.wrap( + filestr.replace('-', '*'), + wrapwidth, + initial_indent='#: ', + subsequent_indent='#: ', + break_long_words=False + )] + else: + ret.append('#: ' + filestr) + + # flags (TODO: wrapping ?) + if self.flags: + ret.append('#, %s' % ', '.join(self.flags)) + + # previous context and previous msgid/msgid_plural + fields = ['previous_msgctxt', 'previous_msgid', + 'previous_msgid_plural'] + if self.obsolete: + prefix = "#~| " + else: + prefix = "#| " + for f in fields: + val = getattr(self, f) + if val is not None: + ret += self._str_field(f, prefix, "", val, wrapwidth) + + ret.append(_BaseEntry.__unicode__(self, wrapwidth)) + ret = u('\n').join(ret) + return ret + + def __cmp__(self, other): + """ + Called by comparison operations if rich comparison is not defined. + """ + # First: Obsolete test + if self.obsolete != other.obsolete: + if self.obsolete: + return -1 + else: + return 1 + # Work on a copy to protect original + occ1 = sorted(self.occurrences[:]) + occ2 = sorted(other.occurrences[:]) + if occ1 > occ2: + return 1 + if occ1 < occ2: + return -1 + # Compare context + msgctxt = self.msgctxt or '0' + othermsgctxt = other.msgctxt or '0' + if msgctxt > othermsgctxt: + return 1 + elif msgctxt < othermsgctxt: + return -1 + # Compare msgid_plural + msgid_plural = self.msgid_plural or '0' + othermsgid_plural = other.msgid_plural or '0' + if msgid_plural > othermsgid_plural: + return 1 + elif msgid_plural < othermsgid_plural: + return -1 + # Compare msgstr_plural + if self.msgstr_plural and isinstance(self.msgstr_plural, dict): + msgstr_plural = list(self.msgstr_plural.values()) + else: + msgstr_plural = [] + if other.msgstr_plural and isinstance(other.msgstr_plural, dict): + othermsgstr_plural = list(other.msgstr_plural.values()) + else: + othermsgstr_plural = [] + if msgstr_plural > othermsgstr_plural: + return 1 + elif msgstr_plural < othermsgstr_plural: + return -1 + # Compare msgid + if self.msgid > other.msgid: + return 1 + elif self.msgid < other.msgid: + return -1 + # Compare msgstr + if self.msgstr > other.msgstr: + return 1 + elif self.msgstr < other.msgstr: + return -1 + return 0 + + def __gt__(self, other): + return self.__cmp__(other) > 0 + + def __lt__(self, other): + return self.__cmp__(other) < 0 + + def __ge__(self, other): + return self.__cmp__(other) >= 0 + + def __le__(self, other): + return self.__cmp__(other) <= 0 + + def __eq__(self, other): + return self.__cmp__(other) == 0 + + def __ne__(self, other): + return self.__cmp__(other) != 0 + + def translated(self): + """ + Returns ``True`` if the entry has been translated or ``False`` + otherwise. + """ + if self.obsolete or self.fuzzy: + return False + if self.msgstr != '': + return True + if self.msgstr_plural: + for pos in self.msgstr_plural: + if self.msgstr_plural[pos] == '': + return False + return True + return False + + def merge(self, other): + """ + Merge the current entry with the given pot entry. + """ + self.msgid = other.msgid + self.msgctxt = other.msgctxt + self.occurrences = other.occurrences + self.comment = other.comment + fuzzy = self.fuzzy + self.flags = other.flags[:] # clone flags + if fuzzy: + self.flags.append('fuzzy') + self.msgid_plural = other.msgid_plural + self.obsolete = other.obsolete + self.previous_msgctxt = other.previous_msgctxt + self.previous_msgid = other.previous_msgid + self.previous_msgid_plural = other.previous_msgid_plural + if other.msgstr_plural: + for pos in other.msgstr_plural: + try: + # keep existing translation at pos if any + self.msgstr_plural[pos] + except KeyError: + self.msgstr_plural[pos] = '' + + @property + def fuzzy(self): + return 'fuzzy' in self.flags + + @fuzzy.setter + def fuzzy(self, value): + if value and not self.fuzzy: + self.flags.insert(0, 'fuzzy') + elif not value and self.fuzzy: + self.flags.remove('fuzzy') + + def __hash__(self): + return hash((self.msgid, self.msgstr)) +# }}} +# class MOEntry {{{ + + +class MOEntry(_BaseEntry): + """ + Represents a mo file entry. + """ + def __init__(self, *args, **kwargs): + """ + Constructor, accepts the following keyword arguments, + for consistency with :class:`~polib.POEntry`: + + ``comment`` + ``tcomment`` + ``occurrences`` + ``flags`` + ``previous_msgctxt`` + ``previous_msgid`` + ``previous_msgid_plural`` + + Note: even though these keyword arguments are accepted, + they hold no real meaning in the context of MO files + and are simply ignored. + """ + _BaseEntry.__init__(self, *args, **kwargs) + self.comment = '' + self.tcomment = '' + self.occurrences = [] + self.flags = [] + self.previous_msgctxt = None + self.previous_msgid = None + self.previous_msgid_plural = None + + def __hash__(self): + return hash((self.msgid, self.msgstr)) + +# }}} +# class _POFileParser {{{ + + +class _POFileParser(object): + """ + A finite state machine to efficiently and correctly parse po + file format. + """ + + def __init__(self, pofile, *args, **kwargs): + """ + Constructor. + + Keyword arguments: + + ``pofile`` + string, path to the po file or its content + + ``encoding`` + string, the encoding to use, defaults to ``default_encoding`` + global variable (optional). + + ``check_for_duplicates`` + whether to check for duplicate entries when adding entries to the + file (optional, default: ``False``). + """ + enc = kwargs.get('encoding', default_encoding) + if _is_file(pofile): + try: + self.fhandle = io.open(pofile, 'rt', encoding=enc) + except LookupError: + enc = default_encoding + self.fhandle = io.open(pofile, 'rt', encoding=enc) + else: + self.fhandle = pofile.splitlines() + + klass = kwargs.get('klass') + if klass is None: + klass = POFile + self.instance = klass( + pofile=pofile, + encoding=enc, + check_for_duplicates=kwargs.get('check_for_duplicates', False) + ) + self.transitions = {} + self.current_line = 0 + self.current_entry = POEntry(linenum=self.current_line) + self.current_state = 'st' + self.current_token = None + # two memo flags used in handlers + self.msgstr_index = 0 + self.entry_obsolete = 0 + # Configure the state machine, by adding transitions. + # Signification of symbols: + # * ST: Beginning of the file (start) + # * HE: Header + # * TC: a translation comment + # * GC: a generated comment + # * OC: a file/line occurrence + # * FL: a flags line + # * CT: a message context + # * PC: a previous msgctxt + # * PM: a previous msgid + # * PP: a previous msgid_plural + # * MI: a msgid + # * MP: a msgid plural + # * MS: a msgstr + # * MX: a msgstr plural + # * MC: a msgid or msgstr continuation line + all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc', + 'ms', 'mp', 'mx', 'mi'] + + self.add('tc', ['st', 'he'], 'he') + self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', + 'mp', 'mx', 'mi'], 'tc') + self.add('gc', all, 'gc') + self.add('oc', all, 'oc') + self.add('fl', all, 'fl') + self.add('pc', all, 'pc') + self.add('pm', all, 'pm') + self.add('pp', all, 'pp') + self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', + 'pp', 'ms', 'mx'], 'ct') + self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', + 'pm', 'pp', 'ms', 'mx'], 'mi') + self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp') + self.add('ms', ['mi', 'mp', 'tc'], 'ms') + self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx') + self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc') + + def parse(self): + """ + Run the state machine, parse the file line by line and call process() + with the current matched symbol. + """ + try: + keywords = { + 'msgctxt': 'ct', + 'msgid': 'mi', + 'msgstr': 'ms', + 'msgid_plural': 'mp', + } + prev_keywords = { + 'msgid_plural': 'pp', + 'msgid': 'pm', + 'msgctxt': 'pc', + } + tokens = [] + fpath = '%s ' % self.instance.fpath if self.instance.fpath else '' + for line in self.fhandle: + self.current_line += 1 + if self.current_line == 1: + BOM = codecs.BOM_UTF8.decode('utf-8') + if line.startswith(BOM): + line = line[len(BOM):] + line = line.strip() + if line == '': + continue + + tokens = line.split(None, 2) + nb_tokens = len(tokens) + + if tokens[0] == '#~|': + continue + + if tokens[0] == '#~' and nb_tokens > 1: + line = line[3:].strip() + tokens = tokens[1:] + nb_tokens -= 1 + self.entry_obsolete = 1 + else: + self.entry_obsolete = 0 + + # Take care of keywords like + # msgid, msgid_plural, msgctxt & msgstr. + if tokens[0] in keywords and nb_tokens > 1: + line = line[len(tokens[0]):].lstrip() + if re.search(r'([^\\]|^)"', line[1:-1]): + raise IOError('Syntax error in po file %s(line %s): ' + 'unescaped double quote found' % + (fpath, self.current_line)) + self.current_token = line + self.process(keywords[tokens[0]]) + continue + + self.current_token = line + + if tokens[0] == '#:': + if nb_tokens <= 1: + continue + # we are on a occurrences line + self.process('oc') + + elif line[:1] == '"': + # we are on a continuation line + if re.search(r'([^\\]|^)"', line[1:-1]): + raise IOError('Syntax error in po file %s(line %s): ' + 'unescaped double quote found' % + (fpath, self.current_line)) + self.process('mc') + + elif line[:7] == 'msgstr[': + # we are on a msgstr plural + self.process('mx') + + elif tokens[0] == '#,': + if nb_tokens <= 1: + continue + # we are on a flags line + self.process('fl') + + elif tokens[0] == '#' or tokens[0].startswith('##'): + if line == '#': + line += ' ' + # we are on a translator comment line + self.process('tc') + + elif tokens[0] == '#.': + if nb_tokens <= 1: + continue + # we are on a generated comment line + self.process('gc') + + elif tokens[0] == '#|': + if nb_tokens <= 1: + raise IOError('Syntax error in po file %s(line %s)' % + (fpath, self.current_line)) + + # Remove the marker and any whitespace right after that. + line = line[2:].lstrip() + self.current_token = line + + if tokens[1].startswith('"'): + # Continuation of previous metadata. + self.process('mc') + continue + + if nb_tokens == 2: + # Invalid continuation line. + raise IOError('Syntax error in po file %s(line %s): ' + 'invalid continuation line' % + (fpath, self.current_line)) + + # we are on a "previous translation" comment line, + if tokens[1] not in prev_keywords: + # Unknown keyword in previous translation comment. + raise IOError('Syntax error in po file %s(line %s): ' + 'unknown keyword %s' % + (fpath, self.current_line, + tokens[1])) + + # Remove the keyword and any whitespace + # between it and the starting quote. + line = line[len(tokens[1]):].lstrip() + self.current_token = line + self.process(prev_keywords[tokens[1]]) + + else: + raise IOError('Syntax error in po file %s(line %s)' % + (fpath, self.current_line)) + + if self.current_entry and len(tokens) > 0 and \ + not tokens[0].startswith('#'): + # since entries are added when another entry is found, we must + # add the last entry here (only if there are lines). Trailing + # comments are ignored + self.instance.append(self.current_entry) + + # before returning the instance, check if there's metadata and if + # so extract it in a dict + metadataentry = self.instance.find('') + if metadataentry: # metadata found + # remove the entry + self.instance.remove(metadataentry) + self.instance.metadata_is_fuzzy = metadataentry.flags + key = None + for msg in metadataentry.msgstr.splitlines(): + try: + key, val = msg.split(':', 1) + self.instance.metadata[key] = val.strip() + except (ValueError, KeyError): + if key is not None: + self.instance.metadata[key] += '\n' + msg.strip() + finally: + # close opened file + if not isinstance(self.fhandle, list): # must be file + self.fhandle.close() + return self.instance + + def add(self, symbol, states, next_state): + """ + Add a transition to the state machine. + + Keywords arguments: + + ``symbol`` + string, the matched token (two chars symbol). + + ``states`` + list, a list of states (two chars symbols). + + ``next_state`` + the next state the fsm will have after the action. + """ + for state in states: + action = getattr(self, 'handle_%s' % next_state) + self.transitions[(symbol, state)] = (action, next_state) + + def process(self, symbol): + """ + Process the transition corresponding to the current state and the + symbol provided. + + Keywords arguments: + + ``symbol`` + string, the matched token (two chars symbol). + + ``linenum`` + integer, the current line number of the parsed file. + """ + try: + (action, state) = self.transitions[(symbol, self.current_state)] + if action(): + self.current_state = state + except Exception: + fpath = '%s ' % self.instance.fpath if self.instance.fpath else '' + if hasattr(self.fhandle, 'close'): + self.fhandle.close() + raise IOError('Syntax error in po file %s(line %s)' % + (fpath, self.current_line)) + + # state handlers + + def handle_he(self): + """Handle a header comment.""" + if self.instance.header != '': + self.instance.header += '\n' + self.instance.header += self.current_token[2:] + return 1 + + def handle_tc(self): + """Handle a translator comment.""" + if self.current_state in ['mc', 'ms', 'mx']: + self.instance.append(self.current_entry) + self.current_entry = POEntry(linenum=self.current_line) + if self.current_entry.tcomment != '': + self.current_entry.tcomment += '\n' + tcomment = self.current_token.lstrip('#') + if tcomment.startswith(' '): + tcomment = tcomment[1:] + self.current_entry.tcomment += tcomment + return True + + def handle_gc(self): + """Handle a generated comment.""" + if self.current_state in ['mc', 'ms', 'mx']: + self.instance.append(self.current_entry) + self.current_entry = POEntry(linenum=self.current_line) + if self.current_entry.comment != '': + self.current_entry.comment += '\n' + self.current_entry.comment += self.current_token[3:] + return True + + def handle_oc(self): + """Handle a file:num occurrence.""" + if self.current_state in ['mc', 'ms', 'mx']: + self.instance.append(self.current_entry) + self.current_entry = POEntry(linenum=self.current_line) + occurrences = self.current_token[3:].split() + for occurrence in occurrences: + if occurrence != '': + try: + fil, line = occurrence.rsplit(':', 1) + if not line.isdigit(): + fil = occurrence + line = '' + self.current_entry.occurrences.append((fil, line)) + except (ValueError, AttributeError): + self.current_entry.occurrences.append((occurrence, '')) + return True + + def handle_fl(self): + """Handle a flags line.""" + if self.current_state in ['mc', 'ms', 'mx']: + self.instance.append(self.current_entry) + self.current_entry = POEntry(linenum=self.current_line) + self.current_entry.flags += [c.strip() for c in + self.current_token[3:].split(',')] + return True + + def handle_pp(self): + """Handle a previous msgid_plural line.""" + if self.current_state in ['mc', 'ms', 'mx']: + self.instance.append(self.current_entry) + self.current_entry = POEntry(linenum=self.current_line) + self.current_entry.previous_msgid_plural = \ + unescape(self.current_token[1:-1]) + return True + + def handle_pm(self): + """Handle a previous msgid line.""" + if self.current_state in ['mc', 'ms', 'mx']: + self.instance.append(self.current_entry) + self.current_entry = POEntry(linenum=self.current_line) + self.current_entry.previous_msgid = \ + unescape(self.current_token[1:-1]) + return True + + def handle_pc(self): + """Handle a previous msgctxt line.""" + if self.current_state in ['mc', 'ms', 'mx']: + self.instance.append(self.current_entry) + self.current_entry = POEntry(linenum=self.current_line) + self.current_entry.previous_msgctxt = \ + unescape(self.current_token[1:-1]) + return True + + def handle_ct(self): + """Handle a msgctxt.""" + if self.current_state in ['mc', 'ms', 'mx']: + self.instance.append(self.current_entry) + self.current_entry = POEntry(linenum=self.current_line) + self.current_entry.msgctxt = unescape(self.current_token[1:-1]) + return True + + def handle_mi(self): + """Handle a msgid.""" + if self.current_state in ['mc', 'ms', 'mx']: + self.instance.append(self.current_entry) + self.current_entry = POEntry(linenum=self.current_line) + self.current_entry.obsolete = self.entry_obsolete + self.current_entry.msgid = unescape(self.current_token[1:-1]) + return True + + def handle_mp(self): + """Handle a msgid plural.""" + self.current_entry.msgid_plural = unescape(self.current_token[1:-1]) + return True + + def handle_ms(self): + """Handle a msgstr.""" + self.current_entry.msgstr = unescape(self.current_token[1:-1]) + return True + + def handle_mx(self): + """Handle a msgstr plural.""" + index = self.current_token[7] + value = self.current_token[self.current_token.find('"') + 1:-1] + self.current_entry.msgstr_plural[int(index)] = unescape(value) + self.msgstr_index = int(index) + return True + + def handle_mc(self): + """Handle a msgid or msgstr continuation line.""" + token = unescape(self.current_token[1:-1]) + if self.current_state == 'ct': + self.current_entry.msgctxt += token + elif self.current_state == 'mi': + self.current_entry.msgid += token + elif self.current_state == 'mp': + self.current_entry.msgid_plural += token + elif self.current_state == 'ms': + self.current_entry.msgstr += token + elif self.current_state == 'mx': + self.current_entry.msgstr_plural[self.msgstr_index] += token + elif self.current_state == 'pp': + self.current_entry.previous_msgid_plural += token + elif self.current_state == 'pm': + self.current_entry.previous_msgid += token + elif self.current_state == 'pc': + self.current_entry.previous_msgctxt += token + # don't change the current state + return False +# }}} +# class _MOFileParser {{{ + + +class _MOFileParser(object): + """ + A class to parse binary mo files. + """ + + def __init__(self, mofile, *args, **kwargs): + """ + Constructor. + + Keyword arguments: + + ``mofile`` + string, path to the mo file or its content + + ``encoding`` + string, the encoding to use, defaults to ``default_encoding`` + global variable (optional). + + ``check_for_duplicates`` + whether to check for duplicate entries when adding entries to the + file (optional, default: ``False``). + """ + if _is_file(mofile): + self.fhandle = open(mofile, 'rb') + else: + self.fhandle = io.BytesIO(mofile) + + klass = kwargs.get('klass') + if klass is None: + klass = MOFile + self.instance = klass( + fpath=mofile, + encoding=kwargs.get('encoding', default_encoding), + check_for_duplicates=kwargs.get('check_for_duplicates', False) + ) + + def __del__(self): + """ + Make sure the file is closed, this prevents warnings on unclosed file + when running tests with python >= 3.2. + """ + if self.fhandle and hasattr(self.fhandle, 'close'): + self.fhandle.close() + + def parse(self): + """ + Build the instance with the file handle provided in the + constructor. + """ + # parse magic number + magic_number = self._readbinary('> 16 not in (0, 1): + raise IOError('Invalid mo file, unexpected major revision number') + self.instance.version = version + # original strings and translation strings hash table offset + msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8) + # move to msgid hash table and read length and offset of msgids + self.fhandle.seek(msgids_hash_offset) + msgids_index = [] + for i in range(numofstrings): + msgids_index.append(self._readbinary(ii, 8)) + # move to msgstr hash table and read length and offset of msgstrs + self.fhandle.seek(msgstrs_hash_offset) + msgstrs_index = [] + for i in range(numofstrings): + msgstrs_index.append(self._readbinary(ii, 8)) + # build entries + encoding = self.instance.encoding + for i in range(numofstrings): + self.fhandle.seek(msgids_index[i][1]) + msgid = self.fhandle.read(msgids_index[i][0]) + + self.fhandle.seek(msgstrs_index[i][1]) + msgstr = self.fhandle.read(msgstrs_index[i][0]) + if i == 0 and not msgid: # metadata + raw_metadata, metadata = msgstr.split(b('\n')), {} + for line in raw_metadata: + tokens = line.split(b(':'), 1) + if tokens[0] != b(''): + try: + k = tokens[0].decode(encoding) + v = tokens[1].decode(encoding) + metadata[k] = v.strip() + except IndexError: + metadata[k] = u('') + self.instance.metadata = metadata + continue + # test if we have a plural entry + msgid_tokens = msgid.split(b('\0')) + if len(msgid_tokens) > 1: + entry = self._build_entry( + msgid=msgid_tokens[0], + msgid_plural=msgid_tokens[1], + msgstr_plural=dict((k, v) for k, v in + enumerate(msgstr.split(b('\0')))) + ) + else: + entry = self._build_entry(msgid=msgid, msgstr=msgstr) + self.instance.append(entry) + # close opened file + self.fhandle.close() + return self.instance + + def _build_entry(self, msgid, msgstr=None, msgid_plural=None, + msgstr_plural=None): + msgctxt_msgid = msgid.split(b('\x04')) + encoding = self.instance.encoding + if len(msgctxt_msgid) > 1: + kwargs = { + 'msgctxt': msgctxt_msgid[0].decode(encoding), + 'msgid': msgctxt_msgid[1].decode(encoding), + } + else: + kwargs = {'msgid': msgid.decode(encoding)} + if msgstr: + kwargs['msgstr'] = msgstr.decode(encoding) + if msgid_plural: + kwargs['msgid_plural'] = msgid_plural.decode(encoding) + if msgstr_plural: + for k in msgstr_plural: + msgstr_plural[k] = msgstr_plural[k].decode(encoding) + kwargs['msgstr_plural'] = msgstr_plural + return MOEntry(**kwargs) + + def _readbinary(self, fmt, numbytes): + """ + Private method that unpack n bytes of data using format . + It returns a tuple or a mixed value if the tuple length is 1. + """ + bytes = self.fhandle.read(numbytes) + tup = struct.unpack(fmt, bytes) + if len(tup) == 1: + return tup[0] + return tup +# }}} diff --git a/setup/publish.py b/setup/publish.py index eb831302437c..968860a8f558 100644 --- a/setup/publish.py +++ b/setup/publish.py @@ -110,7 +110,8 @@ def pre_sub_commands(self, opts): if 'PUBLISH_BUILD_DONE' not in os.environ: subprocess.check_call([sys.executable, 'setup.py', 'check']) subprocess.check_call([sys.executable, 'setup.py', 'build']) - subprocess.check_call([sys.executable, 'setup.py', 'test']) + if 'SKIP_CALIBRE_TESTS' not in os.environ: + subprocess.check_call([sys.executable, 'setup.py', 'test']) subprocess.check_call([sys.executable, 'setup.py', 'pot']) subprocess.check_call([sys.executable, 'setup.py', 'translations']) os.environ['PUBLISH_BUILD_DONE'] = '1' @@ -194,7 +195,9 @@ def run(self, opts): languages = opts.language or list( json.load(open(self.j(base, 'locale', 'completed.json'), 'rb')) ) - languages = ['en'] + list(set(languages) - {'en'}) + languages = set(languages) - {'en'} + languages.discard('ta') # Tamil translations break Sphinx + languages = ['en'] + list(languages) os.environ['ALL_USER_MANUAL_LANGUAGES'] = ' '.join(languages) for language in languages: jobs.append(create_job([ @@ -284,8 +287,9 @@ def build_man_pages(self, dest, compress=False): shutil.rmtree(dest) os.makedirs(dest) base = self.j(self.d(self.SRC), 'manual') - languages = list(available_translations()) - languages = ['en'] + list(set(languages) - {'en', 'en_GB'}) + languages = set(available_translations()) + languages.discard('ta') # Tamil translatins are completely borked break sphinx + languages = ['en'] + list(languages - {'en', 'en_GB'}) os.environ['ALL_USER_MANUAL_LANGUAGES'] = ' '.join(languages) try: os.makedirs(dest) diff --git a/setup/translations.py b/setup/translations.py index 79a30beca66c..81fd3da758f1 100644 --- a/setup/translations.py +++ b/setup/translations.py @@ -642,9 +642,11 @@ def check_all(self): self.upload_to_vcs('Fixed translations') def check_for_user_manual_errors(self): + sys.path.insert(0, self.j(self.d(self.SRC), 'setup')) + import polib + del sys.path[0] self.info('Checking user manual translations...') srcbase = self.j(self.d(self.SRC), 'translations', 'manual') - import polib changes = defaultdict(set) for lang in os.listdir(srcbase): if lang.startswith('en_') or lang == 'en': diff --git a/src/calibre/constants.py b/src/calibre/constants.py index 3b44abf9f148..25f8a8df9b63 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -11,7 +11,7 @@ from polyglot.builtins import environ_item, hasenv __appname__ = 'calibre' -numeric_version = (7, 9, 0) +numeric_version = (7, 10, 0) __version__ = '.'.join(map(str, numeric_version)) git_version = None __author__ = "Kovid Goyal " diff --git a/src/calibre/db/cache.py b/src/calibre/db/cache.py index b7881d0ce06f..aae4e85de0fb 100644 --- a/src/calibre/db/cache.py +++ b/src/calibre/db/cache.py @@ -15,7 +15,6 @@ import weakref from collections import defaultdict from collections.abc import MutableSet, Set -from contextlib import closing from functools import partial, wraps from io import DEFAULT_BUFFER_SIZE, BytesIO from queue import Queue @@ -46,7 +45,7 @@ from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre.ptempfile import PersistentTemporaryFile, SpooledTemporaryFile, base_dir from calibre.utils.config import prefs, tweaks -from calibre.utils.date import UNDEFINED_DATE, utcnow +from calibre.utils.date import UNDEFINED_DATE, timestampfromdt, utcnow from calibre.utils.date import now as nowf from calibre.utils.filenames import make_long_path_useable from calibre.utils.icu import lower as icu_lower @@ -3158,11 +3157,14 @@ def report_progress(fname): mdata = self.format_metadata(book_id, fmt) key = f'{key_prefix}:{book_id}:{fmt}' fm[fmt] = key - with exporter.start_file(key, mtime=mdata.get('mtime')) as dest: - self._copy_format_to(book_id, fmt, dest, report_file_size=dest.ensure_space) + mtime = mdata.get('mtime') + if mtime is not None: + mtime = timestampfromdt(mtime) + with exporter.start_file(key, mtime=mtime) as dest: + self._copy_format_to(book_id, fmt, dest) cover_key = '{}:{}:{}'.format(key_prefix, book_id, '.cover') with exporter.start_file(cover_key) as dest: - if not self.copy_cover_to(book_id, dest, report_file_size=dest.ensure_space): + if not self.copy_cover_to(book_id, dest): dest.discard() else: fm['.cover'] = cover_key @@ -3439,6 +3441,7 @@ def is_null_date(x): dest_value.extend(src_value) self._set_field(field, {dest_id: dest_value}) + def import_library(library_key, importer, library_path, progress=None, abort=None): from calibre.db.backend import DB metadata = importer.metadata[library_key] @@ -3452,27 +3455,22 @@ def report_progress(fname): report_progress('metadata.db') if abort is not None and abort.is_set(): return - with open(os.path.join(library_path, 'metadata.db'), 'wb') as f: - src = importer.start_file(metadata['metadata.db'], 'metadata.db for ' + library_path) - shutil.copyfileobj(src, f) - src.close() + importer.save_file(metadata['metadata.db'], 'metadata.db for ' + library_path, os.path.join(library_path, 'metadata.db')) if 'full-text-search.db' in metadata: if progress is not None: progress('full-text-search.db', 1, total) if abort is not None and abort.is_set(): return poff += 1 - with open(os.path.join(library_path, 'full-text-search.db'), 'wb') as f: - src = importer.start_file(metadata['full-text-search.db'], 'full-text-search.db for ' + library_path) - shutil.copyfileobj(src, f) - src.close() + importer.save_file(metadata['full-text-search.db'], 'full-text-search.db for ' + library_path, + os.path.join(library_path, 'full-text-search.db')) if abort is not None and abort.is_set(): return if 'notes.db' in metadata: import zipfile notes_dir = os.path.join(library_path, NOTES_DIR_NAME) os.makedirs(notes_dir, exist_ok=True) - with closing(importer.start_file(metadata['notes.db'], 'notes.db for ' + library_path)) as stream: + with importer.start_file(metadata['notes.db'], 'notes.db for ' + library_path) as stream: stream.check_hash = False with zipfile.ZipFile(stream) as zf: for zi in zf.infolist(): @@ -3481,6 +3479,8 @@ def report_progress(fname): os.utime(tpath, (date_time, date_time)) if abort is not None and abort.is_set(): return + if importer.corrupted_files: + raise ValueError('Corrupted files:\n' + '\n'.join(importer.corrupted_files)) cache = Cache(DB(library_path, load_user_formatter_functions=False)) cache.init() @@ -3493,20 +3493,22 @@ def report_progress(fname): if progress is not None: progress(title, i + poff, total) cache._update_path((book_id,), mark_as_dirtied=False) - for fmt, fmtkey in iteritems(fmt_key_map): + for fmt, fmtkey in fmt_key_map.items(): if fmt == '.cover': - with closing(importer.start_file(fmtkey, _('Cover for %s') % title)) as stream: + with importer.start_file(fmtkey, _('Cover for %s') % title) as stream: path = cache._field_for('path', book_id).replace('/', os.sep) cache.backend.set_cover(book_id, path, stream, no_processing=True) else: - with closing(importer.start_file(fmtkey, _('{0} format for {1}').format(fmt.upper(), title))) as stream: + with importer.start_file(fmtkey, _('{0} format for {1}').format(fmt.upper(), title)) as stream: size, fname = cache._do_add_format(book_id, fmt, stream, mtime=stream.mtime) cache.fields['formats'].table.update_fmt(book_id, fmt, fname, size, cache.backend) for relpath, efkey in extra_files.get(book_id, {}).items(): - with closing(importer.start_file(efkey, _('Extra file {0} for book {1}').format(relpath, title))) as stream: + with importer.start_file(efkey, _('Extra file {0} for book {1}').format(relpath, title)) as stream: path = cache._field_for('path', book_id).replace('/', os.sep) cache.backend.add_extra_file(relpath, stream, path) cache.dump_metadata({book_id}) + if importer.corrupted_files: + raise ValueError('Corrupted files:\n' + '\n'.join(importer.corrupted_files)) if progress is not None: progress(_('Completed'), total, total) return cache diff --git a/src/calibre/db/tests/filesystem.py b/src/calibre/db/tests/filesystem.py index 2cae7f0c3530..3dc5bd5607ab 100644 --- a/src/calibre/db/tests/filesystem.py +++ b/src/calibre/db/tests/filesystem.py @@ -246,6 +246,21 @@ def test_fname_change(self): def test_export_import(self): from calibre.db.cache import import_library from calibre.utils.exim import Exporter, Importer + with TemporaryDirectory('export_lib') as tdir: + for part_size in (8, 1, 1024): + exporter = Exporter(tdir, part_size=part_size + Exporter.tail_size()) + files = { + 'a': b'a' * 7, 'b': b'b' * 7, 'c': b'c' * 2, 'd': b'd' * 9, 'e': b'e' * 3, + } + for key, data in files.items(): + exporter.add_file(BytesIO(data), key) + exporter.commit() + importer = Importer(tdir) + for key, expected in files.items(): + with importer.start_file(key, key) as f: + actual = f.read() + self.assertEqual(expected, actual, key) + self.assertFalse(importer.corrupted_files) cache = self.init_cache() bookdir = os.path.dirname(cache.format_abspath(1, '__COVER_INTERNAL__')) with open(os.path.join(bookdir, 'exf'), 'w') as f: @@ -255,13 +270,14 @@ def test_export_import(self): f.write('recurse') self.assertEqual({ef.relpath for ef in cache.list_extra_files(1, pattern='sub/**/*')}, {'sub/recurse'}) self.assertEqual({ef.relpath for ef in cache.list_extra_files(1)}, {'exf', 'sub/recurse'}) - for part_size in (1 << 30, 100, 1): + for part_size in (512, 1027, None): with TemporaryDirectory('export_lib') as tdir, TemporaryDirectory('import_lib') as idir: - exporter = Exporter(tdir, part_size=part_size) + exporter = Exporter(tdir, part_size=part_size if part_size is None else (part_size + Exporter.tail_size())) cache.export_library('l', exporter) exporter.commit() importer = Importer(tdir) ic = import_library('l', importer, idir) + self.assertFalse(importer.corrupted_files) self.assertEqual(cache.all_book_ids(), ic.all_book_ids()) for book_id in cache.all_book_ids(): self.assertEqual(cache.cover(book_id), ic.cover(book_id), 'Covers not identical for book: %d' % book_id) @@ -290,6 +306,7 @@ def test_export_import(self): exporter.commit() importer = Importer(tdir) ic = import_library('l', importer, idir) + self.assertFalse(importer.corrupted_files) self.assertEqual(ic.fts_search('exim')[0]['id'], 1) self.assertEqual(cache.notes_for('authors', 2), ic.notes_for('authors', 2)) a, b = cache.get_notes_resource(r1), ic.get_notes_resource(r1) diff --git a/src/calibre/devices/eb600/driver.py b/src/calibre/devices/eb600/driver.py index 3db68bbfe740..2c0d65772ad1 100644 --- a/src/calibre/devices/eb600/driver.py +++ b/src/calibre/devices/eb600/driver.py @@ -427,7 +427,7 @@ def windows_sort_drives(self, drives): class POCKETBOOK740(USBMS): - name = 'PocketBook 701 Device Interface' + name = 'PocketBook 740 Device Interface' gui_name = 'PocketBook' description = _('Communicate with the PocketBook 740') supported_platforms = ['windows', 'osx', 'linux'] diff --git a/src/calibre/devices/kobo/driver.py b/src/calibre/devices/kobo/driver.py index ef5de175490f..875471cf4a22 100644 --- a/src/calibre/devices/kobo/driver.py +++ b/src/calibre/devices/kobo/driver.py @@ -1412,7 +1412,7 @@ class KOBOTOUCH(KOBO): # Starting with firmware version 3.19.x, the last number appears to be is a # build number. A number will be recorded here but it can be safely ignored # when testing the firmware version. - max_supported_fwversion = (4, 38, 21908) + max_supported_fwversion = (4, 39, 22861) # The following document firmware versions where new function or devices were added. # Not all are used, but this feels a good place to record it. min_fwversion_shelves = (2, 0, 0) diff --git a/src/calibre/ebooks/docx/to_html.py b/src/calibre/ebooks/docx/to_html.py index ee5fb6d3d68c..a41daae32e4e 100644 --- a/src/calibre/ebooks/docx/to_html.py +++ b/src/calibre/ebooks/docx/to_html.py @@ -671,8 +671,11 @@ def convert_run(self, run): ctext = child.text if space != 'preserve': # Remove leading and trailing whitespace. Word ignores - # leading and trailing whitespace without preserve - ctext = ctext.strip(' \n\r\t') + # leading and trailing whitespace without preserve unless + # the element is only whitespace. + stripped = ctext.strip(' \n\r\t') + if stripped: + ctext = stripped # Only use a with white-space:pre-wrap if this element # actually needs it, i.e. if it has more than one # consecutive space or it has newlines or tabs. diff --git a/src/calibre/ebooks/readability/cleaners.py b/src/calibre/ebooks/readability/cleaners.py index fee5aec96fd5..cd52fe2580b0 100644 --- a/src/calibre/ebooks/readability/cleaners.py +++ b/src/calibre/ebooks/readability/cleaners.py @@ -1,7 +1,10 @@ # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds import re -from lxml.html.clean import Cleaner +try: + from lxml_html_clean import Cleaner +except ImportError: + from lxml.html.clean import Cleaner bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] single_quoted = "'[^']+'" diff --git a/src/calibre/gui2/__init__.py b/src/calibre/gui2/__init__.py index fd501990dca4..04716a3cb77d 100644 --- a/src/calibre/gui2/__init__.py +++ b/src/calibre/gui2/__init__.py @@ -1189,10 +1189,9 @@ def __init__(self, args=(), force_calibre_style=False, override_program_name=Non args = [override_program_name] + args[1:] self.palette_manager = PaletteManager(force_calibre_style, headless) if headless: - args.extend(('-platformpluginpath', plugins_loc, '-platform', 'headless')) + args.extend(('-platformpluginpath', plugins_loc, '-platform', os.environ.get('CALIBRE_HEADLESS_PLATFORM', 'headless'))) else: args.extend(self.palette_manager.args_to_qt) - self.headless = headless from calibre_extensions import progress_indicator self.pi = progress_indicator @@ -1583,7 +1582,7 @@ def ensure_app(headless=True): args = sys.argv[:1] has_headless = ismacos or islinux or isbsd if headless and has_headless: - args += ['-platformpluginpath', plugins_loc, '-platform', 'headless'] + args += ['-platformpluginpath', plugins_loc, '-platform', os.environ.get('CALIBRE_HEADLESS_PLATFORM', 'headless')] if ismacos: os.environ['QT_MAC_DISABLE_FOREGROUND_APPLICATION_TRANSFORM'] = '1' if headless and iswindows: diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py index 85d475829404..12e7021e0534 100644 --- a/src/calibre/gui2/dialogs/metadata_bulk.py +++ b/src/calibre/gui2/dialogs/metadata_bulk.py @@ -224,10 +224,13 @@ def read_file_metadata(self, args): else: db.set_metadata(book_id, mi, allow_case_change=True) if cdata is not None: - db.set_cover({book_id: cdata}) + try: + db.set_cover({book_id: cdata}) + except Exception: + import traceback + traceback.print_exc() self.progress_update.emit(1) self.progress_finished_cur_step.emit() - finally: worker.shutdown() diff --git a/src/calibre/gui2/library/views.py b/src/calibre/gui2/library/views.py index 2d60f4cdfba2..971a5746d2cd 100644 --- a/src/calibre/gui2/library/views.py +++ b/src/calibre/gui2/library/views.py @@ -42,7 +42,7 @@ from calibre import force_unicode from calibre.constants import filesystem_encoding, islinux -from calibre.gui2 import BOOK_DETAILS_DISPLAY_DEBOUNCE_DELAY, FunctionDispatcher, error_dialog, gprefs +from calibre.gui2 import BOOK_DETAILS_DISPLAY_DEBOUNCE_DELAY, FunctionDispatcher, error_dialog, gprefs, show_restart_warning from calibre.gui2.dialogs.enum_values_edit import EnumValuesEdit from calibre.gui2.gestures import GestureManager from calibre.gui2.library import DEFAULT_SORT @@ -68,6 +68,7 @@ ) from calibre.gui2.library.models import BooksModel, DeviceBooksModel from calibre.gui2.pin_columns import PinTableView +from calibre.gui2.preferences.create_custom_column import CreateNewCustomColumn from calibre.utils.config import prefs, tweaks from calibre.utils.icu import primary_sort_key from polyglot.builtins import iteritems @@ -567,6 +568,18 @@ def column_header_context_handler(self, action=None, column=None, view=None): view.apply_state(view.get_default_state()) elif action == 'addcustcol': self.add_column_signal.emit() + elif action == 'editcustcol': + def show_restart_dialog(): + from calibre.gui2.preferences.main import must_restart_message + if show_restart_warning(must_restart_message): + self.gui.quit(restart=True) + col_manager = CreateNewCustomColumn(self.gui) + if col_manager.must_restart(): + show_restart_dialog() + else: + res = col_manager.edit_existing_column(column) + if res[0] == CreateNewCustomColumn.Result.COLUMN_EDITED: + show_restart_dialog() elif action.startswith('align_'): alignment = action.partition('_')[-1] self._model.change_alignment(column, alignment) @@ -631,6 +644,13 @@ def create_context_menu(self, col, name, view): ans.addAction(QIcon.ic('width.png'), _('Adjust width of column {0}').format(name), partial(self.manually_adjust_column_size, view, col, name)) + if not isinstance(view, DeviceBooksView): + col_manager = CreateNewCustomColumn(self.gui) + if self.can_add_columns and self.model().is_custom_column(col): + act = ans.addAction(QIcon.ic('edit_input.png'), _('Edit column definition for %s') % name, + partial(handler, action='editcustcol')) + if col_manager.must_restart(): + act.setEnabled(False) if self.is_library_view: if self._model.db.field_metadata[col]['is_category']: act = ans.addAction(QIcon.ic('quickview.png'), _('Quickview column %s') % name, @@ -664,8 +684,10 @@ def create_context_menu(self, col, name, view): partial(handler, action='reset_ondevice_width')) ans.addAction(_('Restore default layout'), partial(handler, action='defaults')) if self.can_add_columns: - ans.addAction( - QIcon.ic('column.png'), _('Add your own columns'), partial(handler, action='addcustcol')) + act = ans.addAction(QIcon.ic('column.png'), _('Add your own columns'), + partial(handler, action='addcustcol')) + col_manager = CreateNewCustomColumn(self.gui) + act.setEnabled(not col_manager.must_restart()) return ans def show_row_header_context_menu(self, pos): diff --git a/src/calibre/gui2/linux_file_dialogs.py b/src/calibre/gui2/linux_file_dialogs.py index f67191344143..126b550db267 100644 --- a/src/calibre/gui2/linux_file_dialogs.py +++ b/src/calibre/gui2/linux_file_dialogs.py @@ -25,17 +25,30 @@ def get_winid(widget=None): return widget.effectiveWinId() +def to_known_dialog_provider_name(q: str) -> str: + uq = q.upper() + if uq in ('KDE', 'LXQT', 'LXDE'): + return 'KDE' + if uq in ('GNOME', 'GNOME-FLASHBACK', 'GNOME-FLASHBACK:GNOME', 'MATE', 'XFCE'): + return 'GNOME' + return '' + + def detect_desktop_environment(): de = os.getenv('XDG_CURRENT_DESKTOP') if de: - return de.upper().split(':', 1)[0] + for x in de.split(':'): + q = to_known_dialog_provider_name(x) + if q: + return q if os.getenv('KDE_FULL_SESSION') == 'true': return 'KDE' if os.getenv('GNOME_DESKTOP_SESSION_ID'): return 'GNOME' ds = os.getenv('DESKTOP_SESSION') if ds and ds.upper() in {'GNOME', 'XFCE'}: - return ds.upper() + return 'GNOME' + return '' def is_executable_present(name): @@ -343,9 +356,9 @@ def check_for_linux_native_dialogs(): if ans is None: de = detect_desktop_environment() order = ('zenity', 'kdialog') - if de in {'GNOME', 'UNITY', 'MATE', 'XFCE'}: + if de == 'GNOME': order = ('zenity',) - elif de in {'KDE', 'LXDE'}: + elif de == 'KDE': order = ('kdialog',) for exe in order: if is_executable_present(exe): diff --git a/src/calibre/gui2/preferences/create_custom_column.py b/src/calibre/gui2/preferences/create_custom_column.py index 3480936c2ca3..981f2e01c853 100644 --- a/src/calibre/gui2/preferences/create_custom_column.py +++ b/src/calibre/gui2/preferences/create_custom_column.py @@ -292,9 +292,15 @@ def setup_ui(self): # {{{ self.g = g = QGridLayout() l.addLayout(g) l.addStretch(10) + bbl = QHBoxLayout() + txt = QLabel(_('Pressing OK will require restarting calibre even if nothing was changed')) + txt.setWordWrap(True) + bbl.addWidget(txt) + bbl.addStretch(1) self.button_box = bb = QDialogButtonBox(QDialogButtonBox.StandardButton.Ok | QDialogButtonBox.StandardButton.Cancel, self) bb.accepted.connect(self.accept), bb.rejected.connect(self.reject) - l.addWidget(bb) + bbl.addWidget(bb) + l.addLayout(bbl) def add_row(text, widget): if text is None: @@ -932,6 +938,7 @@ class Result(Enum): INVALID_DISPLAY = 7 EXCEPTION_RAISED = 8 MUST_RESTART = 9 + COLUMN_EDITED = 11 def __init__(self, gui): self.gui = gui @@ -991,20 +998,40 @@ def create_column(self, lookup_name, column_heading, datatype, is_multiple, 'colnum': self.created_count, 'is_multiple': is_multiple, } + + return self._create_or_edit_column(lookup_name, freeze_lookup_name=freeze_lookup_name, + operation='create') + + def edit_existing_column(self, lookup_name): + if lookup_name not in self.custcols: + return self.Result.INVALID_KEY + return self._create_or_edit_column(lookup_name, freeze_lookup_name=False, operation='edit') + + def _create_or_edit_column(self, lookup_name, freeze_lookup_name, operation=None): try: dialog = CreateCustomColumn(self.gui, self, lookup_name, self.gui.library_view.model().orig_headers, freeze_lookup_name=freeze_lookup_name) if dialog.result() == QDialog.DialogCode.Accepted and self.cc_column_key is not None: cc = self.custcols[lookup_name] - self.db.create_custom_column( - label=cc['label'], - name=cc['name'], - datatype=cc['datatype'], - is_multiple=cc['is_multiple'], - display=cc['display']) - self.gui.must_restart_before_config = True - return (self.Result.COLUMN_ADDED, self.cc_column_key) + if operation == 'create': + self.db.create_custom_column( + label=cc['label'], + name=cc['name'], + datatype=cc['datatype'], + is_multiple=bool(cc['is_multiple']), + display=cc['display']) + self.gui.must_restart_before_config = True + return (self.Result.COLUMN_ADDED, self.cc_column_key) + # editing/viewing + if operation == 'edit': + self.db.set_custom_column_metadata(cc['colnum'], name=cc['name'], + label=cc['label'], display=cc['display'], + notify=False) + if '*must_restart' in cc: + self.gui.must_restart_before_config = True + return (self.Result.COLUMN_EDITED, self.cc_column_key) + return (self.Result.CANCELED, self.cc_column_key) except Exception as e: import traceback traceback.print_exc() diff --git a/src/calibre/gui2/preferences/main.py b/src/calibre/gui2/preferences/main.py index 8b78916b040f..51a5913c46e1 100644 --- a/src/calibre/gui2/preferences/main.py +++ b/src/calibre/gui2/preferences/main.py @@ -218,6 +218,12 @@ def __init__(self, parent=None): # }}} + +must_restart_message = _('The changes you have made require calibre be ' + 'restarted immediately. You will not be allowed to ' + 'set any more preferences, until you restart.') + + class Preferences(QDialog): run_wizard_requested = pyqtSignal() @@ -394,13 +400,11 @@ def commit(self, *args): do_restart = False if must_restart: self.must_restart = True - msg = _('Some of the changes you made require a restart.' - ' Please restart calibre as soon as possible.') if rc: - msg = _('The changes you have made require calibre be ' - 'restarted immediately. You will not be allowed to ' - 'set any more preferences, until you restart.') - + msg = must_restart_message + else: + msg = _('Some of the changes you made require a restart.' + ' Please restart calibre as soon as possible.') do_restart = show_restart_warning(msg, parent=self) self.showing_widget.refresh_gui(self.gui) diff --git a/src/calibre/gui2/search_box.py b/src/calibre/gui2/search_box.py index 92953f306769..7e03a0a88da0 100644 --- a/src/calibre/gui2/search_box.py +++ b/src/calibre/gui2/search_box.py @@ -57,6 +57,12 @@ def contextMenuEvent(self, ev): else: menu.addAction(ac) menu.addSeparator() + ac = menu.addAction(_('Invert current search')) + ac.setEnabled(bool(self.text().strip())) + ac.setIcon(QIcon.ic('search.png')) + ac.triggered.connect(self.invert_search) + menu.addAction(ac) + menu.addSeparator() if self.as_url is not None: url = self.as_url(self.text()) if url: @@ -64,6 +70,17 @@ def contextMenuEvent(self, ev): menu.addAction(_('&Clear search history')).triggered.connect(self.clear_history) menu.exec(ev.globalPos()) + def invert_search(self): + q = self.text().strip() + if q: + if q.startswith('NOT ( ') and q.endswith(' )'): + q = q[6:-2] + else: + q = f'NOT ( {q} )' + self.setText(q) + ev = QKeyEvent(QEvent.Type.KeyPress, Qt.Key.Key_Enter, Qt.KeyboardModifier.NoModifier) + self.keyPressEvent(ev) + def paste_and_search(self): self.paste() ev = QKeyEvent(QEvent.Type.KeyPress, Qt.Key.Key_Enter, Qt.KeyboardModifier.NoModifier) @@ -452,6 +469,7 @@ def add_action(current_menu, whole_name, last_component, func=None): use_hierarchy = 'search' in db.new_api.pref('categories_using_hierarchy', []) submenus = {} for name in sorted(db.saved_search_names(), key=lambda x: primary_sort_key(x.strip())): + display_name = name.replace('&', '&&') current_menu = menu if use_hierarchy: components = tuple(n.strip() for n in name.split('.')) @@ -461,14 +479,16 @@ def add_action(current_menu, whole_name, last_component, func=None): for i,c in enumerate(hierarchy, start=1): hierarchical_prefix = '.'.join(hierarchy[:i]) if hierarchical_prefix not in submenus: - current_menu = current_menu.addMenu(c) + current_menu = current_menu.addMenu(c.replace('&', '&&')) current_menu.setIcon(folder_icon) submenus[hierarchical_prefix] = current_menu else: current_menu = submenus[hierarchical_prefix] - ac = add_action(current_menu, name, last, partial(self.search.set_search_string, 'search:"='+name+'"')) + ac = add_action(current_menu, display_name, last.replace('&', '&&'), + partial(self.search.set_search_string, 'search:"='+name+'"')) else: - ac = add_action(current_menu, name, name, partial(self.search.set_search_string, 'search:"='+name+'"')) + ac = add_action(current_menu, display_name, display_name, + partial(self.search.set_search_string, 'search:"='+name+'"')) if ac.icon().isNull(): ac.setIcon(search_icon) diff --git a/src/calibre/gui2/search_restriction_mixin.py b/src/calibre/gui2/search_restriction_mixin.py index 3fdcef881239..b91aed7e6422 100644 --- a/src/calibre/gui2/search_restriction_mixin.py +++ b/src/calibre/gui2/search_restriction_mixin.py @@ -590,14 +590,16 @@ def build_search_restriction_list(self): dex = 0 def add_action(current_menu, name, last): nonlocal dex + def compare_fix_amps(name1, name2): + return (self._trim_restriction_name(name1).replace('&&', '&') == + self._trim_restriction_name(name2).replace('&&', '&')) self.search_restriction.addItem(name) txt = self._trim_restriction_name(last) - if self._trim_restriction_name(name) == self._trim_restriction_name(current_restriction): + if compare_fix_amps(name, current_restriction): a = current_menu.addAction(self.checked, txt if txt else self.no_restriction) else: a = current_menu.addAction(txt if txt else self.no_restriction) - a.triggered.connect(partial(self.search_restriction_triggered, - action=a, index=dex)) + a.triggered.connect(partial(self.search_restriction_triggered, action=a, index=dex)) dex += 1 return a @@ -649,10 +651,9 @@ def apply_search_restriction(self, i): if i == 1: self.apply_text_search_restriction(str(self.search.currentText())) elif i == 2 and str(self.search_restriction.currentText()).startswith('*'): - self.apply_text_search_restriction( - str(self.search_restriction.currentText())[1:]) + self.apply_text_search_restriction(str(self.search_restriction.currentText())[1:]) else: - r = str(self.search_restriction.currentText()) + r = str(self.search_restriction.currentText()).replace('&&', '&') if r is not None and r != '': restriction = 'search:"%s"'%(r) else: diff --git a/src/calibre/gui2/tweak_book/spell.py b/src/calibre/gui2/tweak_book/spell.py index ffcc777bd2e0..67a0f0f22f65 100644 --- a/src/calibre/gui2/tweak_book/spell.py +++ b/src/calibre/gui2/tweak_book/spell.py @@ -11,6 +11,7 @@ from itertools import chain from threading import Thread +import regex from qt.core import ( QT_VERSION_STR, QAbstractItemView, @@ -75,7 +76,7 @@ ) from calibre.spell.import_from import import_from_online, import_from_oxt from calibre.startup import connect_lambda -from calibre.utils.icu import contains, primary_contains, primary_sort_key, sort_key +from calibre.utils.icu import contains, primary_contains, primary_sort_key, sort_key, upper from calibre.utils.localization import calibre_langcode_to_name, canonicalize_lang, get_lang, get_language from calibre.utils.resources import get_path as P from calibre_extensions.progress_indicator import set_no_activate_on_click @@ -726,6 +727,7 @@ class WordsModel(QAbstractTableModel): def __init__(self, parent=None): QAbstractTableModel.__init__(self, parent) self.counts = (0, 0) + self.all_caps = self.with_numbers = self.camel_case = self.snake_case = False self.words = {} # Map of (word, locale) to location data for the word self.spell_map = {} # Map of (word, locale) to dictionaries.recognized(word, locale) self.sort_on = (0, False) @@ -734,6 +736,9 @@ def __init__(self, parent=None): self.show_only_misspelt = True self.headers = (_('Word'), _('Count'), _('Language'), _('Misspelled?')) self.alignments = Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignRight, Qt.AlignmentFlag.AlignLeft, Qt.AlignmentFlag.AlignHCenter + self.num_pat = regex.compile(r'\d', flags=regex.UNICODE) + self.camel_case_pat = regex.compile(r'[a-z][A-Z]', flags=regex.UNICODE) + self.snake_case_pat = regex.compile(r'\w_\w', flags=regex.UNICODE) def rowCount(self, parent=QModelIndex()): return len(self.items) @@ -794,8 +799,10 @@ def sort(self, column, order=Qt.SortOrder.AscendingOrder): self.do_sort() self.endResetModel() - def filter(self, filter_text): + def filter(self, filter_text, *, all_caps=False, with_numbers=False, camel_case=False, snake_case=False): self.filter_expression = filter_text or None + self.all_caps, self.with_numbers = all_caps, with_numbers + self.camel_case, self.snake_case = camel_case, snake_case self.beginResetModel() self.do_filter() self.do_sort() @@ -839,7 +846,16 @@ def filter_item(self, x): if self.show_only_misspelt and self.spell_map[x]: return False func = contains if tprefs['spell_check_case_sensitive_search'] else primary_contains - if self.filter_expression is not None and not func(self.filter_expression, x[0]): + word = x[0] + if self.filter_expression is not None and not func(self.filter_expression, word): + return False + if self.all_caps and upper(word) == word: + return False + if self.with_numbers and self.num_pat.search(word) is not None: + return False + if self.camel_case and self.camel_case_pat.search(word) is not None: + return False + if self.snake_case and self.snake_case_pat.search(word) is not None: return False return True @@ -1149,6 +1165,27 @@ def setup_ui(self): t.textChanged.connect(self.do_filter) t.setClearButtonEnabled(True) l.addWidget(t) + h = QHBoxLayout() + l.addLayout(h) + h.addWidget(QLabel(_('Also hide words:'))) + any_hide_checked = False + def hw(name, title, tooltip): + nonlocal any_hide_checked + ac = QCheckBox(title) + pref_name = f'spell-check-hide-words-{name}' + ac.setObjectName(pref_name) + ac.setChecked(tprefs.get(pref_name, False)) + if ac.isChecked(): + any_hide_checked = True + ac.toggled.connect(self.hide_words_toggled) + ac.setToolTip(tooltip) + h.addWidget(ac) + return ac + self.all_caps = hw('all-caps', _('ALL CAPS'), _('Hide words with all capital letters')) + self.with_numbers = hw('with-numbers', _('with numbers'), _('Hide words that contain numbers')) + self.camel_case = hw('camel-case', _('camelCase'), _('Hide words in camelCase')) + self.snake_case = hw('snake-case', _('snake_case'), _('Hide words in snake_case')) + h.addStretch(10) m.h2 = h = QHBoxLayout() l.addLayout(h) @@ -1252,6 +1289,14 @@ def button_action(sc, tt, button): self.action_change_word = button_action('ctrl+right', _('Change all occurrences of this word'), self.change_button) self.action_show_next_occurrence = button_action('alt+right', _('Show next occurrence of this word in the book'), self.next_occurrence) + if any_hide_checked: + QTimer.singleShot(0, self.do_filter) + + def hide_words_toggled(self, checked): + cb = self.sender() + pref_name = cb.objectName() + tprefs.set(pref_name, checked) + self.do_filter() def next_word(self): v = self.suggested_list if self.focusWidget() is self.suggested_list else self.words_view @@ -1465,7 +1510,9 @@ def __exit__(self, *args): def do_filter(self): text = str(self.filter_text.text()).strip() with self: - self.words_model.filter(text) + self.words_model.filter( + text, all_caps=self.all_caps.isChecked(), with_numbers=self.with_numbers.isChecked(), + camel_case=self.camel_case.isChecked(), snake_case=self.snake_case.isChecked()) def refresh(self, change_request=None): if not self.isVisible(): diff --git a/src/calibre/headless/headless_integration.cpp b/src/calibre/headless/headless_integration.cpp index e87bf2c4ca5a..77b6ab0503ef 100644 --- a/src/calibre/headless/headless_integration.cpp +++ b/src/calibre/headless/headless_integration.cpp @@ -127,12 +127,10 @@ HeadlessIntegration *HeadlessIntegration::instance() return static_cast(QGuiApplicationPrivate::platformIntegration()); } -static QString themeName() { return QStringLiteral("headless"); } -QStringList HeadlessIntegration::themeNames() const -{ - return QStringList(themeName()); -} +#define THEME_NAME "headless" + +QStringList HeadlessIntegration::themeNames() const { return QStringList(THEME_NAME); } // Restrict the styles to "fusion" to prevent native styles requiring native // window handles (eg Windows Vista style) from being used. @@ -155,7 +153,7 @@ class HeadlessTheme : public QPlatformTheme QPlatformTheme *HeadlessIntegration::createPlatformTheme(const QString &name) const { - return name == themeName() ? new HeadlessTheme() : nullptr; + return name == THEME_NAME ? new HeadlessTheme() : nullptr; } QT_END_NAMESPACE diff --git a/src/calibre/utils/exim.py b/src/calibre/utils/exim.py index 21413045cfda..4e12dde715ea 100644 --- a/src/calibre/utils/exim.py +++ b/src/calibre/utils/exim.py @@ -4,6 +4,7 @@ import errno import hashlib +import io import json import os import shutil @@ -13,6 +14,7 @@ import time import uuid from collections import Counter +from typing import NamedTuple from calibre import prints from calibre.constants import config_dir, filesystem_encoding, iswindows @@ -25,47 +27,33 @@ # Export {{{ - -def send_file(from_obj, to_obj, chunksize=1<<20): - m = hashlib.sha1() - while True: - raw = from_obj.read(chunksize) - if not raw: - break - m.update(raw) - to_obj.write(raw) - return str(m.hexdigest()) - - class FileDest: def __init__(self, key, exporter, mtime=None): self.exporter, self.key = exporter, key self.hasher = hashlib.sha1() - self.start_pos = exporter.f.tell() + self.start_part_number, self.start_pos = exporter.current_pos() self._discard = False - self.mtime = None + self.mtime = mtime + self.size = 0 def discard(self): self._discard = True - def ensure_space(self, size): - if size > 0: - self.exporter.ensure_space(size) - self.start_pos = self.exporter.f.tell() - def write(self, data): + self.size += len(data) + written = self.exporter.write(data) + if len(data) != written: + raise RuntimeError(f'Exporter failed to write all data: {len(data)} != {written}') self.hasher.update(data) - self.exporter.f.write(data) def flush(self): pass def close(self): if not self._discard: - size = self.exporter.f.tell() - self.start_pos digest = str(self.hasher.hexdigest()) - self.exporter.file_metadata[self.key] = (len(self.exporter.parts), self.start_pos, size, digest, self.mtime) + self.exporter.file_metadata[self.key] = (self.start_part_number, self.start_pos, self.size, digest, self.mtime) del self.exporter, self.hasher def __enter__(self): @@ -77,17 +65,23 @@ def __exit__(self, *args): class Exporter: - VERSION = 0 + VERSION = 1 TAIL_FMT = b'!II?' # part_num, version, is_last MDATA_SZ_FMT = b'!Q' EXT = '.calibre-data' - def __init__(self, path_to_export_dir, part_size=(1 << 30)): - self.part_size = part_size + @classmethod + def tail_size(cls): + return struct.calcsize(cls.TAIL_FMT) + + def __init__(self, path_to_export_dir, part_size=None): + # default part_size is 1 GB + self.part_size = (1 << 30) if part_size is None else part_size self.base = os.path.abspath(path_to_export_dir) - self.parts = [] - self.new_part() + self.commited_parts = [] + self.current_part = None self.file_metadata = {} + self.tail_sz = self.tail_size() self.metadata = {'file_metadata': self.file_metadata} def set_metadata(self, key, val): @@ -95,47 +89,61 @@ def set_metadata(self, key, val): raise KeyError('The metadata already contains the key: %s' % key) self.metadata[key] = val - @property - def f(self): - return self.parts[-1] + def current_pos(self): + pos = 0 + if self.current_part is not None: + pos = self.current_part.tell() + if pos >= self.part_size - self.tail_sz: + self.new_part() + pos = 0 + return len(self.commited_parts) + 1, pos + + def write(self, data: bytes) -> int: + written = 0 + data = memoryview(data) + while len(data) > 0: + if self.current_part is None: + self.new_part() + max_size = self.part_size - self.tail_sz - self.current_part.tell() + if max_size <= 0: + self.new_part() + max_size = self.part_size - self.tail_sz + chunk = data[:max_size] + w = self.current_part.write(chunk) + data = data[w:] + written += w + return written def new_part(self): - self.parts.append(open(os.path.join( - self.base, f'part-{len(self.parts) + 1:04d}{self.EXT}'), 'wb')) + self.commit_part() + self.current_part = open(os.path.join( + self.base, f'part-{len(self.commited_parts) + 1:04d}{self.EXT}'), 'wb') def commit_part(self, is_last=False): - self.f.write(struct.pack(self.TAIL_FMT, len(self.parts), self.VERSION, is_last)) - self.f.close() - self.parts[-1] = self.f.name - - def ensure_space(self, size): - try: - if size + self.f.tell() < self.part_size: - return - except AttributeError: - raise RuntimeError('This exporter has already been committed, cannot add to it') - self.commit_part() - self.new_part() + if self.current_part is not None: + self.current_part.write(struct.pack(self.TAIL_FMT, len(self.commited_parts) + 1, self.VERSION, is_last)) + self.current_part.close() + self.commited_parts.append(self.current_part.name) + self.current_part = None def commit(self): raw = json.dumps(self.metadata, ensure_ascii=False) if not isinstance(raw, bytes): raw = raw.encode('utf-8') - self.ensure_space(len(raw)) - self.f.write(raw) - self.f.write(struct.pack(self.MDATA_SZ_FMT, len(raw))) + self.new_part() + orig, self.part_size = self.part_size, sys.maxsize + self.write(raw) + self.write(struct.pack(self.MDATA_SZ_FMT, len(raw))) + self.part_size = orig self.commit_part(is_last=True) def add_file(self, fileobj, key): - fileobj.seek(0, os.SEEK_END) - size = fileobj.tell() - fileobj.seek(0) - self.ensure_space(size) - pos = self.f.tell() - digest = send_file(fileobj, self.f) - size = self.f.tell() - pos - mtime = os.fstat(fileobj.fileno()).st_mtime - self.file_metadata[key] = (len(self.parts), pos, size, digest, mtime) + try: + mtime = os.fstat(fileobj.fileno()).st_mtime + except (io.UnsupportedOperation, OSError): + mtime = None + with self.start_file(key, mtime=mtime) as dest: + shutil.copyfileobj(fileobj, dest) def start_file(self, key, mtime=None): return FileDest(key, self, mtime=mtime) @@ -217,47 +225,135 @@ def export(destdir, library_paths=None, dbmap=None, progress1=None, progress2=No # Import {{{ +class Chunk(NamedTuple): + part_num: int + pos_in_part: int + size: int + pos_in_file: int + + +class Pos: + + def __init__(self, part, pos_in_part, size, importer): + self.size = size + self.pos_in_file = 0 + self.chunks = chunks = [] + self.open_part = importer.open_part + self.currently_open_part = None + self.currently_open_chunk_index = -1 + + pos = 0 + while size > 0: + part_size = importer.size_of_part(part) + chunk_size = min(size, part_size - pos_in_part) + if chunk_size > 0: + chunks.append(Chunk(part, pos_in_part, chunk_size, pos)) + size -= chunk_size + pos += chunk_size + part += 1 + pos_in_part = 0 + + def close(self): + if self.currently_open_part is not None: + self.currently_open_part.close() + self.currently_open_part = None + self.currently_open_chunk_index = -1 + + def tell(self) -> int: + return self.pos_in_file + + def seek(self, amt, whence=os.SEEK_SET) -> int: + if whence == os.SEEK_SET: + new_pos_in_file = amt + if whence == os.SEEK_END: + new_pos_in_file = self.size + amt + if whence == os.SEEK_CUR: + new_pos_in_file = self.pos_in_file + amt + self.pos_in_file = max(0, min(new_pos_in_file, self.size)) + return self.pos_in_file + + def read(self, size=None): + if size is None or size < 0: + size = self.size + size = min(size, self.size) + amt_left = max(0, self.size - self.pos_in_file) + amt_to_read = min(amt_left, size) + if amt_to_read <= 0: + return b'' + start_chunk = max(0, self.currently_open_chunk_index) + num = len(self.chunks) + ans = [] + chunk_idx = -1 + for i in range(num): + chunk_idx = (start_chunk + i) % num + chunk = self.chunks[chunk_idx] + if chunk.pos_in_file <= self.pos_in_file < chunk.pos_in_file + chunk.size: + break + else: + raise ValueError(f'No chunk found containing {self.pos_in_file=}') + + while amt_to_read > 0: + try: + chunk = self.chunks[chunk_idx] + except IndexError: + break + ans.append(self._read_chunk(chunk, amt_to_read, chunk_idx)) + amt_to_read -= len(ans[-1]) + chunk_idx += 1 + return b''.join(ans) + + def _read_chunk(self, chunk, size, chunk_idx): + if self.currently_open_chunk_index != chunk_idx or self.currently_open_part is None: + self.close() + self.currently_open_part = self.open_part(chunk.part_num) + self.currently_open_chunk_index = chunk_idx + offset_from_start_of_chunk = self.pos_in_file - chunk.pos_in_file + self.currently_open_part.seek(chunk.pos_in_part + offset_from_start_of_chunk, os.SEEK_SET) + size = min(size, chunk.size - offset_from_start_of_chunk) + ans = self.currently_open_part.read(size) + self.pos_in_file += len(ans) + return ans + class FileSource: - def __init__(self, f, size, digest, description, mtime, importer): - self.f, self.size, self.digest, self.description = f, size, digest, description - self.seekable = self.f.seekable + def __init__(self, start_partnum, start_pos, size, digest, description, mtime, importer): + self.size, self.digest, self.description = size, digest, description self.mtime = mtime - self.start = f.tell() - self.end = self.start + size + self.start = start_pos + self.start_partnum = start_partnum + self.pos = Pos(start_partnum, start_pos, size, importer) self.hasher = hashlib.sha1() self.importer = importer self.check_hash = True + def seekable(self): + return False + def seek(self, amt, whence=os.SEEK_SET): - if whence == os.SEEK_SET: - return self.f.seek(self.start + amt, os.SEEK_SET) - if whence == os.SEEK_END: - return self.f.seek(self.end + amt, os.SEEK_SET) - if whence == os.SEEK_CUR: - return self.f.seek(amt, whence) + return self.pos.seek(amt, whence) def tell(self): - return self.f.tell() - self.start + return self.pos.tell() def read(self, size=None): - if size is not None and size < 1: - return b'' - left = self.end - self.f.tell() - amt = min(left, size or left) - if amt < 1: - return b'' - ans = self.f.read(amt) - if self.check_hash: + ans = self.pos.read(size) + if self.check_hash and ans: self.hasher.update(ans) return ans def close(self): if self.check_hash and self.hasher.hexdigest() != self.digest: self.importer.corrupted_files.append(self.description) - self.f.close() - self.hasher = self.f = None + self.hasher = None + self.pos.close() + self.pos = None + + def __enter__(self): + return self + + def __exit__(self, *a): + self.close() class Importer: @@ -265,11 +361,14 @@ class Importer: def __init__(self, path_to_export_dir): self.corrupted_files = [] part_map = {} - tail_size = struct.calcsize(Exporter.TAIL_FMT) + self.tail_size = tail_size = struct.calcsize(Exporter.TAIL_FMT) + self.version = -1 for name in os.listdir(path_to_export_dir): if name.lower().endswith(Exporter.EXT): path = os.path.join(path_to_export_dir, name) with open(path, 'rb') as f: + f.seek(0, os.SEEK_END) + size_of_part = f.tell() f.seek(-tail_size, os.SEEK_END) raw = f.read() if len(raw) != tail_size: @@ -279,7 +378,11 @@ def __init__(self, path_to_export_dir): raise ValueError('The exported data in %s is not valid,' ' version (%d) is higher than maximum supported version.' ' You might need to upgrade calibre first.' % (name, version)) - part_map[part_num] = path, is_last + part_map[part_num] = path, is_last, size_of_part + if self.version == -1: + self.version = version + if version != self.version: + raise ValueError(f'The exported data in {name} is not valid as it contains a mix of parts with versions: {self.version} and {version}') nums = sorted(part_map) if not nums: raise ValueError('No exported data found in: %s' % path_to_export_dir) @@ -289,37 +392,44 @@ def __init__(self, path_to_export_dir): raise ValueError('The last part of this exported data set is missing') if len(nums) != nums[-1]: raise ValueError('There are some parts of the exported data set missing') - self.part_map = {num:path for num, (path, is_last) in iteritems(part_map)} + self.part_map, self.part_size_map = {}, {} + for part_num, (path, is_last, size_of_part) in part_map.items(): + self.part_map[part_num] = path + self.part_size_map[part_num] = size_of_part msf = struct.calcsize(Exporter.MDATA_SZ_FMT) offset = tail_size + msf - with self.part(nums[-1]) as f: + with self.open_part(nums[-1]) as f: f.seek(-offset, os.SEEK_END) sz, = struct.unpack(Exporter.MDATA_SZ_FMT, f.read(msf)) f.seek(- sz - offset, os.SEEK_END) self.metadata = json.loads(f.read(sz)) self.file_metadata = self.metadata['file_metadata'] - def part(self, num): + def size_of_part(self, num): + return self.part_size_map[num] - self.tail_size + + def open_part(self, num): return open(self.part_map[num], 'rb') def start_file(self, key, description): partnum, pos, size, digest, mtime = self.file_metadata[key] - f = self.part(partnum) - f.seek(pos) - return FileSource(f, size, digest, description, mtime, self) + return FileSource(partnum, pos, size, digest, description, mtime, self) + + def save_file(self, key, description, output_path): + with open(output_path, 'wb') as dest, self.start_file(key, description) as src: + shutil.copyfileobj(src, dest) def export_config(self, base_dir, library_usage_stats): for key, relpath in self.metadata['config_dir']: - f = self.start_file(key, relpath) - path = os.path.join(base_dir, relpath.replace('/', os.sep)) - try: - with open(path, 'wb') as dest: - shutil.copyfileobj(f, dest) - except OSError: - os.makedirs(os.path.dirname(path)) - with open(path, 'wb') as dest: - shutil.copyfileobj(f, dest) - f.close() + with self.start_file(key, relpath) as f: + path = os.path.join(base_dir, relpath.replace('/', os.sep)) + try: + with open(path, 'wb') as dest: + shutil.copyfileobj(f, dest) + except OSError: + os.makedirs(os.path.dirname(path)) + with open(path, 'wb') as dest: + shutil.copyfileobj(f, dest) gpath = os.path.join(base_dir, 'global.py') try: with open(gpath, 'rb') as f: diff --git a/src/calibre/utils/hyphenation/dictionaries.py b/src/calibre/utils/hyphenation/dictionaries.py index 000f0b78258b..92f011ec57bd 100644 --- a/src/calibre/utils/hyphenation/dictionaries.py +++ b/src/calibre/utils/hyphenation/dictionaries.py @@ -72,7 +72,10 @@ def extract_dicts(cache_path): buf.seek(0) tf = tarfile.TarFile(fileobj=buf) with tf: - tf.extractall(tdir) + try: + tf.extractall(tdir, filter='data') + except TypeError: + tf.extractall(tdir) with open(os.path.join(tdir, 'sha1sum'), 'wb') as f: f.write(expected_hash()) dest = os.path.join(cache_path, 'f') diff --git a/src/calibre/utils/icu.c b/src/calibre/utils/icu.c index c67449018e3a..58053bb520b4 100644 --- a/src/calibre/utils/icu.c +++ b/src/calibre/utils/icu.c @@ -1231,9 +1231,16 @@ icu_set_filesystem_encoding(PyObject *self, PyObject *args) { char *encoding; if (!PyArg_ParseTuple(args, "s:setfilesystemencoding", &encoding)) return NULL; +#if PY_VERSION_HEX < 0x03012000 + // The nitwits at Python deprecated this in 3.12 claiming we should use + // PyConfig.filesystem_encoding instead. But that can only be used if we + // control the interpreter, which we do not in Linux distro builds. Sigh. + // Well, if this causes issues we just continue to tell people not to use + // Linux distro builds. On frozen aka non-distro builds we set + // PyPreConfig.utf8_mode = 1 which supposedly sets this to utf-8 anyway. Py_FileSystemDefaultEncoding = strdup(encoding); +#endif Py_RETURN_NONE; - } // }}} diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 044a0402e59a..7835cdfc73f8 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -17,6 +17,7 @@ import threading import time import traceback +from base64 import standard_b64decode from urllib.request import urlopen from calibre import browser, relpath, unicode_path @@ -248,13 +249,18 @@ def fetch_url(self, url): ans = response(q) ans.newurl = url return ans - self.log.debug('Fetching', url) st = time.monotonic() + is_data_url = url.startswith('data:') + if not is_data_url: + self.log.debug('Fetching', url) # Check for a URL pointing to the local filesystem and special case it # for efficiency and robustness. Bypasses delay checking as it does not # apply to local fetches. Ensures that unicode paths that are not # representable in the filesystem_encoding work. + if is_data_url: + payload = url.partition(',')[2] + return standard_b64decode(payload) is_local = 0 if url.startswith('file://'): is_local = 7 diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index c78e3edc08cf..919cabe61a86 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -9,7 +9,7 @@ from calibre.utils.iso8601 import parse_iso8601 -module_version = 4 # needed for live updates +module_version = 5 # needed for live updates pprint @@ -185,15 +185,12 @@ def extract_html(soup): return json_to_html(raw) -def download_url(url=None, br=None): - # Get the URL from the Wayback machine +def download_url_from_wayback(category, url, br=None): from mechanize import Request host = 'http://localhost:8090' host = 'https://wayback1.calibre-ebook.com' - if url is None: - url = sys.argv[-1] rq = Request( - host + '/nytimes', + host + '/' + category, data=json.dumps({"url": url}), headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'} ) @@ -204,6 +201,13 @@ def download_url(url=None, br=None): return br.open_novisit(rq, timeout=3 * 60).read() +def download_url(url=None, br=None): + # Get the URL from the Wayback machine + if url is None: + url = sys.argv[-1] + return download_url_from_wayback('nytimes', url, br) + + if __name__ == '__main__': f = sys.argv[-1] raw = open(f).read()