Skip to content

Commit

Permalink
Merge commit 'b2b885117b2d176a8f2f8d79f69138daa743433b' of https://gi…
Browse files Browse the repository at this point in the history
  • Loading branch information
KaiHuaDou committed Oct 7, 2023
2 parents ee8900e + b2b8851 commit e3d417c
Show file tree
Hide file tree
Showing 63 changed files with 1,146 additions and 344 deletions.
50 changes: 50 additions & 0 deletions Changelog.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,56 @@
# - title by author
# }}}

{{{ 6.28.0 2023-10-06

:: new features

- Edit book: Show full path to book being edited in the status bar

- Edit book: When adding dictionaries allow directly a LibreOffice adding the dictionary just by choosing the language

- Edit book: When saving a copy add some convenience actions to edit the copy immediately either in the current editor window or a new window

- E-book viewer: Highlights panel: Allow right clicking to export only selected highlights

:: bug fixes

- [2034900] Edit book: Fix a regression in the previous release that caused Text search to sometimes not select matches correctly

- [2037198] Edit book: When copying files do it in order so that the files are pasted in the same order when pasting into another editor instance

- Edit book: Fix smart tag insert not working correctly if the selected text starts with the closing angle bracket of a tag

- [2038238] Save to disk: Do not limit the total path length to 240 characters on non-Windows platforms

- [2037898] Fix incorrect cover for AZW3 version of calibre User Manual

- [2029723] Content server: Change formatting of book details to match new de-emphasized titles based formatting in the main calibre program

- PDF metadata: Fix a regression that broke updating metadata in PDF files without an /Info dictionary

- NOOK driver: For the Glowlight 2023 write the calibre metadata files into the NOOK sub-folder as the firmware does not allow writing files into the root folder

- NOOK driver: Fix Glowlight 2023 not being detected on Linux and macOS

- [2037454] E-book viewer: Make CFI calculation more robust especially on pages with very little content

- [2037543] E-book viewer: Workaround bug in Chromium where getBoundingClientRect() fails sometimes leading to incorrect calculation of anchor positions

- [2037237] Fix errors caused by .DS_Store files inserted into the .caltrash directory on macOS if the user happens to open .caltrash in Finder

:: improved recipes
- National Geographic
- Bloomberg
- Endgadget
- Times of India
- Horizons

:: new recipes
- Business Standard Print Edition by unkn0wn

}}}

{{{ 6.27.0 2023-09-22

:: new features
Expand Down
16 changes: 9 additions & 7 deletions recipes/bloomberg-business-week.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ def get_contents(x):
if otype == 'text':
if 'attributes' in x:
if 'strong' in x['attributes']:
return '<b>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</b>'
return '<strong>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</strong>'
if 'emphasis' in x['attributes']:
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
return '<em>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</em>'
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
elif otype == 'br':
return '<br>'
Expand All @@ -30,8 +30,8 @@ def get_contents(x):
return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>'
elif otype == 'media':
if x['subType'] == 'photo':
return '<div><div class="img"><img src="{}"></div><div class="cap">{}</div></div>'.format(
x['data']['photo']['src'], x['data']['photo']['caption'])
return '<div><div class="img"><img src="{}"></div><div class="cap">{}<div>{}</div></div></div>'.format(
x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit'])
elif x['subType'] == 'chart':
if x['data'] and x['data']['chart']:
return '<div class="img"><img src="{}"></div>'.format(x['data']['chart']['fallback'])
Expand All @@ -50,6 +50,8 @@ def get_contents(x):
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
elif otype in {'div', 'callout'}:
return '<div>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</div>'
elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']):
if any(b in x for b in ['value', 'content']):
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
Expand Down Expand Up @@ -77,7 +79,7 @@ class Bloomberg(BasicNewsRecipe):
.auth {font-size:small; font-weight:bold;}
.time, .chart, .css--lede-byline, .css--lede-timestamp {font-size:small;}
.subhead {font-style:italic; color:#404040;}
i, .col {color:#202020;}
em, .col {color:#202020;}
.cat {font-size:small; color:gray;}
.news-figure-caption-text, .cap, .img, .css--caption-outer-wrapper {font-size:small; text-align:center;}
.news-figure-credit {font-size:small; text-align:center; color:#202020;}
Expand Down Expand Up @@ -179,7 +181,7 @@ class Bloomberg(BasicNewsRecipe):
body_data = data['body']['content']
for x in body_data:
body += get_contents(x)
pause = random.choice((3, 4, 5, 6))
pause = random.choice((5, 6, 7, 8, 9))
self.log('Delay: ', pause, ' seconds')
time.sleep(pause)
return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>'
Expand Down
16 changes: 9 additions & 7 deletions recipes/bloomberg.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ def get_contents(x):
if otype == 'text':
if 'attributes' in x:
if 'strong' in x['attributes']:
return '<b>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</b>'
return '<strong>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</strong>'
if 'emphasis' in x['attributes']:
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
return '<em>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</em>'
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
return x.get('value', '') + ''.join(map(get_contents, x.get('content', '')))
elif otype == 'br':
return '<br>'
Expand All @@ -31,8 +31,8 @@ def get_contents(x):
return '<blockquote class="col">' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</blockquote>'
elif otype == 'media':
if x['subType'] == 'photo':
return '<div><div class="img"><img src="{}"></div><div class="cap">{}</div></div>'.format(
x['data']['photo']['src'], x['data']['photo']['caption'])
return '<div><div class="img"><img src="{}"></div><div class="cap">{}<div>{}</div></div></div>'.format(
x['data']['photo']['src'], x['data']['photo']['caption'], x['data']['photo']['credit'])
elif x['subType'] == 'chart':
if x['data'] and x['data']['chart']:
return '<div class="img"><img src="{}"></div>'.format(x['data']['chart']['fallback'])
Expand All @@ -51,6 +51,8 @@ def get_contents(x):
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
elif otype in {'div', 'callout'}:
return '<div>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</div>'
elif not any(x == otype for x in ['', 'ad', 'inline-newsletter', 'tabularData']):
if any(b in x for b in ['value', 'content']):
return '<i>' + x.get('value', '') + ''.join(map(get_contents, x.get('content', ''))) + '</i>'
Expand All @@ -76,7 +78,7 @@ class Bloomberg(BasicNewsRecipe):
.auth {font-size:small; font-weight:bold;}
.time, .chart {font-size:small;}
.subhead {font-style:italic; color:#404040;}
i, .col {color:#202020;}
em, .col {color:#202020;}
.cat {font-size:small; color:gray;}
.news-figure-caption-text, .cap, .img {font-size:small; text-align:center;}
.news-figure-credit {font-size:small; text-align:center; color:#202020;}
Expand Down Expand Up @@ -177,7 +179,7 @@ class Bloomberg(BasicNewsRecipe):
body_data = data['body']['content']
for x in body_data:
body += get_contents(x)
pause = random.choice((3, 4, 5, 6))
pause = random.choice((5, 6, 7, 8, 9))
self.log('Delay: ', pause, ' seconds')
time.sleep(pause)
return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div>' + body + '</div></body></html>'
Expand Down
117 changes: 117 additions & 0 deletions recipes/business_standard_print.recipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from calibre.web.feeds.news import BasicNewsRecipe
from html5_parser import parse
from datetime import datetime
import json

today = datetime.today().strftime('%d-%m-%Y')

# today = '20-09-2023'

day, month, year = (int(x) for x in today.split('-'))
dt = datetime(year, month, day)

class BusinessStandardPrint(BasicNewsRecipe):
title = 'Business Standard Print Edition'
__author__ = 'unkn0wn'
description = "India's most respected business daily, Articles from Today's Paper"
language = 'en_IN'
masthead_url = 'https://bsmedia.business-standard.com/include/_mod/site/html5/images/business-standard-logo.png'
encoding = 'utf-8'
timefmt = ' [' + dt.strftime('%b %d, %Y') + ']'
resolve_internal_links = True
remove_empty_feeds = True

no_stylesheets = True
remove_javascript = True
remove_attributes = ['width', 'height', 'style']

def __init__(self, *args, **kwargs):
BasicNewsRecipe.__init__(self, *args, **kwargs)
if self.output_profile.short_name.startswith('kindle'):
self.title = 'Business Standard ' + dt.strftime('%b %d, %Y')

def get_browser(self):
return BasicNewsRecipe.get_browser(self, user_agent='common_words/based')

ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
resolve_internal_links = True

extra_css = '''
img {display:block; margin:0 auto;}
.auth, .cat { font-size:small; color:#202020; }
.cap { font-size:small; text-align:center; }
'''

def parse_index(self):
if dt.weekday() == 6:
self.log.warn(
'Business Standard Does Not Have A Print Publication On Sunday. The Reports'
' And Columns On This Page Today Appeared In The Newspaper\'s Saturday Edition.'
)
url = 'https://apibs.business-standard.com/category/today-paper?sortBy=' + today
raw = self.index_to_soup(url, raw=True)
data = json.loads(raw)
data = data['data']

feeds = []

for section in data:
if section == 'EpaperImage':
self.cover_url = data[section]['url']
continue
self.log(section)
articles = []
for article in data[section]:
title = article['heading1']
desc = article['sub_heading']
url = 'https://www.business-standard.com' + article['article_url']
self.log('\t', title, '\n\t', desc, '\n\t\t', url)
articles.append({'title': title, 'description':desc, 'url': url})
if articles:
feeds.append((section, articles))
return feeds

def preprocess_raw_html(self, raw, *a):
root = parse(raw)
m = root.xpath('//script[@id="__NEXT_DATA__"]')

data = json.loads(m[0].text)

img_url = None
if 'articleImageUrl' in data['props']['pageProps']['articleSchema']:
img_url = data['props']['pageProps']['articleSchema']['articleImageUrl']

art_url = 'https://www.business-standard.com' + data['props']['pageProps']['url']

data = data['props']['pageProps']['data']

title = '<h1 title="{}">'.format(art_url) + data['pageTitle'] + '</h1>'

cat = subhead = lede = auth = caption = ''

if 'defaultArticleCat' in data and data['defaultArticleCat'] is not None:
if 'h1_tag' in data['defaultArticleCat'] and data['defaultArticleCat']['h1_tag'] is not None:
cat = '<div><p class="cat">' + data['defaultArticleCat']['h1_tag'] + '</p></div>'

if 'metaDescription' in data and data['metaDescription'] is not None:
subhead = '<h3>' + data['metaDescription'] + '</h3>'
self.art_desc = data['metaDescription']

date = (datetime.fromtimestamp(int(data['publishDate']))).strftime('%b %d, %Y | %I:%M %p')

if 'multiple_authors_name' in data:
auth = '<div><p class="auth">' + data['multiple_authors_name'] + ' | ' + data['placeName'] + ' | ' + date + '</p></div>'

if 'featuredImageObj' in data:
if 'url' in data['featuredImageObj']:
if img_url is not None:
lede = '<p class="cap"><img src="{}">'.format(img_url)
else:
lede = '<p class="cap"><img src="{}">'.format(data['featuredImageObj']['url'])
if 'alt_text' in data['featuredImageObj']:
caption = '<span>' + data['featuredImageObj']['alt_text'] + '</span></p>'

body = data['htmlContent']

return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div><p></p>' + body + '</div></body></html>'
Loading

0 comments on commit e3d417c

Please sign in to comment.