From 8b1edd25dbc4ba1fc4f8917449c912f3ef753e20 Mon Sep 17 00:00:00 2001 From: rleed Date: Wed, 18 Oct 2023 11:02:12 -0300 Subject: [PATCH] simplfy and don't use side effects --- api/resolvers/item.js | 5 +- lib/timedate-scraper.js | 119 +++++++++++++++++++--------------------- 2 files changed, 57 insertions(+), 67 deletions(-) diff --git a/api/resolvers/item.js b/api/resolvers/item.js index fa5510cbd..b62e5c4fc 100644 --- a/api/resolvers/item.js +++ b/api/resolvers/item.js @@ -3,6 +3,7 @@ import { ensureProtocol, removeTracking } from '../../lib/url' import { serializeInvoicable } from './serial' import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor' import { getMetadata, metadataRuleSets } from 'page-metadata-parser' +import { ruleSet as publicationDateRuleSet } from '../../lib/timedate-scraper' import domino from 'domino' import { ITEM_SPAM_INTERVAL, ITEM_FILTER_THRESHOLD, @@ -17,7 +18,6 @@ import { advSchema, amountSchema, bountySchema, commentSchema, discussionSchema, import { sendUserNotification } from '../webPush' import { defaultCommentSort } from '../../lib/item' import { notifyItemParents, notifyUserSubscribers, notifyZapped } from '../../lib/push-notifications' -import { initDateRule } from '../../lib/timedate-scraper' import { datePivot } from '../../lib/time' export async function commentFilterClause (me, models) { @@ -542,8 +542,7 @@ export default { const response = await fetch(ensureProtocol(url), { redirect: 'follow' }) const html = await response.text() const doc = domino.createWindow(html).document - initDateRule() - const metadata = getMetadata(doc, url, { title: metadataRuleSets.title, publicationDate: metadataRuleSets.publicationDate }) + const metadata = getMetadata(doc, url, { title: metadataRuleSets.title, publicationDate: publicationDateRuleSet }) const dateHint = ` (${metadata.publicationDate?.getFullYear()})` const moreThanOneYearAgo = metadata.publicationDate && metadata.publicationDate < datePivot(new Date(), { years: -1 }) diff --git a/lib/timedate-scraper.js b/lib/timedate-scraper.js index 87eb37f38..0774b05fa 100644 --- a/lib/timedate-scraper.js +++ b/lib/timedate-scraper.js @@ -1,8 +1,6 @@ -import { metadataRuleSets } from 'page-metadata-parser' - -// Module to extend page-metadata-parser with date rules. +// Date rule for use with page-metadata-parser. // Based on https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py -// Usage: import and call initDateRule() before doing a metadata query for publicationDate. +// Usage: import ruleSet and use in a call similar to: getMetadata(doc, url, { publicationDate: ruleSet.publicationDate }) // Some example URLs for testing purposes: // ld+json example from 2018: @@ -31,67 +29,60 @@ export function extractFromURL (url) { } } -let ruleSet = null +function asDate (str) { + if (str) { + try { + const d = new Date(str) + if (!isNaN(d)) return d + } catch { } + } +} -export function initDateRule () { - if (!ruleSet) { - console.log('Building ruleset for article publication dates') - function asDate (str) { - if (str) { - try { - const d = new Date(str) - if (!isNaN(d)) return d - } catch { } - } - } - ruleSet = { - // note meta names are case sensitive, and scorers must not favor rules when they will not return good results. - rules: [ - ['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.datePublished)], - ['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateCreated)], - ['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateModified)], +export const ruleSet = { + // note meta names are case sensitive, and scorers must not favor rules when they will not return good results. + rules: [ + ['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.datePublished)], + ['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateCreated)], + ['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateModified)], - ['meta[property="article:published_time"]', node => asDate(node.getAttribute('content'))], - ['meta[name="pubdate"]', node => asDate(node.getAttribute('content'))], - ['meta[name="publishdate"]', node => asDate(node.getAttribute('content'))], - ['meta[name="timestamp"]', node => asDate(node.getAttribute('content'))], - ['meta[name="dc.date.issued"]', node => asDate(node.getAttribute('content'))], - ['meta[name="date"]', node => asDate(node.getAttribute('content'))], - ['meta[property="bt:pubdate"]', node => asDate(node.getAttribute('content'))], - ['meta[name="parsely-pub-date"]', node => asDate(node.getAttribute('content'))], - ['meta[name="sailthru.date"]', node => asDate(node.getAttribute('content'))], - ['meta[name="article.published"]', node => asDate(node.getAttribute('content'))], - ['meta[name="published-date"]', node => asDate(node.getAttribute('content'))], - ['meta[name="article.created"]', node => asDate(node.getAttribute('content'))], - ['meta[name="article_date_original"]', node => asDate(node.getAttribute('content'))], - ['meta[name="cxenseparse:recs:publishtime"]', node => asDate(node.getAttribute('content'))], - ['meta[name="date_published"]', node => asDate(node.getAttribute('content'))], - ['meta[itemprop="datePublished"]', node => asDate(node.getAttribute('content'))], - ['meta[itemprop="datepublished"]', node => asDate(node.getAttribute('content'))], - ['meta[itemprop="datecreated"]', node => asDate(node.getAttribute('content'))], - ['meta[http-equiv="date"]', node => asDate(node.getAttribute('content'))], - ['meta[property="og:image"]', node => asDate(extractFromURL(node.getAttribute('content')))], - ['meta[itemprop="image"]', node => asDate(extractFromURL(node.getAttribute('content')))], + ['meta[property="article:published_time"]', node => asDate(node.getAttribute('content'))], + ['meta[name="pubdate"]', node => asDate(node.getAttribute('content'))], + ['meta[name="publishdate"]', node => asDate(node.getAttribute('content'))], + ['meta[name="timestamp"]', node => asDate(node.getAttribute('content'))], + ['meta[name="dc.date.issued"]', node => asDate(node.getAttribute('content'))], + ['meta[name="date"]', node => asDate(node.getAttribute('content'))], + ['meta[property="bt:pubdate"]', node => asDate(node.getAttribute('content'))], + ['meta[name="parsely-pub-date"]', node => asDate(node.getAttribute('content'))], + ['meta[name="sailthru.date"]', node => asDate(node.getAttribute('content'))], + ['meta[name="article.published"]', node => asDate(node.getAttribute('content'))], + ['meta[name="published-date"]', node => asDate(node.getAttribute('content'))], + ['meta[name="article.created"]', node => asDate(node.getAttribute('content'))], + ['meta[name="article_date_original"]', node => asDate(node.getAttribute('content'))], + ['meta[name="cxenseparse:recs:publishtime"]', node => asDate(node.getAttribute('content'))], + ['meta[name="date_published"]', node => asDate(node.getAttribute('content'))], + ['meta[itemprop="datePublished"]', node => asDate(node.getAttribute('content'))], + ['meta[itemprop="datepublished"]', node => asDate(node.getAttribute('content'))], + ['meta[itemprop="datecreated"]', node => asDate(node.getAttribute('content'))], + ['meta[http-equiv="date"]', node => asDate(node.getAttribute('content'))], + ['meta[property="og:image"]', node => asDate(extractFromURL(node.getAttribute('content')))], + ['meta[itemprop="image"]', node => asDate(extractFromURL(node.getAttribute('content')))], - ['time', node => asDate(node.getAttribute('datetime') || (node.getAttribute('class') === 'timestamp' && node.innerHTML))], - ['span[itemprop="datePublished"]', node => asDate(node.getAttribute('content') || cleanDateStr(node.innerHTML))], - ...['span', 'p', 'div'].map(tag => { - return ['pubdate', 'timestamp', 'article_date', 'articledate', 'date'].map(className => { - return [`${tag}[class="${className}"]`, node => asDate(cleanDateStr(node.innerHTML))] - }) - }).flat() - ], - scorers: [ - (el, score) => { - if (el.localName === 'script' && el.getAttribute('type') === 'application/ld+json' && el.innerHTML) { - const data = JSON.parse(el.innerHTML) - return data?.datePublished || data?.dateCreated || data?.dateModified ? 1000000 + score : 0 - } - }, - (el, score) => el.localName === 'meta' && el.getAttribute('content') && cleanDateStr(el.getAttribute('content')) ? 1000 + score : 0, - (el, score) => !['script', 'meta'].includes(el.localName) ? score : 0 - ] - } - metadataRuleSets.publicationDate = ruleSet - } + ['time', node => asDate(node.getAttribute('datetime') || (node.getAttribute('class') === 'timestamp' && node.innerHTML))], + ['span[itemprop="datePublished"]', node => asDate(node.getAttribute('content') || cleanDateStr(node.innerHTML))], + ...['span', 'p', 'div'].map(tag => { + return ['pubdate', 'timestamp', 'article_date', 'articledate', 'date'].map(className => { + return [`${tag}[class="${className}"]`, node => asDate(cleanDateStr(node.innerHTML))] + }) + }).flat() + ], + scorers: [ + (el, score) => { + if (el.localName === 'script' && el.getAttribute('type') === 'application/ld+json' && el.innerHTML) { + const data = JSON.parse(el.innerHTML) + return data?.datePublished || data?.dateCreated || data?.dateModified ? 1000000 + score : 0 + } + }, + (el, score) => el.localName === 'meta' && el.getAttribute('content') && cleanDateStr(el.getAttribute('content')) ? 1000 + score : 0, + (el, score) => !['script', 'meta'].includes(el.localName) ? score : 0 + ] }