Skip to content

Commit

Permalink
simplfy and don't use side effects
Browse files Browse the repository at this point in the history
  • Loading branch information
rleed committed Oct 20, 2023
1 parent 193ec48 commit 8b1edd2
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 67 deletions.
5 changes: 2 additions & 3 deletions api/resolvers/item.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { ensureProtocol, removeTracking } from '../../lib/url'
import { serializeInvoicable } from './serial'
import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor'
import { getMetadata, metadataRuleSets } from 'page-metadata-parser'
import { ruleSet as publicationDateRuleSet } from '../../lib/timedate-scraper'
import domino from 'domino'
import {
ITEM_SPAM_INTERVAL, ITEM_FILTER_THRESHOLD,
Expand All @@ -17,7 +18,6 @@ import { advSchema, amountSchema, bountySchema, commentSchema, discussionSchema,
import { sendUserNotification } from '../webPush'
import { defaultCommentSort } from '../../lib/item'
import { notifyItemParents, notifyUserSubscribers, notifyZapped } from '../../lib/push-notifications'
import { initDateRule } from '../../lib/timedate-scraper'
import { datePivot } from '../../lib/time'

export async function commentFilterClause (me, models) {
Expand Down Expand Up @@ -542,8 +542,7 @@ export default {
const response = await fetch(ensureProtocol(url), { redirect: 'follow' })
const html = await response.text()
const doc = domino.createWindow(html).document
initDateRule()
const metadata = getMetadata(doc, url, { title: metadataRuleSets.title, publicationDate: metadataRuleSets.publicationDate })
const metadata = getMetadata(doc, url, { title: metadataRuleSets.title, publicationDate: publicationDateRuleSet })
const dateHint = ` (${metadata.publicationDate?.getFullYear()})`
const moreThanOneYearAgo = metadata.publicationDate && metadata.publicationDate < datePivot(new Date(), { years: -1 })

Expand Down
119 changes: 55 additions & 64 deletions lib/timedate-scraper.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import { metadataRuleSets } from 'page-metadata-parser'

// Module to extend page-metadata-parser with date rules.
// Date rule for use with page-metadata-parser.
// Based on https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py
// Usage: import and call initDateRule() before doing a metadata query for publicationDate.
// Usage: import ruleSet and use in a call similar to: getMetadata(doc, url, { publicationDate: ruleSet.publicationDate })
// Some example URLs for testing purposes:

// ld+json example from 2018:
Expand Down Expand Up @@ -31,67 +29,60 @@ export function extractFromURL (url) {
}
}

let ruleSet = null
function asDate (str) {
if (str) {
try {
const d = new Date(str)
if (!isNaN(d)) return d
} catch { }
}
}

export function initDateRule () {
if (!ruleSet) {
console.log('Building ruleset for article publication dates')
function asDate (str) {
if (str) {
try {
const d = new Date(str)
if (!isNaN(d)) return d
} catch { }
}
}
ruleSet = {
// note meta names are case sensitive, and scorers must not favor rules when they will not return good results.
rules: [
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.datePublished)],
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateCreated)],
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateModified)],
export const ruleSet = {
// note meta names are case sensitive, and scorers must not favor rules when they will not return good results.
rules: [
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.datePublished)],
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateCreated)],
['script[type="application/ld+json"]', node => asDate(JSON.parse(node.innerHTML)?.dateModified)],

['meta[property="article:published_time"]', node => asDate(node.getAttribute('content'))],
['meta[name="pubdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="publishdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="timestamp"]', node => asDate(node.getAttribute('content'))],
['meta[name="dc.date.issued"]', node => asDate(node.getAttribute('content'))],
['meta[name="date"]', node => asDate(node.getAttribute('content'))],
['meta[property="bt:pubdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="parsely-pub-date"]', node => asDate(node.getAttribute('content'))],
['meta[name="sailthru.date"]', node => asDate(node.getAttribute('content'))],
['meta[name="article.published"]', node => asDate(node.getAttribute('content'))],
['meta[name="published-date"]', node => asDate(node.getAttribute('content'))],
['meta[name="article.created"]', node => asDate(node.getAttribute('content'))],
['meta[name="article_date_original"]', node => asDate(node.getAttribute('content'))],
['meta[name="cxenseparse:recs:publishtime"]', node => asDate(node.getAttribute('content'))],
['meta[name="date_published"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datePublished"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datepublished"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datecreated"]', node => asDate(node.getAttribute('content'))],
['meta[http-equiv="date"]', node => asDate(node.getAttribute('content'))],
['meta[property="og:image"]', node => asDate(extractFromURL(node.getAttribute('content')))],
['meta[itemprop="image"]', node => asDate(extractFromURL(node.getAttribute('content')))],
['meta[property="article:published_time"]', node => asDate(node.getAttribute('content'))],
['meta[name="pubdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="publishdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="timestamp"]', node => asDate(node.getAttribute('content'))],
['meta[name="dc.date.issued"]', node => asDate(node.getAttribute('content'))],
['meta[name="date"]', node => asDate(node.getAttribute('content'))],
['meta[property="bt:pubdate"]', node => asDate(node.getAttribute('content'))],
['meta[name="parsely-pub-date"]', node => asDate(node.getAttribute('content'))],
['meta[name="sailthru.date"]', node => asDate(node.getAttribute('content'))],
['meta[name="article.published"]', node => asDate(node.getAttribute('content'))],
['meta[name="published-date"]', node => asDate(node.getAttribute('content'))],
['meta[name="article.created"]', node => asDate(node.getAttribute('content'))],
['meta[name="article_date_original"]', node => asDate(node.getAttribute('content'))],
['meta[name="cxenseparse:recs:publishtime"]', node => asDate(node.getAttribute('content'))],
['meta[name="date_published"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datePublished"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datepublished"]', node => asDate(node.getAttribute('content'))],
['meta[itemprop="datecreated"]', node => asDate(node.getAttribute('content'))],
['meta[http-equiv="date"]', node => asDate(node.getAttribute('content'))],
['meta[property="og:image"]', node => asDate(extractFromURL(node.getAttribute('content')))],
['meta[itemprop="image"]', node => asDate(extractFromURL(node.getAttribute('content')))],

['time', node => asDate(node.getAttribute('datetime') || (node.getAttribute('class') === 'timestamp' && node.innerHTML))],
['span[itemprop="datePublished"]', node => asDate(node.getAttribute('content') || cleanDateStr(node.innerHTML))],
...['span', 'p', 'div'].map(tag => {
return ['pubdate', 'timestamp', 'article_date', 'articledate', 'date'].map(className => {
return [`${tag}[class="${className}"]`, node => asDate(cleanDateStr(node.innerHTML))]
})
}).flat()
],
scorers: [
(el, score) => {
if (el.localName === 'script' && el.getAttribute('type') === 'application/ld+json' && el.innerHTML) {
const data = JSON.parse(el.innerHTML)
return data?.datePublished || data?.dateCreated || data?.dateModified ? 1000000 + score : 0
}
},
(el, score) => el.localName === 'meta' && el.getAttribute('content') && cleanDateStr(el.getAttribute('content')) ? 1000 + score : 0,
(el, score) => !['script', 'meta'].includes(el.localName) ? score : 0
]
}
metadataRuleSets.publicationDate = ruleSet
}
['time', node => asDate(node.getAttribute('datetime') || (node.getAttribute('class') === 'timestamp' && node.innerHTML))],
['span[itemprop="datePublished"]', node => asDate(node.getAttribute('content') || cleanDateStr(node.innerHTML))],
...['span', 'p', 'div'].map(tag => {
return ['pubdate', 'timestamp', 'article_date', 'articledate', 'date'].map(className => {
return [`${tag}[class="${className}"]`, node => asDate(cleanDateStr(node.innerHTML))]
})
}).flat()
],
scorers: [
(el, score) => {
if (el.localName === 'script' && el.getAttribute('type') === 'application/ld+json' && el.innerHTML) {
const data = JSON.parse(el.innerHTML)
return data?.datePublished || data?.dateCreated || data?.dateModified ? 1000000 + score : 0
}
},
(el, score) => el.localName === 'meta' && el.getAttribute('content') && cleanDateStr(el.getAttribute('content')) ? 1000 + score : 0,
(el, score) => !['script', 'meta'].includes(el.localName) ? score : 0
]
}

0 comments on commit 8b1edd2

Please sign in to comment.