Skip to content

Commit

Permalink
bug fixes and cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
rleed committed Sep 24, 2023
1 parent b6c0ee1 commit 126570e
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 14 deletions.
1 change: 0 additions & 1 deletion api/resolvers/item.js
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,6 @@ export default {
const doc = domino.createWindow(html).document
const metadata = getMetadata(doc, url, { title: metadataRuleSets.title })
const datedata = extractArticlePublishedDate({ url, doc })
console.log(datedata, (new Date() - datedata.date) / (1000 * 60 * 60 * 24))
const dateHint = (datedata && (new Date() - datedata.date) / (1000 * 60 * 60 * 24) > 365)
? ` (${datedata.date.getFullYear()})`
: ''
Expand Down
32 changes: 19 additions & 13 deletions lib/timedate-scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ exports.extractFromLDJson = function ({ url, doc }) {
// returns { date, source } or undefined
exports.extractFromMeta = function ({ url, doc }) {
const dateRules = {
// note a couple of sad things:
// 1. meta names are case sensitive
// 2. rules stop if selector matches, even when callback returns false
rules: [
['meta[property="article:published_time"]', node => node.getAttribute('content')],
['meta[name="pubdate"]', node => node.getAttribute('content')],
Expand All @@ -63,11 +66,11 @@ exports.extractFromMeta = function ({ url, doc }) {
['meta[name="article_date_original"]', node => node.getAttribute('content')],
['meta[name="cxenseparse:recs:publishtime"]', node => node.getAttribute('content')],
['meta[name="date_published"]', node => node.getAttribute('content')],
['meta[itemprop="datePublished"]', node => node.getAttribute('content')],
['meta[itemprop="datepublished"]', node => node.getAttribute('content')],
['meta[itemprop="datecreated"]', node => node.getAttribute('content')],
['meta[http-equiv="date"]', node => node.getAttribute('content')],

// note: rules stop if selector matches, even when callback returns false
['meta[property="og:image"]', node => exports.extractFromURL(node.getAttribute('content'))],
['meta[itemprop="image"]', node => exports.extractFromURL(node.getAttribute('content'))]
]
Expand All @@ -91,11 +94,11 @@ exports.extractFromHTMLTag = function ({ url, doc }) {
({ date } = getMetadata(doc, url, {
date: {
rules: [
['span[itemprop]="datePublished"]', node => node.getAttribute('content') || exports.parseStrDate(node.innerHTML)]
['span[itemprop="datePublished"]', node => node.getAttribute('content') || exports.parseStrDate(node.innerHTML)]
]
}
}))
if (date) return { date, source: 'span[itemprop]="datePublished"]' }
if (date) return { date, source: 'span[itemprop="datePublished"]' }

for (const tag of ['span', 'p', 'div']) {
for (const className of ['pubdate', 'timestamp', 'article_date', 'articledate', 'date']) {
Expand All @@ -112,22 +115,25 @@ exports.extractFromHTMLTag = function ({ url, doc }) {
// returns { date, source } or undefined
exports.extractArticlePublishedDate = function ({ url, doc }) {
console.log('Extracting date from', url)
let articleDate
let foundDate
try {
articleDate = { date: exports.extractFromURL(url), source: 'url' }
let possibleDate = exports.extractFromLDJson({ url, doc })
// establish a default from the URL if possible
let possibleDate = exports.extractFromURL(url)
if (possibleDate) foundDate = { date: possibleDate, source: 'url' }

// try to get date from various sources in order of precedence
possibleDate = exports.extractFromLDJson({ url, doc })
if (!possibleDate) possibleDate = exports.extractFromMeta({ url, doc })
if (!possibleDate) possibleDate = exports.extractFromHTMLTag({ url, doc })
if (possibleDate) articleDate = possibleDate
if (possibleDate) foundDate = possibleDate
} catch (e) {
console.log('Exception in extractArticlePublishedDate for', url)
console.log(e)
}
if (articleDate) {
if (foundDate) {
try {
const d = new Date(articleDate.date)
if (!isNaN(d)) articleDate.date = d
} catch {}
const d = new Date(foundDate.date)
if (!isNaN(d)) foundDate.date = d
} catch { }
}
return articleDate
return foundDate
}

0 comments on commit 126570e

Please sign in to comment.