Skip to content

Commit

Permalink
Delete old raw docs by service+api_version (fix #79)
Browse files Browse the repository at this point in the history
See issue for details.

The change in dbqueries.js is actually not necessary, but I did it anyway for continuity (now ALL grouping is done on `service`+`api_version`, nothing on `backend` anymore).
  • Loading branch information
christophfriedrich committed Sep 18, 2020
1 parent 6735911 commit cdf8a88
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
9 changes: 7 additions & 2 deletions crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ mongo.connect(async (error, client) => {
response.data.versions
.filter(b => ! b.api_version.startsWith('0.3.')) // the Hub doesn't support openEO API v0.3.x anymore
.forEach(b => individualBackends[b.api_version] = b.url.replace(/\/$/, '')); // URL always without trailing slash
allIndividualBackends = allIndividualBackends.concat(Object.values(individualBackends));
allIndividualBackends = allIndividualBackends.concat(Object.keys(individualBackends).map(version => serviceUrl+'@'+version));
}
catch(error) {
console.log('An error occurred while getting or reading ' + url + ' (' + error.name + ': ' + error.message + ')');
Expand Down Expand Up @@ -204,7 +204,12 @@ mongo.connect(async (error, client) => {

// Delete all entries that don't belong to one of the backends that are listed in the currently configured services's well-known documents
// But exempt those that failed to download. The two conditions are implicitly connected with AND.
await collection.deleteMany({ backend: { $not: { $in: allIndividualBackends }}, service: { $not: { $in: allFailedServices }} });
// See also issue #79, https://stackoverflow.com/q/63937811, and the MongoDB docs for "$expr" and "$in (aggregation)"
// Note that the two "$in" are NOT exactly the same operator (one is from the query lanuage, one from the aggregation framework)
await collection.deleteMany({
$expr: { $not: { $in: [ {$concat:["$service","@","$api_version"]}, allIndividualBackends ] } },
service: { $not: { $in: allFailedServices } }
});

// Increase `unsucessfulCrawls` counter of items that were not updated in this run
await collection.updateMany({retrieved: {$lt: starttimestamp}}, {$inc: {unsuccessfulCrawls: 1}});
Expand Down
2 changes: 1 addition & 1 deletion src/dbqueries.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ module.exports = {
// But since the endpoints are hardcoded anyway there's no benefit, especially not when considering regex slowness.
{ $sort: { backend: 1, path: 1 } },
{ $group: {
_id: '$backend',
_id: {$concat: ['$service', '@', '$api_version']},
service: { $first: '$service' },
api_version: { $first: '$api_version' },
backend: { $first: '$backend' },
Expand Down

0 comments on commit cdf8a88

Please sign in to comment.