From 591358c5a54919b7b8fc0819679c11ffe1d3472e Mon Sep 17 00:00:00 2001 From: MohamedBassem Date: Tue, 5 Mar 2024 00:33:59 +0000 Subject: [PATCH] fix: Use puppeteer adblocker to block cookies notices --- packages/workers/crawler.ts | 6 ++ packages/workers/package.json | 1 + pnpm-lock.yaml | 113 ++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+) diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index f1ee07f3..fbbee730 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -18,6 +18,7 @@ import { db } from "@hoarder/db"; import { Browser } from "puppeteer"; import puppeteer from "puppeteer-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; +import AdblockerPlugin from "puppeteer-extra-plugin-adblocker"; import metascraper from "metascraper"; @@ -70,6 +71,11 @@ async function launchBrowser() { export class CrawlerWorker { static async build() { puppeteer.use(StealthPlugin()); + puppeteer.use( + AdblockerPlugin({ + blockTrackersAndAnnoyances: true, + }), + ); await launchBrowser(); logger.info("Starting crawler worker ..."); diff --git a/packages/workers/package.json b/packages/workers/package.json index a7b62462..f2fc164c 100644 --- a/packages/workers/package.json +++ b/packages/workers/package.json @@ -26,6 +26,7 @@ "openai": "^4.26.1", "puppeteer": "^22.0.0", "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-adblocker": "^2.13.6", "puppeteer-extra-plugin-stealth": "^2.11.2", "tsx": "^4.7.1", "typescript": "^5", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ea3891da..4f7a22a6 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -420,6 +420,9 @@ importers: puppeteer-extra: specifier: ^3.3.6 version: 3.3.6(puppeteer@22.3.0) + puppeteer-extra-plugin-adblocker: + specifier: ^2.13.6 + version: 2.13.6(puppeteer-extra@3.3.6)(puppeteer@22.3.0) puppeteer-extra-plugin-stealth: specifier: ^2.11.2 version: 2.11.2(puppeteer-extra@3.3.6) @@ -1721,6 +1724,40 @@ packages: '@babel/helper-validator-identifier': 7.22.20 to-fast-properties: 2.0.0 + /@cliqz/adblocker-content@1.26.16: + resolution: {integrity: sha512-N1pKg1gxfpnz47w2Sjs2sg3fxFZb113ClUhitgAFSVXeIhZ+S+bCaQtvwtP0mJT+SDfUx2NsPiLwZoPjVRI3wQ==} + dependencies: + '@cliqz/adblocker-extended-selectors': 1.26.16 + dev: false + + /@cliqz/adblocker-extended-selectors@1.26.16: + resolution: {integrity: sha512-ePXS3aD1R+0XfCnOj0L2ms0NA5AxKHfFLfw92cZ87IPY8ZEZK/sWwQCv5wawbwBmXksr0YkMfFVCiH/IQgUNBQ==} + dev: false + + /@cliqz/adblocker-puppeteer@1.23.8(puppeteer@22.3.0): + resolution: {integrity: sha512-Ca1/DBqQXsOpKTFVAHX6OpLTSEupXmUkUWHj6iXhLLleC7RPISN5B0b801VDmaGRqoC5zKRxn0vYbIfpgCWVug==} + peerDependencies: + puppeteer: '>5' + dependencies: + '@cliqz/adblocker': 1.26.16 + '@cliqz/adblocker-content': 1.26.16 + puppeteer: 22.3.0(typescript@5.3.3) + tldts-experimental: 5.7.112 + dev: false + + /@cliqz/adblocker@1.26.16: + resolution: {integrity: sha512-NQ5WdNeiWiggDhhT/IXbsjKgH44nA9k5GlW00gUWRUpfKHCCInyDJYjM5pbHqxhgC3LkMVmXmU5vIsMUZ4RxFQ==} + dependencies: + '@cliqz/adblocker-content': 1.26.16 + '@cliqz/adblocker-extended-selectors': 1.26.16 + '@remusao/guess-url-type': 1.2.1 + '@remusao/small': 1.2.1 + '@remusao/smaz': 1.9.1 + '@types/chrome': 0.0.260 + '@types/firefox-webext-browser': 120.0.1 + tldts-experimental: 6.1.11 + dev: false + /@colors/colors@1.6.0: resolution: {integrity: sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA==} engines: {node: '>=0.1.90'} @@ -3375,6 +3412,35 @@ packages: engines: {node: '>=14.0.0'} dev: false + /@remusao/guess-url-type@1.2.1: + resolution: {integrity: sha512-rbOqre2jW8STjheOsOaQHLgYBaBZ9Owbdt8NO7WvNZftJlaG3y/K9oOkl8ZUpuFBisIhmBuMEW6c+YrQl5inRA==} + dev: false + + /@remusao/small@1.2.1: + resolution: {integrity: sha512-7MjoGt0TJMVw1GPKgWq6SJPws1SLsUXQRa43Umht+nkyw2jnpy3WpiLNqGdwo5rHr5Wp9B2W/Pm5RQp656UJdw==} + dev: false + + /@remusao/smaz-compress@1.9.1: + resolution: {integrity: sha512-E2f48TwloQu3r6BdLOGF2aczeH7bJ/32oJGqvzT9SKur0cuUnLcZ7ZXP874E2fwmdE+cXzfC7bKzp79cDnmeyw==} + dependencies: + '@remusao/trie': 1.4.1 + dev: false + + /@remusao/smaz-decompress@1.9.1: + resolution: {integrity: sha512-TfjKKprYe3n47od8auhvJ/Ikj9kQTbDTe71ynKlxslrvvUhlIV3VQSuwYuMWMbdz1fIs0H/fxCN1Z8/H3km6/A==} + dev: false + + /@remusao/smaz@1.9.1: + resolution: {integrity: sha512-e6BLuP8oaXCZ9+v46Is4ilAZ/Vq6YLgmBP204Ixgk1qTjXmqvFYG7+AS7v9nsZdGOy96r9DWGFbbDVgMxwu1rA==} + dependencies: + '@remusao/smaz-compress': 1.9.1 + '@remusao/smaz-decompress': 1.9.1 + dev: false + + /@remusao/trie@1.4.1: + resolution: {integrity: sha512-yvwa+aCyYI/UjeD39BnpMypG8N06l86wIDW1/PAc6ihBRnodIfZDwccxQN3n1t74wduzaz74m4ZMHZnB06567Q==} + dev: false + /@rollup/plugin-babel@5.3.1(@babel/core@7.23.9)(rollup@2.79.1): resolution: {integrity: sha512-WFfdLWU/xVWKeRQnKmIAQULUI7Il0gZnBIH/ZFO069wYIfPu+8zrfp/KMW0atmELoRDq8FbiP3VCss9MhCut7Q==} engines: {node: '>= 10.0.0'} @@ -3946,6 +4012,10 @@ packages: resolution: {integrity: sha512-xFU8ZXTw4gd358lb2jw25nxY9QAgqn2+bKKjKOYfNCzN4DKCFetK7sPtrlpg66Ywe3vWY9FNxprZawAh9wfJ3g==} dev: false + /@types/firefox-webext-browser@120.0.1: + resolution: {integrity: sha512-IR+NpPC+/o9TSTelcvT/w3fXTanX3LrpVxC5EQrlQyTjyWOKFz8O2mCJQ9VuejBz4NtovCGGKacXQ/VyY63L0A==} + dev: false + /@types/glob@7.2.0: resolution: {integrity: sha512-ZUxbzKl0IfJILTS6t7ip5fQQM/J3TJYubDm3nMbgubNNYS62eXeUpoLUC8/7fJNiFYHTrGPQn7hspDUzIHX3UA==} dependencies: @@ -9651,6 +9721,33 @@ packages: - utf-8-validate dev: false + /puppeteer-extra-plugin-adblocker@2.13.6(puppeteer-extra@3.3.6)(puppeteer@22.3.0): + resolution: {integrity: sha512-AftgnUZ1rg2RPe9RpX6rkYAxEohwp3iFeGIyjsAuTaIiw4VLZqOb1LSY8/S60vAxpeat60fbCajxoUetmLy4Dw==} + engines: {node: '>=8'} + peerDependencies: + puppeteer: '*' + puppeteer-core: '*' + puppeteer-extra: '*' + peerDependenciesMeta: + puppeteer: + optional: true + puppeteer-core: + optional: true + puppeteer-extra: + optional: true + dependencies: + '@cliqz/adblocker-puppeteer': 1.23.8(puppeteer@22.3.0) + debug: 4.3.4 + node-fetch: 2.7.0 + puppeteer: 22.3.0(typescript@5.3.3) + puppeteer-extra: 3.3.6(puppeteer@22.3.0) + puppeteer-extra-plugin: 3.2.3(puppeteer-extra@3.3.6) + transitivePeerDependencies: + - encoding + - playwright-extra + - supports-color + dev: false + /puppeteer-extra-plugin-stealth@2.11.2(puppeteer-extra@3.3.6): resolution: {integrity: sha512-bUemM5XmTj9i2ZerBzsk2AN5is0wHMNE6K0hXBzBXOzP5m5G3Wl0RHhiqKeHToe/uIH8AoZiGhc1tCkLZQPKTQ==} engines: {node: '>=8'} @@ -10883,10 +10980,26 @@ packages: hasBin: true dev: false + /tldts-core@5.7.112: + resolution: {integrity: sha512-mutrEUgG2sp0e/MIAnv9TbSLR0IPbvmAImpzqul5O/HJ2XM1/I1sajchQ/fbj0fPdA31IiuWde8EUhfwyldY1Q==} + dev: false + /tldts-core@6.1.11: resolution: {integrity: sha512-ZFcT+/fdEc5VRndQIJtArNBHsaq4udRoeE4E6cwLzGaH0dq7Ng2L7cAoea6riM2uhNFD09EDa1bN8lrfrOBCLg==} dev: false + /tldts-experimental@5.7.112: + resolution: {integrity: sha512-Nq5qWN4OiLziAOOOEoSME7cZI4Hz8Srt+9q6cl8mZ5EAhCfmeE6l7K5XjuIKN+pySuGUvthE5aPiD185YU1/lg==} + dependencies: + tldts-core: 5.7.112 + dev: false + + /tldts-experimental@6.1.11: + resolution: {integrity: sha512-4Ij/BzPUYS33PcAo9cprPm8qmKNBeYw2U7WsBAMtseqbQvCIyDsnXlOWy/SKmldalPdMPsL2CLjt27+KlWBH7g==} + dependencies: + tldts-core: 6.1.11 + dev: false + /tldts@6.1.11: resolution: {integrity: sha512-AAgE/IWvbsg4Lr4KGFNR7bL/MhQfBlgGV9UBg2uy5mCwSGi5f12eZ7ZydAqv4ACys6pUYjNoV2qfZdcCn4RS+Q==} hasBin: true