diff --git a/package.json b/package.json index 18decb3..2bd655d 100644 --- a/package.json +++ b/package.json @@ -8,13 +8,15 @@ "scripts": { "eslint": "eslint --ext .js --fix --ignore-path .gitignore .", "predev": "npm run eslint", - "watch": "nodemon --inspect-brk=5858 --exec babel-node src/index.js --watch src", "dev": "babel-node src/index.js --inspect-brk=5858", "build": "babel src --out-dir dist", "start": "NODE_ENV=production node dist/index.js", - "csv-votaciones": "for file in ./$npm_package_config_folder/*.json; do json2csv -i $file >> ./$npm_package_config_folder/votaciones.csv; done", + "csv-votaciones": "for file in ./$npm_package_config_folder/*.json; do json2csv -i $file --fields-config=$npm_package_config_fields_config >> ./$npm_package_config_folder/votaciones.csv; done", "csv-votos": "for file in ./$npm_package_config_folder/votos/$npm_package_config_year/*.json; do json2csv -i $file >> ./$npm_package_config_folder/votaciones_votos-$npm_package_config_year.csv; done", - "diputados-csv-votaciones": "npm run csv-votaciones --app:folder=data/diputados", + "csv-expedientes": "for file in ./$npm_package_config_folder/expedientes/*.json; do json2csv -i $file >> ./$npm_package_config_folder/votaciones_expedientes.csv; done", + "diputados-csv-votaciones": "npm run csv-votaciones --app:folder=data/diputados --app:fields_config=src/providers/ar-diputados/votingsCsv.json", + "diputados-csv-expedientes": "npm run csv-expedientes --app:folder=data/diputados", + "diputados-csv-votos": "npm run csv-votos --app:folder=data/diputados", "senadores-csv-votaciones": "npm run csv-votaciones --app:folder=data/senadores", "senadores-csv-votos": "npm run csv-votos --app:folder=data/senadores" }, diff --git a/src/index.js b/src/index.js index 910df29..9eb9b37 100644 --- a/src/index.js +++ b/src/index.js @@ -8,25 +8,33 @@ const defaultYear = now.getFullYear(); yargs .command({ - command: "votaciones ", + command: "votaciones [anioMax]", desc: "Descarga el listado de votaciones de del indicado", builder: yargs => yargs.default("anio", defaultYear), - handler: argv => parseVotingsFromYear(argv.provider, argv.anio) + handler: argv => + parseVotingsFromYear(argv.provider, argv.anio, argv.anioMax) }) .command({ - command: "votos ", + command: "votos [anioMax] [soloEstasVotaciones..]", desc: "Descarga los votos cada votación de realizada durante el indicado", builder: yargs => yargs.default("anio", defaultYear), - handler: argv => parseVotingsDetailsFromYear(argv.provider, argv.anio) + handler: argv => + parseVotingsDetailsFromYear( + argv.provider, + argv.anio, + argv.anioMax, + argv.soloEstasVotaciones + ) }) .command({ - command: "importar [soloEstasVotaciones..]", + command: "importar [anioMax] [soloEstasVotaciones..]", desc: "Importa todo lo descargado de para el en el API", builder: yargs => yargs.default("anio", defaultYear), handler: argv => getProvider(argv.provider).api.sendYear( argv.anio, + argv.anioMax, argv.soloEstasVotaciones ) }) @@ -53,66 +61,82 @@ function getProvider(providerType) { return provider.default; } -async function parseVotingsFromYear(providerType, year) { +async function parseVotingsFromYear(providerType, yearMin, yearMax = null) { + if (!yearMax) { + yearMax = yearMin; + } const provider = getProvider(providerType); const scrapper = new provider.scrapper(); - try { - logger.info("INICIO DEL ANALISIS DEL AÑO", year); - await scrapper.start(); + for (let year = yearMin; year <= yearMax; year++) { try { - const votings = await scrapper.parseVotingsFromYear(year); - if (votings.length) { - const path = await persistData(providerType, `${year}.json`, votings); - logger.info(`Votaciones guardadas. Archivo: ${path}`); + logger.info("INICIO DEL ANALISIS DEL AÑO", year); + await scrapper.start(); + try { + await scrapper.parseVotingsFromYear(year); + } catch (error) { + logger.error(`parseVotingsFromYear: ${error.message}`); } - } catch (error) { - logger.error(`parseVotingsFromYear: ${error.message}`); + } catch (err) { + logger.error(`Ocurrió un error general durante el proceso ${err}`); + } finally { + await scrapper.finish(); + logger.info(`FIN DEL ANALISIS DEL AÑO ${year}`); } - } catch (err) { - logger.error(`Ocurrió un error general durante el proceso ${err}`); - } finally { - await scrapper.finish(); - logger.info(`FIN DEL ANALISIS DEL AÑO ${year}`); - process.exit(); } + process.exit(); } -async function parseVotingsDetailsFromYear(providerType, year) { +async function parseVotingsDetailsFromYear( + providerType, + yearMin, + yearMax = null, + onlyTheseVotings = [] +) { + if (!yearMax) { + yearMax = yearMin; + } const provider = getProvider(providerType); const scrapper = new provider.scrapper(); - try { - logger.info(`INICIO ANALISIS DE VOTACIONES DEL AÑO ${year}`); + for (let year = yearMin; year <= yearMax; year++) { try { - await scrapper.start(); - const database = getDataFromFile(`${providerType}/${year}.json`); - const page = await scrapper.createPage(); - const editedVotings = []; - for (let voting of database) { - const editedVoting = await scrapper.parseVotingsDetails( - page, - voting, - `${providerType}/votos/${year}` - ); + logger.info(`INICIO ANALISIS DE VOTACIONES DEL AÑO ${year}`); + try { + await scrapper.start(); + const database = getDataFromFile(`${providerType}/${year}.json`); + const page = await scrapper.createPage(); + const editedVotings = []; + for (let voting of database) { + let editedVoting = voting; + if ( + !onlyTheseVotings.length || + onlyTheseVotings.indexOf(voting.id) > -1 + ) { + editedVoting = await scrapper.parseVotingsDetails( + page, + voting, + `${providerType}/votos/${year}` + ); + } + editedVotings.push(editedVoting); + } - editedVotings.push(editedVoting); + const path = await persistData( + providerType, + `${year}.json`, + editedVotings + ); + logger.info(`Votaciones actualizadas. Archivo: ${path}`); + } catch (err) { + logger.error(err.stack); } - - const path = await persistData( - providerType, - `${year}.json`, - editedVotings - ); - logger.info(`Votaciones actualizadas. Archivo: ${path}`); } catch (err) { logger.error(err.stack); + } finally { + await scrapper.finish(); + logger.info(`FIN ANALISIS DE VOTACIONES DEL AÑO: ${year}`); } - } catch (err) { - logger.error(err.stack); - } finally { - await scrapper.finish(); - logger.info(`FIN ANALISIS DE VOTACIONES DEL AÑO: ${year}`); - process.exit(); } + process.exit(); } // async function fillVotingsDetailsFromYear(providerType, year) { diff --git a/src/providers/ar-diputados/scrapper.js b/src/providers/ar-diputados/scrapper.js index a11cd3c..8828005 100644 --- a/src/providers/ar-diputados/scrapper.js +++ b/src/providers/ar-diputados/scrapper.js @@ -1,25 +1,28 @@ import puppeteer from "puppeteer"; -import logger from "services/logger"; +import logger, { pageConsoleLogger } from "services/logger"; import { DOWNLOAD_PATH, getFilesFromFolder, dirExistsSync, - createDirRecursively + createDirRecursively, + persistData } from "services/fs"; const __DEV__ = process.env.NODE_ENV !== "production"; const VOTINGS_URI = "https://votaciones.hcdn.gob.ar"; -const PAGE_LOG = false; - let puppeteerConfig = {}; if (__DEV__) { puppeteerConfig = { headless: !__DEV__, - devtools: __DEV__, + devtools: !__DEV__, slowMo: 100 // slow down by 250ms, }; } +const pageViewport = { + width: 1200, + height: 900 +}; export default class Scrapper { browser; @@ -49,15 +52,10 @@ export default class Scrapper { try { logger.info(`Abriendo nueva pestaña`); const page = await this.browser.newPage(); - if (__DEV__ && PAGE_LOG) { - page.on("console", msg => { - const text = msg.text(); - if (text.indexOf("Failed to load resource") > -1) { - return; - } - logger.log(`PAGE LOG: ${text}`); - }); + if (__DEV__) { + page.setViewport(pageViewport); } + page.on("console", pageConsoleLogger); return page; } catch (error) { throw `Ocurrió un error al crear una página. Error: ${error}`; @@ -91,54 +89,121 @@ export default class Scrapper { row.removeAttribute("style"); // Date. Format: new Date(numero * 1000) - const url = row + const detailsUrl = row .querySelector("td > center > button:nth-child(2)") .getAttribute("urldetalle"); - const id = url.replace("/votacion/", ""); - const date = row.getAttribute("data-date"); + const id = parseInt(detailsUrl.replace("/votacion/", "")); + const date = new Date(parseInt(row.getAttribute("data-date")) * 1000) + .toISOString() + .slice(0, 19) + .replace("T", " "); const title = row .querySelector("td:nth-child(2)") .textContent.replace("(Ver expedientes)", "") + .replace(/\n/g, " ") + .replace(/\t/g, " ") .trim(); const type = row.querySelector("td:nth-child(3)").textContent.trim(); const result = row.querySelector("td:nth-child(4)").textContent.trim(); + // PDF + // onclick="updatePdf('https://votaciones.hcdn.gob.ar/proxy/pdf/1993/111PO06_31_R31.pdf','111','6','31')" + const recordUrl = row + .querySelector("td > center > button:nth-child(1)") + .getAttribute("onclick") + .replace(/.*'(https:\/\/votaciones.*?)'.*/g, "$1"); + + // Video + // onclick="openVideo('1IIlS4l-xOg', '', '')" + const videoUrlAttribute = row + .querySelector("td > center > button:nth-child(3)") + .getAttribute("onclick"); + let videoUrl = null; + if (videoUrlAttribute != null) { + const videoUrlId = videoUrlAttribute.replace( + /openVideo\('(.*?)'.*\)/g, + "$1" + ); + + videoUrl = `https://www.youtube.com/watch?v=${videoUrlId}`; + } + const voting = { id, date, title, type, result, - url + recordUrl, + detailsUrl, + videoUrl }; return voting; }); }); + + try { + const path = await persistData("diputados", `${year}.json`, votings); + logger.info(`Votaciones guardadas. Archivo: ${path}`); + } catch (error) { + logger.info( + `No se pudo guardar el archivo de votaciones. Error: ${error.stack}` + ); + } logger.info( `Análisis de votaciones finalizada. Cantidad: ${votings.length}` ); - logger.info(`Analizando registros...`); + logger.info(`Analizando expedientes...`); + + try { + await this.clickAllFilesLink(page, rowsSelector); + } catch (error) { + logger.error( + `No se pudieron abrir los expedientes del año requerido. Error: ${ + error.stack + }` + ); + } + const recordsFromYear = []; for (const index in votings) { let voting = votings[index]; const nth = parseInt(index) + 1; - const linkSelector = `${rowsSelector}:nth-child(${nth}) > td:nth-child(2) a[id]`; const recordsSelector = `${rowsSelector}:nth-child(${nth}) > td:nth-child(2) div[tituloexpediente]`; + // Si el elemento no existe es porque el click masivo no se realizó ok, por lo que + // es necesario realizarlo nuevamente de forma particular (y esperar el feedback) try { - const link = await page.$(linkSelector); - const linkTextProp = await link.getProperty("textContent"); - const linkText = await linkTextProp.jsonValue(); - if (linkText.indexOf("Ver") > -1) { - await link.click(); - await page.waitForSelector(recordsSelector); + const recordsElement = await page.$(recordsSelector); + if (!recordsElement) { + logger.info(`No se cargaron los expedientes para #${voting.id}`); + const linkSelector = `${rowsSelector}:nth-child(${nth}) > td:nth-child(2) a[id]`; + const linkElement = await page.$(linkSelector); + if (linkElement) { + logger.info(`No hay nada que clickear`); + } else { + // Reintento con todos los expedientes de nuevo + await this.clickAllFilesLink(page, rowsSelector); + } } + } catch (error) { + logger.error( + `No se pudo abrir el expediente de la fila ${nth} para la votación ${ + voting.id + }` + ); + } + try { const records = await page.$$eval(recordsSelector, records => records.map(record => { - const id = record.getAttribute("identificador"); - const title = record.getAttribute("tituloexpediente"); + const id = record.getAttribute("identificador").trim(); + const title = record + .getAttribute("tituloexpediente") + .replace(/\n/g, " ") + .replace(/\t/g, " ") + .trim(); return { id, title @@ -146,28 +211,69 @@ export default class Scrapper { }) ); voting.records = records; - logger.warn( - `Registros de la votación #${voting.id}. Cantidad:`, - records.length - ); + if (records.length) { + logger.info( + `Expedientes encontrados para la votación #${voting.id}: ${ + records.length + }` + ); + + records.map(record => { + record.votingId = voting.id; + recordsFromYear.push(record); + }); + } else { + logger.info(`La votación ${voting.id} no tiene expedientes`); + } } catch (error) { - logger.warn(`Votación #${voting.id} no tiene registros`); - voting.records = []; + logger.error( + `No se pudieron leer los expedientes de la votación #${ + voting.id + }. Error: ${error.stack}` + ); } } - logger.info(`Análisis de registros finalizado`); + + try { + const path = await persistData( + "diputados/expedientes", + `${year}-records.json`, + recordsFromYear + ); + logger.info(`Expedientes guardados. Archivo: ${path}`); + } catch (error) { + logger.info( + `No se pudo guardar el archivo de expedientes. Error: ${error.stack}` + ); + } + logger.info(`Análisis de expedientes finalizado`); await page.close(); return votings; }; + clickAllFilesLink = async (page, rowsSelector) => { + const linkSelector = `${rowsSelector} > td:nth-child(2) a[id]`; + await page.$$eval(linkSelector, async links => { + /* eslint-disable no-console */ + const sleep = ms => new Promise(resolve => setTimeout(resolve, ms)); + for (const link of links) { + if (link.textContent.indexOf("Ver") > -1) { + await sleep(500); // minimo 500ms + link.click(); + } + } + /* eslint-enable no-console */ + }); + }; + /** * Analiza y descarga los votos de les legisladores * para la votación dada */ - parseVotingsDetails = async (page, voting, downloadRelativePath) => { + parseVotingsDetails = async (page, voting, relativePath) => { try { - const pageUrl = `${VOTINGS_URI}${voting.url}`; + const pageUrl = `${VOTINGS_URI}${voting.detailsUrl}`; logger.info(`\nINICIO VOTACION #${voting.id}`); logger.info(pageUrl); @@ -196,51 +302,40 @@ export default class Scrapper { voting.president = await presidentProp.jsonValue(); logger.info(`Presidente\t\t ${voting.president}`); - try { - const documentUrl = await page.$(`.white-box div:nth-child(3) h5 a`); - const documentUrlProp = await documentUrl.getProperty("href"); - voting.documentUrl = await documentUrlProp.jsonValue(); - logger.info(`URL del documento\t ${voting.documentUrl}`); - } catch (err) { - logger.info("No se pudo obtener la URL del documento"); - } - - const affirmativeCount = await page.$( - `.white-box div:nth-child(3) > div.row > div:nth-child(1) > ul > h3` - ); - const affirmativeCountProp = await affirmativeCount.getProperty( - "textContent" - ); - voting.affirmativeCount = await affirmativeCountProp.jsonValue(); + voting.affirmativeCount = await this.getCountAsync(page, 1); logger.info(`Votos afirmativos\t${voting.affirmativeCount}`); - const negativeCount = await page.$( - `.white-box div:nth-child(3) > div.row > div:nth-child(2) > ul > h3` - ); - const negativeCountProp = await negativeCount.getProperty("textContent"); - voting.negativeCount = await negativeCountProp.jsonValue(); + voting.negativeCount = await this.getCountAsync(page, 2); logger.info(`Votos negativos\t\t${voting.negativeCount}`); - const abstentionCount = await page.$( - `.white-box div:nth-child(3) > div.row > div:nth-child(3) > ul > h3` - ); - const abstentionCountProp = await abstentionCount.getProperty( - "textContent" - ); - voting.abstentionCount = await abstentionCountProp.jsonValue(); + voting.abstentionCount = await this.getCountAsync(page, 3); logger.info(`Abstenciones\t\t${voting.abstentionCount}`); - const absentCount = await page.$( - `.white-box div:nth-child(3) > div.row > div:nth-child(4) > ul > h3` - ); - const absentCountProp = await absentCount.getProperty("textContent"); - voting.absentCount = await absentCountProp.jsonValue(); + voting.absentCount = await this.getCountAsync(page, 4); logger.info(`Ausentes\t\t${voting.absentCount}`); - await this.downloadVotesCsvFromPage( - page, - `${downloadRelativePath}/${voting.id}` - ); + const rowsSelector = "#myTable > tbody > tr"; + + try { + await page.evaluate(this.showAllRows); + const votes = await page.$$eval( + rowsSelector, + this.parsePageVotingVotesRows + ); + + for (const i in votes) { + votes[i] = { + date: voting.date, + votingId: voting.id, + ...votes[i] + }; + } + await persistData(relativePath, `${voting.id}.json`, votes); + } catch (error) { + logger.error( + `No se pudieron tomar ni guardar los votos. Error: ${error.stack}` + ); + } } catch (err) { logger.info(err); } finally { @@ -250,6 +345,88 @@ export default class Scrapper { return voting; }; + /** + * Obtiene el conteo de afirmativos/negativos/abstenciones/ausentes + */ + getCountAsync = async (page, nthChild) => { + const countSelector = `.white-box div:nth-child(3) > div.row > div:nth-child(${nthChild}) > ul > h3`; + const countElement = await page.$(countSelector); + const countProp = await countElement.getProperty("textContent"); + return parseInt(await countProp.jsonValue()); + }; + + showAllRows = () => { + // Ejecuto jQuery DataTables para que muestre todos los registros de una + /* eslint-disable no-console */ + // eslint-disable-next-line + jQuery("#myTable") + .DataTable() + .page.len(300) + .draw(); + }; + + /** + * Lee la tabla de votos y devuelve el arreglo + */ + parsePageVotingVotesRows = rows => + rows.map(row => { + /* eslint-disable no-console */ + try { + // Columnas: + // 1. Foto del diputado + const photoUrl = row + .querySelector("td:nth-child(1) > div > a") + .getAttribute("href"); + + // const legislatorId = parseInt(profileUrl.replace(/.*\/([0-9]+)/, "$1")); + console.log(photoUrl); + + // 2. Diputado + const legislator = row + .querySelector("td:nth-child(2)") + .textContent.trim(); + console.log(legislator); + + // 3. Bloque + const party = row.querySelector("td:nth-child(3)").textContent.trim(); + console.log(party); + + // 4. Provincia + const region = row.querySelector("td:nth-child(4)").textContent.trim(); + console.log(region); + + // 5. Cómo votó + const vote = row.querySelector("td:nth-child(5)").textContent.trim(); + console.log(vote); + + // 6. Qué dijo + const videoUrlElement = row.querySelector( + "td:nth-child(6) button[onclick]" + ); + let videoUrl = null; + if (videoUrlElement) { + const videoUrlId = videoUrlElement + .getAttribute("onclick") + .replace(/openVideo\('(.*?)'.*\)/g, "$1"); + videoUrl = `https://www.youtube.com/watch?v=${videoUrlId}`; + } + console.log(videoUrl); + + const data = { + legislator, + party, + region, + vote, + photoUrl, + videoUrl + }; + return data; + } catch (error) { + console.error(error); + } + /* eslint-enable no-console */ + }); + /** * Descarga el CSV con los votos */ diff --git a/src/providers/ar-diputados/votingsCsv.json b/src/providers/ar-diputados/votingsCsv.json new file mode 100644 index 0000000..0c4e4c8 --- /dev/null +++ b/src/providers/ar-diputados/votingsCsv.json @@ -0,0 +1,18 @@ +[ + "id", + "date", + "period", + "meeting", + "record", + "title", + "type", + "president", + "affirmativeCount", + "negativeCount", + "abstentionCount", + "absentCount", + "result", + "recordUrl", + "detailsUrl", + "videoUrl" +]