-
Notifications
You must be signed in to change notification settings - Fork 0
/
recorridos-get-invalid-rows.js
122 lines (101 loc) · 3.87 KB
/
recorridos-get-invalid-rows.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/**
* Script that generates a CSV from a json, with the following structure:
* year | columns... | null_count
*
*/
const fs = require('fs');
const path = require('path');
const PapaParse = require('papaparse');
const years = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024];
// In this array, each element will be an object with the following structure
// {
// year: 2010,
// column: 'column_name',
// null_count: 0,
// empty_string_count: 0,
// }
const csvJson = []
async function processFiles() {
for (let i = 0; i < years.length; i++) {
const year = years[i];
const filePath = path.resolve(__dirname, `data/original/recorridos_realizados_${year}.csv`);
try {
// get time to process the file
const startWatch = process.hrtime();
console.log(`Processing data/original/recorridos_realizados_${year}.csv`)
// for this type of files, we will use PapaParse to read the file, to have a more relaiable way to get the json value per column
let columnNames = [];
let columnNamesDefined = false;
await new Promise((resolve, reject) => {
const config = {
header: true,
dynamicTyping: true,
worker: true,
step: function(results, parser) {
if(!columnNamesDefined) {
columnNames = results.meta.fields; // array of column names
columnNamesDefined = true; // only do this once
for(let i = 0; i < columnNames.length; i++) {
const column = columnNames[i];
csvJson.push({
year: year,
column_name: column,
null_count: 0,
empty_string_count: 0,
row_count: 0,
null_rate: 0,
})
}
}
// process the row
for (let i = 0; i < columnNames.length; i++) {
const column = columnNames[i];
// find the year and column in the csvJson array
const row = csvJson.find(row => row.year === year && row.column_name === column);
// check if the value is null or empty string
if(results.data[column] === null) {
row.null_count++;
}
// check if the value is an empty string
if(results.data[column] === '') {
row.empty_string_count++;
}
// increment the row count
row.row_count++;
}
},
complete: () => {
const endWatch = process.hrtime(startWatch);
console.log(`-- Completed data/original/recorridos_realizados_${year}.csv (took ${endWatch[0]}s)`);
resolve(); // Resolve the promise here
},
error: (error) => {
console.warn(`Error parsing data/original/recorridos_realizados_${year}.csv: ${error.message}`);
reject(error); // Reject the promise here
}
};
// read the file
const read = fs.createReadStream(filePath);
PapaParse.parse(read, config); // Parse the file
});
} catch (err) {
console.warn(`Error reading data/original/recorridos_realizados_${year}.csv: ${err.message}`);
console.error(err)
}
}
}
async function main() {
await processFiles();
// calculate the error rate for each column
for (let i = 0; i < csvJson.length; i++) {
const row = csvJson[i];
row.null_rate = (row.null_count / row.row_count) * 100;
}
console.log('All files processed. Generating CSV file...')
// save the file in a csv
const csv = PapaParse.unparse(csvJson, {});
await fs.promises.writeFile('data/recorridos_get_invalid_rows.csv', csv);
console.log('Done. CSV file saved as recorridos_get_invalid_rows.csv')
return 0;
}
main().catch(err => console.error(err));