This repository has been archived by the owner on May 13, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.js
490 lines (490 loc) · 60.5 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
// Parses the development applications at the South Australian Copper Coast Council web site and
// places them in a database.
//
// Michael Bone
// 18th October 2018
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const cheerio = require("cheerio");
const request = require("request-promise-native");
const sqlite3 = require("sqlite3");
const urlparser = require("url");
const moment = require("moment");
const pdfjs = require("pdfjs-dist");
const didyoumean = require("didyoumean2");
const fs = require("fs");
sqlite3.verbose();
const DevelopmentApplicationsUrl = "https://www.coppercoast.sa.gov.au/services/planning-and-development/development-register";
const CommentUrl = "mailto:info@coppercoast.sa.gov.au";
// All valid street and suburb names.
let SuburbNames = null;
let StreetNames = null;
// Sets up an sqlite database.
async function initializeDatabase() {
return new Promise((resolve, reject) => {
let database = new sqlite3.Database("data.sqlite");
database.serialize(() => {
database.run("create table if not exists [data] ([council_reference] text primary key, [address] text, [description] text, [info_url] text, [comment_url] text, [date_scraped] text, [date_received] text)");
resolve(database);
});
});
}
// Inserts a row in the database if it does not already exist.
async function insertRow(database, developmentApplication) {
return new Promise((resolve, reject) => {
let sqlStatement = database.prepare("insert or replace into [data] values (?, ?, ?, ?, ?, ?, ?)");
sqlStatement.run([
developmentApplication.applicationNumber,
developmentApplication.address,
developmentApplication.description,
developmentApplication.informationUrl,
developmentApplication.commentUrl,
developmentApplication.scrapeDate,
developmentApplication.receivedDate
], function (error, row) {
if (error) {
console.error(error);
reject(error);
}
else {
console.log(` Saved application \"${developmentApplication.applicationNumber}\" with address \"${developmentApplication.address}\", description \"${developmentApplication.description}\" and received date \"${developmentApplication.receivedDate}\" to the database.`);
sqlStatement.finalize(); // releases any locks
resolve(row);
}
});
});
}
// Gets the highest Y co-ordinate of all elements that are considered to be in the same row as
// the specified element. Take care to avoid extremely tall elements (because these may otherwise
// be considered as part of all rows and effectively force the return value of this function to
// the same value, regardless of the value of startElement).
function getRowTop(elements, startElement) {
let top = startElement.y;
for (let element of elements)
if (element.y < startElement.y + startElement.height && element.y + element.height > startElement.y) // check for overlap
if (getVerticalOverlapPercentage(startElement, element) > 50) // avoids extremely tall elements
if (element.y < top)
top = element.y;
return top;
}
// Constructs a rectangle based on the intersection of the two specified rectangles.
function intersect(rectangle1, rectangle2) {
let x1 = Math.max(rectangle1.x, rectangle2.x);
let y1 = Math.max(rectangle1.y, rectangle2.y);
let x2 = Math.min(rectangle1.x + rectangle1.width, rectangle2.x + rectangle2.width);
let y2 = Math.min(rectangle1.y + rectangle1.height, rectangle2.y + rectangle2.height);
if (x2 >= x1 && y2 >= y1)
return { x: x1, y: y1, width: x2 - x1, height: y2 - y1 };
else
return { x: 0, y: 0, width: 0, height: 0 };
}
// Calculates the square of the Euclidean distance between two elements.
function calculateDistance(element1, element2) {
let point1 = { x: element1.x + element1.width, y: element1.y + element1.height / 2 };
let point2 = { x: element2.x, y: element2.y + element2.height / 2 };
if (point2.x < point1.x - element1.width / 5) // arbitrary overlap factor of 20% (ie. ignore elements that overlap too much in the horizontal direction)
return Number.MAX_VALUE;
return (point2.x - point1.x) * (point2.x - point1.x) + (point2.y - point1.y) * (point2.y - point1.y);
}
// Determines whether there is vertical overlap between two elements.
function isVerticalOverlap(element1, element2) {
return element2.y < element1.y + element1.height && element2.y + element2.height > element1.y;
}
// Gets the percentage of vertical overlap between two elements (0 means no overlap and 100 means
// 100% overlap; and, for example, 20 means that 20% of the second element overlaps somewhere
// with the first element).
function getVerticalOverlapPercentage(element1, element2) {
let y1 = Math.max(element1.y, element2.y);
let y2 = Math.min(element1.y + element1.height, element2.y + element2.height);
return (y2 < y1) ? 0 : (((y2 - y1) * 100) / element2.height);
}
// Gets the element immediately to the right of the specified element (but ignores elements that
// appear after a large horizontal gap).
function getRightElement(elements, element) {
let closestElement = { text: undefined, confidence: 0, x: Number.MAX_VALUE, y: Number.MAX_VALUE, width: 0, height: 0 };
for (let rightElement of elements)
if (isVerticalOverlap(element, rightElement) && // ensure that there is at least some vertical overlap
getVerticalOverlapPercentage(element, rightElement) > 50 && // avoid extremely tall elements (ensure at least 50% overlap)
(rightElement.x > element.x + element.width) && // ensure the element actually is to the right
(rightElement.x - (element.x + element.width) < 30) && // avoid elements that appear after a large gap (arbitrarily ensure less than a 30 pixel gap horizontally)
calculateDistance(element, rightElement) < calculateDistance(element, closestElement)) // check if closer than any element encountered so far
closestElement = rightElement;
return (closestElement.text === undefined) ? undefined : closestElement;
}
// Gets the text to the right in a rectangle, where the rectangle is delineated by the positions
// in which the three specified strings of (case sensitive) text are found.
function getRightText(elements, topLeftText, rightText, bottomText) {
// Construct a bounding rectangle in which the expected text should appear. Any elements
// over 50% within the bounding rectangle will be assumed to be part of the expected text.
let topLeftElement = elements.find(element => element.text.trim() == topLeftText);
let rightElement = (rightText === undefined) ? undefined : elements.find(element => element.text.trim() == rightText);
let bottomElement = (bottomText === undefined) ? undefined : elements.find(element => element.text.trim() == bottomText);
if (topLeftElement === undefined)
return undefined;
let x = topLeftElement.x + topLeftElement.width;
let y = topLeftElement.y;
let width = (rightElement === undefined) ? Number.MAX_VALUE : (rightElement.x - x);
let height = (bottomElement === undefined) ? Number.MAX_VALUE : (bottomElement.y - y);
let bounds = { x: x, y: y, width: width, height: height };
// Gather together all elements that are at least 50% within the bounding rectangle.
let intersectingElements = [];
for (let element of elements) {
let intersectingBounds = intersect(element, bounds);
let intersectingArea = intersectingBounds.width * intersectingBounds.height;
let elementArea = element.width * element.height;
if (elementArea > 0 && intersectingArea * 2 > elementArea && element.text !== ":")
intersectingElements.push(element);
}
if (intersectingElements.length === 0)
return undefined;
// Sort the elements by Y co-ordinate and then by X co-ordinate.
let elementComparer = (a, b) => (a.y > b.y) ? 1 : ((a.y < b.y) ? -1 : ((a.x > b.x) ? 1 : ((a.x < b.x) ? -1 : 0)));
intersectingElements.sort(elementComparer);
// Join the elements into a single string.
return intersectingElements.map(element => element.text).join(" ").trim().replace(/\s\s+/g, " ");
}
// Gets the text downwards in a rectangle, where the rectangle is delineated by the positions in
// which the three specified strings of (case sensitive) text are found.
function getDownText(elements, topText, rightText, bottomText) {
// Construct a bounding rectangle in which the expected text should appear. Any elements
// over 50% within the bounding rectangle will be assumed to be part of the expected text.
let topElement = elements.find(element => element.text.trim() == topText);
let rightElement = (rightText === undefined) ? undefined : elements.find(element => element.text.trim() == rightText);
let bottomElement = (bottomText === undefined) ? undefined : elements.find(element => element.text.trim() == bottomText);
if (topElement === undefined)
return undefined;
let x = topElement.x;
let y = topElement.y + topElement.height;
let width = (rightElement === undefined) ? Number.MAX_VALUE : (rightElement.x - x);
let height = (bottomElement === undefined) ? Number.MAX_VALUE : (bottomElement.y - y);
let bounds = { x: x, y: y, width: width, height: height };
// Gather together all elements that are at least 50% within the bounding rectangle.
let intersectingElements = [];
for (let element of elements) {
let intersectingBounds = intersect(element, bounds);
let intersectingArea = intersectingBounds.width * intersectingBounds.height;
let elementArea = element.width * element.height;
if (elementArea > 0 && intersectingArea * 2 > elementArea && element.text !== ":")
intersectingElements.push(element);
}
if (intersectingElements.length === 0)
return undefined;
// Sort the elements by Y co-ordinate and then by X co-ordinate.
let elementComparer = (a, b) => (a.y > b.y) ? 1 : ((a.y < b.y) ? -1 : ((a.x > b.x) ? 1 : ((a.x < b.x) ? -1 : 0)));
intersectingElements.sort(elementComparer);
// Join the elements into a single string.
return intersectingElements.map(element => element.text).join(" ").trim().replace(/\s\s+/g, " ");
}
// Parses the details from the elements associated with a single development application.
function parseApplicationElements(elements, startElement, informationUrl) {
// Get the application number.
let applicationNumber = getRightText(elements, "Application No", "Application Date", "Applicants Name");
if (applicationNumber === "") {
let elementSummary = elements.map(element => `[${element.text}]`).join("");
console.log(`Could not find the application number on the PDF page for the current development application. The development application will be ignored. Elements: ${elementSummary}`);
return undefined;
}
console.log(` Found \"${applicationNumber}\".`);
// Get the received date.
let receivedDateText = "";
if (elements.some(element => element.text.trim() == "Application Received")) {
receivedDateText = getRightText(elements, "Application Received", "Planning Approval", "Land Division Approval");
if (receivedDateText === undefined)
receivedDateText = getRightText(elements, "Application Date", "Planning Approval", "Application Received");
}
else if (elements.some(element => element.text.trim() == "Application received")) {
receivedDateText = getRightText(elements, "Application received", "Planning Approval", "Building Application");
if (receivedDateText === undefined)
receivedDateText = getRightText(elements, "Application Date", "Planning Approval", "Application received");
}
let receivedDate = undefined;
if (receivedDateText !== undefined)
receivedDate = moment(receivedDateText.trim(), "D/MM/YYYY", true);
// Get the house number, street and suburb of the address.
let houseNumber = getRightText(elements, "Property House No", "Planning Conditions", "Lot");
if (houseNumber === undefined || houseNumber === "0")
houseNumber = "";
let streetName = getRightText(elements, "Property Street", "Planning Conditions", "Property Suburb");
if (streetName === undefined || streetName === "" || streetName === "0") {
let elementSummary = elements.map(element => `[${element.text}]`).join("");
console.log(`Application number ${applicationNumber} will be ignored because an address was not found or parsed (there is no street name). Elements: ${elementSummary}`);
return undefined;
}
let suburbName = getRightText(elements, "Property Suburb", "Planning Conditions", "Title");
if (suburbName === undefined || suburbName === "" || suburbName === "0") {
let elementSummary = elements.map(element => `[${element.text}]`).join("");
console.log(`Application number ${applicationNumber} will be ignored because an address was not found or parsed (there is no suburb name for street \"${streetName}\"). Elements: ${elementSummary}`);
return undefined;
}
// Two addresses are sometimes recorded in the same field. This is done in a way which is
// ambiguous (ie. it is not possible to reconstruct the original addresses perfectly).
//
// For example, the following address:
//
// House Number: ü35
// Street: RAILWAYüSCHOOL TCE SOUTHüTERRA
// Suburb: PASKEVILLEüPASKEVILLE
//
// should be interpreted as the following two addresses:
//
// RAILWAY TCE SOUTH, PASKEVILLE
// 35 SCHOOL TERRA(CE), PASKEVILLE
//
// whereas the following address:
//
// House Number: 79ü4
// Street: ROSSLYNüSWIFT WINGS ROADüROAD
// Suburb: WALLAROOüWALLAROO
//
// should be interpreted as the following two addresses:
//
// 79 ROSSLYN ROAD, WALLAROO
// 4 SWIFT WINGS ROAD, WALLAROO
//
// And so notice that in the first case above the "TCE" text of the Street belonged to the
// first address. Whereas in the second case above the "WINGS" text of the Street belonged
// to the second address (this was deduced by examining actual existing street names).
let address = "";
if (houseNumber.includes("ü")) {
// The house number always has at most one "ü" character. So split on this character.
let houseNumber1 = houseNumber.split("ü")[0];
let houseNumber2 = houseNumber.split("ü")[1];
// The street name may have one or two "ü" characters.
let streetName1 = undefined;
let streetName2 = undefined;
let streetNameTokens = streetName.split("ü");
if (streetNameTokens.length === 2) {
// If there is one "ü" character in the street then simply split on this to determine
// the street names. For example, "SMITH STREETüRAILWAY TERRACE" is simply split into
// "SMITH STREET" and "RAILWAY TERRACE".
streetName1 = streetNameTokens[0];
streetName2 = streetNameTokens[1];
}
else if (streetNameTokens.length === 3) {
// If there are two "ü" characters then the splitting is more complicated if the
// middle token contains more than one space.
let streetName1Prefix = streetNameTokens[0];
let streetName2Suffix = streetNameTokens[2];
streetNameTokens = streetNameTokens[1].split(" ");
if (streetNameTokens.length === 2) {
// This is a simple case because the middle token contains only one space. For
// example, "OLIVEüTUCKER PARADEüROAD" becomes "OLIVE PARADE" and "TUCKER ROAD".
streetName1 = streetName1Prefix + " " + streetNameTokens[1];
streetName2 = streetNameTokens[0] + " " + streetName2Suffix;
}
else if (streetNameTokens.length === 3) {
// This is a more complicated case because the middle token contains two spaces.
// For example, in "ROSSLYNüSWIFT WINGS ROADüSTREET" does "WINGS" belong with the
// first street or the second? Either "ROSSLYN WINGS ROAD"/"SWIFT STREET" would
// result or "ROSSLYN ROAD"/"SWIFT WINGS STREET" would result. Determining which
// is the reason for the "didyoumean" calls.
streetName1 = streetName1Prefix + " " + streetNameTokens[1] + " " + streetNameTokens[2];
streetName2 = streetNameTokens[0] + " " + streetName2Suffix;
let streetNameMatch1 = didyoumean(streetName1, Object.keys(StreetNames), { caseSensitive: false, returnType: "first-closest-match", thresholdType: "edit-distance", threshold: 2, trimSpace: true });
let streetNameMatch2 = didyoumean(streetName2, Object.keys(StreetNames), { caseSensitive: false, returnType: "first-closest-match", thresholdType: "edit-distance", threshold: 2, trimSpace: true });
if (streetNameMatch1 === null && streetNameMatch2 === null) {
streetName1 = streetName1Prefix + " " + streetNameTokens[2];
streetName2 = streetNameTokens[0] + " " + streetNameTokens[1] + " " + streetName2Suffix;
}
}
}
// Remove the "hundred" prefix that appears in some addresses.
let suburbName1 = suburbName.split("ü")[0].replace(/^HD /, "");
let suburbName2 = suburbName.split("ü")[1].replace(/^HD /, "");
// There are typically two addresses (for example, delineating a corner block). Prefer
// the address that has a house number.
if (/[0-9]+[a-zA-Z]?/.test(houseNumber1))
address = houseNumber1 + " " + streetName1 + ", " + (SuburbNames[suburbName1.toUpperCase()] || suburbName1);
else if (/[0-9]+[a-zA-Z]?/.test(houseNumber2))
address = houseNumber2 + " " + streetName2 + ", " + (SuburbNames[suburbName2.toUpperCase()] || suburbName2);
else
address = houseNumber1 + " " + streetName1 + ", " + (SuburbNames[suburbName1.toUpperCase()] || suburbName1);
}
else {
suburbName = suburbName.replace(/^HD /, "");
address = `${houseNumber} ${streetName}, ${SuburbNames[suburbName.toUpperCase()] || suburbName}`;
}
address = address.trim().replace(/\s\s+/g, " ");
// Get the description.
let description = getDownText(elements, "Development Description", "Relevant Authority", "Private Certifier Name");
// Construct the resulting application information.
return {
applicationNumber: applicationNumber,
address: address,
description: ((description !== undefined && description.trim() !== "") ? description : "NO DESCRIPTION PROVIDED"),
informationUrl: informationUrl,
commentUrl: CommentUrl,
scrapeDate: moment().format("YYYY-MM-DD"),
receivedDate: (receivedDate !== undefined && receivedDate.isValid()) ? receivedDate.format("YYYY-MM-DD") : ""
};
}
// Finds the start element of each development application on the current PDF page (there are
// typically two development applications on a single page and each development application
// typically begins with the text "Application No").
function findStartElements(elements) {
// Examine all the elements on the page that being with "A" or "a".
let startElements = [];
for (let element of elements.filter(element => element.text.trim().toLowerCase().startsWith("a"))) {
// Extract up to 15 elements to the right of the element that has text starting with the
// letter "a" (and so may be the start of the "Application No" text). Join together the
// elements to the right in an attempt to find the best match to "Application No".
let rightElement = element;
let rightElements = [];
let matches = [];
do {
rightElements.push(rightElement);
let text = rightElements.map(element => element.text).join("").replace(/\s/g, "").toLowerCase();
if (text.length >= 15) // stop once the text is too long
break;
if (text.length >= 10) { // ignore until the text is close to long enough
if (text === "applicationno")
matches.push({ element: rightElement, threshold: 0, text: text });
else if (didyoumean(text, ["ApplicationNo"], { caseSensitive: false, returnType: "first-closest-match", thresholdType: "edit-distance", threshold: 1, trimSpace: true }) !== null)
matches.push({ element: rightElement, threshold: 1, text: text });
else if (didyoumean(text, ["ApplicationNo"], { caseSensitive: false, returnType: "first-closest-match", thresholdType: "edit-distance", threshold: 2, trimSpace: true }) !== null)
matches.push({ element: rightElement, threshold: 2, text: text });
}
rightElement = getRightElement(elements, rightElement);
} while (rightElement !== undefined && rightElements.length < 15);
// Chose the best match (if any matches were found).
if (matches.length > 0) {
let bestMatch = matches.reduce((previous, current) => (previous === undefined ||
current.threshold < previous.threshold ||
(current.threshold === previous.threshold && Math.abs(current.text.trim().length - "ApplicationNo".length) <= Math.abs(previous.text.trim().length - "ApplicationNo".length)) ? current : previous), undefined);
startElements.push(bestMatch.element);
}
}
// Ensure the start elements are sorted in the order that they appear on the page.
let yComparer = (a, b) => (a.y > b.y) ? 1 : ((a.y < b.y) ? -1 : 0);
startElements.sort(yComparer);
return startElements;
}
// Parses a PDF document.
async function parsePdf(url) {
let developmentApplications = [];
// Read the PDF.
let buffer = await request({ url: url, encoding: null, proxy: process.env.MORPH_PROXY });
await sleep(2000 + getRandom(0, 5) * 1000);
// Parse the PDF. Each page has the details of multiple applications.
let pdf = await pdfjs.getDocument({ data: buffer, disableFontFace: true, ignoreErrors: true });
for (let pageIndex = 0; pageIndex < pdf.numPages; pageIndex++) {
console.log(`Reading and parsing applications from page ${pageIndex + 1} of ${pdf.numPages}.`);
let page = await pdf.getPage(pageIndex + 1);
// Construct a text element for each item from the parsed PDF information.
let textContent = await page.getTextContent();
let viewport = await page.getViewport(1.0);
let elements = textContent.items.map(item => {
let transform = pdfjs.Util.transform(viewport.transform, item.transform);
// Work around the issue https://github.com/mozilla/pdf.js/issues/8276 (heights are
// exaggerated). The problem seems to be that the height value is too large in some
// PDFs. Provide an alternative, more accurate height value by using a calculation
// based on the transform matrix.
let workaroundHeight = Math.sqrt(transform[2] * transform[2] + transform[3] * transform[3]);
return { text: item.str, x: transform[4], y: transform[5], width: item.width, height: workaroundHeight };
});
// Sort the elements by Y co-ordinate and then by X co-ordinate.
let elementComparer = (a, b) => (a.y > b.y) ? 1 : ((a.y < b.y) ? -1 : ((a.x > b.x) ? 1 : ((a.x < b.x) ? -1 : 0)));
elements.sort(elementComparer);
// Group the elements into sections based on where the "Application No" text starts (and
// any other element the "Application No" elements line up with horizontally with a margin
// of error equal to about half the height of the "Application No" text).
let applicationElementGroups = [];
let startElements = findStartElements(elements);
for (let index = 0; index < startElements.length; index++) {
// Determine the highest Y co-ordinate of this row and the next row (or the bottom of
// the current page). Allow some leeway vertically (add some extra height).
let startElement = startElements[index];
let raisedStartElement = {
text: startElement.text,
confidence: startElement.confidence,
x: startElement.x,
y: startElement.y - startElement.height / 2,
width: startElement.width,
height: startElement.height
};
let rowTop = getRowTop(elements, raisedStartElement);
let nextRowTop = (index + 1 < startElements.length) ? getRowTop(elements, startElements[index + 1]) : Number.MAX_VALUE;
// Extract all elements between the two rows.
applicationElementGroups.push({ startElement: startElements[index], elements: elements.filter(element => element.y >= rowTop && element.y + element.height < nextRowTop) });
}
// Parse the development application from each group of elements (ie. a section of the
// current page of the PDF document). If the same application number is encountered a
// second time add a suffix to the application number so it is unique (and so will be
// inserted into the database later instead of being ignored).
for (let applicationElementGroup of applicationElementGroups) {
let developmentApplication = parseApplicationElements(applicationElementGroup.elements, applicationElementGroup.startElement, url);
if (developmentApplication !== undefined) {
let suffix = 0;
let applicationNumber = developmentApplication.applicationNumber;
while (developmentApplications
.some(otherDevelopmentApplication => otherDevelopmentApplication.applicationNumber === developmentApplication.applicationNumber &&
(otherDevelopmentApplication.address !== developmentApplication.address ||
otherDevelopmentApplication.description !== developmentApplication.description ||
otherDevelopmentApplication.receivedDate !== developmentApplication.receivedDate)))
developmentApplication.applicationNumber = `${applicationNumber} (${++suffix})`; // add a unique suffix
developmentApplications.push(developmentApplication);
}
}
}
return developmentApplications;
}
// Gets a random integer in the specified range: [minimum, maximum).
function getRandom(minimum, maximum) {
return Math.floor(Math.random() * (Math.floor(maximum) - Math.ceil(minimum))) + Math.ceil(minimum);
}
// Pauses for the specified number of milliseconds.
function sleep(milliseconds) {
return new Promise(resolve => setTimeout(resolve, milliseconds));
}
// Parses the development applications.
async function main() {
// Ensure that the database exists.
let database = await initializeDatabase();
// Read the files containing all possible street and suburb names.
SuburbNames = {};
for (let suburb of fs.readFileSync("suburbnames.txt").toString().replace(/\r/g, "").trim().split("\n"))
SuburbNames[suburb.split(",")[0]] = suburb.split(",")[1];
StreetNames = {};
for (let line of fs.readFileSync("streetnames.txt").toString().replace(/\r/g, "").trim().split("\n")) {
let streetName = line.split(",")[0];
let suburbName = line.split(",")[1];
if (StreetNames[streetName] === undefined)
StreetNames[streetName] = [];
StreetNames[streetName].push(suburbName); // several suburbs may exist for the same street name
}
// Retrieve the page that contains the links to the PDFs.
console.log(`Retrieving page: ${DevelopmentApplicationsUrl}`);
let body = await request({ url: DevelopmentApplicationsUrl, proxy: process.env.MORPH_PROXY });
await sleep(2000 + getRandom(0, 5) * 1000);
let $ = cheerio.load(body);
let pdfUrls = [];
for (let element of $("p a").get()) {
let pdfUrl = new urlparser.URL(element.attribs.href, DevelopmentApplicationsUrl);
if (pdfUrl.href.toLowerCase().includes(".pdf"))
if (!pdfUrls.some(url => url === pdfUrl.href)) // avoid duplicates
pdfUrls.push(pdfUrl.href);
}
if (pdfUrls.length === 0) {
console.log("No PDF URLs were found on the page.");
return;
}
// Select the most recent PDF. And randomly select one other PDF (avoid processing all PDFs
// at once because this may use too much memory, resulting in morph.io terminating the current
// process).
let selectedPdfUrls = [];
selectedPdfUrls.push(pdfUrls.shift());
if (pdfUrls.length > 0)
selectedPdfUrls.push(pdfUrls[getRandom(0, pdfUrls.length)]);
if (getRandom(0, 2) === 0)
selectedPdfUrls.reverse();
for (let pdfUrl of selectedPdfUrls) {
console.log(`Parsing document: ${pdfUrl}`);
let developmentApplications = await parsePdf(pdfUrl);
console.log(`Parsed ${developmentApplications.length} development application(s) from document: ${pdfUrl}`);
console.log(`Inserting development applications into the database.`);
for (let developmentApplication of developmentApplications)
await insertRow(database, developmentApplication);
}
}
main().then(() => console.log("Complete.")).catch(error => console.error(error));
//# sourceMappingURL=data:application/json;base64,