-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
84 lines (70 loc) · 2.23 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import puppeteer from "puppeteer";
import slugify from "slugify";
import chalk from "chalk";
import { chunk, createProgressBar, fetchJson, writeFile } from "./utils.js";
import { getRepoIssueUrls, scrapIssueUsers } from "./issue-scraper.js";
const githubRepo = process.argv[2];
const browserPages = 15;
const delayMs = 1000;
const lookingForGithubRepo = async (githubRepo) => {
console.log(
chalk.cyan(`Looking for repository "${githubRepo}" in GitHub...`)
);
const repoJson = await fetchJson(
`https://api.github.com/repos/${githubRepo}`
);
if (repoJson.message === "Not Found") {
console.error(` -> ❌ Repository "${githubRepo}" not found.`);
process.exit(1);
}
console.log(` -> Repository found.\n`);
};
const scrapGithubRepo = async () => {
const filepath = `users/${slugify(githubRepo.replaceAll(/\//g, "_"))}.csv`;
const logger = [];
await lookingForGithubRepo(githubRepo);
const urls = await getRepoIssueUrls(githubRepo);
console.log(
chalk.cyan(`Start scrapping users (parallel x${browserPages})...`)
);
const progressBar = createProgressBar(urls.length);
const browser = await puppeteer.launch({ headless: true });
const urlChunks = chunk(urls, browserPages);
const chunkUsers = await Promise.all(
urlChunks.map(async (urlChunk) => {
try {
const users = await scrapIssueUsers({
browser,
urls: urlChunk,
delayMs,
progressBar,
logger,
});
return users;
} catch (error) {
logger.push(` -> Error scraping issue users: ${error}`);
return [];
}
})
);
await browser.close();
const results = [...new Set(chunkUsers.flat())].sort();
console.log("\n");
console.log(chalk.green("Done!"), `Found ${results.length} unique users`);
await writeFile(filepath, results.join("\n"));
console.log(`Results saved into "${filepath}"\n`);
if (logger.length > 0) {
console.log(`\n\nLogger:`);
console.log(logger.join("\n"));
}
};
if (!githubRepo) {
console.error(
chalk.red(
"❌ Please provide a GitHub repository as an argument. (ex. 'openai/dall-e')"
)
);
process.exit(1);
}
process.setMaxListeners(50);
scrapGithubRepo(githubRepo).catch((e) => console.log(e));