-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawler-proxy.js
142 lines (108 loc) · 4.17 KB
/
crawler-proxy.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
function range(start, end) {
const array = [];
for (let i=start; i<end; i++) {
array.push(i);
}
return array;
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
try {
const url = `https://www.trustpilot.com/search?query=${formattedKeyword}&page=${pageNumber+1}`;
const proxyUrl = getScrapeOpsUrl(url, location);
await page.goto(proxyUrl);
console.log(`Successfully fetched: ${url}`);
const script = await page.$("script[id='__NEXT_DATA__']");
const innerHTML = await page.evaluate(element => element.innerHTML, script);
const jsonData = JSON.parse(innerHTML);
const businessUnits = jsonData.props.pageProps.businessUnits;
for (const business of businessUnits) {
let category = "n/a";
if ("categories" in business && business.categories.length > 0) {
category = business.categories[0].categoryId;
}
let location = "n/a";
if ("location" in business && "country" in business.location) {
location = business.location.country
}
const trustpilotFormatted = business.contact.website.split("://")[1];
const businessInfo = {
name: business.displayName.toLowerCase().replace(" ", "").replace("'", ""),
stars: business.stars,
rating: business.trustScore,
num_reviews: business.numberOfReviews,
website: business.contact.website,
trustpilot_url: `https://www.trustpilot.com/review/${trustpilotFormatted}`,
location: location,
category: category
};
await writeToCsv([businessInfo], `${keyword.replace(" ", "-")}.csv`);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, pages, location, concurrencyLimit, retries) {
const pageList = range(0, pages);
const browser = await puppeteer.launch()
while (pageList.length > 0) {
const currentBatch = pageList.splice(0, concurrencyLimit);
const tasks = currentBatch.map(page => scrapeSearchResults(browser, keyword, page, location, retries));
try {
await Promise.all(tasks);
} catch (err) {
console.log(`Failed to process batch: ${err}`);
}
}
await browser.close();
}
async function main() {
const keywords = ["online bank"];
const concurrencyLimit = 5;
const pages = 1;
const location = "us";
const retries = 3;
for (const keyword of keywords) {
await startScrape(keyword, pages, location, concurrencyLimit, retries);
}
}
main();