-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapeData.js
116 lines (96 loc) · 4.35 KB
/
scrapeData.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
const got = require('got');
const fs = require('fs').promises;
const _ = require('lodash');
const excel = require('excel4node');
const progress = require('cli-progress');
const _colors = require('colors');
const randomUseragent = require('random-useragent');
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
const bar = new progress.SingleBar({
format: 'Currently scraping Amazon... |' + _colors.red('{bar}') + '| {percentage}% | ETA: {eta_formatted} | {value}/{total} Chunks',
barCompleteChar: '\u2588',
barIncompleteChar: '\u2591',
hideCursor: true
});
(async() => {
// Read ASINs file:
let ASINs_file = (await fs.readFile('ASIN_data.txt')).toString().split('\n');
let ASINs = _.compact(ASINs_file);
bar.start(ASINs.length, 0);
// Configure Spreadsheet:
let wb = new excel.Workbook();
let ws = wb.addWorksheet('Amazon Report');
// Set Spreadsheet Headers:
let headerStyle = wb.createStyle({ font: { bold: true }});
let headers = ["ASIN","Title","Rating","Reviews","Price","Rank","Fulfilled By","Available?","MBA?"];
for(h in headers) ws.cell(1, parseInt(h)+1).string(headers[h]).style(headerStyle);
// Set Column Sizes:
let widths = [20,70,20,10,20,10,20,30,10];
for(w in widths) ws.column(parseInt(w)+1).setWidth(widths[w]);
// Scrape each Amazon page:
for(a in ASINs) {
const res = await got.get("https://amazon.com/dp/"+ASINs[a]+"?th=1&psc=1", {
headers: { 'User-Agent': randomUseragent.getRandom() },
retry: { limit: 10 }
});
let curr_row = parseInt(a) + 2;
ws.cell(curr_row, 1).string(ASINs[a]);
// get title
let title = /<span id="productTitle" class="a-size-large product-title-word-break">[\r\n\s]+([a-zA-Z0-9].+)/gm;
try {
let title_match = res.body.matchAll(title);
for(t of title_match) ws.cell(curr_row, 2).string(t[1]);
} catch(e) { ws.cell(curr_row, 2).string("N/A"); }
// get rating
let rate = /<span id="acrPopover" class="reviewCountTextLinkedHistogram noUnderline" title="(.+)"/;
try {
let rate_match = res.body.match(rate);
ws.cell(curr_row, 3).string(rate_match[1]);
} catch(e) { ws.cell(curr_row, 3).string("N/A"); }
// get reviews
let reviews = /<span id="acrCustomerReviewText" class="a-size-base">([0-9, ]+)/;
try {
let review_match = res.body.match(reviews);
ws.cell(curr_row, 4).string(review_match[1]);
} catch(e) { ws.cell(curr_row, 4).string("N/A"); }
// get price
let price = /priceBlockBuyingPriceString">([\$0-9 \-\.]+)/;
try {
let price_match = res.body.match(price);
ws.cell(curr_row, 5).string(price_match[1]);
} catch(e) { ws.cell(curr_row, 5).string("N/A"); }
// get rank
let rank = /#([0-9,]+) in/;
try {
let rank_match = res.body.match(rank);
ws.cell(curr_row, 6).string(rank_match[1]);
} catch(e) { ws.cell(curr_row, 6).string("N/A"); }
// get fulfiller
let shipper = /<span class="tabular-buybox-text">([\w.]+)<\/span>/;
try {
let ship_match = res.body.match(shipper);
ws.cell(curr_row, 7).string(ship_match[1]);
} catch(e) { ws.cell(curr_row, 7).string("N/A"); }
// get available
let available = /<div id="availability" class="a-section a-spacing-base }">[\r\n\s]+<span.+>[\r\n\s]+([\w ]+)/;
try {
let avail_match = res.body.match(available);
ws.cell(curr_row, 8).string(avail_match[1]);
} catch(e) { ws.cell(curr_row, 8).string("N/A"); }
// get merch by amazon
let mba = /(Lightweight, Classic fit, Double-needle sleeve and bottom hem)/;
try {
let mba_match = res.body.match(mba);
if(mba_match[1] === "Lightweight, Classic fit, Double-needle sleeve and bottom hem")
ws.cell(curr_row, 9).string("Yes");
} catch(e) { ws.cell(curr_row, 9).string("No"); }
wb.write('Amazon Report.xlsx'); // add line to spreadsheet each time data is retrieved
bar.increment();
await sleep(2000 + Math.floor(Math.random()*3000)); // wait anywhere between 2-5s
}
// End of program
console.log("\n");
process.exit(0);
})();