-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
155 lines (121 loc) · 4.2 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// This script automates the process of
// scraping specific information from a list of URLs stored in a CSV file
// from the website www.copyrightable.com
import { parse } from 'csv-parse';
import { chromium } from 'playwright';
import fs from 'fs'
(async () => {
// Launch a new browser instance
const browser = await chromium.launch();
const page = await browser.newPage();
const urls = [];
await new Promise((resolve, reject) => {
fs.createReadStream('mylinks1.csv')
.pipe(parse({ delimiter: ',', from_line: 2 })) // Start from the second line if there's a header
.on('data', (row) => {
urls.push(row[0]); // Assuming URLs are in the first column
})
.on('end', () => resolve(urls))
.on('error', (error) => reject(error));
});
// Navigate to your target URL
console.log("my urls: ", urls)
let count=0
for (const link of urls)
{
await page.goto(link, { waitUntil: 'domcontentloaded' });
// Use the CSS selector to scrape the content
let content = await page.$eval(
'body > main > div:nth-child(2) > div > div.flex.gap-8.mt-9.flex-col-reverse.md\\:flex-row',
el => el.innerText
);
// Define labels to extract and initialize an object to hold the structured data
const labels = [
'Copyright Claimant',
'Registration Number',
'Registration Date',
'Year of Creation',
'Record Status',
'Physical Description',
'Personal Authors',
'Corporate Authors',
'Rights Note',
'Application Title Statement',
'Author Statement',
'Authorship'
];
// Initialize an object to store each label's corresponding value
const scrapedData = {};
//Regular expression to match an email pattern after the last comma
// Iterate over each label to extract and format corresponding values from the content
labels.forEach(label => {
const regex = new RegExp(`${label}:?\\s*([\\s\\S]*?)(?=\\n\\s*${labels.join('|')}|$)`, 'i');
const match = content.match(regex);
if (match) {
scrapedData[label] = match[1].trim();
}
});
const shortValues = {};
Object.entries(scrapedData).forEach(([key, value]) => {
shortValues[key] = value.split('\n')[0].trim();
});
Object.entries(shortValues).forEach(([key, value]) => {
if (key === 'Registration Number') {
// Extract the text until the first comma
const valueUntilComma = value.split(',')[0].trim();
shortValues['Registration Number']=valueUntilComma
}
if(key==='Rights Note')
{
const emailMatch = value.match(/[^,]*@[^,]*$/);
const email = emailMatch ? emailMatch[0].trim() : "N/A";
// Log the extracted email
shortValues['Email']=email
// Remove the email from the value string
if (emailMatch) {
value= value.replace(emailMatch[0], '').trim();
// Remove any trailing comma and whitespace
value = value.replace(/,\s*$/, '');
shortValues['Rights Note']=value
}
}
if(!value)
shortValues[key]='N/A'
});
// for (const [key, value] of Object.entries(scrapedData)) {
// console.log(`${key}: ${value}`);
// }
for (const [key, value] of Object.entries(shortValues)) {
if(value=='')
shortValues[key]= 'N/A'
shortValues[key]= value.replace(/,/g, "");
}
// Optionally, prepare this data for CSV export
const headersOrder = [
'Copyright Claimant',
'Application Title Statement',
'Email',
'Registration Number',
'Registration Date',
'Year of Creation',
'Record Status',
'Physical Description',
'Personal Authors',
'Corporate Authors',
'Rights Note',
'Author Statement',
'Authorship'
];
// Create CSV rows
const csvRows = [];
//csvRows.push(headersOrder.join(',')); // Add shuffled headers
const values = headersOrder.map(header => shortValues[header] || ''); // Map values based on new header order
csvRows.push(values.join(',')); // Add values
// Write the CSV data to a file
const csvData = csvRows.join('\n') + '\n';
fs.appendFileSync('updated_data1.csv', csvData);
console.log('record no- ',++count);
}
// Close the browser
await browser.close();
})();