Skip to content

Commit

Permalink
add error handling to puppeteer
Browse files Browse the repository at this point in the history
  • Loading branch information
chadlangston committed May 14, 2024
1 parent 395b6b5 commit 1c7cdd3
Show file tree
Hide file tree
Showing 3 changed files with 1,327 additions and 125 deletions.
184 changes: 100 additions & 84 deletions ContentExtractorPuppeteer.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,34 +22,43 @@ class PuppeteerContentExtractor extends ContentExtractor {
this.visitedUrls.add(path);

const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
await page.goto(path, { waitUntil: 'networkidle2' });
try {
const page = await browser.newPage();
page.setDefaultNavigationTimeout(60000); // 60 seconds
await page.goto(path, { waitUntil: 'load' });

let textContent = await page.evaluate(() => {
return Array.from(document.querySelectorAll('p, h1, h2, h3, h4, h5, h6, span')).map(element => element.innerText).join('\n');
});

this.chatSession.appendMessageToFile('\n- ' + path + '\n');
this.addMessageToTempMessageLog(path);
this.addMessageToTempMessageLog(textContent);

if (depth > 0) {
const links = await page.evaluate(() =>
Array.from(document.querySelectorAll('a')).map(a => a.href)
);

// only scrape links from the same domain
const baseHostname = url.parse(path).hostname;
const filteredLinks = links.filter(link => {
const absoluteUrl = url.resolve(path, link);
return url.parse(absoluteUrl).hostname === baseHostname;
let textContent = await page.evaluate(() => {
return Array.from(document.querySelectorAll('p, h1, h2, h3, h4, h5, h6, span')).map(element => element.innerText).join('\n');
});

// Process links concurrently
const promises = filteredLinks.map(async link => await this.extractText(link, depth - 1));
await Promise.all(promises);
this.chatSession.appendMessageToFile('\n- ' + path + '\n');
this.addMessageToTempMessageLog(path);
this.addMessageToTempMessageLog(textContent);

if (depth > 0) {
const links = await page.evaluate(() =>
Array.from(document.querySelectorAll('a')).map(a => a.href)
);

// only scrape links from the same domain
const baseHostname = url.parse(path).hostname;
const filteredLinks = links.filter(link => {
const absoluteUrl = url.resolve(path, link);
return url.parse(absoluteUrl).hostname === baseHostname;
});

// Process links concurrently
const promises = filteredLinks.map(async link => await this.extractText(link, depth - 1));
await Promise.all(promises);
}
await browser.close();
}
catch (error) {
console.error(`Error processing URL ${path}: ${error}`);
}
finally {
await browser.close();
}
await browser.close();
}

async extractImages(path, depth) {
Expand All @@ -59,72 +68,79 @@ class PuppeteerContentExtractor extends ContentExtractor {
}
this.visitedUrls.add(path);

const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
await page.goto(path, { waitUntil: 'networkidle2' });

let imageUrls = await page.evaluate(() =>
Array.from(document.querySelectorAll('img')).map(img => img.src)
);

// Resolve relative URLs and filter out duplicates and google maps
imageUrls = imageUrls
.map(src => url.resolve(path, src))
.filter((src, index, self) => self.indexOf(src) === index
&& !src.includes('maps.googleapis.com'));

for (const imageUrl of imageUrls) {
if (this.visitedUrls.has(imageUrl)) continue;
this.visitedUrls.add(imageUrl);

try {
const imageBuffer = await page.evaluate(async (src) => {
const response = await fetch(src);
const buffer = await response.arrayBuffer();
return Array.from(new Uint8Array(buffer));
}, imageUrl);

const metadata = await sharp(Uint8Array.from(imageBuffer)).metadata();

if (metadata.width < 200 || metadata.height < 200) {
continue; // Skip small images
}
const browser = await puppeteer.launch({headless: 'new'});
try {
const page = await browser.newPage();
page.setDefaultNavigationTimeout(60000); // 60 seconds
await page.goto(path, { waitUntil: 'load' });

let imageUrls = await page.evaluate(() =>
Array.from(document.querySelectorAll('img')).map(img => img.src)
);

this.chatSession.imageTokenCount += this.countImageTokens(metadata.width, metadata.height);
this.chatSession.appendMessageToFile('\n- ' + imageUrl + '\n');

if (metadata.format === 'svg') {
const jpegBuffer = await sharp(Uint8Array.from(imageBuffer)).jpeg().toBuffer();
const base64Image = `data:image/jpeg;base64,${jpegBuffer.toString('base64')}`;
this.addMessageToTempMessageLog(imageUrl);
this.addImageToTempMessageLog(base64Image);
} else {
this.addMessageToTempMessageLog(imageUrl);
this.addImageToTempMessageLog(imageUrl);
// Resolve relative URLs and filter out duplicates and google maps
imageUrls = imageUrls
.map(src => url.resolve(path, src))
.filter((src, index, self) => self.indexOf(src) === index
&& !src.includes('maps.googleapis.com'));

for (const imageUrl of imageUrls) {
if (this.visitedUrls.has(imageUrl)) continue;
this.visitedUrls.add(imageUrl);

try {
const imageBuffer = await page.evaluate(async (src) => {
const response = await fetch(src);
const buffer = await response.arrayBuffer();
return Array.from(new Uint8Array(buffer));
}, imageUrl);

const metadata = await sharp(Uint8Array.from(imageBuffer)).metadata();

if (metadata.width < 200 || metadata.height < 200) {
continue; // Skip small images
}

this.chatSession.imageTokenCount += this.countImageTokens(metadata.width, metadata.height);
this.chatSession.appendMessageToFile('\n- ' + imageUrl + '\n');

if (metadata.format === 'svg') {
const jpegBuffer = await sharp(Uint8Array.from(imageBuffer)).jpeg().toBuffer();
const base64Image = `data:image/jpeg;base64,${jpegBuffer.toString('base64')}`;
this.addMessageToTempMessageLog(imageUrl);
this.addImageToTempMessageLog(base64Image);
} else {
this.addMessageToTempMessageLog(imageUrl);
this.addImageToTempMessageLog(imageUrl);
}
} catch (error) {
console.error(`Error processing image ${imageUrl}: ${error}`);
}
} catch (error) {
console.error(`Error processing image ${imageUrl}: ${error}`);
}
}

if (depth > 0) {
const links = await page.evaluate(() =>
Array.from(document.querySelectorAll('a')).map(a => a.href)
);

// only scrape links from the same domain
const baseHostname = url.parse(path).hostname;
const filteredLinks = links.filter(link => {
const absoluteUrl = url.resolve(path, link);
return url.parse(absoluteUrl).hostname === baseHostname;
});
if (depth > 0) {
const links = await page.evaluate(() =>
Array.from(document.querySelectorAll('a')).map(a => a.href)
);

// only scrape links from the same domain
const baseHostname = url.parse(path).hostname;
const filteredLinks = links.filter(link => {
const absoluteUrl = url.resolve(path, link);
return url.parse(absoluteUrl).hostname === baseHostname;
});

// Process links concurrently
const promises = filteredLinks.map(async link => await this.extractImages(link, depth - 1));
await Promise.all(promises);
}

// Process links concurrently
const promises = filteredLinks.map(async link => await this.extractImages(link, depth-1));
await Promise.all(promises);
await browser.close();
} catch (error) {
console.error(`Error processing URL ${path}: ${error}`);
} finally {
await browser.close();
}

await browser.close();
}
}

Expand Down
Loading

0 comments on commit 1c7cdd3

Please sign in to comment.