Skip to content

Commit

Permalink
Feature bernardro#17: It's now possible to scrape video comments
Browse files Browse the repository at this point in the history
  • Loading branch information
X0R0X committed Jun 14, 2021
1 parent 31054d3 commit c9ed5b7
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 2 deletions.
8 changes: 7 additions & 1 deletion src/crawler_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ exports.handleMaster = async ({ page, requestQueue, searchKeywords, maxResults,
await utils.loadVideosUrls(requestQueue, page, maxRequested, ['MASTER', 'SEARCH'].includes(label), searchOrUrl);
};

exports.handleDetail = async (page, request, extendOutputFunction, subtitlesSettings) => {
exports.handleDetail = async (page, request, extendOutputFunction, subtitlesSettings, scrapeCommentCount= -1) => {
const { titleXp, viewCountXp, uploadDateXp, likesXp, dislikesXp, channelXp, subscribersXp, descriptionXp, durationSlctr } = CONSTS.SELECTORS.VIDEO;

log.info(`handling detail url ${request.url}`);
Expand Down Expand Up @@ -184,6 +184,11 @@ exports.handleDetail = async (page, request, extendOutputFunction, subtitlesSett
}
}

let comments = null;
if (scrapeCommentCount > -1) {
comments = await utils.getVideoComments(page);
}

await extendOutputFunction({
title,
id: videoId,
Expand All @@ -201,5 +206,6 @@ exports.handleDetail = async (page, request, extendOutputFunction, subtitlesSett
subtitles: srt,
subtitlesURL: srtUrl,
subtitlesType: srtType,
comments: comments,
}, { page, request });
};
4 changes: 3 additions & 1 deletion src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Apify.main(async () => {
downloadSubtitles = false,
saveSubsToKVS: saveSubtitlesToKVS = false,
subtitlesLanguage = 'en',
scrapeCommentCount = -1,
} = input;
if (verboseLog) {
log.setLevel(log.LEVELS.DEBUG);
Expand Down Expand Up @@ -202,7 +203,8 @@ Apify.main(async () => {
saveToKVS: saveSubtitlesToKVS,
language: subtitlesLanguage,
kvs: kvStore,
}
},
scrapeCommentCount
);
break;
}
Expand Down
119 changes: 119 additions & 0 deletions src/utility.js
Original file line number Diff line number Diff line change
Expand Up @@ -511,3 +511,122 @@ module.exports.proxyConfiguration = async ({

return configuration;
};
/**
* Scrape video comments from video detail page.
* @param page
* @param maxCommentCount: Maximum number of comments to scrape. If < 0, scrape all available comments (Might take a long
* time).
* @returns {Promise<*>}
*/
module.exports.getVideoComments = async (page, maxCommentCount=0) => {
// This is copied from SDK - We needed to add stopScrollCallback function parameter to quit scrolling when we have
// enough comments scraped. This should be replaced by Apify.utils.puppeteer.infiniteScroll when the SDK will be
// updated by stopScrollCallback feature.
const infiniteScroll = async (page, options = {}) => {
const {timeoutSecs = 0, waitForSecs = 4, scrollDownAndUp = false, buttonSelector, stopScrollCallback} = options;
let finished;
const startTime = Date.now();
const CHECK_INTERVAL_MILLIS = 1000;
const SCROLL_HEIGHT_IF_ZERO = 10000;
const maybeResourceTypesInfiniteScroll = ['xhr', 'fetch', 'websocket', 'other'];
const resourcesStats = {
newRequested: 0,
oldRequested: 0,
matchNumber: 0,
};
page.on('request', (msg) => {
if (maybeResourceTypesInfiniteScroll.includes(msg.resourceType())) {
resourcesStats.newRequested++;
}
});
const checkFinished = setInterval(() => {
if (resourcesStats.oldRequested === resourcesStats.newRequested) {
resourcesStats.matchNumber++;
if (resourcesStats.matchNumber >= waitForSecs) {
clearInterval(checkFinished);
finished = true;
return;
}
} else {
resourcesStats.matchNumber = 0;
resourcesStats.oldRequested = resourcesStats.newRequested;
}
// check if timeout has been reached
if (timeoutSecs !== 0 && (Date.now() - startTime) / 1000 > timeoutSecs) {
clearInterval(checkFinished);
finished = true;
}
}, CHECK_INTERVAL_MILLIS);
const doScroll = async () => {
/* istanbul ignore next */
await page.evaluate(async (scrollHeightIfZero) => {
const delta = document.body.scrollHeight === 0 ? scrollHeightIfZero : document.body.scrollHeight;
window.scrollBy(0, delta);
}, SCROLL_HEIGHT_IF_ZERO);
};
const maybeClickButton = async () => {
const button = await page.$(buttonSelector);
// Box model returns null if the button is not visible
if (button && await button.boxModel()) {
await button.click({delay: 10});
}
};
while (!finished) {
await doScroll();
await page.waitForTimeout(50);
if (scrollDownAndUp) {
await page.evaluate(() => {
window.scrollBy(0, -1000);
});
}
if (buttonSelector) {
await maybeClickButton();
}
if (stopScrollCallback) {
if (await stopScrollCallback()) {
break;
}
}
}
};

const commentSelector = 'ytd-comment-thread-renderer';
// Scroll first to load at lease one comment.
await page.evaluate(() => {
window.scrollBy(0, 500);
});
await page.waitForSelector('ytd-comment-thread-renderer');
await infiniteScroll(page, {stopScrollCallback:async () => {
const commentCount = await page.evaluate(()=>{
return document.body.querySelectorAll(commentSelector).length;
});
log.debug(`Got ${commentCount}/${maxCommentCount} comments for ${page.url()}`)
return commentCount >= maxCommentCount && maxCommentCount > 0;
}}
);
const comments = await page.evaluate((max) => {
const elements = document.body.querySelectorAll(commentSelector);
const a = [];
for (let i =0; i < elements.length; i++) {
const e = elements[i];
const author = e.querySelector('#author-text > span').innerHTML.trim()
.replace(/\\n/g, '');
if (author) {
const comment = e.querySelector('#content-text').innerHTML.trim()
.replace(/\\n/g, '');
a.push({
author: author,
comment: comment,
});
}
if (a.length >= max) {
break;
}
}
return a;
}, maxCommentCount);

log.info(`Scraped ${comments.length} comments for video ${page.url()}`);

return comments;
};

0 comments on commit c9ed5b7

Please sign in to comment.