Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #102 & #120 - Accomplishments & Connections #121

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
72 changes: 62 additions & 10 deletions src/profile/cleanProfileData.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,22 @@ module.exports = (profile) => {
}

if(profile.courses){
profile.courses = profile.courses.map(({ name, year }) => {
const coursesObj = {}
if(name) {
coursesObj.name = name.replace('Course name\n', '')
}
if(year) {
coursesObj.year = year.replace('Course number\n', '')
}
return coursesObj
}
profile.courses = profile.courses.map(
({ name, year }) => ({
name: name ? name.replace('Course name\n', '') : undefined,
year: year ? year.replace('Course number\n', '') :undefined
})
);
}

if(profile.honors){
profile.honors = profile.honors.map(
({ name, date, issuer, description }) => ({
name: name ? name.replace('honor title\n', '') : undefined,
date,
issuer: issuer ? issuer.replace('honor issuer\n', '') : undefined,
description: description ? description.replace('honor description\n', '') : undefined
})
);
}

Expand All @@ -90,5 +96,51 @@ module.exports = (profile) => {
);
}

if (profile.organizations){
profile.organizations = profile.organizations.map(
({ name, date, position, description }) => ({
name: name ? name.replace('organization name\n', '') : undefined,
date,
position: position ? position.replace('organization position\n', '') : undefined,
description: description ? description.replace('organization description\n', '') : undefined
})
);
}

if(profile.patents){
profile.patents = profile.patents.map(
({ name, date, issuer, description, link }) => ({
name: name ? name.replace('Patent title\n', '') : undefined,
date,
issuer: issuer ? issuer.replace('Patent issuer and number\n', '') : undefined,
description: description ? description.replace('Patent description\n', '') : undefined,
link
})
);
}

if(profile.publications){
profile.publications = profile.publications.map(
({ name, date, publisher, description, link }) => ({
name: name ? name.replace('publication title\n','') : undefined,
date,
publisher: publisher ? publisher.replace('publication description\n','') : undefined,
description: description ? description.replace('publication description\n', '') : undefined,
link
})
);
}

if (profile.testScores){
profile.testScores = profile.testScores.map(
({ name, date, score, description }) => ({
name: name ? name.replace('Test name\n', '') : undefined,
date,
score,
description: description ? description.replace('Description\n', '') : undefined
})
)
}

return profile
}
16 changes: 15 additions & 1 deletion src/profile/profile.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
const openPage = require('../openPage')
const scrapSection = require('../scrapSection')
const scrapAccomplishmentPanel = require('./scrapAccomplishmentPanel')
const scrapConnections = require('./scrapConnections')
const scrollToPageBottom = require('./scrollToPageBottom')
const seeMoreButtons = require('./seeMoreButtons')
const contactInfo = require('./contactInfo')
Expand Down Expand Up @@ -36,6 +37,7 @@ module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGet
await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
}


const [profile] = await scrapSection(page, template.profile)
const [about] = await scrapSection(page, template.about)
const positions = await scrapSection(page, template.positions)
Expand All @@ -46,11 +48,17 @@ module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGet
const skills = await scrapSection(page, template.skills)
const accomplishments = await scrapSection(page, template.accomplishments)
const courses = await scrapAccomplishmentPanel(page, 'courses')
const honors = await scrapAccomplishmentPanel(page, 'honors')
const languages = await scrapAccomplishmentPanel(page, 'languages')
const organizations = await scrapAccomplishmentPanel(page, 'organizations')
const patents = await scrapAccomplishmentPanel(page, 'patents')
const projects = await scrapAccomplishmentPanel(page, 'projects')
const publications = await scrapAccomplishmentPanel(page, 'publications')
const testScores = await scrapAccomplishmentPanel(page, 'test-scores');
const volunteerExperience = await scrapSection(page, template.volunteerExperience)
const peopleAlsoViewed = await scrapSection(page, template.peopleAlsoViewed)
const contact = hasToGetContactInfo ? await contactInfo(page) : []
const connections = await scrapConnections(page);

await page.close()
logger.info(`finished scraping url: ${url}`)
Expand All @@ -69,11 +77,17 @@ module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGet
},
accomplishments,
courses,
honors,
languages,
organizations,
patents,
projects,
publications,
testScores,
peopleAlsoViewed,
volunteerExperience,
contact
contact,
connections
}

const cleanedProfile = cleanProfileData(rawProfile)
Expand Down
72 changes: 69 additions & 3 deletions src/profile/profileScraperTemplate.js
Original file line number Diff line number Diff line change
Expand Up @@ -159,30 +159,96 @@ const template = {
}
},
courses: {
selector: '.pv-accomplishments-section',
selector: '.pv-accomplishments-block.courses li',
fields: {
name: '.pv-accomplishment-entity__title',
year: '.pv-accomplishment-entity__course-number'
}
},
honors: {
selector: '.pv-accomplishments-block.honors li',
fields: {
name: '.pv-accomplishment-entity__title',
date: '.pv-accomplishment-entity__date',
issuer: '.pv-accomplishment-entity__issuer',
description: '.pv-accomplishment-entity__description'
}
},
languages: {
selector: '.pv-accomplishments-section',
selector: '.pv-accomplishments-block.languages li',
fields: {
name: '.pv-accomplishment-entity__title',
proficiency: '.pv-accomplishment-entity__proficiency',
}
},
organizations: {
selector: '.pv-accomplishments-block.organizations li',
fields: {
name: '.pv-accomplishment-entity__title',
date: '.pv-accomplishment-entity__date',
position: '.pv-accomplishment-entity__position',
description: '.pv-accomplishment-entity__description'
}
},
patents: {
selector: '.pv-accomplishments-block.patents li',
fields: {
name: '.pv-accomplishment-entity__title',
date: '.pv-accomplishment-entity__date',
issuer: '.pv-accomplishment-entity__issuer',
description: '.pv-accomplishment-entity__description',
link: {
selector: '.pv-accomplishment-entity__external-source',
attribute: 'href'
}
}
},
projects: {
selector: '.pv-accomplishments-section',
selector: '.pv-accomplishments-block.projects li',
fields: {
name: '.pv-accomplishment-entity__title',
date: '.pv-accomplishment-entity__date',
description: '.pv-accomplishment-entity__description',
link: {
selector: '.pv-accomplishment-entity__external-source',
attribute: 'href'
}
}
},
publications: {
selector: '.pv-accomplishments-block.publications li',
fields: {
name: '.pv-accomplishment-entity__title',
date: '.pv-accomplishment-entity__date',
publisher: '.pv-accomplishment-entity__publisher',
description: '.pv-accomplishment-entity__description',
link: {
selector: '.pv-accomplishment-entity__external-source',
attribute: 'href'
}
}
},
'test-scores': {
selector: '.pv-accomplishments-block.test-scores li',
fields: {
name: '.pv-accomplishment-entity__title',
date: '.pv-accomplishment-entity__date',
score: '.pv-accomplishment-entity__score',
description: '.pv-accomplishment-entity__description'
}
},
connections: {
selector: 'li.search-result',
fields: {
name: '.name.actor-name',
distance: '.dist-value',
position: 'p.subline-level-1',
location: 'p.subline-level-2',
link: {
selector: 'a.search-result__result-link',
attribute: 'href'
}
}
}
}

Expand Down
36 changes: 36 additions & 0 deletions src/profile/scrapConnections.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
const template = require('./profileScraperTemplate')
const scrapSection = require('../scrapSection')

const logger = require('../logger')(__filename)

const seeConnectionsSelector = 'a[data-control-name*="topcard_view_all_connections"]'

const scrapConnections = async (page) => {
const link = await page.$eval(seeConnectionsSelector, e => e ? e.href.trim() : undefined);
if (!link) {
logger.warn('no link to connections - most likely not a 1st level connection')
return { total: 0, connections: [] }
}

let currentPage = 1;
await page.goto(link + '&page=' + currentPage);
const total = await page.$eval('.search-results__total', e => e ? parseInt(e.innerText) : 0)
const maxPage = total > 1000 ? 100 : total / 10;
const connections = [];

if (total > 1000) {
logger.warn(`profile have ${total} connections - only first 1000 will be scraped`)
}

do {
// TODO - Last element on page isn't captured
const connectionsOnPage = await scrapSection(page, template.connections);
Array.prototype.push.apply(connections, connectionsOnPage);
await page.goto(link + '&page=' + ++currentPage);
} while (currentPage <= maxPage)

logger.info('finished scraping connections')
return { total, connections };
}

module.exports = scrapConnections;
4 changes: 3 additions & 1 deletion src/profile/seeMoreButtons.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
const logger = require('../logger')(__filename)
const showMoreAccomplishments = require('./showMoreAccomplishments')

const seeMoreButtons = [
{
id: 'SHOW_MORE_ABOUT',
Expand Down Expand Up @@ -36,7 +38,7 @@ const clickAll = async(page) => {
}
}

return
return await showMoreAccomplishments(page);
}

module.exports = { clickAll }
47 changes: 47 additions & 0 deletions src/profile/showMoreAccomplishments.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
const logger = require('../logger')(__filename)

const accomplishments = [
'courses',
'projects',
'languages',
'honors',
'patents',
'publications',
'organizations',
'test-scores'
]

const showAccomplishments = async (page) => {
for (let accomplishment of accomplishments) {
const selector = '.pv-accomplishments-block.' + accomplishment;;

try {
const elems = await page.$$(selector + ' button')
const elem = elems[0]
if (elem)
await elem.click()
else
continue
} catch (e) {
logger.warn(`couldn't click on ${accomplishment}, it's probably invisible`)
continue
}

while (true) {
try {
const elems = await page.$$(selector + ' button.pv-profile-section__see-more-inline')
const elem = elems[0];
if (elem)
await elem.click()
else
break;
} catch (e) {
break
}
}

}
return
}

module.exports = showAccomplishments;
1 change: 1 addition & 0 deletions src/scrapSection.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ const scrapSelector = (selector, section) =>
Object.keys(section.fields)
.reduce(scrapSelectorFields(selector, section), Promise.resolve({}))


module.exports = async (page, section) => {
const sectionSelectors = await page.$$(section.selector)

Expand Down