Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scrape.js: Use GraphQL API #148

Merged
merged 1 commit into from
Oct 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions lib/queries/github_search_org.graphql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
query($query: String!) {
search(type: USER, query: $query, first: 1) {
nodes {
...on Organization {
login
}
}
}
}
6 changes: 6 additions & 0 deletions lib/queries/github_user_info.graphql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
query($user: String!) {
user(login: $user) {
login
updatedAt
}
}
2 changes: 2 additions & 0 deletions lib/queries/index.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const { loadQuery } = require('../utils')

module.exports.GITHUB_REPO_INFO_QUERY = loadQuery('github_repo_info')
module.exports.GITHUB_SEARCH_ORG_QUERY = loadQuery('github_search_org')
module.exports.GITHUB_USER_INFO_QUERY = loadQuery('github_user_info')
77 changes: 67 additions & 10 deletions lib/scrape.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ const validUsername = require('valid-github-username')
const wdk = require('wikidata-sdk')
const cheerio = require('cheerio')

const { GITHUB_REPO_INFO_QUERY } = require('./queries')
const {
GITHUB_REPO_INFO_QUERY,
GITHUB_SEARCH_ORG_QUERY,
GITHUB_USER_INFO_QUERY,
} = require('./queries')
const { getLatestCommitMessage } = require('./utils')

const GH_BASE = 'https://github.com'
Expand Down Expand Up @@ -137,6 +141,7 @@ async function fetchRepositoryInfo(org) {
;({ data, errors } = await client.query(GITHUB_REPO_INFO_QUERY, { org }))
} catch (error) {
console.warn(`GitHub query for org ${org} fails, error: ${error}`)
return []
}

if (data && data.organization) {
Expand Down Expand Up @@ -209,12 +214,34 @@ async function checkGitHubUserExists(user) {
}

async function searchGitHubOrgs(query) {
let results = []
// use REST API to fetch
const res = await fetch(
`${GH_API_BASE}/search/users?q=${query}%20type:org`,
`${GH_API_BASE}/search/users?q=${query}`,
GH_API_OPTIONS
)
console.log('query=', `${GH_API_BASE}/search/users?q=${query}`)
const { items } = await res.json()
return items || []
results = results.concat(items)
// use GraphQL API to fetch
let data, errors
try {
;({ data, errors } = await client.query(GITHUB_SEARCH_ORG_QUERY, { query }))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, what is the semicolon for?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure, but eslint will be unhappy with that ;)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it is because you put parentheses in

;({ data, errors } = await client.query(GITHUB_SEARCH_ORG_QUERY, { query }))

Another question, why is the parentheses used?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is because I put parentheses.
I have to use parentheses otherwise there would be a parsing error.

} catch (error) {
console.warn(`GitHub query ${query} fails, error: ${error}`)
}

if (data && data.search && data.search.nodes) {
results = results.concat(data.search.nodes)
} else {
const errorMessage =
errors && errors.length ? errors[0].message : 'unknown error'
console.warn(
`Cannot query ${query} from GitHub, error message: ${errorMessage}`
)
}
console.log('results', results)
return unique(results)
}

async function getGitHubUserHistory(user, from, to) {
Expand Down Expand Up @@ -262,12 +289,34 @@ function findMatches(input, pattern) {
}

async function getGitHubUser(user) {
const res = await fetch(`${GH_API_BASE}/users/${user}`, GH_API_OPTIONS)
let response = await res.json()
if (response && response.message) {
response = undefined
let data, errors
try {
;({ data, errors } = await client.query(GITHUB_USER_INFO_QUERY, { user }))
} catch (error) {
console.warn(`GitHub query for user ${user} fails, error: ${error}`)
}

if (data && data.user) {
return data.user
} else {
const errorMessage =
errors && errors.length ? errors[0].message : 'unknown error'
console.warn(
`Cannot fetch user ${user} via GitHub GraphQL,`,
`error message: ${errorMessage}, resorting to GitHub REST API hit`
)
const res = await fetch(`${GH_API_BASE}/users/${user}`, GH_API_OPTIONS)
const response = await res.json()
if (response && response.message) {
console.warn(
`Cannot fetch user ${user} via GitHub REST API,`,
`error message: ${response.message}`
)
return undefined
} else {
return response
}
}
return response
}

async function findOrganization({
Expand Down Expand Up @@ -307,10 +356,14 @@ async function findOrganization({
)

const removePattern = /the|project|\([a-zA-Z]+\)/gi
const searchQuery = name.replace(removePattern, '').trim()
const searchQuery = name.replace(removePattern, '').trim() + ' type:org'
const searchResults = await searchGitHubOrgs(searchQuery)

if (searchResults.length > 0 && searchResults[0].score > MIN_SEARCH_SCORE) {
if (searchResults.length > 0) {
if (searchResults[0].score && searchResults[0].score <= MIN_SEARCH_SCORE) {
// GitHub REST API returns a list of matches with confidence score
return null
}
return searchResults[0].login
}

Expand Down Expand Up @@ -658,6 +711,10 @@ async function fetchDates() {
return res.json()
}

function unique(arr) {
return Array.from(new Set(arr))
}

;(async () => {
const { competition_open_starts } = await fetchProgram()
COMPETITION_OPEN = new Date(competition_open_starts)
Expand Down