forked from github/docs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse-page-sections-into-records.js
104 lines (87 loc) · 3.38 KB
/
parse-page-sections-into-records.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env node
import { render } from 'cheerio-to-text'
import { maxContentLength } from '../../lib/search/config.js'
// This module takes cheerio page object and divides it into sections
// using H1,H2 heading elements as section delimiters. The text
// that follows each heading becomes the content of the search record.
const ignoredHeadingSlugs = ['in-this-article', 'further-reading', 'prerequisites']
export default function parsePageSectionsIntoRecords(page) {
const { href, $, languageCode } = page
const title = $('h1').first().text().trim()
const breadcrumbsArray = $('[data-search=breadcrumbs] nav.breadcrumbs a')
.map((i, el) => {
return $(el).text().trim().replace('/', '').replace(/\s+/g, ' ')
})
.get()
.slice(0, -1)
// Like in printing from DOM, some elements should not be included in
// the records for search. This might be navigational elements of the
// page that don't make much sense to find in a site search.
$('[data-search=hide]').remove()
const breadcrumbs = breadcrumbsArray.join(' / ') || ''
const metaKeywords = $('meta[name="keywords"]').attr('content')
const topics = (metaKeywords ? metaKeywords.split(',') : [])
.filter(Boolean)
.map((keyword) => keyword.trim())
const productName = breadcrumbsArray[0] || ''
if (productName) topics.push(productName)
// Remove "github" to make filter queries shorter
if (productName.includes('GitHub ')) {
const productNameShort = productName.replace('GitHub ', '').trim()
if (productNameShort) topics.push(productNameShort)
}
const objectID = href
const rootSelector = '[data-search=article-body]'
const $root = $(rootSelector)
if ($root.length === 0) {
console.warn(`${href} has no '${rootSelector}'`)
} else if ($root.length > 1) {
console.warn(`${href} has more than one '${rootSelector}' (${$root.length})`)
}
const $sections = $('h2', $root)
.filter('[id]')
.filter((i, el) => {
return !ignoredHeadingSlugs.includes($(el).attr('id'))
})
const headings = $sections
.map((i, el) => $(el).text())
.get()
.join(' ')
.trim()
const intro = $('[data-search=lead] p').text().trim()
let body = ''
// Typical example pages with no `$root` are:
// https://docs.github.com/en/code-security/guides or
// https://docs.github.com/en/graphql/overview/explorer
//
// We need to avoid these because if you use `getAllText()` on these
// pages, it will extract *everything* from the page, which will
// include the side bar and footer.
// TODO: Come up a custom solution to extract some text from these
// pages that yields some decent content to be searched on, because
// when you view these pages in a browser, there's clearly text there.
if ($root.length > 0) {
body = render($root)
}
if (!body && !intro) {
console.warn(`${objectID} has no body and no intro.`)
}
// These below lines can be deleted (along with the `maxContentLength`
// config) once we've stopped generating Lunr indexes on disk that
// we store as Git LFS.
if (!process.env.ELASTICSEARCH_URL) {
if (languageCode !== 'en' && body.length > maxContentLength) {
body = body.slice(0, maxContentLength)
}
}
const content =
intro && !body.includes(intro.trim()) ? `${intro.trim()}\n${body.trim()}`.trim() : body.trim()
return {
objectID,
breadcrumbs,
title,
headings,
content,
topics,
}
}