-
Notifications
You must be signed in to change notification settings - Fork 96
/
analyzer.go
103 lines (87 loc) · 2.81 KB
/
analyzer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
package jsluice
import (
"bytes"
"unicode"
"github.com/PuerkitoBio/goquery"
sitter "github.com/smacker/go-tree-sitter"
"github.com/smacker/go-tree-sitter/javascript"
)
// Analyzer could be considered the core type of jsluice. It wraps
// the parse tree for a JavaScript file and provides mechanisms to
// extract URLs, secrets etc
type Analyzer struct {
urlMatchers []URLMatcher
rootNode *Node
userSecretMatchers []SecretMatcher
}
// NewAnalyzer accepts a slice of bytes representing some JavaScript
// source code and returns a pointer to a new Analyzer
func NewAnalyzer(source []byte) *Analyzer {
parser := sitter.NewParser()
parser.SetLanguage(javascript.GetLanguage())
if isProbablyHTML(source) {
source = extractInlineJS(source)
}
tree := parser.Parse(nil, source)
// TODO: Align how URLMatcher and SecretMatcher slices
// are loaded. At the moment we load URLMatchers now,
// and SecretMatchers only when GetSecrets is called.
// This is mostly because URL matching was written first,
// and then secret matching was added later.
return &Analyzer{
urlMatchers: AllURLMatchers(),
rootNode: NewNode(tree.RootNode(), source),
}
}
// Query peforms a tree-sitter query on the JavaScript being analyzed.
// The provided function is called once for every node that captured by the query.
// See https://tree-sitter.github.io/tree-sitter/using-parsers#query-syntax
// for details on query syntax.
func (a *Analyzer) Query(q string, fn func(*Node)) {
a.rootNode.Query(q, fn)
}
// Query peforms a tree-sitter query on the JavaScript being analyzed.
// The provided function is called for every query match, with captured nodes
// grouped into a QueryResult
// See https://tree-sitter.github.io/tree-sitter/using-parsers#query-syntax
// for details on query syntax.
func (a *Analyzer) QueryMulti(q string, fn func(QueryResult)) {
a.rootNode.QueryMulti(q, fn)
}
// RootNode returns the root note of the parsed JavaScript
func (a *Analyzer) RootNode() *Node {
return a.rootNode
}
// isProbablyHTML returns true for source that is probably HTML.
// False positives are OK as long as the false positives are not
// JavaScript source.
func isProbablyHTML(source []byte) bool {
for _, b := range source {
if unicode.IsSpace(rune(b)) {
continue
}
if b == '<' {
return true
}
break
}
return false
}
// extractInlineJS extracts inline JavaScript from HTML pages using goquery.
func extractInlineJS(source []byte) []byte {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(source))
if err != nil {
// Not a valid HTML document, so just return the source.
return source
}
var inline []byte
doc.Find("script").Each(func(i int, s *goquery.Selection) {
if s.Is("script") {
inline = append(inline, []byte(s.Text()+"\n")...)
}
})
if len(inline) == 0 {
return source
}
return inline
}