-
Notifications
You must be signed in to change notification settings - Fork 27
/
chpt9-web-crawler.js
65 lines (52 loc) · 2.52 KB
/
chpt9-web-crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* We're designing a web crawler and need to avoid infinite loops */
// Web crawlers go from page to page and index the links between those pages to map the web
// Indexing means there must be a way to store connections
// Connections can go in 2 ways and also there can be connections that end up being circular
// An easy way to avoid infinite search loops is to use a hash table to detect where we've already visited
// We can use a graph structure like in the social network example where we perform a breadth 1st search
// The amount of data to store URLs could add up quickly -- 4 bytes per character and say ~ 100 chars per page
// We can give our hash map object a trie structure to cut down on characters since for example, most pages start with www. and end with .com
function createUrlMap(arrayOfUrls) {
var urlTrie = {};
// We look at every letter of every url for O(n^2)
for (var i = 0; i < arrayOfUrls.length; i++) {
var pageUrl = arrayOfUrls[i];
var currentPos = urlTrie;
for (var j = 0; j < pageUrl.length; j++) {
var trieLetter = currentPos[pageUrl.charAt(j)];
if (!trieLetter) {
// If there is no entry, we branch out by creating a new object
if (j === pageUrl.length - 1) {
currentPos[pageUrl.charAt(j)] = "*";
} else {
currentPos[pageUrl.charAt(j)] = {};
}
}
// We move the trie position to the object we created or the letter key we found
currentPos = currentPos[pageUrl.charAt(j)];
}
}
return urlTrie;
}
// What about handling duplicates subpage from the domain? Like if I had www.joespage.com/js-algorithms.com and www.joespage.com/js-questions.com and both were the same content.
// We could selectively crawl segments of that page and generate a signature that tells us the page has been crawled
function crawlPage() {
var domElements = document.getElementsByTagName("*");
var content = [];
// Get all inner html content and some kind of signature for the webpage content
// We could be more specific like selecting Element.tagName === "h1" or only selecting segments of the page by modifying our loop conditions
for (var i = 0; i < domElements.length; i++) {
if (domElements[i].innerHTML) {
content.push(jsHash(domElements[i].innerHTML));
}
}
return content;
}
// A general purpose hashing algorithm
function jsHash(string) {
var hash = 1315423911;
for (var i = 0; i < string.length; i++) {
hash ^= ((hash << 5) + string.charAt(i) + (hash >> 2));
}
return hash;
}