forked from kamyu104/LeetCode-Solutions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb-crawler.py
39 lines (35 loc) · 1013 Bytes
/
web-crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# Time: O(|V| + |E|)
# Space: O(|V|)
# """
# This is HtmlParser's API interface.
# You should not implement it, or speculate about its implementation
# """
class HtmlParser(object):
def getUrls(self, url):
"""
:type url: str
:rtype List[str]
"""
pass
class Solution(object):
def crawl(self, startUrl, htmlParser):
"""
:type startUrl: str
:type htmlParser: HtmlParser
:rtype: List[str]
"""
SCHEME = "http://"
def hostname(url):
pos = url.find('/', len(SCHEME))
if pos == -1:
return url
return url[:pos]
result = [startUrl]
lookup = set(result)
for from_url in result:
name = hostname(from_url)
for to_url in htmlParser.getUrls(from_url):
if to_url not in lookup and name == hostname(to_url):
result.append(to_url)
lookup.add(to_url)
return result