-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
96 lines (66 loc) · 2.05 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from os import mkdir, remove
import requests
import re
sites_file = open('websites.txt', 'r')
error_links = []
titles = []
links_array = []
crawled_links = []
def isXMLLInk(site):
return site.find('.xml') == -1
def crawl(site):
# for site in file_link:
if isXMLLInk(site):
site = site.replace("\n","").strip("/") + '/sitemap.xml'
try:
response = requests.get(url=site).text
except Exception:
error_links.append(site)
return 1
pattern = re.compile(r'<loc>(.*?)<\/loc>', re.DOTALL)
links = pattern.findall(response)
for link in links:
if not isXMLLInk(link):
links_array.append(link)
else:
title = (link.split('/')[-2]).replace('-', ' ')
titles.append(title)
return 0
for link in sites_file:
links_array.append(link)
for link in links_array:
if link in crawled_links:
continue
crawled_links.append(link)
crawl(link)
title_file = open(f'titles.txt', 'a')
for item in titles:
title_file.write(f'{item}\n')
title_file.close()
# print(titles)
# print(links_array)
# print(error_links)
# for site in sites_file:
# try:
# mkdir(site.split('.')[0][8:])
# except FileExistsError:
# pass
# if site.find('.xml') == -1:
# site = site.replace("\n","").strip("/") + '/sitemap.xml'
# print(site)
# response = requests.get(url=site).text
# pattern = re.compile(r'<loc>(.*?)<\/loc>', re.DOTALL)
# links = pattern.findall(response)
# print(links)
# site = site.split('.')[0][8:]
# links_file = open(f'{site}/links.txt', 'w')
# title_file = open(f'{site}/titles.txt', 'w')
# for link in links:
# links_file.write(f'{link}\n')
# if link.find('.xml') == -1:
# link = (link.split('/')[-2]).replace('-', ' ')
# title_file.write(f'{link}\n')
# links_file.close()
# title_file.close()
sites_file.close()
#TODO append - make dic and write - loop through xml - take file by argument - check in title before adding