-
Notifications
You must be signed in to change notification settings - Fork 0
/
my_parser.py
100 lines (71 loc) · 3.17 KB
/
my_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# -*- coding: utf-8 -*-
import re
import requests
import uuid
import os
from lxml import html, etree
def __save_photo(url):
if not os.path.exists('./img'):
os.mkdir('./img')
file_name = "img/" + str(uuid.uuid4()) + '.jpg'
if requests.head(url).ok:
p = requests.get(url)
with open(file_name, "wb") as file:
file.write(p.content)
return str(os.path.abspath(file_name))
else:
return None
def __get_part_type(page):
tree = html.fromstring(page)
return tree.xpath("//*[@class='part']/div/a/text()")
def __get_mark(page):
tree = html.fromstring(page)
autos = tree.xpath("//*[@class='auto']/strong/text()")
return [re.match(r'^[A-Z][a-z]+(\s*[a-z]*)', item).group(0).strip() for item in autos]
def __get_price(page):
tree = html.fromstring(page)
prices = tree.xpath("//*[@class='price']/strong/text()")
prices = [item.replace(' ', '') for item in prices]
return list(filter(lambda item: re.match(r'^[0-9]+$', item), prices))
def __get_photo(page):
tree = html.fromstring(page)
img_urls = tree.xpath("//*[@class='photo']/a")
img_urls = [etree.tostring(url, encoding='unicode') for url in img_urls]
img_urls = [re.search(r'https:.*jpg', item) for item in img_urls]
img_urls = [item.group(0) for item in img_urls if item != None]
return [__save_photo(url) for url in img_urls]
def __get_company(page):
tree = html.fromstring(page)
return tree.xpath("//*[@class='company']/a[1]/text()")
def __get_model(page):
tree = html.fromstring(page)
autos = tree.xpath("//*[@class='auto']/strong/text()")
return [re.search(r'(.[A-Z][a-z\-]*)|([A-Z][a-z]*$)', item).group(0).strip() for item in autos]
def __group_results(items):
return list(map(lambda item: item.group(1).strip() if item is not None else None, items))
def __get_frame_engine_year(page):
tree = html.fromstring(page)
columns = tree.xpath("//*[@class='auto']")
columns = [etree.tostring(column, encoding='unicode') for column in columns]
frames = __group_results([re.search(r'.кузов:\s(.*?)<', column) for column in columns])
engines = __group_results([re.search(r'.двигатель:\s(.*?)<', column) for column in columns])
years = __group_results([re.search(r'.год\sвыпуска:\s(.*?)<', column) for column in columns])
return {'frames': frames, 'engines': engines, 'years': years}
def get_page_data(pages):
for page in pages:
frame_engine_year = __get_frame_engine_year(page)
return tuple(zip(__get_part_type(page),
__get_mark(page),
__get_model(page),
frame_engine_year['frames'],
frame_engine_year['engines'],
frame_engine_year['years'],
__get_price(page),
__get_company(page),
__get_photo(page)))
def have_results(url):
page = requests.get(url).text
return not re.match(r'<div id=\"fs_not_rezult\"', page)
def get_list_of_models(page):
tree = html.fromstring(page)
return tree.xpath("//*[@id='catalog_block_select']/div/span/a/text()")