-
Notifications
You must be signed in to change notification settings - Fork 0
/
content_parse.py
64 lines (49 loc) · 1.97 KB
/
content_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import re
def parse_content(name):
soup = BeautifulSoup(open(name).read().decode('utf-8'), 'lxml')
flight_detail = soup.find('div', 'e_fly_lst')
items = flight_detail.contents
with open('flight.txt', 'a') as f:
for item in items:
f.write(parse_item(item).encode('utf-8') + '\n')
f.close()
def parse_item(item):
# 获取航班名称
per_detail = ''
air_name = item.find('div', class_='a-name')
per_detail += u'{:^8}'.format(air_name.text)
# 获取出发机场
departure_airport = item.find('div', class_='a-dep-airport')
per_detail += u'{:^10}'.format(departure_airport.text)
# 获取出发时间
departure_time = item.find('div', class_='a-dep-time')
per_detail += u'{:^8}'.format(departure_time.text)
# 获取时长
during_time = item.find('div', class_='a-tm-be')
per_detail += u'{:^10}'.format(during_time.text)
# 获取到达机场
arrival_airport = item.find('div', class_='a-arr-airport')
per_detail += u'{:^10}'.format(arrival_airport.text)
# 获取到达时间
arrival_time = item.find('div', class_='a-arr-time')
per_detail += u'{:^10}'.format(arrival_time.text)
# 获取时延概率及时间
delay_per = item.find_all('p', class_='a-pty-mint')
for x in delay_per:
per_detail += u'{:^6}'.format(x.text)
# 获取真正的价格
price_tag = item.find('div', class_='a-low-prc')
price = price_tag.find_all('b')
price_list = [str(word) for word in price[0].text]
for x in xrange(1, len(price)):
m = re.search('\'left:-(\d{2})px\'', str(price[x].attrs))
price_list[len(price_list) - int(m.group(1)) / 16] = str(price[x].text)
price_final = ''.join(price_list)
per_detail += u'{:^6}'.format(price_final)
return per_detail
# 10-7日 北京到杭州结果测试
if __name__ == '__main__':
parse_content(u'result_1.html')
parse_content(u'result_2.html')