-
Notifications
You must be signed in to change notification settings - Fork 4
/
crawling.py
103 lines (73 loc) · 2.87 KB
/
crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient # pymongo를 임포트 하기(패키지 인스톨 먼저 해야겠죠?)
client = MongoClient('localhost', 27017) # mongoDB는 27017 포트로 돌아갑니다.
db = client.recipe
# URL을 읽어서 HTML를 받아오고,
headers ={"User-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36"}
i=range(1,5)
# HTML을 BeautifulSoup이라는 라이브러리를 활용해 검색하기 용이한 상태로 만듦
#contents_area_full > ul > ul
page = []
for a in i:
try:
data = requests.get('https://www.10000recipe.com/recipe/list.html?order=reco&page=' + str(a), headers=headers)
soup = BeautifulSoup(data.text, 'html.parser')
num = soup.find('ul', {'class': 'common_sp_list_ul ea4'})
k=num.find_all('div',{'class':'common_sp_thumb'})
for kk in k:
page.append(kk.find('a')['href'])
except:
pass
main_img= []
main_step=[]
main_title=[]
main_recipe=[]
main_recipe2=[]
#contents_area > div.view2_summary > h3
for p in page:
data2 = requests.get('https://www.10000recipe.com'+p,headers=headers)
soup = BeautifulSoup(data2.text, 'html.parser')
try:
title=soup.select_one('#contents_area > div.view2_summary > h3')
img = soup.select_one('#contents_area > div.view2_pic > div.centeredcrop > img')
recipe = soup.select('#divConfirmedMaterialArea > ul:nth-of-type(1) > a')
recipe2 = soup.select('#divConfirmedMaterialArea > ul:nth-of-type(2) > a')
step = soup.find("div", {"class": "view_step"})
s = step.find_all("div", {"class": "media-body"})
i = 0
last = ""
last_recipe1 = ""
last_recipe2 = ""
for a in s:
i += 1
last += str(i) + a.text + "&"
last = last[:-1]
# a의 text를 찍어본다.
for craw in recipe:
a_tag = craw.find('li')
last_recipe1 += a_tag.text.split("\n")[0].strip() + "&"
last_recipe1 = last_recipe1[:-1].strip()
for craw in recipe2:
# movie 안에 a 가 있으면,
a_tag = craw.find('li')
last_recipe2 += a_tag.text.split("\n")[0].strip() + "&"
last_recipe2 = last_recipe2[:-1]
main_recipe2.append(last_recipe2)
main_img.append(img['src'])
main_step.append(last)
main_recipe.append(last_recipe1.strip())
main_title.append(title.text)
except:
pass
k = range(0, len(page))
bus = []
length=range(0,len(main_recipe))
for i in length:
bus.append('{}&{}'.format(main_recipe[i], main_recipe2[i]))
print(bus)
for i in k:
doc = {
'step': main_step[i], 'title': main_title[i], 'recipe1': main_recipe[i], 'recipe2': main_recipe2[i],'recipe_main':bus[i],'img': main_img[i]
}
db.recipe.insert_one(doc)