-
Notifications
You must be signed in to change notification settings - Fork 0
/
cssci_info_crawler.py
99 lines (83 loc) · 2.96 KB
/
cssci_info_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import time
import json
import requests
from lxml import etree
journalInfoDict = {}
with open("cssci/cssci.json") as f:
data = json.load(f)
keys = list(data.keys())
length = len(keys)
for i in range(length):
key = keys[i]
journalInfoDict.update({key: {}})
# url
url = "https://navi.cnki.net/knavi/journals/{}/detail".format(key)
journalInfoDict[key]["url"] = url
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers)
html = etree.HTML(response.text)
# title
title = html.xpath("string(/html/head/title)").strip()
journalInfoDict[key]["title"] = title
# publish types
publishType = list(
map(
lambda item: item.strip(),
html.xpath(
'//*[@id="qk"]/div[2]/dl/dd/p[@class="journalType journalType1"]/span/descendant-or-self::*/text()'
),
)
)
journalInfoDict[key]["type"] = publishType
# included in databases
database = list(
map(
lambda item: item.strip(),
html.xpath(
'//*[@id="qk"]/div[2]/dl/dd/p[@class="journalType journalType2"]/span/descendant-or-self::*/text()'
),
)
)
journalInfoDict[key]["database"] = database
# journal base info
baseInfoKey = html.xpath('//*[@id="JournalBaseInfo"]/li/p/label/text()')
baseInfoVal = html.xpath('//*[@id="JournalBaseInfo"]/li/p/span/text()')
baseInfoNo = len(baseInfoKey)
for i in range(baseInfoNo):
journalInfoDict[key][baseInfoKey[i]] = baseInfoVal[i]
# publish info
publishInfoKey = html.xpath(
'//*[@id="publishInfo"]/li/descendant-or-self::*/p/label/text()'
)
publishinfoVal = html.xpath(
'//*[@id="publishInfo"]/li/descendant-or-self::*/p/span/text()'
)
publishinfoNo = len(publishInfoKey)
for i in range(publishinfoNo):
journalInfoDict[key][publishInfoKey[i]] = publishinfoVal[i]
# evaluate info
evaluateInfoKey = html.xpath(
'//*[@id="evaluateInfo"]/li[2]/p[position()<3]/label/text()'
)
evaluateInfoVal = html.xpath(
'//*[@id="evaluateInfo"]/li[2]/p[position()<3]/span/text()'
)
if len(evaluateInfoKey) == len(evaluateInfoVal):
for i in range(len(evaluateInfoKey)):
journalInfoDict[key][evaluateInfoKey[i]] = evaluateInfoVal[i]
else:
pass
# print info
print("finished {}: {}".format(key, title))
# sleep 3 seconds
time.sleep(3)
json_object = json.dumps(journalInfoDict, indent=4)
journalCount = len(json.loads(json_object))
print("total number: " + str(journalCount))
if journalCount == length:
with open("cssci/cssci_info.json", "w", encoding="utf-8") as f:
f.write(json_object)
else:
print("something wrong!")