-
Notifications
You must be signed in to change notification settings - Fork 0
/
cssci-expanded_crawler.py
114 lines (100 loc) · 3.39 KB
/
cssci-expanded_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import time
import math
import json
import requests
from lxml import etree
url = "https://navi.cnki.net/knavi/journals/searchbaseinfo"
headers = {
"X-Requested-With": "XMLHttpRequest",
"Content-Type": "application/x-www-form-urlencoded",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
}
searchStateJson = {
"StateID": "",
"Platfrom": "",
"QueryTime": "",
"Account": "knavi",
"ClientToken": "",
"Language": "",
"CNode": {"PCode": "JOURNAL", "SMode": "", "OperateT": ""},
"QNode": {
"SelectT": "",
"Select_Fields": "",
"S_DBCodes": "",
"QGroup": [
{
"Key": "Navi",
"Logic": 1,
"Items": [],
"ChildItems": [
{
"Key": "journals",
"Logic": 1,
"Items": [
{
"Key": "datasource",
"Title": "",
"Logic": 1,
"Name": "EI",
"Operate": "",
"Value": "0010?",
"ExtendType": 0,
"ExtendValue": "",
"Value2": "",
}
],
"ChildItems": [],
}
],
}
],
"OrderBy": "OTA|DESC",
"GroupBy": "",
"Additon": "",
},
}
payload = {
"searchStateJson": json.dumps(searchStateJson),
"displaymode": 1,
"pageindex": 1,
"pagecount": 21,
"index": "datasource",
"searchType": "刊名(曾用刊名)",
"clickName": "CSSCI 中文社会科学引文索引(2021-2022)来源期刊(扩展版)",
"switchdata": "leftnavi",
}
journalDict = {}
print("started!")
# 第一页
response = requests.post(url, data=payload, headers=headers)
html = etree.HTML(response.text)
totalcount = html.xpath('number(//span[@class="totalcount"]/em)')
journals = html.xpath('//ul[@class="list_tup"]/li')
for journal in journals:
code = journal.xpath("string(a/@href)")[-4:]
name = journal.xpath("string(a/@title)").strip()
journalDict.update({code: name})
print("finished page 1")
time.sleep(3)
pageTotal = math.floor(totalcount / payload["pagecount"] + 2)
# 后续页
for pageNo in range(2, pageTotal):
payload["pageindex"] = pageNo
response = requests.post(url, data=payload, headers=headers)
html = etree.HTML(response.text)
journals = html.xpath('//ul[@class="list_tup"]/li')
for journal in journals:
code = journal.xpath("string(a/@href)")[-4:]
name = journal.xpath("string(a/@title)").strip()
journalDict.update({code: name})
print("finished page " + str(pageNo))
time.sleep(3)
print("finished!")
json_object = json.dumps(journalDict, indent=4)
journalCount = len(json.loads(json_object))
print("total number: " + str(journalCount))
if journalCount == totalcount:
with open("cssci-expanded/cssci-expanded.json", "w", encoding="utf-8") as f:
f.write(json_object)
else:
print("something wrong!")