-
Notifications
You must be signed in to change notification settings - Fork 1
/
karen-extra-records-parse.py
111 lines (83 loc) · 3.54 KB
/
karen-extra-records-parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import yaml
import time
import json
import pymongo
import logging
import argparse
import requests
import datetime
from sys import exit
from pprint import pprint
from urllib.parse import quote
from urllib.parse import urlparse
from dateutil.parser import parse
class KarenExtraRecordParse():
def __init__(self,config):
name_yml = os.path.abspath(config)
with open(name_yml, 'r') as ymlfile:
cfg = yaml.load(ymlfile,Loader=yaml.BaseLoader)
client = pymongo.MongoClient('mongodb://{0}:{1}@{2}/{3}'.format(
quote(cfg['auth'][cfg['env']]['mongo-user']),
quote(cfg['auth'][cfg['env']]['mongo-pass']),
cfg['auth'][cfg['env']]['mongo-host'],
cfg['auth'][cfg['env']]['mongo-db'])
)
self.db = client[cfg['auth'][cfg['env']]['mongo-db']]
logging.basicConfig(
level=logging.INFO,
filename= 'pipeline.log',
datefmt='%Y-%m-%d %H:%M:%S',
format='%(asctime)s %(levelname)-8s %(message)s')
self.logger = logging.getLogger('pipeline')
self.token = requests.post('https://app.dimensions.ai/api/auth.json', json={
'username': cfg['auth'][cfg['env']]['dimension-username'],
'password': cfg['auth'][cfg['env']]['dimension-password']
}).json()['token']
self.collection = 'KarenExtraRecords'
self.extra_documents = 'extra_documents'
self.organisations = 'organisations'
self.db[self.organisations].create_index([('name', pymongo.ASCENDING)], name='name', default_language='english',unique=False)
def process(self):
data = self.db[self.collection].find({})
total = self.db[self.collection].count_documents({})
for index,row in enumerate(data,1):
doi = None
dimension_id = None
if row['Dim link'].startswith('https'):
dimension_id = urlparse(row['Dim link']).path.split("/")[-1].strip()
else:
doi = row['Dim link'].strip()
try:
grid = self.db[self.organisations].find_one({'id_original' : row['Grid']})['id']
except TypeError:
try:
grid = self.db[self.organisations].find_one({'name' : row['Organisation']})['id']
except TypeError:
# No hace match ni el ID o el nombre de la organizacion.
# Comunmente son Jounal
# https://app.dimensions.ai/details/publication/pub.1065208274
continue
if doi is not None:
dsl_query = f'search publications where doi = "{doi}" return publications'
else:
dsl_query = f'search publications where id = "{dimension_id}" return publications'
time.sleep(1)
data = requests.post(u'https://app.dimensions.ai/api/dsl.json',
data=dsl_query
,headers={
'Authorization': "JWT " + self.token
}).json()
self.db[self.extra_documents].insert_one({
'link' : row['Dim link'],
'grid' : grid,
'dimension_id' : dimension_id,
'doi' : doi
})
self.logger.info(f'{index}/{total} INSERTED')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-cfg','--config',default='./config.yml')
args = parser.parse_args()
config = args.config
KarenExtraRecordParse(config).process()