Skip to content

Commit

Permalink
Merge pull request #19 from ehenneken/collection_update
Browse files Browse the repository at this point in the history
Skip and general updates
  • Loading branch information
ehenneken authored Jan 5, 2023
2 parents 953c46d + 4c9eade commit f908747
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 11 deletions.
37 changes: 28 additions & 9 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,35 @@
'downloads':'/tmp/downloads.links'
}
ADS_REFERENCE_DATA = "/references/resolved"
# The root of the output location
OUTPUT_DIRECTORY = '/tmp/reports'
# ============================= APPLICATION ==================================== #
#
# Collections we are reporting on
COLLECTIONS = ['AST', 'HP', 'PS', 'HP_AST', 'PS_AST']
COLLECTIONS = ['AST', 'HP', 'PS', 'HP_AST', 'PS_AST', 'CORE']
# Report formats supported
FORMATS = ['NASA', 'CURATORS', 'MISSING']
# Report types supported
SUBJECTS = ['FULLTEXT', 'REFERENCES', 'SUMMARY']
# Which journals are we reporting on per collection
JOURNALS = {
'AST': ["ApJ..","ApJL","ApJS.","AJ...","MNRAS","A&A..","A&AS.","PASP.","AN...","PhRvD","JCAP.","APh..","CQGra"],
'PS': ["AREPS","ASTRA","AdSpR","AnGeo","Ap&SS","AsBio","CeMDA","E&PSL","EM&P.","GeCoA","IJAsB","Icar.","JAtS.","JGRA.","JGRD.",
"JGRE.","M&PS.","M&PSA","Metic","NatGe","P&SS.","PEPI.","RvGeo","SSRv.","SoSyR","SoPh.","SpWea","PSJ..","Moon.","SpPol"],
'HP': ['SoPh.','SpWea'],
'HP_AST': ["ApJ..","ApJL","ApJS.","AJ...","MNRAS","A&A..","A&AS."],
'PS_AST': ["ApJ..","ApJL","ApJS.","AJ...","MNRAS","A&A..","A&AS."]
'AST': ["ApJ","ApJL","ApJS","AJ","MNRAS","A&A","A&AS","PASP","AN","PhRvD","JCAP","APh","CQGra", "ARA&A"],
'PS': ["AREPS","ASTRA","AdSpR","AnGeo","Ap&SS","AsBio","CeMDA","E&PSL","EM&P","GeCoA","IJAsB","Icar","JAtS",
"JGRA","JGRD","JGRE","M&PS","M&PSA","Metic","NatGe","P&SS","PEPI","RvGeo","SSRv","SoSyR","SoPh",
"SpWea","PSJ","Moon","SpPol"],
'HP': ['SoPh','SpWea'],
'CORE': ['A&A', 'A&AS', 'A&C', 'AJ', 'AN', 'APh', 'ARA&A', 'AREPS', 'ARep', 'ASTRA', 'AcA', 'AdSpR', 'AmJPh',
'AnGeo', 'AnPhy', 'Ap&SS', 'ApJ', 'ApJL', 'ApJS', 'AsBio', 'AstBu', 'AstL', 'CQGra', 'CaJPh', 'CeMDA',
'ChJPh', 'ChPhC', 'CoPhC', 'CoSka', 'CoTPh', 'CosRe', 'E&PSL', 'E&SS', 'EL', 'EM&P', 'EPJD', 'EPJP',
'EPJST', 'ExA', 'FoPh', 'FrASS', 'FrP', 'GApFD', 'GReGr', 'Galax', 'GeCoA', 'GrCo', 'IJAsB', 'IJGMM',
'IJMPA', 'IJMPD', 'IJMPE', 'Icar', 'InJPh', 'JAI', 'JApA', 'JAtS', 'JCAP', 'JChPh', 'JETP', 'JFM',
'JGRA', 'JGRD', 'JGRE', 'JHEAp', 'JHEP', 'JKAS', 'JKPS', 'JMoSp', 'JPhG', 'JPlPh', 'JSWSC', 'M&PS',
'M&PSA', 'MNRAS', 'MPLA', 'Metic', 'Moon', 'NIMPA', 'NJPh', 'NatAs', 'NatGe', 'Natur', 'NewA', 'NewAR',
'NuPhA', 'NuPhB', 'P&SS', 'PAN', 'PASA', 'PASJ', 'PASP', 'PCCP', 'PDU', 'PEPI', 'PPCF', 'PSJ', 'PTEP',
'PhFl', 'PhLB', 'PhPl', 'PhR', 'PhRvC', 'PhRvD', 'PhRvE', 'PhRvF', 'PhRvL', 'PhyS', 'Prama', 'RAA', 'RMxAA',
'RScI', 'RaSc', 'RvGeo', 'RvMP', 'SSRv', 'Sci', 'ScPP', 'SoPh', 'SoSyR', 'SpPol', 'SpWea'],
'HP_AST': ["ApJ","ApJL","ApJS","AJ","MNRAS","A&A","A&AS"],
'PS_AST': ["ApJ","ApJL","ApJS","AJ","MNRAS","A&A","A&AS"]
}
# For some collection we define filters (to e.g. get the right content from multidisciplinary journals)
COLLECTION_FILTERS = {
Expand Down Expand Up @@ -79,6 +92,12 @@
YEAR_IS_VOL = {
'JCAP.':2003
}
# The root of the output location
OUTPUT_DIRECTORY = '/tmp/reports'
# Specification of volume ranges where coverage should not be calculated
# Example: for some volumes there will be no full text (the publisher does not have it
# and the ADS will not digitize)
NO_FULLTEXT = {
'AnGeo': '1-13',
'SoSyR': '1-36',
'SoPh.': '101',
}

52 changes: 50 additions & 2 deletions xreport/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from xreport.utils import _get_citations
from xreport.utils import _get_usage
from xreport.utils import _get_records
from xreport.utils import _string2list
from datetime import datetime
from datetime import date
from operator import itemgetter
Expand Down Expand Up @@ -94,6 +95,9 @@ def make_report(self, collection, report_type):
self.missing[journal] = []
# Update statistics data structure with general publication information
self._get_publication_data()
# Record all journals/volumes for which full text, references or metadata coverage
# needs to be skipped
self._get_skip_volumes()

def save_report(self, collection, report_type, subject):
"""
Expand Down Expand Up @@ -211,6 +215,36 @@ def _get_publication_data(self):
# for normalization
self.statsdata[journal]['pubdata'] = art_dict
#
def _get_skip_volumes(self):
"""
For full text, references and metadata, check the config for
volumes that need to be skipped in coverage reporting
"""
# Determine all volumes for which we need to skip full text coverage reporting
self.skip_fulltext = {}
try:
no_fulltext = self.config['NO_FULLTEXT']
for jrnl in no_fulltext.keys():
self.skip_fulltext[jrnl] = _string2list(no_fulltext.get(jrnl,'0'))
except:
pass
# Determine all volumes for which we need to skip reference match reporting
self.skip_references = {}
try:
no_fulltext = self.config['NO_REFERENCES']
for jrnl in no_fulltext.keys():
self.skip_fulltext[jrnl] = _string2list(no_fulltext.get(jrnl,'0'))
except:
pass
# Determine all volumes for which we need to skip metadata coverage reporting
self.skip_metadata = {}
try:
no_fulltext = self.config['NO_METADATA']
for jrnl in no_fulltext.keys():
self.skip_fulltext[jrnl] = _string2list(no_fulltext.get(jrnl,'0'))
except:
pass

def _highlight_cells(self, val):
"""
Mapping function for use in Pandas to apply conditional cell coloring
Expand Down Expand Up @@ -306,6 +340,7 @@ def _get_fulltext_data_general(self):
For a set of journals, get full text data (the number of records with full text per volume)
"""
# Determine if certain volumes need to be skipped:
for journal in self.journals:
# The ADS query to retrieve all records with full text for a given journal
# Filters:
Expand All @@ -318,7 +353,14 @@ def _get_fulltext_data_general(self):
full_dict = _get_facet_data(self.config, query, 'volume')
# Coverage data is stored in a dictionary
cov_dict = {}
# Collect volumes to be skipped, if any
try:
skip = self.skip_fulltext[journal]
except:
skip = []
for volume in sorted(self.statsdata[journal]['pubdata'].keys()):
if volume in skip:
continue
try:
frac = 100*float(full_dict[volume])/float(self.statsdata[journal]['pubdata'][volume])
except:
Expand All @@ -339,7 +381,14 @@ def _get_fulltext_data_classic(self, ft_source):
for journal in self.journals:
# Coverage data is stored in a dictionary
cov_dict = {}
# Collect volumes to be skipped, if any
try:
skip = self.skip_fulltext[journal]
except:
skip = []
for volume in sorted(self.statsdata[journal]['pubdata'].keys()):
if volume in skip:
continue
# For each volume of the journals in the collection we query the Pandas dataframe to retrieve the sources of full text
if ft_source == 'arxiv':
# How many records are there with full text from arXiv?
Expand All @@ -366,8 +415,7 @@ def _get_missing_publications(self):
"""
for journal in self.journals:
# The ADS query to retrieve all records without full text for a given journal
# Additional filter: records entered up to one month from now
query = 'bibstem:"{0}" -fulltext_mtime:["1000-01-01t00:00:00.000Z" TO *] entdate:[* TO NOW-40DAYS] doctype:article'.format(journal)
query = 'bibstem:"{0}" -fulltext_mtime:["1000-01-01t00:00:00.000Z" TO *] doctype:article'.format(journal)
missing_pubs = _get_records(self.config, query, 'bibcode,doi,title,first_author_norm,volume,issue')
self.missing[journal] = missing_pubs

Expand Down
18 changes: 18 additions & 0 deletions xreport/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,24 @@ def _group(lst, n):
if len(val) == n:
yield tuple(val)

def _string2list(numstr):
"""
Convert a string of numbers into a list/range
Example: '1,2,5-7,10' --> [1, 2, 5, 6, 7, 10]
param: str: the input string
"""
result = []
for part in numstr.split(','):
if '-' in part:
a, b = part.split('-')
a, b = int(a), int(b)
result.extend(range(a, b + 1))
else:
a = int(part)
result.append(a)
return result

def _make_dict(tup, key_is_int=True):
"""
Turn list of tuples into a dictionary
Expand Down

0 comments on commit f908747

Please sign in to comment.