diff --git a/config.py b/config.py index 3d1af7f..270ddae 100644 --- a/config.py +++ b/config.py @@ -10,22 +10,35 @@ 'downloads':'/tmp/downloads.links' } ADS_REFERENCE_DATA = "/references/resolved" +# The root of the output location +OUTPUT_DIRECTORY = '/tmp/reports' # ============================= APPLICATION ==================================== # # # Collections we are reporting on -COLLECTIONS = ['AST', 'HP', 'PS', 'HP_AST', 'PS_AST'] +COLLECTIONS = ['AST', 'HP', 'PS', 'HP_AST', 'PS_AST', 'CORE'] # Report formats supported FORMATS = ['NASA', 'CURATORS', 'MISSING'] # Report types supported SUBJECTS = ['FULLTEXT', 'REFERENCES', 'SUMMARY'] # Which journals are we reporting on per collection JOURNALS = { - 'AST': ["ApJ..","ApJL","ApJS.","AJ...","MNRAS","A&A..","A&AS.","PASP.","AN...","PhRvD","JCAP.","APh..","CQGra"], - 'PS': ["AREPS","ASTRA","AdSpR","AnGeo","Ap&SS","AsBio","CeMDA","E&PSL","EM&P.","GeCoA","IJAsB","Icar.","JAtS.","JGRA.","JGRD.", - "JGRE.","M&PS.","M&PSA","Metic","NatGe","P&SS.","PEPI.","RvGeo","SSRv.","SoSyR","SoPh.","SpWea","PSJ..","Moon.","SpPol"], - 'HP': ['SoPh.','SpWea'], - 'HP_AST': ["ApJ..","ApJL","ApJS.","AJ...","MNRAS","A&A..","A&AS."], - 'PS_AST': ["ApJ..","ApJL","ApJS.","AJ...","MNRAS","A&A..","A&AS."] + 'AST': ["ApJ","ApJL","ApJS","AJ","MNRAS","A&A","A&AS","PASP","AN","PhRvD","JCAP","APh","CQGra", "ARA&A"], + 'PS': ["AREPS","ASTRA","AdSpR","AnGeo","Ap&SS","AsBio","CeMDA","E&PSL","EM&P","GeCoA","IJAsB","Icar","JAtS", + "JGRA","JGRD","JGRE","M&PS","M&PSA","Metic","NatGe","P&SS","PEPI","RvGeo","SSRv","SoSyR","SoPh", + "SpWea","PSJ","Moon","SpPol"], + 'HP': ['SoPh','SpWea'], + 'CORE': ['A&A', 'A&AS', 'A&C', 'AJ', 'AN', 'APh', 'ARA&A', 'AREPS', 'ARep', 'ASTRA', 'AcA', 'AdSpR', 'AmJPh', + 'AnGeo', 'AnPhy', 'Ap&SS', 'ApJ', 'ApJL', 'ApJS', 'AsBio', 'AstBu', 'AstL', 'CQGra', 'CaJPh', 'CeMDA', + 'ChJPh', 'ChPhC', 'CoPhC', 'CoSka', 'CoTPh', 'CosRe', 'E&PSL', 'E&SS', 'EL', 'EM&P', 'EPJD', 'EPJP', + 'EPJST', 'ExA', 'FoPh', 'FrASS', 'FrP', 'GApFD', 'GReGr', 'Galax', 'GeCoA', 'GrCo', 'IJAsB', 'IJGMM', + 'IJMPA', 'IJMPD', 'IJMPE', 'Icar', 'InJPh', 'JAI', 'JApA', 'JAtS', 'JCAP', 'JChPh', 'JETP', 'JFM', + 'JGRA', 'JGRD', 'JGRE', 'JHEAp', 'JHEP', 'JKAS', 'JKPS', 'JMoSp', 'JPhG', 'JPlPh', 'JSWSC', 'M&PS', + 'M&PSA', 'MNRAS', 'MPLA', 'Metic', 'Moon', 'NIMPA', 'NJPh', 'NatAs', 'NatGe', 'Natur', 'NewA', 'NewAR', + 'NuPhA', 'NuPhB', 'P&SS', 'PAN', 'PASA', 'PASJ', 'PASP', 'PCCP', 'PDU', 'PEPI', 'PPCF', 'PSJ', 'PTEP', + 'PhFl', 'PhLB', 'PhPl', 'PhR', 'PhRvC', 'PhRvD', 'PhRvE', 'PhRvF', 'PhRvL', 'PhyS', 'Prama', 'RAA', 'RMxAA', + 'RScI', 'RaSc', 'RvGeo', 'RvMP', 'SSRv', 'Sci', 'ScPP', 'SoPh', 'SoSyR', 'SpPol', 'SpWea'], + 'HP_AST': ["ApJ","ApJL","ApJS","AJ","MNRAS","A&A","A&AS"], + 'PS_AST': ["ApJ","ApJL","ApJS","AJ","MNRAS","A&A","A&AS"] } # For some collection we define filters (to e.g. get the right content from multidisciplinary journals) COLLECTION_FILTERS = { @@ -79,6 +92,12 @@ YEAR_IS_VOL = { 'JCAP.':2003 } -# The root of the output location -OUTPUT_DIRECTORY = '/tmp/reports' +# Specification of volume ranges where coverage should not be calculated +# Example: for some volumes there will be no full text (the publisher does not have it +# and the ADS will not digitize) +NO_FULLTEXT = { + 'AnGeo': '1-13', + 'SoSyR': '1-36', + 'SoPh.': '101', +} diff --git a/xreport/reports.py b/xreport/reports.py index fa9d273..272cc96 100644 --- a/xreport/reports.py +++ b/xreport/reports.py @@ -6,6 +6,7 @@ from xreport.utils import _get_citations from xreport.utils import _get_usage from xreport.utils import _get_records +from xreport.utils import _string2list from datetime import datetime from datetime import date from operator import itemgetter @@ -94,6 +95,9 @@ def make_report(self, collection, report_type): self.missing[journal] = [] # Update statistics data structure with general publication information self._get_publication_data() + # Record all journals/volumes for which full text, references or metadata coverage + # needs to be skipped + self._get_skip_volumes() def save_report(self, collection, report_type, subject): """ @@ -211,6 +215,36 @@ def _get_publication_data(self): # for normalization self.statsdata[journal]['pubdata'] = art_dict # + def _get_skip_volumes(self): + """ + For full text, references and metadata, check the config for + volumes that need to be skipped in coverage reporting + """ + # Determine all volumes for which we need to skip full text coverage reporting + self.skip_fulltext = {} + try: + no_fulltext = self.config['NO_FULLTEXT'] + for jrnl in no_fulltext.keys(): + self.skip_fulltext[jrnl] = _string2list(no_fulltext.get(jrnl,'0')) + except: + pass + # Determine all volumes for which we need to skip reference match reporting + self.skip_references = {} + try: + no_fulltext = self.config['NO_REFERENCES'] + for jrnl in no_fulltext.keys(): + self.skip_fulltext[jrnl] = _string2list(no_fulltext.get(jrnl,'0')) + except: + pass + # Determine all volumes for which we need to skip metadata coverage reporting + self.skip_metadata = {} + try: + no_fulltext = self.config['NO_METADATA'] + for jrnl in no_fulltext.keys(): + self.skip_fulltext[jrnl] = _string2list(no_fulltext.get(jrnl,'0')) + except: + pass + def _highlight_cells(self, val): """ Mapping function for use in Pandas to apply conditional cell coloring @@ -306,6 +340,7 @@ def _get_fulltext_data_general(self): For a set of journals, get full text data (the number of records with full text per volume) """ + # Determine if certain volumes need to be skipped: for journal in self.journals: # The ADS query to retrieve all records with full text for a given journal # Filters: @@ -318,7 +353,14 @@ def _get_fulltext_data_general(self): full_dict = _get_facet_data(self.config, query, 'volume') # Coverage data is stored in a dictionary cov_dict = {} + # Collect volumes to be skipped, if any + try: + skip = self.skip_fulltext[journal] + except: + skip = [] for volume in sorted(self.statsdata[journal]['pubdata'].keys()): + if volume in skip: + continue try: frac = 100*float(full_dict[volume])/float(self.statsdata[journal]['pubdata'][volume]) except: @@ -339,7 +381,14 @@ def _get_fulltext_data_classic(self, ft_source): for journal in self.journals: # Coverage data is stored in a dictionary cov_dict = {} + # Collect volumes to be skipped, if any + try: + skip = self.skip_fulltext[journal] + except: + skip = [] for volume in sorted(self.statsdata[journal]['pubdata'].keys()): + if volume in skip: + continue # For each volume of the journals in the collection we query the Pandas dataframe to retrieve the sources of full text if ft_source == 'arxiv': # How many records are there with full text from arXiv? @@ -366,8 +415,7 @@ def _get_missing_publications(self): """ for journal in self.journals: # The ADS query to retrieve all records without full text for a given journal - # Additional filter: records entered up to one month from now - query = 'bibstem:"{0}" -fulltext_mtime:["1000-01-01t00:00:00.000Z" TO *] entdate:[* TO NOW-40DAYS] doctype:article'.format(journal) + query = 'bibstem:"{0}" -fulltext_mtime:["1000-01-01t00:00:00.000Z" TO *] doctype:article'.format(journal) missing_pubs = _get_records(self.config, query, 'bibcode,doi,title,first_author_norm,volume,issue') self.missing[journal] = missing_pubs diff --git a/xreport/utils.py b/xreport/utils.py index 3dda294..0e64dc9 100644 --- a/xreport/utils.py +++ b/xreport/utils.py @@ -27,6 +27,24 @@ def _group(lst, n): if len(val) == n: yield tuple(val) +def _string2list(numstr): + """ + Convert a string of numbers into a list/range + Example: '1,2,5-7,10' --> [1, 2, 5, 6, 7, 10] + + param: str: the input string + """ + result = [] + for part in numstr.split(','): + if '-' in part: + a, b = part.split('-') + a, b = int(a), int(b) + result.extend(range(a, b + 1)) + else: + a = int(part) + result.append(a) + return result + def _make_dict(tup, key_is_int=True): """ Turn list of tuples into a dictionary