Merge pull request #19 from ehenneken/collection_update

Skip and general updates
adsabs · Jan 5, 2023 · f908747 · f908747
2 parents 953c46d + 4c9eade
commit f908747
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 11 deletions.
diff --git a/config.py b/config.py
@@ -10,22 +10,35 @@
     'downloads':'/tmp/downloads.links'
 }
 ADS_REFERENCE_DATA = "/references/resolved"
+# The root of the output location
+OUTPUT_DIRECTORY = '/tmp/reports'
 # ============================= APPLICATION ==================================== #
 # 
 # Collections we are reporting on
-COLLECTIONS = ['AST', 'HP', 'PS', 'HP_AST', 'PS_AST']
+COLLECTIONS = ['AST', 'HP', 'PS', 'HP_AST', 'PS_AST', 'CORE']
 # Report formats supported
 FORMATS = ['NASA', 'CURATORS', 'MISSING']
 # Report types supported
 SUBJECTS = ['FULLTEXT', 'REFERENCES', 'SUMMARY']
 # Which journals are we reporting on per collection
 JOURNALS = {
-    'AST': ["ApJ..","ApJL","ApJS.","AJ...","MNRAS","A&A..","A&AS.","PASP.","AN...","PhRvD","JCAP.","APh..","CQGra"],
-    'PS': ["AREPS","ASTRA","AdSpR","AnGeo","Ap&SS","AsBio","CeMDA","E&PSL","EM&P.","GeCoA","IJAsB","Icar.","JAtS.","JGRA.","JGRD.",
-              "JGRE.","M&PS.","M&PSA","Metic","NatGe","P&SS.","PEPI.","RvGeo","SSRv.","SoSyR","SoPh.","SpWea","PSJ..","Moon.","SpPol"],
-    'HP': ['SoPh.','SpWea'],
-    'HP_AST': ["ApJ..","ApJL","ApJS.","AJ...","MNRAS","A&A..","A&AS."],
-    'PS_AST': ["ApJ..","ApJL","ApJS.","AJ...","MNRAS","A&A..","A&AS."]
+    'AST': ["ApJ","ApJL","ApJS","AJ","MNRAS","A&A","A&AS","PASP","AN","PhRvD","JCAP","APh","CQGra", "ARA&A"],
+    'PS': ["AREPS","ASTRA","AdSpR","AnGeo","Ap&SS","AsBio","CeMDA","E&PSL","EM&P","GeCoA","IJAsB","Icar","JAtS",
+           "JGRA","JGRD","JGRE","M&PS","M&PSA","Metic","NatGe","P&SS","PEPI","RvGeo","SSRv","SoSyR","SoPh",
+           "SpWea","PSJ","Moon","SpPol"],
+    'HP': ['SoPh','SpWea'],
+    'CORE': ['A&A', 'A&AS', 'A&C', 'AJ', 'AN', 'APh', 'ARA&A', 'AREPS', 'ARep', 'ASTRA', 'AcA', 'AdSpR', 'AmJPh', 
+             'AnGeo', 'AnPhy', 'Ap&SS', 'ApJ', 'ApJL', 'ApJS', 'AsBio', 'AstBu', 'AstL', 'CQGra', 'CaJPh', 'CeMDA', 
+             'ChJPh', 'ChPhC', 'CoPhC', 'CoSka', 'CoTPh', 'CosRe', 'E&PSL', 'E&SS', 'EL', 'EM&P', 'EPJD', 'EPJP', 
+             'EPJST', 'ExA', 'FoPh', 'FrASS', 'FrP', 'GApFD', 'GReGr', 'Galax', 'GeCoA', 'GrCo', 'IJAsB', 'IJGMM', 
+             'IJMPA', 'IJMPD', 'IJMPE', 'Icar', 'InJPh', 'JAI', 'JApA', 'JAtS', 'JCAP', 'JChPh', 'JETP', 'JFM', 
+             'JGRA', 'JGRD', 'JGRE', 'JHEAp', 'JHEP', 'JKAS', 'JKPS', 'JMoSp', 'JPhG', 'JPlPh', 'JSWSC', 'M&PS', 
+             'M&PSA', 'MNRAS', 'MPLA', 'Metic', 'Moon', 'NIMPA', 'NJPh', 'NatAs', 'NatGe', 'Natur', 'NewA', 'NewAR', 
+             'NuPhA', 'NuPhB', 'P&SS', 'PAN', 'PASA', 'PASJ', 'PASP', 'PCCP', 'PDU', 'PEPI', 'PPCF', 'PSJ', 'PTEP',  
+             'PhFl', 'PhLB', 'PhPl', 'PhR', 'PhRvC', 'PhRvD', 'PhRvE', 'PhRvF', 'PhRvL', 'PhyS', 'Prama', 'RAA', 'RMxAA', 
+             'RScI', 'RaSc', 'RvGeo', 'RvMP', 'SSRv', 'Sci', 'ScPP', 'SoPh', 'SoSyR', 'SpPol', 'SpWea'],
+    'HP_AST': ["ApJ","ApJL","ApJS","AJ","MNRAS","A&A","A&AS"],
+    'PS_AST': ["ApJ","ApJL","ApJS","AJ","MNRAS","A&A","A&AS"]
 }
 # For some collection we define filters (to e.g. get the right content from multidisciplinary journals)
 COLLECTION_FILTERS = {
@@ -79,6 +92,12 @@
 YEAR_IS_VOL = {
     'JCAP.':2003
 }
-# The root of the output location
-OUTPUT_DIRECTORY = '/tmp/reports'
+# Specification of volume ranges where coverage should not be calculated
+# Example: for some volumes there will be no full text (the publisher does not have it
+#          and the ADS will not digitize)
+NO_FULLTEXT = {
+    'AnGeo': '1-13',
+    'SoSyR': '1-36',
+    'SoPh.': '101',
+}
 
diff --git a/xreport/reports.py b/xreport/reports.py
@@ -6,6 +6,7 @@
 from xreport.utils import _get_citations
 from xreport.utils import _get_usage
 from xreport.utils import _get_records
+from xreport.utils import _string2list
 from datetime import datetime
 from datetime import date
 from operator import itemgetter
@@ -94,6 +95,9 @@ def make_report(self, collection, report_type):
             self.missing[journal] = []
         # Update statistics data structure with general publication information
         self._get_publication_data()
+        # Record all journals/volumes for which full text, references or metadata coverage
+        # needs to be skipped
+        self._get_skip_volumes()
 
     def save_report(self, collection, report_type, subject):
         """
@@ -211,6 +215,36 @@ def _get_publication_data(self):
             # for normalization
             self.statsdata[journal]['pubdata'] = art_dict
     #
+    def _get_skip_volumes(self):
+        """
+        For full text, references and metadata, check the config for
+        volumes that need to be skipped in coverage reporting
+        """
+        # Determine all volumes for which we need to skip full text coverage reporting
+        self.skip_fulltext = {}
+        try:
+            no_fulltext = self.config['NO_FULLTEXT']
+            for jrnl in no_fulltext.keys():
+                self.skip_fulltext[jrnl] = _string2list(no_fulltext.get(jrnl,'0'))
+        except:
+            pass
+        # Determine all volumes for which we need to skip reference match reporting
+        self.skip_references = {}
+        try:
+            no_fulltext = self.config['NO_REFERENCES']
+            for jrnl in no_fulltext.keys():
+                self.skip_fulltext[jrnl] = _string2list(no_fulltext.get(jrnl,'0'))
+        except:
+            pass
+        # Determine all volumes for which we need to skip metadata coverage reporting
+        self.skip_metadata = {}
+        try:
+            no_fulltext = self.config['NO_METADATA']
+            for jrnl in no_fulltext.keys():
+                self.skip_fulltext[jrnl] = _string2list(no_fulltext.get(jrnl,'0'))
+        except:
+            pass
+
     def _highlight_cells(self, val):
         """
         Mapping function for use in Pandas to apply conditional cell coloring
@@ -306,6 +340,7 @@ def _get_fulltext_data_general(self):
         For a set of journals, get full text data (the number of records with full text per volume)
 
         """
+        # Determine if certain volumes need to be skipped:
         for journal in self.journals:
             # The ADS query to retrieve all records with full text for a given journal
             # Filters:
@@ -318,7 +353,14 @@ def _get_fulltext_data_general(self):
             full_dict = _get_facet_data(self.config, query, 'volume')
             # Coverage data is stored in a dictionary
             cov_dict = {}
+            # Collect volumes to be skipped, if any
+            try:
+                skip = self.skip_fulltext[journal]
+            except:
+                skip = []
             for volume in sorted(self.statsdata[journal]['pubdata'].keys()):
+                if volume in skip:
+                    continue
                 try:
                     frac = 100*float(full_dict[volume])/float(self.statsdata[journal]['pubdata'][volume])
                 except:
@@ -339,7 +381,14 @@ def _get_fulltext_data_classic(self, ft_source):
         for journal in self.journals:
             # Coverage data is stored in a dictionary
             cov_dict = {}
+            # Collect volumes to be skipped, if any
+            try:
+                skip = self.skip_fulltext[journal]
+            except:
+                skip = []
             for volume in sorted(self.statsdata[journal]['pubdata'].keys()):
+                if volume in skip:
+                    continue
                 # For each volume of the journals in the collection we query the Pandas dataframe to retrieve the sources of full text
                 if ft_source == 'arxiv':
                     # How many records are there with full text from arXiv?
@@ -366,8 +415,7 @@ def _get_missing_publications(self):
         """
         for journal in self.journals:
             # The ADS query to retrieve all records without full text for a given journal
-            # Additional filter: records entered up to one month from now
-            query = 'bibstem:"{0}"  -fulltext_mtime:["1000-01-01t00:00:00.000Z" TO *] entdate:[* TO NOW-40DAYS] doctype:article'.format(journal)
+            query = 'bibstem:"{0}"  -fulltext_mtime:["1000-01-01t00:00:00.000Z" TO *] doctype:article'.format(journal)
             missing_pubs = _get_records(self.config, query, 'bibcode,doi,title,first_author_norm,volume,issue')
             self.missing[journal] = missing_pubs
 

diff --git a/xreport/utils.py b/xreport/utils.py
@@ -27,6 +27,24 @@ def _group(lst, n):
         if len(val) == n:
             yield tuple(val)
 
+def _string2list(numstr):
+    """
+    Convert a string of numbers into a list/range
+    Example: '1,2,5-7,10' --> [1, 2, 5, 6, 7, 10]
+    
+    param: str: the input string
+    """
+    result = []
+    for part in numstr.split(','):
+        if '-' in part:
+            a, b = part.split('-')
+            a, b = int(a), int(b)
+            result.extend(range(a, b + 1))
+        else:
+            a = int(part)
+            result.append(a)
+    return result
+
 def _make_dict(tup, key_is_int=True):
     """
     Turn list of tuples into a dictionary