Merge pull request #28 from golnazads/master

arXiv source metadata is swtiched, and pyingest dropped
adsabs · Dec 18, 2023 · 46d01fa · 46d01fa
2 parents 8c8fb27 + a186396
commit 46d01fa
Show file tree

Hide file tree

Showing 18 changed files with 252 additions and 249 deletions.
diff --git a/adsdocmatch/match_w_metadata.py b/adsdocmatch/match_w_metadata.py
@@ -6,7 +6,6 @@
 from adsdocmatch.pub_parser import get_pub_metadata
 from adsdocmatch.oracle_util import OracleUtil
 from adsdocmatch.matchable_status import matchable_status
-from pyingest.parsers.arxiv import ArxivParser
 from adsputils import setup_logging, load_config
 
 proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
@@ -30,7 +29,6 @@ class MatchMetadata():
 
     process_pub_bibstem = {}
 
-    ARXIV_PARSER = ArxivParser()
     ORACLE_UTIL = OracleUtil()
 
     def get_input_filenames(self, filename):
@@ -85,6 +83,45 @@ def process_pub_metadata(self, metadata):
             self.process_pub_bibstem[bibstem] = 1 if status == True else (0 if status == False else -1)
         return self.process_pub_bibstem[bibstem]
 
+    def parse_arXiv_comments(self, metadata):
+        """
+
+        :param metadata:
+        :return:
+        """
+        comments = metadata.get('arXiv_comments', '')
+        if comments:
+            # extract doi out of comments if there are any
+            match = self.re_doi.search(comments)
+            if match:
+                metadata['doi'] = match.group(1)
+            else:
+                doi = metadata.get('properties', {}).get('DOI', None)
+                if doi:
+                    metadata['doi'] = doi.replace('doi:', '')
+            match_doctype = None
+            title = metadata.get('title')
+            # check title for erratum
+            match = self.re_doctype_errata.search(title)
+            if match:
+                match_doctype = ['erratum']
+            else:
+                match = self.re_doctype_bookreview.search(title)
+                if match:
+                    match_doctype = ['bookreview']
+                else:
+                    # check both comments and title for thesis
+                    match = self.re_doctype_thesis.search("%s %s" % (comments, title))
+                    if match:
+                        match_doctype = ['phdthesis', 'mastersthesis']
+            must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
+        else:
+            metadata.pop("doi", None)
+            match_doctype = None
+            must_match = False
+            comments = ''
+        return metadata, comments, must_match, match_doctype
+
     def write_results(self, result_filename, matches, metadata_filename, rerun_filename):
         """
 
@@ -202,41 +239,8 @@ def match_to_pub(self, filename):
         """
         try:
             with open(filename, 'rb') as arxiv_fp:
-                journal = filename.strip().split('/')[-5]
-                if journal == 'ArXiv':
-                    metadata = self.ARXIV_PARSER.parse(arxiv_fp)
-                    comments = ' '.join(metadata.get('comments', []))
-                    # extract doi out of comments if there are any
-                    match = self.re_doi.search(comments)
-                    if match:
-                        metadata['doi'] = match.group(1)
-                    else:
-                        doi = metadata.get('properties', {}).get('DOI', None)
-                        if doi:
-                            metadata['doi'] = doi.replace('doi:', '')
-                    match_doctype = None
-                    title = metadata.get('title')
-                    # check title for erratum
-                    match = self.re_doctype_errata.search(title)
-                    if match:
-                        match_doctype = ['erratum']
-                    else:
-                        match = self.re_doctype_bookreview.search(title)
-                        if match:
-                            match_doctype = ['bookreview']
-                        else:
-                            # check both comments and title for thesis
-                            match = self.re_doctype_thesis.search("%s %s" % (comments, title))
-                            if match:
-                                match_doctype = ['phdthesis', 'mastersthesis']
-                    must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
-                else:
-                    metadata = get_pub_metadata(arxiv_fp.read())
-                    # remove the doi, since in this case, oracle thinks it is the publication doi
-                    metadata.pop("doi", None)
-                    match_doctype = None
-                    must_match = False
-                    comments = ''
+                metadata = get_pub_metadata(arxiv_fp.read())
+                metadata, comments, must_match, match_doctype = self.parse_arXiv_comments(metadata)
                 oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype)
                 # before proceeding see if this arXiv article's class is among the ones that ADS archives the
                 # published version if available
@@ -456,3 +460,13 @@ def process_match_to_pub(self, path):
         combined_output_filename = "%s%s" % (path, config.get('DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME', 'default'))
         self.merge_classic_docmatch_results(classic_matched_filename, result_filename, combined_output_filename)
         return combined_output_filename
+
+if __name__ == '__main__':
+    print(MatchMetadata().match_to_pub('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
+    print(MatchMetadata().match_to_arXiv('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
+
+    '''
+/proj/ads/abstracts/gen/text/L52/L52-28159.abs
+/proj/ads/abstracts/gen/text/L48/L48-23288.abs
+/proj/ads/abstracts/sources/ArXiv/oai/arXiv.org/2306/02768
+'''
diff --git a/adsdocmatch/pub_parser/__init__.py b/adsdocmatch/pub_parser/__init__.py
@@ -51,6 +51,7 @@ def as_needed(article):
         ("Publication Date", "pubdate"),
         ("Bibliographic Code", "bibcode"),
         ("DOI", "doi"),
+        ("arXiv_comments", "arXiv_comments")
     ]
     return_record = {}
     for src_key, dest_key in field_mappings:
@@ -131,4 +132,8 @@ def get_pub_metadata(contents):
     switch_date = article['Publication Date'].split('/')
     article['Publication Date'] = switch_date[1] + '/' + switch_date[0]
 
+    if 'Origin' in fields_found_in_file:
+        if article['Origin'] == 'ARXIV':
+            article['arXiv_comments'] = article.get('Comments', '')
+
     return as_needed(article)
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579
diff --git a/...unittests/stubdata/text/L48/L48-23288.abs → ...ch/tests/unittests/stubdata/L48-23288.abs b/...unittests/stubdata/text/L48/L48-23288.abs → ...ch/tests/unittests/stubdata/L48-23288.abs
diff --git a/...unittests/stubdata/text/L52/L52-28159.abs → ...ch/tests/unittests/stubdata/L52-28159.abs b/...unittests/stubdata/text/L52/L52-28159.abs → ...ch/tests/unittests/stubdata/L52-28159.abs
diff --git a/adsdocmatch/tests/unittests/stubdata/X01-74270.abs b/adsdocmatch/tests/unittests/stubdata/X01-74270.abs
@@ -0,0 +1,32 @@
+Title:              Validation of the new Hipparcos reduction
+Authors:            van Leeuwen, F.
+Journal:            eprint arXiv:0708.1752
+Publication Date:   08/2007
+Comments:           12 pages, 19 figures, accepted for publication by Astronomy and Astrophysics;
+                    Astron.Astrophys.474:653-664,2007;
+                    doi:10.1051/0004-6361:20078357
+Origin:             ARXIV
+Keywords:           Astrophysics
+Bibliographic Code: 2007arXiv0708.1752V
+
+                               Abstract
+Context.A new reduction of the astrometric data as produced by the
+Hipparcos mission has been published, claiming accuracies for nearly all
+stars brighter than magnitude Hp = 8 to be better, by up to a factor 4,
+than in the original catalogue. Aims.The new Hipparcos astrometric
+catalogue is checked for the quality of the data and the consistency of
+the formal errors as well as the possible presence of error
+correlations. The differences with the earlier publication are
+explained. Methods. The internal errors are followed through the
+reduction process, and the external errors are investigated on the basis
+of a comparison with radio observations of a small selection of stars,
+and the distribution of negative parallaxes. Error correlation levels
+are investigated and the reduction by more than a factor 10 as obtained
+in the new catalogue is explained. Results.The formal errors on the
+parallaxes for the new catalogue are confirmed. The presence of a small
+amount of additional noise, though unlikely, cannot be ruled out.
+Conclusions. The new reduction of the Hipparcos astrometric data
+provides an improvement by a factor 2.2 in the total weight compared to
+the catalogue published in 1997, and provides much improved data for a
+wide range of studies on stellar luminosities and local galactic
+kinematics.
diff --git a/adsdocmatch/tests/unittests/stubdata/X10-50737.abs b/adsdocmatch/tests/unittests/stubdata/X10-50737.abs
@@ -0,0 +1,17 @@
+Title:              Post-Lie algebra structures on the Witt algebra
+Authors:            Tang, Xiaomin
+Journal:            eprint arXiv:1701.00200
+Publication Date:   01/2017
+Comments:           24 pages
+Origin:             ARXIV
+Keywords:           Mathematics - Rings and Algebras, 17A30, 17A42,
+                    17B60, 18D50
+Bibliographic Code: 2017arXiv170100200T
+
+                               Abstract
+In this paper, we characterize the graded post-Lie algebra structures
+and a class of shifting post-Lie algebra structures on the Witt algebra.
+We obtain some new Lie algebras and give a class of their modules. As an
+application, the homogeneous Rota-Baxter operators and a class of
+non-homogeneous Rota-Baxter operators of weight $1$ on the Witt algebra
+are studied.