Merge pull request #27 from golnazads/master

support earth science records that can appear both as eprint and pub
adsabs · Dec 18, 2023 · 8c8fb27 · 8c8fb27
2 parents 232d829 + ce92723
commit 8c8fb27
Show file tree

Hide file tree

Showing 10 changed files with 192 additions and 38 deletions.
diff --git a/adsdocmatch/match_w_metadata.py b/adsdocmatch/match_w_metadata.py
@@ -1,6 +1,5 @@
 import os
 import time
-from datetime import date
 import re
 import csv
 
@@ -203,32 +202,41 @@ def match_to_pub(self, filename):
         """
         try:
             with open(filename, 'rb') as arxiv_fp:
-                metadata = self.ARXIV_PARSER.parse(arxiv_fp)
-                comments = ' '.join(metadata.get('comments', []))
-                # extract doi out of comments if there are any
-                match = self.re_doi.search(comments)
-                if match:
-                    metadata['doi'] = match.group(1)
-                else:
-                    doi = metadata.get('properties', {}).get('DOI', None)
-                    if doi:
-                        metadata['doi'] = doi.replace('doi:', '')
-                match_doctype = None
-                title = metadata.get('title')
-                # check title for erratum
-                match = self.re_doctype_errata.search(title)
-                if match:
-                    match_doctype = ['erratum']
-                else:
-                    match = self.re_doctype_bookreview.search(title)
+                journal = filename.strip().split('/')[-5]
+                if journal == 'ArXiv':
+                    metadata = self.ARXIV_PARSER.parse(arxiv_fp)
+                    comments = ' '.join(metadata.get('comments', []))
+                    # extract doi out of comments if there are any
+                    match = self.re_doi.search(comments)
                     if match:
-                        match_doctype = ['bookreview']
+                        metadata['doi'] = match.group(1)
                     else:
-                        # check both comments and title for thesis
-                        match = self.re_doctype_thesis.search("%s %s"%(comments, title))
+                        doi = metadata.get('properties', {}).get('DOI', None)
+                        if doi:
+                            metadata['doi'] = doi.replace('doi:', '')
+                    match_doctype = None
+                    title = metadata.get('title')
+                    # check title for erratum
+                    match = self.re_doctype_errata.search(title)
+                    if match:
+                        match_doctype = ['erratum']
+                    else:
+                        match = self.re_doctype_bookreview.search(title)
                         if match:
-                            match_doctype = ['phdthesis', 'mastersthesis']
-                must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
+                            match_doctype = ['bookreview']
+                        else:
+                            # check both comments and title for thesis
+                            match = self.re_doctype_thesis.search("%s %s" % (comments, title))
+                            if match:
+                                match_doctype = ['phdthesis', 'mastersthesis']
+                    must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
+                else:
+                    metadata = get_pub_metadata(arxiv_fp.read())
+                    # remove the doi, since in this case, oracle thinks it is the publication doi
+                    metadata.pop("doi", None)
+                    match_doctype = None
+                    must_match = False
+                    comments = ''
                 oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype)
                 # before proceeding see if this arXiv article's class is among the ones that ADS archives the
                 # published version if available

diff --git a/...cmatch/tests/unittests/stubdata/0708.1752 → ...ests/stubdata/ArXiv/oai/eprints/0708/1752 b/...cmatch/tests/unittests/stubdata/0708.1752 → ...ests/stubdata/ArXiv/oai/eprints/0708/1752
diff --git a/...match/tests/unittests/stubdata/1701.00200 → ...sts/stubdata/ArXiv/oai/eprints/1701/00200 b/...match/tests/unittests/stubdata/1701.00200 → ...sts/stubdata/ArXiv/oai/eprints/1701/00200
diff --git a/...match/tests/unittests/stubdata/1801.01021 → ...sts/stubdata/ArXiv/oai/eprints/1801/01021 b/...match/tests/unittests/stubdata/1801.01021 → ...sts/stubdata/ArXiv/oai/eprints/1801/01021
diff --git a/...match/tests/unittests/stubdata/2106.07251 → ...sts/stubdata/ArXiv/oai/eprints/2106/07251 b/...match/tests/unittests/stubdata/2106.07251 → ...sts/stubdata/ArXiv/oai/eprints/2106/07251
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579
@@ -0,0 +1,47 @@
+<record>
+<header>
+ <identifier>oai:arXiv.org:2312.08579</identifier>
+ <datestamp>2023-12-15</datestamp>
+ <setSpec>cs</setSpec>
+ <setSpec>physics:astro-ph</setSpec>
+</header>
+<metadata>
+ <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
+ <dc:title>Identifying Planetary Names in Astronomy Papers: A Multi-Step Approach</dc:title>
+ <dc:creator>Shapurian, Golnaz</dc:creator>
+ <dc:creator>Kurtz, Michael J</dc:creator>
+ <dc:creator>Accomazzi, Alberto</dc:creator>
+ <dc:subject>Computer Science - Computation and Language</dc:subject>
+ <dc:subject>Astrophysics - Instrumentation and Methods for Astrophysics</dc:subject>
+ <dc:subject>Computer Science - Machine Learning</dc:subject>
+ <dc:description>  The automatic identification of planetary feature names in astronomy
+publications presents numerous challenges. These features include craters,
+defined as roughly circular depressions resulting from impact or volcanic
+activity; dorsas, which are elongate raised structures or wrinkle ridges; and
+lacus, small irregular patches of dark, smooth material on the Moon, referred
+to as &quot;lake&quot; (Planetary Names Working Group, n.d.). Many feature names overlap
+with places or people's names that they are named after, for example, Syria,
+Tempe, Einstein, and Sagan, to name a few (U.S. Geological Survey, n.d.). Some
+feature names have been used in many contexts, for instance, Apollo, which can
+refer to mission, program, sample, astronaut, seismic, seismometers, core, era,
+data, collection, instrument, and station, in addition to the crater on the
+Moon. Some feature names can appear in the text as adjectives, like the lunar
+craters Black, Green, and White. Some feature names in other contexts serve as
+directions, like craters West and South on the Moon. Additionally, some
+features share identical names across different celestial bodies, requiring
+disambiguation, such as the Adams crater, which exists on both the Moon and
+Mars. We present a multi-step pipeline combining rule-based filtering,
+statistical relevance analysis, part-of-speech (POS) tagging, named entity
+recognition (NER) model, hybrid keyword harvesting, knowledge graph (KG)
+matching, and inference with a locally installed large language model (LLM) to
+reliably identify planetary names despite these challenges. When evaluated on a
+dataset of astronomy papers from the Astrophysics Data System (ADS), this
+methodology achieves an F1-score over 0.97 in disambiguating planetary feature
+names.
+</dc:description>
+ <dc:date>2023-12-13</dc:date>
+ <dc:type>text</dc:type>
+ <dc:identifier>http://arxiv.org/abs/2312.08579</dc:identifier>
+ </oai_dc:dc>
+</metadata>
+</record>
diff --git a/adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs b/adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs
@@ -0,0 +1,15 @@
+Title:              Native H2 exploration in the western Pyrenean
+                    foothills
+Authors:            Lefeuvre, Nicolas; Truche, Laurent;
+                    Donze, Frederic Victor; Ducoux, Maxime;
+                    Barr&eacute;, Guillaume; Fakoury, Rose-Adeline;
+                    Calassou, Sylvain; Gaucher, Eric
+Journal:            ESS Open Archive, id. essoar.10507102.1
+Publication Date:   05/2021
+Category:           Earth Science
+Origin:             ESSOAR
+DOI:                10.1002/essoar.10507102.1
+Bibliographic Code: 2021esoar.10507102L
+
+                               Abstract
+Not Available
diff --git a/adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs b/adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs
@@ -0,0 +1,29 @@
+Title:              Wildfire influence on recent US pollution trends
+Authors:            Burke, Marshall; Childs, Marissa;
+                    de la Cuesta, Brandon; Qiu, Minghao; Li, Jessica;
+                    Gould, Carlos; Heft-Neal, Sam; Wara, Michael
+Journal:            EarthArXiv Preprint, id. X58667
+Publication Date:   12/2022
+Category:           Earth Science
+Origin:             EAARX
+Keywords:           Environmental Health and Protection
+DOI:                10.31223/x58667
+Bibliographic Code: 2022EaArX...X58667B
+
+                               Abstract
+Steady improvements in ambient air quality in the US over the past
+several decades have led to large public health benefits. However,
+recent trends in PM2.5 concentrations, a key pollutant, have stagnated
+or begun to reverse throughout much of the US. We quantify the
+contribution of wildfire smoke to these trends and find that since 2016,
+wildfire smoke has significantly slowed or reversed previous
+improvements in average annual PM2.5 concentrations in two-thirds of US
+states, eroding 23% of previous gains on average in those states
+(equivalent to 3.6 years of air quality progress) and over 50% in
+multiple western states. Smoke influence on trends in extreme PM2.5
+concentrations is detectable by 2010, but remains concentrated primarily
+in western states. Wildfire-driven increases in ambient PM2.5
+concentrations are unregulated under current air pollution law, and,
+absent additional intervention, wildfire's contribution to regional and
+national air quality trends is likely to grow as the climate continues
+to warm.
diff --git a/adsdocmatch/tests/unittests/test_match_w_metadata.py b/adsdocmatch/tests/unittests/test_match_w_metadata.py
@@ -89,7 +89,8 @@ def test_match_to_pub_1(self):
             'comment': 'No matches with Abstract, trying Title. No document was found in solr matching the request.'
         }]
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/1701.00200')
+            arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
+            matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1701/00200')
             self.assertEqual(len(matches), 1)
             fields = matches[0].split('\t')
             self.assertEqual(len(fields), 6)
@@ -112,7 +113,8 @@ def test_match_to_pub_2(self):
             'comment': ''
         }]
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/1801.01021')
+            arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
+            matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1801/01021')
             self.assertEqual(len(matches), 1)
             fields = matches[0].split('\t')
             self.assertEqual(len(fields), 6)
@@ -135,7 +137,8 @@ def test_match_to_pub_3(self):
             'comment': ''
         }]
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/0708.1752')
+            arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
+            matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '0708/1752')
             self.assertEqual(len(matches), 1)
             fields = matches[0].split('\t')
             self.assertEqual(len(fields), 6)
@@ -165,7 +168,8 @@ def test_match_to_pub_4(self):
             'comment': 'Matching doctype `phdthesis;mastersthesis`. Multi match: 2 of 2.'
         }]
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/2106.07251')
+            arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
+            matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '2106/07251')
             self.assertEqual(len(matches), 2)
             expected_values = [
                 ['2021arXiv210607251P','2020PhDT........36P','Match','0.8989977',"{'abstract': None, 'title': 1.0, 'author': 1, 'year': 1}",'Matching doctype `phdthesis;mastersthesis`. Multi match: 1 of 2.'],
@@ -177,6 +181,57 @@ def test_match_to_pub_4(self):
                 for i in range(len(fields)):
                     self.assertEqual(fields[i], expected_value[i])
 
+    def test_match_to_earth_science_1(self):
+        """ test match_to_ earth science records that can appear both as eprint and publication """
+        return_value = [{
+            'source_bibcode': '2021esoar.10507102L',
+            'matched_bibcode': '2021GGG....2209917L',
+            'label': 'Match',
+            'confidence': 0.9981487,
+            'score': {'abstract': None, 'title': 0.95, 'author': 1, 'year': 1},
+            'comment': ''
+        }]
+        # treat as eprint
+        with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
+            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs'))
+
+        return_value = [{
+            'source_bibcode': '2021esoar.10507102L',
+            'matched_bibcode': '...................',
+            'label': 'Not Match',
+            'confidence': 0,
+            'score': '',
+            'comment': "No result from solr with DOI ['10.1002/essoar.10507102.1'] in pubnote. No matches with Abstract, trying Title. No document was found in solr matching the request."
+        }]
+        # now send it as publication
+        with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
+            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs'))
+
+    def test_match_to_earth_science_2(self):
+        """ test match_to_ earth science records that can appear both as eprint and publication """
+        return_value = [{
+            'source_bibcode': '2022EaArX...X58667B',
+            'matched_bibcode': '2023Natur.622..761B',
+            'label': 'Not Match',
+            'confidence': 0.0110576,
+            'score': {'abstract': 0.88, 'title': 0.42, 'author': 1, 'year': 1},
+            'comment': ''}]
+        # treat as eprint
+        with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
+            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs'))
+
+        return_value = [{
+            'source_bibcode': '2022EaArX...X58667B',
+            'matched_bibcode': '...................',
+            'label': 'Not Match',
+            'confidence': 0,
+            'score': '',
+            'comment': "No result from solr with DOI ['10.31223/x58667'] in pubnote. No matches with Abstract, trying Title. No document was found in solr matching the request."
+        }]
+        # now send it as publication
+        with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
+            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs'))
+
     def test_batch_match_to_pub(self):
         """ test batch mode of match_to_pub """
 
@@ -187,7 +242,7 @@ def test_batch_match_to_pub(self):
         rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME']))
 
         # create input file with list of eprint filenames
-        eprint_filenames = ['/2106.07251']
+        eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
         with open(input_filename, "w") as f:
             for filename in eprint_filenames:
                 f.write("%s\n"%(stubdata_dir+filename))
@@ -450,7 +505,7 @@ def test_process_match_to_pub_without_classic_output(self):
         # create input file with list of eprint filenames
         stubdata_dir = os.path.dirname(__file__) + '/stubdata'
         input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME'])
-        eprint_filenames = ['/2106.07251']
+        eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
         with open(input_filename, "w") as f:
             for filename in eprint_filenames:
                 f.write("%s\n"%(stubdata_dir+filename))
@@ -491,7 +546,7 @@ def test_process_match_to_pub_with_classic_output(self):
         # create input file with list of eprint filenames
         stubdata_dir = os.path.dirname(__file__) + '/stubdata'
         input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME'])
-        eprint_filenames = ['/2106.07251']
+        eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
         with open(input_filename, "w") as f:
             for filename in eprint_filenames:
                 f.write("%s\n" % (stubdata_dir + filename))
@@ -567,7 +622,7 @@ def test_write_results(self):
         stubdata_dir = os.path.dirname(__file__) + '/stubdata'
         result_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME'])
         rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME']))
-        eprint_filename = "%s%s"% (stubdata_dir, '/2305/03053')
+        eprint_filename = "%s%s"% (stubdata_dir, '/ArXiv/oai/eprints/2305/03053')
         matches = self.match_metadata.process_results([{
             'source_bibcode': '2023arXiv230503053S',
             'status_flaw' : "got 502 for the last failed attempt -- shall be added to rerun list."}], '\t')

diff --git a/adsdocmatch/tests/unittests/test_oracle_util.py b/adsdocmatch/tests/unittests/test_oracle_util.py
@@ -32,10 +32,10 @@ def create_response(self, text):
 
     def test_normalize_author_list(self):
         """ """
-        eprint_filenames = ['/2106.07251', '/1701.00200', '/1801.01021', '/2106.07251']
-        stubdata_dir = os.path.dirname(__file__) + '/stubdata'
+        eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579']
+        stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints'
 
-        expected_authors = ['Proxauf, B', 'Tang, X', 'Frey, K; Accomazzi, A', 'Proxauf, B']
+        expected_authors = ['Proxauf, B', 'Tang, X', 'Frey, K; Accomazzi, A', 'Shapurian, G; Kurtz, M; Accomazzi, A']
         for filename, authors in zip(eprint_filenames, expected_authors):
             fullpath = stubdata_dir + filename
             with open(fullpath, 'rb') as arxiv_fp:
@@ -70,10 +70,10 @@ def test_normalize_author_list(self):
 
     def test_extract_doi(self):
         """ """
-        eprint_filenames = ['/2106.07251', '/1701.00200', '/1801.01021', '/2106.07251']
-        stubdata_dir = os.path.dirname(__file__) + '/stubdata'
+        eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579']
+        stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints'
 
-        expected_dois = [['10.53846/goediss-8502'], None, ['10.3847/1538-4365/aab760'], ['10.53846/goediss-8502']]
+        expected_dois = [['10.53846/goediss-8502'], None, ['10.3847/1538-4365/aab760'], None]
         for filename, doi in zip(eprint_filenames, expected_dois):
             fullpath = stubdata_dir + filename
             with open(fullpath, 'rb') as arxiv_fp: