From ce92723c0d64ac954cedc798d44ccb8b84363cac Mon Sep 17 00:00:00 2001 From: golnazads <28757512+golnazads@users.noreply.github.com> Date: Sun, 17 Dec 2023 17:35:44 -0500 Subject: [PATCH] support earth science records that can appear both as eprint and pub --- adsdocmatch/match_w_metadata.py | 56 ++++++++------- .../oai/eprints/0708/1752} | 0 .../oai/eprints/1701/00200} | 0 .../oai/eprints/1801/01021} | 0 .../oai/eprints/2106/07251} | 0 .../stubdata/ArXiv/oai/eprints/2312/08579 | 47 ++++++++++++ .../unittests/stubdata/text/L48/L48-23288.abs | 15 ++++ .../unittests/stubdata/text/L52/L52-28159.abs | 29 ++++++++ .../tests/unittests/test_match_w_metadata.py | 71 ++++++++++++++++--- .../tests/unittests/test_oracle_util.py | 12 ++-- 10 files changed, 192 insertions(+), 38 deletions(-) rename adsdocmatch/tests/unittests/stubdata/{0708.1752 => ArXiv/oai/eprints/0708/1752} (100%) rename adsdocmatch/tests/unittests/stubdata/{1701.00200 => ArXiv/oai/eprints/1701/00200} (100%) rename adsdocmatch/tests/unittests/stubdata/{1801.01021 => ArXiv/oai/eprints/1801/01021} (100%) rename adsdocmatch/tests/unittests/stubdata/{2106.07251 => ArXiv/oai/eprints/2106/07251} (100%) create mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 create mode 100644 adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs create mode 100644 adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs diff --git a/adsdocmatch/match_w_metadata.py b/adsdocmatch/match_w_metadata.py index e0068fe..f604f65 100644 --- a/adsdocmatch/match_w_metadata.py +++ b/adsdocmatch/match_w_metadata.py @@ -1,6 +1,5 @@ import os import time -from datetime import date import re import csv @@ -203,32 +202,41 @@ def match_to_pub(self, filename): """ try: with open(filename, 'rb') as arxiv_fp: - metadata = self.ARXIV_PARSER.parse(arxiv_fp) - comments = ' '.join(metadata.get('comments', [])) - # extract doi out of comments if there are any - match = self.re_doi.search(comments) - if match: - metadata['doi'] = match.group(1) - else: - doi = metadata.get('properties', {}).get('DOI', None) - if doi: - metadata['doi'] = doi.replace('doi:', '') - match_doctype = None - title = metadata.get('title') - # check title for erratum - match = self.re_doctype_errata.search(title) - if match: - match_doctype = ['erratum'] - else: - match = self.re_doctype_bookreview.search(title) + journal = filename.strip().split('/')[-5] + if journal == 'ArXiv': + metadata = self.ARXIV_PARSER.parse(arxiv_fp) + comments = ' '.join(metadata.get('comments', [])) + # extract doi out of comments if there are any + match = self.re_doi.search(comments) if match: - match_doctype = ['bookreview'] + metadata['doi'] = match.group(1) else: - # check both comments and title for thesis - match = self.re_doctype_thesis.search("%s %s"%(comments, title)) + doi = metadata.get('properties', {}).get('DOI', None) + if doi: + metadata['doi'] = doi.replace('doi:', '') + match_doctype = None + title = metadata.get('title') + # check title for erratum + match = self.re_doctype_errata.search(title) + if match: + match_doctype = ['erratum'] + else: + match = self.re_doctype_bookreview.search(title) if match: - match_doctype = ['phdthesis', 'mastersthesis'] - must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH) + match_doctype = ['bookreview'] + else: + # check both comments and title for thesis + match = self.re_doctype_thesis.search("%s %s" % (comments, title)) + if match: + match_doctype = ['phdthesis', 'mastersthesis'] + must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH) + else: + metadata = get_pub_metadata(arxiv_fp.read()) + # remove the doi, since in this case, oracle thinks it is the publication doi + metadata.pop("doi", None) + match_doctype = None + must_match = False + comments = '' oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype) # before proceeding see if this arXiv article's class is among the ones that ADS archives the # published version if available diff --git a/adsdocmatch/tests/unittests/stubdata/0708.1752 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752 similarity index 100% rename from adsdocmatch/tests/unittests/stubdata/0708.1752 rename to adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752 diff --git a/adsdocmatch/tests/unittests/stubdata/1701.00200 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200 similarity index 100% rename from adsdocmatch/tests/unittests/stubdata/1701.00200 rename to adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200 diff --git a/adsdocmatch/tests/unittests/stubdata/1801.01021 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021 similarity index 100% rename from adsdocmatch/tests/unittests/stubdata/1801.01021 rename to adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021 diff --git a/adsdocmatch/tests/unittests/stubdata/2106.07251 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251 similarity index 100% rename from adsdocmatch/tests/unittests/stubdata/2106.07251 rename to adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251 diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 new file mode 100644 index 0000000..38fef15 --- /dev/null +++ b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 @@ -0,0 +1,47 @@ + +
+ oai:arXiv.org:2312.08579 + 2023-12-15 + cs + physics:astro-ph +
+ + + Identifying Planetary Names in Astronomy Papers: A Multi-Step Approach + Shapurian, Golnaz + Kurtz, Michael J + Accomazzi, Alberto + Computer Science - Computation and Language + Astrophysics - Instrumentation and Methods for Astrophysics + Computer Science - Machine Learning + The automatic identification of planetary feature names in astronomy +publications presents numerous challenges. These features include craters, +defined as roughly circular depressions resulting from impact or volcanic +activity; dorsas, which are elongate raised structures or wrinkle ridges; and +lacus, small irregular patches of dark, smooth material on the Moon, referred +to as "lake" (Planetary Names Working Group, n.d.). Many feature names overlap +with places or people's names that they are named after, for example, Syria, +Tempe, Einstein, and Sagan, to name a few (U.S. Geological Survey, n.d.). Some +feature names have been used in many contexts, for instance, Apollo, which can +refer to mission, program, sample, astronaut, seismic, seismometers, core, era, +data, collection, instrument, and station, in addition to the crater on the +Moon. Some feature names can appear in the text as adjectives, like the lunar +craters Black, Green, and White. Some feature names in other contexts serve as +directions, like craters West and South on the Moon. Additionally, some +features share identical names across different celestial bodies, requiring +disambiguation, such as the Adams crater, which exists on both the Moon and +Mars. We present a multi-step pipeline combining rule-based filtering, +statistical relevance analysis, part-of-speech (POS) tagging, named entity +recognition (NER) model, hybrid keyword harvesting, knowledge graph (KG) +matching, and inference with a locally installed large language model (LLM) to +reliably identify planetary names despite these challenges. When evaluated on a +dataset of astronomy papers from the Astrophysics Data System (ADS), this +methodology achieves an F1-score over 0.97 in disambiguating planetary feature +names. + + 2023-12-13 + text + http://arxiv.org/abs/2312.08579 + + +
diff --git a/adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs b/adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs new file mode 100644 index 0000000..96de5e8 --- /dev/null +++ b/adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs @@ -0,0 +1,15 @@ +Title: Native H2 exploration in the western Pyrenean + foothills +Authors: Lefeuvre, Nicolas; Truche, Laurent; + Donze, Frederic Victor; Ducoux, Maxime; + Barré, Guillaume; Fakoury, Rose-Adeline; + Calassou, Sylvain; Gaucher, Eric +Journal: ESS Open Archive, id. essoar.10507102.1 +Publication Date: 05/2021 +Category: Earth Science +Origin: ESSOAR +DOI: 10.1002/essoar.10507102.1 +Bibliographic Code: 2021esoar.10507102L + + Abstract +Not Available diff --git a/adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs b/adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs new file mode 100644 index 0000000..c400a36 --- /dev/null +++ b/adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs @@ -0,0 +1,29 @@ +Title: Wildfire influence on recent US pollution trends +Authors: Burke, Marshall; Childs, Marissa; + de la Cuesta, Brandon; Qiu, Minghao; Li, Jessica; + Gould, Carlos; Heft-Neal, Sam; Wara, Michael +Journal: EarthArXiv Preprint, id. X58667 +Publication Date: 12/2022 +Category: Earth Science +Origin: EAARX +Keywords: Environmental Health and Protection +DOI: 10.31223/x58667 +Bibliographic Code: 2022EaArX...X58667B + + Abstract +Steady improvements in ambient air quality in the US over the past +several decades have led to large public health benefits. However, +recent trends in PM2.5 concentrations, a key pollutant, have stagnated +or begun to reverse throughout much of the US. We quantify the +contribution of wildfire smoke to these trends and find that since 2016, +wildfire smoke has significantly slowed or reversed previous +improvements in average annual PM2.5 concentrations in two-thirds of US +states, eroding 23% of previous gains on average in those states +(equivalent to 3.6 years of air quality progress) and over 50% in +multiple western states. Smoke influence on trends in extreme PM2.5 +concentrations is detectable by 2010, but remains concentrated primarily +in western states. Wildfire-driven increases in ambient PM2.5 +concentrations are unregulated under current air pollution law, and, +absent additional intervention, wildfire's contribution to regional and +national air quality trends is likely to grow as the climate continues +to warm. diff --git a/adsdocmatch/tests/unittests/test_match_w_metadata.py b/adsdocmatch/tests/unittests/test_match_w_metadata.py index af45610..0fe0070 100644 --- a/adsdocmatch/tests/unittests/test_match_w_metadata.py +++ b/adsdocmatch/tests/unittests/test_match_w_metadata.py @@ -89,7 +89,8 @@ def test_match_to_pub_1(self): 'comment': 'No matches with Abstract, trying Title. No document was found in solr matching the request.' }] with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/1701.00200') + arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/' + matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1701/00200') self.assertEqual(len(matches), 1) fields = matches[0].split('\t') self.assertEqual(len(fields), 6) @@ -112,7 +113,8 @@ def test_match_to_pub_2(self): 'comment': '' }] with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/1801.01021') + arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/' + matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1801/01021') self.assertEqual(len(matches), 1) fields = matches[0].split('\t') self.assertEqual(len(fields), 6) @@ -135,7 +137,8 @@ def test_match_to_pub_3(self): 'comment': '' }] with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/0708.1752') + arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/' + matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '0708/1752') self.assertEqual(len(matches), 1) fields = matches[0].split('\t') self.assertEqual(len(fields), 6) @@ -165,7 +168,8 @@ def test_match_to_pub_4(self): 'comment': 'Matching doctype `phdthesis;mastersthesis`. Multi match: 2 of 2.' }] with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/2106.07251') + arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/' + matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '2106/07251') self.assertEqual(len(matches), 2) expected_values = [ ['2021arXiv210607251P','2020PhDT........36P','Match','0.8989977',"{'abstract': None, 'title': 1.0, 'author': 1, 'year': 1}",'Matching doctype `phdthesis;mastersthesis`. Multi match: 1 of 2.'], @@ -177,6 +181,57 @@ def test_match_to_pub_4(self): for i in range(len(fields)): self.assertEqual(fields[i], expected_value[i]) + def test_match_to_earth_science_1(self): + """ test match_to_ earth science records that can appear both as eprint and publication """ + return_value = [{ + 'source_bibcode': '2021esoar.10507102L', + 'matched_bibcode': '2021GGG....2209917L', + 'label': 'Match', + 'confidence': 0.9981487, + 'score': {'abstract': None, 'title': 0.95, 'author': 1, 'year': 1}, + 'comment': '' + }] + # treat as eprint + with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): + self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs')) + + return_value = [{ + 'source_bibcode': '2021esoar.10507102L', + 'matched_bibcode': '...................', + 'label': 'Not Match', + 'confidence': 0, + 'score': '', + 'comment': "No result from solr with DOI ['10.1002/essoar.10507102.1'] in pubnote. No matches with Abstract, trying Title. No document was found in solr matching the request." + }] + # now send it as publication + with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): + self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs')) + + def test_match_to_earth_science_2(self): + """ test match_to_ earth science records that can appear both as eprint and publication """ + return_value = [{ + 'source_bibcode': '2022EaArX...X58667B', + 'matched_bibcode': '2023Natur.622..761B', + 'label': 'Not Match', + 'confidence': 0.0110576, + 'score': {'abstract': 0.88, 'title': 0.42, 'author': 1, 'year': 1}, + 'comment': ''}] + # treat as eprint + with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): + self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs')) + + return_value = [{ + 'source_bibcode': '2022EaArX...X58667B', + 'matched_bibcode': '...................', + 'label': 'Not Match', + 'confidence': 0, + 'score': '', + 'comment': "No result from solr with DOI ['10.31223/x58667'] in pubnote. No matches with Abstract, trying Title. No document was found in solr matching the request." + }] + # now send it as publication + with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): + self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs')) + def test_batch_match_to_pub(self): """ test batch mode of match_to_pub """ @@ -187,7 +242,7 @@ def test_batch_match_to_pub(self): rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME'])) # create input file with list of eprint filenames - eprint_filenames = ['/2106.07251'] + eprint_filenames = ['/ArXiv/oai/eprints/2106/07251'] with open(input_filename, "w") as f: for filename in eprint_filenames: f.write("%s\n"%(stubdata_dir+filename)) @@ -450,7 +505,7 @@ def test_process_match_to_pub_without_classic_output(self): # create input file with list of eprint filenames stubdata_dir = os.path.dirname(__file__) + '/stubdata' input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME']) - eprint_filenames = ['/2106.07251'] + eprint_filenames = ['/ArXiv/oai/eprints/2106/07251'] with open(input_filename, "w") as f: for filename in eprint_filenames: f.write("%s\n"%(stubdata_dir+filename)) @@ -491,7 +546,7 @@ def test_process_match_to_pub_with_classic_output(self): # create input file with list of eprint filenames stubdata_dir = os.path.dirname(__file__) + '/stubdata' input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME']) - eprint_filenames = ['/2106.07251'] + eprint_filenames = ['/ArXiv/oai/eprints/2106/07251'] with open(input_filename, "w") as f: for filename in eprint_filenames: f.write("%s\n" % (stubdata_dir + filename)) @@ -567,7 +622,7 @@ def test_write_results(self): stubdata_dir = os.path.dirname(__file__) + '/stubdata' result_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME']) rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME'])) - eprint_filename = "%s%s"% (stubdata_dir, '/2305/03053') + eprint_filename = "%s%s"% (stubdata_dir, '/ArXiv/oai/eprints/2305/03053') matches = self.match_metadata.process_results([{ 'source_bibcode': '2023arXiv230503053S', 'status_flaw' : "got 502 for the last failed attempt -- shall be added to rerun list."}], '\t') diff --git a/adsdocmatch/tests/unittests/test_oracle_util.py b/adsdocmatch/tests/unittests/test_oracle_util.py index 238af84..f326498 100644 --- a/adsdocmatch/tests/unittests/test_oracle_util.py +++ b/adsdocmatch/tests/unittests/test_oracle_util.py @@ -32,10 +32,10 @@ def create_response(self, text): def test_normalize_author_list(self): """ """ - eprint_filenames = ['/2106.07251', '/1701.00200', '/1801.01021', '/2106.07251'] - stubdata_dir = os.path.dirname(__file__) + '/stubdata' + eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579'] + stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints' - expected_authors = ['Proxauf, B', 'Tang, X', 'Frey, K; Accomazzi, A', 'Proxauf, B'] + expected_authors = ['Proxauf, B', 'Tang, X', 'Frey, K; Accomazzi, A', 'Shapurian, G; Kurtz, M; Accomazzi, A'] for filename, authors in zip(eprint_filenames, expected_authors): fullpath = stubdata_dir + filename with open(fullpath, 'rb') as arxiv_fp: @@ -70,10 +70,10 @@ def test_normalize_author_list(self): def test_extract_doi(self): """ """ - eprint_filenames = ['/2106.07251', '/1701.00200', '/1801.01021', '/2106.07251'] - stubdata_dir = os.path.dirname(__file__) + '/stubdata' + eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579'] + stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints' - expected_dois = [['10.53846/goediss-8502'], None, ['10.3847/1538-4365/aab760'], ['10.53846/goediss-8502']] + expected_dois = [['10.53846/goediss-8502'], None, ['10.3847/1538-4365/aab760'], None] for filename, doi in zip(eprint_filenames, expected_dois): fullpath = stubdata_dir + filename with open(fullpath, 'rb') as arxiv_fp: