Skip to content

Commit

Permalink
support earth science records that can appear both as eprint and pub
Browse files Browse the repository at this point in the history
  • Loading branch information
golnazads committed Dec 17, 2023
1 parent 232d829 commit ce92723
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 38 deletions.
56 changes: 32 additions & 24 deletions adsdocmatch/match_w_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import time
from datetime import date
import re
import csv

Expand Down Expand Up @@ -203,32 +202,41 @@ def match_to_pub(self, filename):
"""
try:
with open(filename, 'rb') as arxiv_fp:
metadata = self.ARXIV_PARSER.parse(arxiv_fp)
comments = ' '.join(metadata.get('comments', []))
# extract doi out of comments if there are any
match = self.re_doi.search(comments)
if match:
metadata['doi'] = match.group(1)
else:
doi = metadata.get('properties', {}).get('DOI', None)
if doi:
metadata['doi'] = doi.replace('doi:', '')
match_doctype = None
title = metadata.get('title')
# check title for erratum
match = self.re_doctype_errata.search(title)
if match:
match_doctype = ['erratum']
else:
match = self.re_doctype_bookreview.search(title)
journal = filename.strip().split('/')[-5]
if journal == 'ArXiv':
metadata = self.ARXIV_PARSER.parse(arxiv_fp)
comments = ' '.join(metadata.get('comments', []))
# extract doi out of comments if there are any
match = self.re_doi.search(comments)
if match:
match_doctype = ['bookreview']
metadata['doi'] = match.group(1)
else:
# check both comments and title for thesis
match = self.re_doctype_thesis.search("%s %s"%(comments, title))
doi = metadata.get('properties', {}).get('DOI', None)
if doi:
metadata['doi'] = doi.replace('doi:', '')
match_doctype = None
title = metadata.get('title')
# check title for erratum
match = self.re_doctype_errata.search(title)
if match:
match_doctype = ['erratum']
else:
match = self.re_doctype_bookreview.search(title)
if match:
match_doctype = ['phdthesis', 'mastersthesis']
must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
match_doctype = ['bookreview']
else:
# check both comments and title for thesis
match = self.re_doctype_thesis.search("%s %s" % (comments, title))
if match:
match_doctype = ['phdthesis', 'mastersthesis']
must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
else:
metadata = get_pub_metadata(arxiv_fp.read())
# remove the doi, since in this case, oracle thinks it is the publication doi
metadata.pop("doi", None)
match_doctype = None
must_match = False
comments = ''
oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype)
# before proceeding see if this arXiv article's class is among the ones that ADS archives the
# published version if available
Expand Down
47 changes: 47 additions & 0 deletions adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<record>
<header>
<identifier>oai:arXiv.org:2312.08579</identifier>
<datestamp>2023-12-15</datestamp>
<setSpec>cs</setSpec>
<setSpec>physics:astro-ph</setSpec>
</header>
<metadata>
<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>Identifying Planetary Names in Astronomy Papers: A Multi-Step Approach</dc:title>
<dc:creator>Shapurian, Golnaz</dc:creator>
<dc:creator>Kurtz, Michael J</dc:creator>
<dc:creator>Accomazzi, Alberto</dc:creator>
<dc:subject>Computer Science - Computation and Language</dc:subject>
<dc:subject>Astrophysics - Instrumentation and Methods for Astrophysics</dc:subject>
<dc:subject>Computer Science - Machine Learning</dc:subject>
<dc:description> The automatic identification of planetary feature names in astronomy
publications presents numerous challenges. These features include craters,
defined as roughly circular depressions resulting from impact or volcanic
activity; dorsas, which are elongate raised structures or wrinkle ridges; and
lacus, small irregular patches of dark, smooth material on the Moon, referred
to as &quot;lake&quot; (Planetary Names Working Group, n.d.). Many feature names overlap
with places or people's names that they are named after, for example, Syria,
Tempe, Einstein, and Sagan, to name a few (U.S. Geological Survey, n.d.). Some
feature names have been used in many contexts, for instance, Apollo, which can
refer to mission, program, sample, astronaut, seismic, seismometers, core, era,
data, collection, instrument, and station, in addition to the crater on the
Moon. Some feature names can appear in the text as adjectives, like the lunar
craters Black, Green, and White. Some feature names in other contexts serve as
directions, like craters West and South on the Moon. Additionally, some
features share identical names across different celestial bodies, requiring
disambiguation, such as the Adams crater, which exists on both the Moon and
Mars. We present a multi-step pipeline combining rule-based filtering,
statistical relevance analysis, part-of-speech (POS) tagging, named entity
recognition (NER) model, hybrid keyword harvesting, knowledge graph (KG)
matching, and inference with a locally installed large language model (LLM) to
reliably identify planetary names despite these challenges. When evaluated on a
dataset of astronomy papers from the Astrophysics Data System (ADS), this
methodology achieves an F1-score over 0.97 in disambiguating planetary feature
names.
</dc:description>
<dc:date>2023-12-13</dc:date>
<dc:type>text</dc:type>
<dc:identifier>http://arxiv.org/abs/2312.08579</dc:identifier>
</oai_dc:dc>
</metadata>
</record>
15 changes: 15 additions & 0 deletions adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Title: Native H2 exploration in the western Pyrenean
foothills
Authors: Lefeuvre, Nicolas; Truche, Laurent;
Donze, Frederic Victor; Ducoux, Maxime;
Barr&eacute;, Guillaume; Fakoury, Rose-Adeline;
Calassou, Sylvain; Gaucher, Eric
Journal: ESS Open Archive, id. essoar.10507102.1
Publication Date: 05/2021
Category: Earth Science
Origin: ESSOAR
DOI: 10.1002/essoar.10507102.1
Bibliographic Code: 2021esoar.10507102L

Abstract
Not Available
29 changes: 29 additions & 0 deletions adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Title: Wildfire influence on recent US pollution trends
Authors: Burke, Marshall; Childs, Marissa;
de la Cuesta, Brandon; Qiu, Minghao; Li, Jessica;
Gould, Carlos; Heft-Neal, Sam; Wara, Michael
Journal: EarthArXiv Preprint, id. X58667
Publication Date: 12/2022
Category: Earth Science
Origin: EAARX
Keywords: Environmental Health and Protection
DOI: 10.31223/x58667
Bibliographic Code: 2022EaArX...X58667B

Abstract
Steady improvements in ambient air quality in the US over the past
several decades have led to large public health benefits. However,
recent trends in PM2.5 concentrations, a key pollutant, have stagnated
or begun to reverse throughout much of the US. We quantify the
contribution of wildfire smoke to these trends and find that since 2016,
wildfire smoke has significantly slowed or reversed previous
improvements in average annual PM2.5 concentrations in two-thirds of US
states, eroding 23% of previous gains on average in those states
(equivalent to 3.6 years of air quality progress) and over 50% in
multiple western states. Smoke influence on trends in extreme PM2.5
concentrations is detectable by 2010, but remains concentrated primarily
in western states. Wildfire-driven increases in ambient PM2.5
concentrations are unregulated under current air pollution law, and,
absent additional intervention, wildfire's contribution to regional and
national air quality trends is likely to grow as the climate continues
to warm.
71 changes: 63 additions & 8 deletions adsdocmatch/tests/unittests/test_match_w_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ def test_match_to_pub_1(self):
'comment': 'No matches with Abstract, trying Title. No document was found in solr matching the request.'
}]
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/1701.00200')
arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1701/00200')
self.assertEqual(len(matches), 1)
fields = matches[0].split('\t')
self.assertEqual(len(fields), 6)
Expand All @@ -112,7 +113,8 @@ def test_match_to_pub_2(self):
'comment': ''
}]
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/1801.01021')
arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1801/01021')
self.assertEqual(len(matches), 1)
fields = matches[0].split('\t')
self.assertEqual(len(fields), 6)
Expand All @@ -135,7 +137,8 @@ def test_match_to_pub_3(self):
'comment': ''
}]
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/0708.1752')
arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '0708/1752')
self.assertEqual(len(matches), 1)
fields = matches[0].split('\t')
self.assertEqual(len(fields), 6)
Expand Down Expand Up @@ -165,7 +168,8 @@ def test_match_to_pub_4(self):
'comment': 'Matching doctype `phdthesis;mastersthesis`. Multi match: 2 of 2.'
}]
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/2106.07251')
arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '2106/07251')
self.assertEqual(len(matches), 2)
expected_values = [
['2021arXiv210607251P','2020PhDT........36P','Match','0.8989977',"{'abstract': None, 'title': 1.0, 'author': 1, 'year': 1}",'Matching doctype `phdthesis;mastersthesis`. Multi match: 1 of 2.'],
Expand All @@ -177,6 +181,57 @@ def test_match_to_pub_4(self):
for i in range(len(fields)):
self.assertEqual(fields[i], expected_value[i])

def test_match_to_earth_science_1(self):
""" test match_to_ earth science records that can appear both as eprint and publication """
return_value = [{
'source_bibcode': '2021esoar.10507102L',
'matched_bibcode': '2021GGG....2209917L',
'label': 'Match',
'confidence': 0.9981487,
'score': {'abstract': None, 'title': 0.95, 'author': 1, 'year': 1},
'comment': ''
}]
# treat as eprint
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs'))

return_value = [{
'source_bibcode': '2021esoar.10507102L',
'matched_bibcode': '...................',
'label': 'Not Match',
'confidence': 0,
'score': '',
'comment': "No result from solr with DOI ['10.1002/essoar.10507102.1'] in pubnote. No matches with Abstract, trying Title. No document was found in solr matching the request."
}]
# now send it as publication
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs'))

def test_match_to_earth_science_2(self):
""" test match_to_ earth science records that can appear both as eprint and publication """
return_value = [{
'source_bibcode': '2022EaArX...X58667B',
'matched_bibcode': '2023Natur.622..761B',
'label': 'Not Match',
'confidence': 0.0110576,
'score': {'abstract': 0.88, 'title': 0.42, 'author': 1, 'year': 1},
'comment': ''}]
# treat as eprint
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs'))

return_value = [{
'source_bibcode': '2022EaArX...X58667B',
'matched_bibcode': '...................',
'label': 'Not Match',
'confidence': 0,
'score': '',
'comment': "No result from solr with DOI ['10.31223/x58667'] in pubnote. No matches with Abstract, trying Title. No document was found in solr matching the request."
}]
# now send it as publication
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs'))

def test_batch_match_to_pub(self):
""" test batch mode of match_to_pub """

Expand All @@ -187,7 +242,7 @@ def test_batch_match_to_pub(self):
rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME']))

# create input file with list of eprint filenames
eprint_filenames = ['/2106.07251']
eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
with open(input_filename, "w") as f:
for filename in eprint_filenames:
f.write("%s\n"%(stubdata_dir+filename))
Expand Down Expand Up @@ -450,7 +505,7 @@ def test_process_match_to_pub_without_classic_output(self):
# create input file with list of eprint filenames
stubdata_dir = os.path.dirname(__file__) + '/stubdata'
input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME'])
eprint_filenames = ['/2106.07251']
eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
with open(input_filename, "w") as f:
for filename in eprint_filenames:
f.write("%s\n"%(stubdata_dir+filename))
Expand Down Expand Up @@ -491,7 +546,7 @@ def test_process_match_to_pub_with_classic_output(self):
# create input file with list of eprint filenames
stubdata_dir = os.path.dirname(__file__) + '/stubdata'
input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME'])
eprint_filenames = ['/2106.07251']
eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
with open(input_filename, "w") as f:
for filename in eprint_filenames:
f.write("%s\n" % (stubdata_dir + filename))
Expand Down Expand Up @@ -567,7 +622,7 @@ def test_write_results(self):
stubdata_dir = os.path.dirname(__file__) + '/stubdata'
result_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME'])
rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME']))
eprint_filename = "%s%s"% (stubdata_dir, '/2305/03053')
eprint_filename = "%s%s"% (stubdata_dir, '/ArXiv/oai/eprints/2305/03053')
matches = self.match_metadata.process_results([{
'source_bibcode': '2023arXiv230503053S',
'status_flaw' : "got 502 for the last failed attempt -- shall be added to rerun list."}], '\t')
Expand Down
12 changes: 6 additions & 6 deletions adsdocmatch/tests/unittests/test_oracle_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ def create_response(self, text):

def test_normalize_author_list(self):
""" """
eprint_filenames = ['/2106.07251', '/1701.00200', '/1801.01021', '/2106.07251']
stubdata_dir = os.path.dirname(__file__) + '/stubdata'
eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579']
stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints'

expected_authors = ['Proxauf, B', 'Tang, X', 'Frey, K; Accomazzi, A', 'Proxauf, B']
expected_authors = ['Proxauf, B', 'Tang, X', 'Frey, K; Accomazzi, A', 'Shapurian, G; Kurtz, M; Accomazzi, A']
for filename, authors in zip(eprint_filenames, expected_authors):
fullpath = stubdata_dir + filename
with open(fullpath, 'rb') as arxiv_fp:
Expand Down Expand Up @@ -70,10 +70,10 @@ def test_normalize_author_list(self):

def test_extract_doi(self):
""" """
eprint_filenames = ['/2106.07251', '/1701.00200', '/1801.01021', '/2106.07251']
stubdata_dir = os.path.dirname(__file__) + '/stubdata'
eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579']
stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints'

expected_dois = [['10.53846/goediss-8502'], None, ['10.3847/1538-4365/aab760'], ['10.53846/goediss-8502']]
expected_dois = [['10.53846/goediss-8502'], None, ['10.3847/1538-4365/aab760'], None]
for filename, doi in zip(eprint_filenames, expected_dois):
fullpath = stubdata_dir + filename
with open(fullpath, 'rb') as arxiv_fp:
Expand Down

0 comments on commit ce92723

Please sign in to comment.