From a1863961f31105e885d1fea778f52c461932b963 Mon Sep 17 00:00:00 2001
From: golnazads <28757512+golnazads@users.noreply.github.com>
Date: Mon, 18 Dec 2023 17:05:53 -0500
Subject: [PATCH] arXiv source metadata is swtiched, and pyingest dropped
---
adsdocmatch/match_w_metadata.py | 88 +++++++++++--------
adsdocmatch/pub_parser/__init__.py | 5 ++
.../stubdata/ArXiv/oai/eprints/0708/1752 | 40 ---------
.../stubdata/ArXiv/oai/eprints/1701/00200 | 26 ------
.../stubdata/ArXiv/oai/eprints/1801/01021 | 33 -------
.../stubdata/ArXiv/oai/eprints/2106/07251 | 43 ---------
.../stubdata/ArXiv/oai/eprints/2312/08579 | 47 ----------
.../stubdata/{text/L48 => }/L48-23288.abs | 0
.../stubdata/{text/L52 => }/L52-28159.abs | 0
.../tests/unittests/stubdata/X01-74270.abs | 32 +++++++
.../tests/unittests/stubdata/X10-50737.abs | 17 ++++
.../tests/unittests/stubdata/X11-85081.abs | 22 +++++
.../tests/unittests/stubdata/X18-10145.abs | 35 ++++++++
.../tests/unittests/stubdata/X21-91237.abs | 30 +++++++
.../tests/unittests/stubdata/X23-45511.abs | 39 ++++++++
.../tests/unittests/test_match_w_metadata.py | 29 +++---
.../tests/unittests/test_oracle_util.py | 14 +--
requirements.txt | 1 -
18 files changed, 252 insertions(+), 249 deletions(-)
delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752
delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200
delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021
delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251
delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579
rename adsdocmatch/tests/unittests/stubdata/{text/L48 => }/L48-23288.abs (100%)
rename adsdocmatch/tests/unittests/stubdata/{text/L52 => }/L52-28159.abs (100%)
create mode 100644 adsdocmatch/tests/unittests/stubdata/X01-74270.abs
create mode 100644 adsdocmatch/tests/unittests/stubdata/X10-50737.abs
create mode 100644 adsdocmatch/tests/unittests/stubdata/X11-85081.abs
create mode 100644 adsdocmatch/tests/unittests/stubdata/X18-10145.abs
create mode 100644 adsdocmatch/tests/unittests/stubdata/X21-91237.abs
create mode 100644 adsdocmatch/tests/unittests/stubdata/X23-45511.abs
diff --git a/adsdocmatch/match_w_metadata.py b/adsdocmatch/match_w_metadata.py
index f604f65..1597a7d 100644
--- a/adsdocmatch/match_w_metadata.py
+++ b/adsdocmatch/match_w_metadata.py
@@ -6,7 +6,6 @@
from adsdocmatch.pub_parser import get_pub_metadata
from adsdocmatch.oracle_util import OracleUtil
from adsdocmatch.matchable_status import matchable_status
-from pyingest.parsers.arxiv import ArxivParser
from adsputils import setup_logging, load_config
proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
@@ -30,7 +29,6 @@ class MatchMetadata():
process_pub_bibstem = {}
- ARXIV_PARSER = ArxivParser()
ORACLE_UTIL = OracleUtil()
def get_input_filenames(self, filename):
@@ -85,6 +83,45 @@ def process_pub_metadata(self, metadata):
self.process_pub_bibstem[bibstem] = 1 if status == True else (0 if status == False else -1)
return self.process_pub_bibstem[bibstem]
+ def parse_arXiv_comments(self, metadata):
+ """
+
+ :param metadata:
+ :return:
+ """
+ comments = metadata.get('arXiv_comments', '')
+ if comments:
+ # extract doi out of comments if there are any
+ match = self.re_doi.search(comments)
+ if match:
+ metadata['doi'] = match.group(1)
+ else:
+ doi = metadata.get('properties', {}).get('DOI', None)
+ if doi:
+ metadata['doi'] = doi.replace('doi:', '')
+ match_doctype = None
+ title = metadata.get('title')
+ # check title for erratum
+ match = self.re_doctype_errata.search(title)
+ if match:
+ match_doctype = ['erratum']
+ else:
+ match = self.re_doctype_bookreview.search(title)
+ if match:
+ match_doctype = ['bookreview']
+ else:
+ # check both comments and title for thesis
+ match = self.re_doctype_thesis.search("%s %s" % (comments, title))
+ if match:
+ match_doctype = ['phdthesis', 'mastersthesis']
+ must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
+ else:
+ metadata.pop("doi", None)
+ match_doctype = None
+ must_match = False
+ comments = ''
+ return metadata, comments, must_match, match_doctype
+
def write_results(self, result_filename, matches, metadata_filename, rerun_filename):
"""
@@ -202,41 +239,8 @@ def match_to_pub(self, filename):
"""
try:
with open(filename, 'rb') as arxiv_fp:
- journal = filename.strip().split('/')[-5]
- if journal == 'ArXiv':
- metadata = self.ARXIV_PARSER.parse(arxiv_fp)
- comments = ' '.join(metadata.get('comments', []))
- # extract doi out of comments if there are any
- match = self.re_doi.search(comments)
- if match:
- metadata['doi'] = match.group(1)
- else:
- doi = metadata.get('properties', {}).get('DOI', None)
- if doi:
- metadata['doi'] = doi.replace('doi:', '')
- match_doctype = None
- title = metadata.get('title')
- # check title for erratum
- match = self.re_doctype_errata.search(title)
- if match:
- match_doctype = ['erratum']
- else:
- match = self.re_doctype_bookreview.search(title)
- if match:
- match_doctype = ['bookreview']
- else:
- # check both comments and title for thesis
- match = self.re_doctype_thesis.search("%s %s" % (comments, title))
- if match:
- match_doctype = ['phdthesis', 'mastersthesis']
- must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
- else:
- metadata = get_pub_metadata(arxiv_fp.read())
- # remove the doi, since in this case, oracle thinks it is the publication doi
- metadata.pop("doi", None)
- match_doctype = None
- must_match = False
- comments = ''
+ metadata = get_pub_metadata(arxiv_fp.read())
+ metadata, comments, must_match, match_doctype = self.parse_arXiv_comments(metadata)
oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype)
# before proceeding see if this arXiv article's class is among the ones that ADS archives the
# published version if available
@@ -456,3 +460,13 @@ def process_match_to_pub(self, path):
combined_output_filename = "%s%s" % (path, config.get('DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME', 'default'))
self.merge_classic_docmatch_results(classic_matched_filename, result_filename, combined_output_filename)
return combined_output_filename
+
+if __name__ == '__main__':
+ print(MatchMetadata().match_to_pub('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
+ print(MatchMetadata().match_to_arXiv('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
+
+ '''
+/proj/ads/abstracts/gen/text/L52/L52-28159.abs
+/proj/ads/abstracts/gen/text/L48/L48-23288.abs
+/proj/ads/abstracts/sources/ArXiv/oai/arXiv.org/2306/02768
+'''
\ No newline at end of file
diff --git a/adsdocmatch/pub_parser/__init__.py b/adsdocmatch/pub_parser/__init__.py
index 122eefc..1516ba0 100644
--- a/adsdocmatch/pub_parser/__init__.py
+++ b/adsdocmatch/pub_parser/__init__.py
@@ -51,6 +51,7 @@ def as_needed(article):
("Publication Date", "pubdate"),
("Bibliographic Code", "bibcode"),
("DOI", "doi"),
+ ("arXiv_comments", "arXiv_comments")
]
return_record = {}
for src_key, dest_key in field_mappings:
@@ -131,4 +132,8 @@ def get_pub_metadata(contents):
switch_date = article['Publication Date'].split('/')
article['Publication Date'] = switch_date[1] + '/' + switch_date[0]
+ if 'Origin' in fields_found_in_file:
+ if article['Origin'] == 'ARXIV':
+ article['arXiv_comments'] = article.get('Comments', '')
+
return as_needed(article)
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752
deleted file mode 100644
index 81fa884..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752
+++ /dev/null
@@ -1,40 +0,0 @@
-
-
- oai:arXiv.org:0708.1752
- 2010-04-06
- physics:astro-ph
-
-
-
- Validation of the new Hipparcos reduction
- van Leeuwen, F.
- Astrophysics
- Context.A new reduction of the astrometric data as produced by the Hipparcos
-mission has been published, claiming accuracies for nearly all stars brighter
-than magnitude Hp = 8 to be better, by up to a factor 4, than in the original
-catalogue. Aims.The new Hipparcos astrometric catalogue is checked for the
-quality of the data and the consistency of the formal errors as well as the
-possible presence of error correlations. The differences with the earlier
-publication are explained. Methods. The internal errors are followed through
-the reduction process, and the external errors are investigated on the basis of
-a comparison with radio observations of a small selection of stars, and the
-distribution of negative parallaxes. Error correlation levels are investigated
-and the reduction by more than a factor 10 as obtained in the new catalogue is
-explained. Results.The formal errors on the parallaxes for the new catalogue
-are confirmed. The presence of a small amount of additional noise, though
-unlikely, cannot be ruled out. Conclusions. The new reduction of the Hipparcos
-astrometric data provides an improvement by a factor 2.2 in the total weight
-compared to the catalogue published in 1997, and provides much improved data
-for a wide range of studies on stellar luminosities and local galactic
-kinematics.
-
- Comment: 12 pages, 19 figures, accepted for publication by Astronomy and
- Astrophysics
- 2007-08-13
- text
- http://arxiv.org/abs/0708.1752
- Astron.Astrophys.474:653-664,2007
- doi:10.1051/0004-6361:20078357
-
-
-
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200
deleted file mode 100644
index ca23e71..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
- oai:arXiv.org:1701.00200
- 2017-08-22
- math
-
-
-
- Post-Lie algebra structures on the Witt algebra
- Tang, Xiaomin
- Mathematics - Rings and Algebras
- 17A30, 17A42, 17B60, 18D50
- In this paper, we characterize the graded post-Lie algebra structures and a
-class of shifting post-Lie algebra structures on the Witt algebra. We obtain
-some new Lie algebras and give a class of their modules. As an application, the
-homogeneous Rota-Baxter operators and a class of non-homogeneous Rota-Baxter
-operators of weight $1$ on the Witt algebra are studied.
-
- Comment: 24 pages
- 2017-01-01
- 2017-08-19
- text
- http://arxiv.org/abs/1701.00200
-
-
-
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021
deleted file mode 100644
index 433aa25..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021
+++ /dev/null
@@ -1,33 +0,0 @@
-
-
- oai:arXiv.org:1801.01021
- 2018-05-23
- cs
- physics:astro-ph
-
-
-
- The Unified Astronomy Thesaurus: Semantic Metadata for Astronomy and
- Astrophysics
- Frey, Katie
- Accomazzi, Alberto
- Astrophysics - Instrumentation and Methods for Astrophysics
- Computer Science - Digital Libraries
- Several different controlled vocabularies have been developed and used by the
-astronomical community, each designed to serve a specific need and a specific
-group. The Unified Astronomy Thesaurus (UAT) attempts to provide a highly
-structured controlled vocabulary that will be relevant and useful across the
-entire discipline, regardless of content or platform. As two major use cases
-for the UAT include classifying articles and data, we examine the UAT in
-comparison with the Astronomical Subject Keywords used by major publications
-and the JWST Science Keywords used by STScI's Astronomer's Proposal Tool.
-
- Comment: Submitted to the Astrophysical Journal Supplements, 10 pages, 3
- tables
- 2018-01-03
- text
- http://arxiv.org/abs/1801.01021
- doi:10.3847/1538-4365/aab760
-
-
-
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251
deleted file mode 100644
index 2544460..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251
+++ /dev/null
@@ -1,43 +0,0 @@
-
-
- oai:arXiv.org:2106.07251
- 2023-01-25
- physics:astro-ph
-
-
-
- Observations of large-scale solar flows
- Proxauf, Bastian
- Astrophysics - Solar and Stellar Astrophysics
- In this dissertation, several components of large-scale solar flows are
-studied observationally: solar equatorial Rossby waves (waves of radial
-vorticity), large-scale convection, and surface flows around active regions.
-Maps of horizontal flows are derived from photospheric observations by the
-Helioseismic and Magnetic Imager (HMI) aboard the Solar Dynamics Observatory
-(SDO) using two different techniques: granulation tracking and local
-helioseismology. First, the eigenfunctions of solar Rossby waves are measured
-from helioseismic ring-diagram flow maps with a correlation method and a
-spectral analysis. Down to $9$ Mm below the surface, the dependence of the
-radial vorticity with radius $r$ is consistent with $r^{m-1}$, for a given
-longitudinal wavenumber $m$. At the surface, the eigenfunctions are
-complex-valued. The real part decreases away from the equator and switches sign
-around $\pm 20-30^\circ$. The imaginary part is small, but nonzero, and may be
-due to wave attenuation. This may have implications for the transport of
-angular momentum in the latitudinal direction. Second, we revisit previous
-measurements of power spectra of longitudinal velocities near the solar
-surface, obtained from time-distance and ring-diagram helioseismology. Several
-issues in these past helioseismic analyses are identified and corrected. The
-corrections are not sufficient to remove the discrepancy between the
-measurements. I thus present new velocity power spectra from granulation
-tracking and ring-diagram helioseismology. The two new measurements are close
-to each other near the solar surface, and the corresponding kinetic energy
-decreases with increasing spatial scale.
-
- Comment: PhD thesis, 97 pages
- 2021-06-14
- text
- http://arxiv.org/abs/2106.07251
- doi:10.53846/goediss-8502
-
-
-
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579
deleted file mode 100644
index 38fef15..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
- oai:arXiv.org:2312.08579
- 2023-12-15
- cs
- physics:astro-ph
-
-
-
- Identifying Planetary Names in Astronomy Papers: A Multi-Step Approach
- Shapurian, Golnaz
- Kurtz, Michael J
- Accomazzi, Alberto
- Computer Science - Computation and Language
- Astrophysics - Instrumentation and Methods for Astrophysics
- Computer Science - Machine Learning
- The automatic identification of planetary feature names in astronomy
-publications presents numerous challenges. These features include craters,
-defined as roughly circular depressions resulting from impact or volcanic
-activity; dorsas, which are elongate raised structures or wrinkle ridges; and
-lacus, small irregular patches of dark, smooth material on the Moon, referred
-to as "lake" (Planetary Names Working Group, n.d.). Many feature names overlap
-with places or people's names that they are named after, for example, Syria,
-Tempe, Einstein, and Sagan, to name a few (U.S. Geological Survey, n.d.). Some
-feature names have been used in many contexts, for instance, Apollo, which can
-refer to mission, program, sample, astronaut, seismic, seismometers, core, era,
-data, collection, instrument, and station, in addition to the crater on the
-Moon. Some feature names can appear in the text as adjectives, like the lunar
-craters Black, Green, and White. Some feature names in other contexts serve as
-directions, like craters West and South on the Moon. Additionally, some
-features share identical names across different celestial bodies, requiring
-disambiguation, such as the Adams crater, which exists on both the Moon and
-Mars. We present a multi-step pipeline combining rule-based filtering,
-statistical relevance analysis, part-of-speech (POS) tagging, named entity
-recognition (NER) model, hybrid keyword harvesting, knowledge graph (KG)
-matching, and inference with a locally installed large language model (LLM) to
-reliably identify planetary names despite these challenges. When evaluated on a
-dataset of astronomy papers from the Astrophysics Data System (ADS), this
-methodology achieves an F1-score over 0.97 in disambiguating planetary feature
-names.
-
- 2023-12-13
- text
- http://arxiv.org/abs/2312.08579
-
-
-
diff --git a/adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs b/adsdocmatch/tests/unittests/stubdata/L48-23288.abs
similarity index 100%
rename from adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs
rename to adsdocmatch/tests/unittests/stubdata/L48-23288.abs
diff --git a/adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs b/adsdocmatch/tests/unittests/stubdata/L52-28159.abs
similarity index 100%
rename from adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs
rename to adsdocmatch/tests/unittests/stubdata/L52-28159.abs
diff --git a/adsdocmatch/tests/unittests/stubdata/X01-74270.abs b/adsdocmatch/tests/unittests/stubdata/X01-74270.abs
new file mode 100644
index 0000000..1134f87
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X01-74270.abs
@@ -0,0 +1,32 @@
+Title: Validation of the new Hipparcos reduction
+Authors: van Leeuwen, F.
+Journal: eprint arXiv:0708.1752
+Publication Date: 08/2007
+Comments: 12 pages, 19 figures, accepted for publication by Astronomy and Astrophysics;
+ Astron.Astrophys.474:653-664,2007;
+ doi:10.1051/0004-6361:20078357
+Origin: ARXIV
+Keywords: Astrophysics
+Bibliographic Code: 2007arXiv0708.1752V
+
+ Abstract
+Context.A new reduction of the astrometric data as produced by the
+Hipparcos mission has been published, claiming accuracies for nearly all
+stars brighter than magnitude Hp = 8 to be better, by up to a factor 4,
+than in the original catalogue. Aims.The new Hipparcos astrometric
+catalogue is checked for the quality of the data and the consistency of
+the formal errors as well as the possible presence of error
+correlations. The differences with the earlier publication are
+explained. Methods. The internal errors are followed through the
+reduction process, and the external errors are investigated on the basis
+of a comparison with radio observations of a small selection of stars,
+and the distribution of negative parallaxes. Error correlation levels
+are investigated and the reduction by more than a factor 10 as obtained
+in the new catalogue is explained. Results.The formal errors on the
+parallaxes for the new catalogue are confirmed. The presence of a small
+amount of additional noise, though unlikely, cannot be ruled out.
+Conclusions. The new reduction of the Hipparcos astrometric data
+provides an improvement by a factor 2.2 in the total weight compared to
+the catalogue published in 1997, and provides much improved data for a
+wide range of studies on stellar luminosities and local galactic
+kinematics.
diff --git a/adsdocmatch/tests/unittests/stubdata/X10-50737.abs b/adsdocmatch/tests/unittests/stubdata/X10-50737.abs
new file mode 100644
index 0000000..d2f4766
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X10-50737.abs
@@ -0,0 +1,17 @@
+Title: Post-Lie algebra structures on the Witt algebra
+Authors: Tang, Xiaomin
+Journal: eprint arXiv:1701.00200
+Publication Date: 01/2017
+Comments: 24 pages
+Origin: ARXIV
+Keywords: Mathematics - Rings and Algebras, 17A30, 17A42,
+ 17B60, 18D50
+Bibliographic Code: 2017arXiv170100200T
+
+ Abstract
+In this paper, we characterize the graded post-Lie algebra structures
+and a class of shifting post-Lie algebra structures on the Witt algebra.
+We obtain some new Lie algebras and give a class of their modules. As an
+application, the homogeneous Rota-Baxter operators and a class of
+non-homogeneous Rota-Baxter operators of weight $1$ on the Witt algebra
+are studied.
diff --git a/adsdocmatch/tests/unittests/stubdata/X11-85081.abs b/adsdocmatch/tests/unittests/stubdata/X11-85081.abs
new file mode 100644
index 0000000..a5835ad
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X11-85081.abs
@@ -0,0 +1,22 @@
+Title: The Unified Astronomy Thesaurus: Semantic Metadata
+ for Astronomy and Astrophysics
+Authors: Frey, Katie; Accomazzi, Alberto
+Journal: eprint arXiv:1801.01021
+Publication Date: 01/2018
+Comments: Submitted to the Astrophysical Journal Supplements, 10 pages, 3 tables;
+ doi:10.3847/1538-4365/aab760
+Origin: ARXIV
+Keywords: Astrophysics - Instrumentation and Methods for Astrophysics,
+ Computer Science - Digital Libraries
+Bibliographic Code: 2018arXiv180101021F
+
+ Abstract
+Several different controlled vocabularies have been developed and used
+by the astronomical community, each designed to serve a specific need
+and a specific group. The Unified Astronomy Thesaurus (UAT) attempts to
+provide a highly structured controlled vocabulary that will be relevant
+and useful across the entire discipline, regardless of content or
+platform. As two major use cases for the UAT include classifying
+articles and data, we examine the UAT in comparison with the
+Astronomical Subject Keywords used by major publications and the JWST
+Science Keywords used by STScI's Astronomer's Proposal Tool.
diff --git a/adsdocmatch/tests/unittests/stubdata/X18-10145.abs b/adsdocmatch/tests/unittests/stubdata/X18-10145.abs
new file mode 100644
index 0000000..34a51c2
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X18-10145.abs
@@ -0,0 +1,35 @@
+Title: Observations of large-scale solar flows
+Authors: Proxauf, Bastian
+Journal: eprint arXiv:2106.07251
+Publication Date: 06/2021
+Comments: PhD thesis, 97 pages; doi:10.53846/goediss-8502
+Origin: ARXIV
+Keywords: Astrophysics - Solar and Stellar Astrophysics
+Bibliographic Code: 2021arXiv210607251P
+
+ Abstract
+In this dissertation, several components of large-scale solar flows are
+studied observationally: solar equatorial Rossby waves (waves of radial
+vorticity), large-scale convection, and surface flows around active
+regions. Maps of horizontal flows are derived from photospheric
+observations by the Helioseismic and Magnetic Imager (HMI) aboard the
+Solar Dynamics Observatory (SDO) using two different techniques:
+granulation tracking and local helioseismology. First, the
+eigenfunctions of solar Rossby waves are measured from helioseismic
+ring-diagram flow maps with a correlation method and a spectral
+analysis. Down to $9$ Mm below the surface, the dependence of the radial
+vorticity with radius $r$ is consistent with $r^{m-1}$, for a given
+longitudinal wavenumber $m$. At the surface, the eigenfunctions are
+complex-valued. The real part decreases away from the equator and
+switches sign around $\pm 20-30^\circ$. The imaginary part is small, but
+nonzero, and may be due to wave attenuation. This may have implications
+for the transport of angular momentum in the latitudinal direction.
+Second, we revisit previous measurements of power spectra of
+longitudinal velocities near the solar surface, obtained from
+time-distance and ring-diagram helioseismology. Several issues in these
+past helioseismic analyses are identified and corrected. The corrections
+are not sufficient to remove the discrepancy between the measurements. I
+thus present new velocity power spectra from granulation tracking and
+ring-diagram helioseismology. The two new measurements are close to each
+other near the solar surface, and the corresponding kinetic energy
+decreases with increasing spatial scale.
diff --git a/adsdocmatch/tests/unittests/stubdata/X21-91237.abs b/adsdocmatch/tests/unittests/stubdata/X21-91237.abs
new file mode 100644
index 0000000..87e99ef
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X21-91237.abs
@@ -0,0 +1,30 @@
+Title: ZipIt! Merging Models from Different Tasks without
+ Training
+Authors: Stoica, George; Bolya, Daniel; Bjorner, Jakob;
+ Hearn, Taylor; Hoffman, Judy
+Journal: eprint arXiv:2305.03053
+Publication Date: 05/2023
+Origin: ARXIV
+Keywords: Computer Science - Computer Vision and Pattern Recognition,
+ Computer Science - Machine Learning
+Bibliographic Code: 2023arXiv230503053S
+
+ Abstract
+Typical deep visual recognition models are capable of performing the one
+task they were trained on. In this paper, we tackle the extremely
+difficult problem of combining completely distinct models with different
+initializations, each solving a separate task, into one multi-task model
+without any additional training. Prior work in model merging permutes
+one model to the space of the other then adds them together. While this
+works for models trained on the same task, we find that this fails to
+account for the differences in models trained on disjoint tasks. Thus,
+we introduce "ZipIt!", a general method for merging two arbitrary models
+of the same architecture that incorporates two simple strategies. First,
+in order to account for features that aren't shared between models, we
+expand the model merging problem to additionally allow for merging
+features within each model by defining a general "zip" operation.
+Second, we add support for partially zipping the models up until a
+specified layer, naturally creating a multi-head model. We find that
+these two changes combined account for a staggering 20-60% improvement
+over prior work, making the merging of models trained on disjoint tasks
+feasible.
diff --git a/adsdocmatch/tests/unittests/stubdata/X23-45511.abs b/adsdocmatch/tests/unittests/stubdata/X23-45511.abs
new file mode 100644
index 0000000..19f4b69
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X23-45511.abs
@@ -0,0 +1,39 @@
+Title: Identifying Planetary Names in Astronomy Papers: A
+ Multi-Step Approach
+Authors: Shapurian, Golnaz; Kurtz, Michael J;
+ Accomazzi, Alberto
+Journal: eprint arXiv:2312.08579
+Publication Date: 12/2023
+Origin: ARXIV
+Keywords: Computer Science - Computation and Language,
+ Astrophysics - Instrumentation and Methods for Astrophysics,
+ Computer Science - Machine Learning
+Bibliographic Code: 2023arXiv231208579S
+
+ Abstract
+The automatic identification of planetary feature names in astronomy
+publications presents numerous challenges. These features include
+craters, defined as roughly circular depressions resulting from impact
+or volcanic activity; dorsas, which are elongate raised structures or
+wrinkle ridges; and lacus, small irregular patches of dark, smooth
+material on the Moon, referred to as "lake" (Planetary Names Working
+Group, n.d.). Many feature names overlap with places or people's names
+that they are named after, for example, Syria, Tempe, Einstein, and
+Sagan, to name a few (U.S. Geological Survey, n.d.). Some feature names
+have been used in many contexts, for instance, Apollo, which can refer
+to mission, program, sample, astronaut, seismic, seismometers, core,
+era, data, collection, instrument, and station, in addition to the
+crater on the Moon. Some feature names can appear in the text as
+adjectives, like the lunar craters Black, Green, and White. Some feature
+names in other contexts serve as directions, like craters West and South
+on the Moon. Additionally, some features share identical names across
+different celestial bodies, requiring disambiguation, such as the Adams
+crater, which exists on both the Moon and Mars. We present a multi-step
+pipeline combining rule-based filtering, statistical relevance analysis,
+part-of-speech (POS) tagging, named entity recognition (NER) model,
+hybrid keyword harvesting, knowledge graph (KG) matching, and inference
+with a locally installed large language model (LLM) to reliably identify
+planetary names despite these challenges. When evaluated on a dataset of
+astronomy papers from the Astrophysics Data System (ADS), this
+methodology achieves an F1-score over 0.97 in disambiguating planetary
+feature names.
diff --git a/adsdocmatch/tests/unittests/test_match_w_metadata.py b/adsdocmatch/tests/unittests/test_match_w_metadata.py
index 0fe0070..f80e347 100644
--- a/adsdocmatch/tests/unittests/test_match_w_metadata.py
+++ b/adsdocmatch/tests/unittests/test_match_w_metadata.py
@@ -89,8 +89,7 @@ def test_match_to_pub_1(self):
'comment': 'No matches with Abstract, trying Title. No document was found in solr matching the request.'
}]
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
- arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
- matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1701/00200')
+ matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X10-50737.abs')
self.assertEqual(len(matches), 1)
fields = matches[0].split('\t')
self.assertEqual(len(fields), 6)
@@ -113,8 +112,7 @@ def test_match_to_pub_2(self):
'comment': ''
}]
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
- arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
- matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1801/01021')
+ matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X11-85081.abs')
self.assertEqual(len(matches), 1)
fields = matches[0].split('\t')
self.assertEqual(len(fields), 6)
@@ -137,8 +135,7 @@ def test_match_to_pub_3(self):
'comment': ''
}]
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
- arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
- matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '0708/1752')
+ matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X01-74270.abs')
self.assertEqual(len(matches), 1)
fields = matches[0].split('\t')
self.assertEqual(len(fields), 6)
@@ -168,8 +165,7 @@ def test_match_to_pub_4(self):
'comment': 'Matching doctype `phdthesis;mastersthesis`. Multi match: 2 of 2.'
}]
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
- arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
- matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '2106/07251')
+ matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X21-91237.abs')
self.assertEqual(len(matches), 2)
expected_values = [
['2021arXiv210607251P','2020PhDT........36P','Match','0.8989977',"{'abstract': None, 'title': 1.0, 'author': 1, 'year': 1}",'Matching doctype `phdthesis;mastersthesis`. Multi match: 1 of 2.'],
@@ -193,7 +189,7 @@ def test_match_to_earth_science_1(self):
}]
# treat as eprint
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
- self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs'))
+ self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L48-23288.abs'))
return_value = [{
'source_bibcode': '2021esoar.10507102L',
@@ -205,7 +201,8 @@ def test_match_to_earth_science_1(self):
}]
# now send it as publication
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
- self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs'))
+ print(self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L48-23288.abs'))
+ self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L48-23288.abs'))
def test_match_to_earth_science_2(self):
""" test match_to_ earth science records that can appear both as eprint and publication """
@@ -218,7 +215,7 @@ def test_match_to_earth_science_2(self):
'comment': ''}]
# treat as eprint
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
- self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs'))
+ self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L52-28159.abs'))
return_value = [{
'source_bibcode': '2022EaArX...X58667B',
@@ -230,7 +227,7 @@ def test_match_to_earth_science_2(self):
}]
# now send it as publication
with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
- self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs'))
+ self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L52-28159.abs'))
def test_batch_match_to_pub(self):
""" test batch mode of match_to_pub """
@@ -242,7 +239,7 @@ def test_batch_match_to_pub(self):
rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME']))
# create input file with list of eprint filenames
- eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
+ eprint_filenames = ['/X21-91237.abs']
with open(input_filename, "w") as f:
for filename in eprint_filenames:
f.write("%s\n"%(stubdata_dir+filename))
@@ -505,7 +502,7 @@ def test_process_match_to_pub_without_classic_output(self):
# create input file with list of eprint filenames
stubdata_dir = os.path.dirname(__file__) + '/stubdata'
input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME'])
- eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
+ eprint_filenames = ['/X21-91237.abs']
with open(input_filename, "w") as f:
for filename in eprint_filenames:
f.write("%s\n"%(stubdata_dir+filename))
@@ -546,7 +543,7 @@ def test_process_match_to_pub_with_classic_output(self):
# create input file with list of eprint filenames
stubdata_dir = os.path.dirname(__file__) + '/stubdata'
input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME'])
- eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
+ eprint_filenames = ['/X21-91237.abs']
with open(input_filename, "w") as f:
for filename in eprint_filenames:
f.write("%s\n" % (stubdata_dir + filename))
@@ -622,7 +619,7 @@ def test_write_results(self):
stubdata_dir = os.path.dirname(__file__) + '/stubdata'
result_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME'])
rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME']))
- eprint_filename = "%s%s"% (stubdata_dir, '/ArXiv/oai/eprints/2305/03053')
+ eprint_filename = "%s%s"% (stubdata_dir, '/X21-91237.abs')
matches = self.match_metadata.process_results([{
'source_bibcode': '2023arXiv230503053S',
'status_flaw' : "got 502 for the last failed attempt -- shall be added to rerun list."}], '\t')
diff --git a/adsdocmatch/tests/unittests/test_oracle_util.py b/adsdocmatch/tests/unittests/test_oracle_util.py
index f326498..f413d9e 100644
--- a/adsdocmatch/tests/unittests/test_oracle_util.py
+++ b/adsdocmatch/tests/unittests/test_oracle_util.py
@@ -10,6 +10,7 @@
from adsputils import load_config
from adsdocmatch.match_w_metadata import MatchMetadata
+from adsdocmatch.pub_parser import get_pub_metadata
config = load_config(proj_home=project_home)
@@ -32,14 +33,14 @@ def create_response(self, text):
def test_normalize_author_list(self):
""" """
- eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579']
- stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints'
+ eprint_filenames = ['X18-10145.abs', 'X10-50737.abs', 'X11-85081.abs', 'X23-45511.abs']
+ stubdata_dir = os.path.dirname(__file__) + '/stubdata/'
expected_authors = ['Proxauf, B', 'Tang, X', 'Frey, K; Accomazzi, A', 'Shapurian, G; Kurtz, M; Accomazzi, A']
for filename, authors in zip(eprint_filenames, expected_authors):
fullpath = stubdata_dir + filename
with open(fullpath, 'rb') as arxiv_fp:
- metadata = self.match_metadata.ARXIV_PARSER.parse(arxiv_fp)
+ metadata = get_pub_metadata(arxiv_fp.read())
self.assertEqual(self.match_metadata.ORACLE_UTIL.normalize_author_list(metadata['authors']), authors)
# what if only lastnames are provided
@@ -70,14 +71,15 @@ def test_normalize_author_list(self):
def test_extract_doi(self):
""" """
- eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579']
- stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints'
+ eprint_filenames = ['X18-10145.abs', 'X10-50737.abs', 'X11-85081.abs', 'X23-45511.abs']
+ stubdata_dir = os.path.dirname(__file__) + '/stubdata/'
expected_dois = [['10.53846/goediss-8502'], None, ['10.3847/1538-4365/aab760'], None]
for filename, doi in zip(eprint_filenames, expected_dois):
fullpath = stubdata_dir + filename
with open(fullpath, 'rb') as arxiv_fp:
- metadata = self.match_metadata.ARXIV_PARSER.parse(arxiv_fp)
+ metadata = get_pub_metadata(arxiv_fp.read())
+ metadata, _, _, _ = self.match_metadata.parse_arXiv_comments(metadata)
self.assertEqual(self.match_metadata.ORACLE_UTIL.extract_doi(metadata), doi)
def test_read_google_sheet(self):
diff --git a/requirements.txt b/requirements.txt
index 307dcc8..70571ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-git+https://github.com/adsabs/adsabs-pyingest.git@v1.2.2
git+https://github.com/adsabs/ADSGoogleConnector.git@v0.0.3
adsputils==1.4.3
numpy==1.24.2