From a1863961f31105e885d1fea778f52c461932b963 Mon Sep 17 00:00:00 2001
From: golnazads <28757512+golnazads@users.noreply.github.com>
Date: Mon, 18 Dec 2023 17:05:53 -0500
Subject: [PATCH] arXiv source metadata is swtiched, and pyingest dropped

---
 adsdocmatch/match_w_metadata.py               | 88 +++++++++++--------
 adsdocmatch/pub_parser/__init__.py            |  5 ++
 .../stubdata/ArXiv/oai/eprints/0708/1752      | 40 ---------
 .../stubdata/ArXiv/oai/eprints/1701/00200     | 26 ------
 .../stubdata/ArXiv/oai/eprints/1801/01021     | 33 -------
 .../stubdata/ArXiv/oai/eprints/2106/07251     | 43 ---------
 .../stubdata/ArXiv/oai/eprints/2312/08579     | 47 ----------
 .../stubdata/{text/L48 => }/L48-23288.abs     |  0
 .../stubdata/{text/L52 => }/L52-28159.abs     |  0
 .../tests/unittests/stubdata/X01-74270.abs    | 32 +++++++
 .../tests/unittests/stubdata/X10-50737.abs    | 17 ++++
 .../tests/unittests/stubdata/X11-85081.abs    | 22 +++++
 .../tests/unittests/stubdata/X18-10145.abs    | 35 ++++++++
 .../tests/unittests/stubdata/X21-91237.abs    | 30 +++++++
 .../tests/unittests/stubdata/X23-45511.abs    | 39 ++++++++
 .../tests/unittests/test_match_w_metadata.py  | 29 +++---
 .../tests/unittests/test_oracle_util.py       | 14 +--
 requirements.txt                              |  1 -
 18 files changed, 252 insertions(+), 249 deletions(-)
 delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752
 delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200
 delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021
 delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251
 delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579
 rename adsdocmatch/tests/unittests/stubdata/{text/L48 => }/L48-23288.abs (100%)
 rename adsdocmatch/tests/unittests/stubdata/{text/L52 => }/L52-28159.abs (100%)
 create mode 100644 adsdocmatch/tests/unittests/stubdata/X01-74270.abs
 create mode 100644 adsdocmatch/tests/unittests/stubdata/X10-50737.abs
 create mode 100644 adsdocmatch/tests/unittests/stubdata/X11-85081.abs
 create mode 100644 adsdocmatch/tests/unittests/stubdata/X18-10145.abs
 create mode 100644 adsdocmatch/tests/unittests/stubdata/X21-91237.abs
 create mode 100644 adsdocmatch/tests/unittests/stubdata/X23-45511.abs

diff --git a/adsdocmatch/match_w_metadata.py b/adsdocmatch/match_w_metadata.py
index f604f65..1597a7d 100644
--- a/adsdocmatch/match_w_metadata.py
+++ b/adsdocmatch/match_w_metadata.py
@@ -6,7 +6,6 @@
 from adsdocmatch.pub_parser import get_pub_metadata
 from adsdocmatch.oracle_util import OracleUtil
 from adsdocmatch.matchable_status import matchable_status
-from pyingest.parsers.arxiv import ArxivParser
 from adsputils import setup_logging, load_config
 
 proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
@@ -30,7 +29,6 @@ class MatchMetadata():
 
     process_pub_bibstem = {}
 
-    ARXIV_PARSER = ArxivParser()
     ORACLE_UTIL = OracleUtil()
 
     def get_input_filenames(self, filename):
@@ -85,6 +83,45 @@ def process_pub_metadata(self, metadata):
             self.process_pub_bibstem[bibstem] = 1 if status == True else (0 if status == False else -1)
         return self.process_pub_bibstem[bibstem]
 
+    def parse_arXiv_comments(self, metadata):
+        """
+
+        :param metadata:
+        :return:
+        """
+        comments = metadata.get('arXiv_comments', '')
+        if comments:
+            # extract doi out of comments if there are any
+            match = self.re_doi.search(comments)
+            if match:
+                metadata['doi'] = match.group(1)
+            else:
+                doi = metadata.get('properties', {}).get('DOI', None)
+                if doi:
+                    metadata['doi'] = doi.replace('doi:', '')
+            match_doctype = None
+            title = metadata.get('title')
+            # check title for erratum
+            match = self.re_doctype_errata.search(title)
+            if match:
+                match_doctype = ['erratum']
+            else:
+                match = self.re_doctype_bookreview.search(title)
+                if match:
+                    match_doctype = ['bookreview']
+                else:
+                    # check both comments and title for thesis
+                    match = self.re_doctype_thesis.search("%s %s" % (comments, title))
+                    if match:
+                        match_doctype = ['phdthesis', 'mastersthesis']
+            must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
+        else:
+            metadata.pop("doi", None)
+            match_doctype = None
+            must_match = False
+            comments = ''
+        return metadata, comments, must_match, match_doctype
+
     def write_results(self, result_filename, matches, metadata_filename, rerun_filename):
         """
 
@@ -202,41 +239,8 @@ def match_to_pub(self, filename):
         """
         try:
             with open(filename, 'rb') as arxiv_fp:
-                journal = filename.strip().split('/')[-5]
-                if journal == 'ArXiv':
-                    metadata = self.ARXIV_PARSER.parse(arxiv_fp)
-                    comments = ' '.join(metadata.get('comments', []))
-                    # extract doi out of comments if there are any
-                    match = self.re_doi.search(comments)
-                    if match:
-                        metadata['doi'] = match.group(1)
-                    else:
-                        doi = metadata.get('properties', {}).get('DOI', None)
-                        if doi:
-                            metadata['doi'] = doi.replace('doi:', '')
-                    match_doctype = None
-                    title = metadata.get('title')
-                    # check title for erratum
-                    match = self.re_doctype_errata.search(title)
-                    if match:
-                        match_doctype = ['erratum']
-                    else:
-                        match = self.re_doctype_bookreview.search(title)
-                        if match:
-                            match_doctype = ['bookreview']
-                        else:
-                            # check both comments and title for thesis
-                            match = self.re_doctype_thesis.search("%s %s" % (comments, title))
-                            if match:
-                                match_doctype = ['phdthesis', 'mastersthesis']
-                    must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
-                else:
-                    metadata = get_pub_metadata(arxiv_fp.read())
-                    # remove the doi, since in this case, oracle thinks it is the publication doi
-                    metadata.pop("doi", None)
-                    match_doctype = None
-                    must_match = False
-                    comments = ''
+                metadata = get_pub_metadata(arxiv_fp.read())
+                metadata, comments, must_match, match_doctype = self.parse_arXiv_comments(metadata)
                 oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype)
                 # before proceeding see if this arXiv article's class is among the ones that ADS archives the
                 # published version if available
@@ -456,3 +460,13 @@ def process_match_to_pub(self, path):
         combined_output_filename = "%s%s" % (path, config.get('DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME', 'default'))
         self.merge_classic_docmatch_results(classic_matched_filename, result_filename, combined_output_filename)
         return combined_output_filename
+
+if __name__ == '__main__':
+    print(MatchMetadata().match_to_pub('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
+    print(MatchMetadata().match_to_arXiv('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
+
+    '''
+/proj/ads/abstracts/gen/text/L52/L52-28159.abs
+/proj/ads/abstracts/gen/text/L48/L48-23288.abs
+/proj/ads/abstracts/sources/ArXiv/oai/arXiv.org/2306/02768
+'''
\ No newline at end of file
diff --git a/adsdocmatch/pub_parser/__init__.py b/adsdocmatch/pub_parser/__init__.py
index 122eefc..1516ba0 100644
--- a/adsdocmatch/pub_parser/__init__.py
+++ b/adsdocmatch/pub_parser/__init__.py
@@ -51,6 +51,7 @@ def as_needed(article):
         ("Publication Date", "pubdate"),
         ("Bibliographic Code", "bibcode"),
         ("DOI", "doi"),
+        ("arXiv_comments", "arXiv_comments")
     ]
     return_record = {}
     for src_key, dest_key in field_mappings:
@@ -131,4 +132,8 @@ def get_pub_metadata(contents):
     switch_date = article['Publication Date'].split('/')
     article['Publication Date'] = switch_date[1] + '/' + switch_date[0]
 
+    if 'Origin' in fields_found_in_file:
+        if article['Origin'] == 'ARXIV':
+            article['arXiv_comments'] = article.get('Comments', '')
+
     return as_needed(article)
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752
deleted file mode 100644
index 81fa884..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752
+++ /dev/null
@@ -1,40 +0,0 @@
-<record>
-<header>
- <identifier>oai:arXiv.org:0708.1752</identifier>
- <datestamp>2010-04-06</datestamp>
- <setSpec>physics:astro-ph</setSpec>
-</header>
-<metadata>
- <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
- <dc:title>Validation of the new Hipparcos reduction</dc:title>
- <dc:creator>van Leeuwen, F.</dc:creator>
- <dc:subject>Astrophysics</dc:subject>
- <dc:description>  Context.A new reduction of the astrometric data as produced by the Hipparcos
-mission has been published, claiming accuracies for nearly all stars brighter
-than magnitude Hp = 8 to be better, by up to a factor 4, than in the original
-catalogue. Aims.The new Hipparcos astrometric catalogue is checked for the
-quality of the data and the consistency of the formal errors as well as the
-possible presence of error correlations. The differences with the earlier
-publication are explained. Methods. The internal errors are followed through
-the reduction process, and the external errors are investigated on the basis of
-a comparison with radio observations of a small selection of stars, and the
-distribution of negative parallaxes. Error correlation levels are investigated
-and the reduction by more than a factor 10 as obtained in the new catalogue is
-explained. Results.The formal errors on the parallaxes for the new catalogue
-are confirmed. The presence of a small amount of additional noise, though
-unlikely, cannot be ruled out. Conclusions. The new reduction of the Hipparcos
-astrometric data provides an improvement by a factor 2.2 in the total weight
-compared to the catalogue published in 1997, and provides much improved data
-for a wide range of studies on stellar luminosities and local galactic
-kinematics.
-</dc:description>
- <dc:description>Comment: 12 pages, 19 figures, accepted for publication by Astronomy and
-  Astrophysics</dc:description>
- <dc:date>2007-08-13</dc:date>
- <dc:type>text</dc:type>
- <dc:identifier>http://arxiv.org/abs/0708.1752</dc:identifier>
- <dc:identifier>Astron.Astrophys.474:653-664,2007</dc:identifier>
- <dc:identifier>doi:10.1051/0004-6361:20078357</dc:identifier>
- </oai_dc:dc>
-</metadata>
-</record>
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200
deleted file mode 100644
index ca23e71..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200
+++ /dev/null
@@ -1,26 +0,0 @@
-<record>
-<header>
- <identifier>oai:arXiv.org:1701.00200</identifier>
- <datestamp>2017-08-22</datestamp>
- <setSpec>math</setSpec>
-</header>
-<metadata>
- <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
- <dc:title>Post-Lie algebra structures on the Witt algebra</dc:title>
- <dc:creator>Tang, Xiaomin</dc:creator>
- <dc:subject>Mathematics - Rings and Algebras</dc:subject>
- <dc:subject>17A30, 17A42, 17B60, 18D50</dc:subject>
- <dc:description>  In this paper, we characterize the graded post-Lie algebra structures and a
-class of shifting post-Lie algebra structures on the Witt algebra. We obtain
-some new Lie algebras and give a class of their modules. As an application, the
-homogeneous Rota-Baxter operators and a class of non-homogeneous Rota-Baxter
-operators of weight $1$ on the Witt algebra are studied.
-</dc:description>
- <dc:description>Comment: 24 pages</dc:description>
- <dc:date>2017-01-01</dc:date>
- <dc:date>2017-08-19</dc:date>
- <dc:type>text</dc:type>
- <dc:identifier>http://arxiv.org/abs/1701.00200</dc:identifier>
- </oai_dc:dc>
-</metadata>
-</record>
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021
deleted file mode 100644
index 433aa25..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021
+++ /dev/null
@@ -1,33 +0,0 @@
-<record>
-<header>
- <identifier>oai:arXiv.org:1801.01021</identifier>
- <datestamp>2018-05-23</datestamp>
- <setSpec>cs</setSpec>
- <setSpec>physics:astro-ph</setSpec>
-</header>
-<metadata>
- <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
- <dc:title>The Unified Astronomy Thesaurus: Semantic Metadata for Astronomy and
-  Astrophysics</dc:title>
- <dc:creator>Frey, Katie</dc:creator>
- <dc:creator>Accomazzi, Alberto</dc:creator>
- <dc:subject>Astrophysics - Instrumentation and Methods for Astrophysics</dc:subject>
- <dc:subject>Computer Science - Digital Libraries</dc:subject>
- <dc:description>  Several different controlled vocabularies have been developed and used by the
-astronomical community, each designed to serve a specific need and a specific
-group. The Unified Astronomy Thesaurus (UAT) attempts to provide a highly
-structured controlled vocabulary that will be relevant and useful across the
-entire discipline, regardless of content or platform. As two major use cases
-for the UAT include classifying articles and data, we examine the UAT in
-comparison with the Astronomical Subject Keywords used by major publications
-and the JWST Science Keywords used by STScI's Astronomer's Proposal Tool.
-</dc:description>
- <dc:description>Comment: Submitted to the Astrophysical Journal Supplements, 10 pages, 3
-  tables</dc:description>
- <dc:date>2018-01-03</dc:date>
- <dc:type>text</dc:type>
- <dc:identifier>http://arxiv.org/abs/1801.01021</dc:identifier>
- <dc:identifier>doi:10.3847/1538-4365/aab760</dc:identifier>
- </oai_dc:dc>
-</metadata>
-</record>
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251
deleted file mode 100644
index 2544460..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251
+++ /dev/null
@@ -1,43 +0,0 @@
-<record>
-<header>
- <identifier>oai:arXiv.org:2106.07251</identifier>
- <datestamp>2023-01-25</datestamp>
- <setSpec>physics:astro-ph</setSpec>
-</header>
-<metadata>
- <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
- <dc:title>Observations of large-scale solar flows</dc:title>
- <dc:creator>Proxauf, Bastian</dc:creator>
- <dc:subject>Astrophysics - Solar and Stellar Astrophysics</dc:subject>
- <dc:description>  In this dissertation, several components of large-scale solar flows are
-studied observationally: solar equatorial Rossby waves (waves of radial
-vorticity), large-scale convection, and surface flows around active regions.
-Maps of horizontal flows are derived from photospheric observations by the
-Helioseismic and Magnetic Imager (HMI) aboard the Solar Dynamics Observatory
-(SDO) using two different techniques: granulation tracking and local
-helioseismology. First, the eigenfunctions of solar Rossby waves are measured
-from helioseismic ring-diagram flow maps with a correlation method and a
-spectral analysis. Down to $9$ Mm below the surface, the dependence of the
-radial vorticity with radius $r$ is consistent with $r^{m-1}$, for a given
-longitudinal wavenumber $m$. At the surface, the eigenfunctions are
-complex-valued. The real part decreases away from the equator and switches sign
-around $\pm 20-30^\circ$. The imaginary part is small, but nonzero, and may be
-due to wave attenuation. This may have implications for the transport of
-angular momentum in the latitudinal direction. Second, we revisit previous
-measurements of power spectra of longitudinal velocities near the solar
-surface, obtained from time-distance and ring-diagram helioseismology. Several
-issues in these past helioseismic analyses are identified and corrected. The
-corrections are not sufficient to remove the discrepancy between the
-measurements. I thus present new velocity power spectra from granulation
-tracking and ring-diagram helioseismology. The two new measurements are close
-to each other near the solar surface, and the corresponding kinetic energy
-decreases with increasing spatial scale.
-</dc:description>
- <dc:description>Comment: PhD thesis, 97 pages</dc:description>
- <dc:date>2021-06-14</dc:date>
- <dc:type>text</dc:type>
- <dc:identifier>http://arxiv.org/abs/2106.07251</dc:identifier>
- <dc:identifier>doi:10.53846/goediss-8502</dc:identifier>
- </oai_dc:dc>
-</metadata>
-</record>
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579
deleted file mode 100644
index 38fef15..0000000
--- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579
+++ /dev/null
@@ -1,47 +0,0 @@
-<record>
-<header>
- <identifier>oai:arXiv.org:2312.08579</identifier>
- <datestamp>2023-12-15</datestamp>
- <setSpec>cs</setSpec>
- <setSpec>physics:astro-ph</setSpec>
-</header>
-<metadata>
- <oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
- <dc:title>Identifying Planetary Names in Astronomy Papers: A Multi-Step Approach</dc:title>
- <dc:creator>Shapurian, Golnaz</dc:creator>
- <dc:creator>Kurtz, Michael J</dc:creator>
- <dc:creator>Accomazzi, Alberto</dc:creator>
- <dc:subject>Computer Science - Computation and Language</dc:subject>
- <dc:subject>Astrophysics - Instrumentation and Methods for Astrophysics</dc:subject>
- <dc:subject>Computer Science - Machine Learning</dc:subject>
- <dc:description>  The automatic identification of planetary feature names in astronomy
-publications presents numerous challenges. These features include craters,
-defined as roughly circular depressions resulting from impact or volcanic
-activity; dorsas, which are elongate raised structures or wrinkle ridges; and
-lacus, small irregular patches of dark, smooth material on the Moon, referred
-to as &quot;lake&quot; (Planetary Names Working Group, n.d.). Many feature names overlap
-with places or people's names that they are named after, for example, Syria,
-Tempe, Einstein, and Sagan, to name a few (U.S. Geological Survey, n.d.). Some
-feature names have been used in many contexts, for instance, Apollo, which can
-refer to mission, program, sample, astronaut, seismic, seismometers, core, era,
-data, collection, instrument, and station, in addition to the crater on the
-Moon. Some feature names can appear in the text as adjectives, like the lunar
-craters Black, Green, and White. Some feature names in other contexts serve as
-directions, like craters West and South on the Moon. Additionally, some
-features share identical names across different celestial bodies, requiring
-disambiguation, such as the Adams crater, which exists on both the Moon and
-Mars. We present a multi-step pipeline combining rule-based filtering,
-statistical relevance analysis, part-of-speech (POS) tagging, named entity
-recognition (NER) model, hybrid keyword harvesting, knowledge graph (KG)
-matching, and inference with a locally installed large language model (LLM) to
-reliably identify planetary names despite these challenges. When evaluated on a
-dataset of astronomy papers from the Astrophysics Data System (ADS), this
-methodology achieves an F1-score over 0.97 in disambiguating planetary feature
-names.
-</dc:description>
- <dc:date>2023-12-13</dc:date>
- <dc:type>text</dc:type>
- <dc:identifier>http://arxiv.org/abs/2312.08579</dc:identifier>
- </oai_dc:dc>
-</metadata>
-</record>
diff --git a/adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs b/adsdocmatch/tests/unittests/stubdata/L48-23288.abs
similarity index 100%
rename from adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs
rename to adsdocmatch/tests/unittests/stubdata/L48-23288.abs
diff --git a/adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs b/adsdocmatch/tests/unittests/stubdata/L52-28159.abs
similarity index 100%
rename from adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs
rename to adsdocmatch/tests/unittests/stubdata/L52-28159.abs
diff --git a/adsdocmatch/tests/unittests/stubdata/X01-74270.abs b/adsdocmatch/tests/unittests/stubdata/X01-74270.abs
new file mode 100644
index 0000000..1134f87
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X01-74270.abs
@@ -0,0 +1,32 @@
+Title:              Validation of the new Hipparcos reduction
+Authors:            van Leeuwen, F.
+Journal:            eprint arXiv:0708.1752
+Publication Date:   08/2007
+Comments:           12 pages, 19 figures, accepted for publication by Astronomy and Astrophysics;
+                    Astron.Astrophys.474:653-664,2007;
+                    doi:10.1051/0004-6361:20078357
+Origin:             ARXIV
+Keywords:           Astrophysics
+Bibliographic Code: 2007arXiv0708.1752V
+
+                               Abstract
+Context.A new reduction of the astrometric data as produced by the
+Hipparcos mission has been published, claiming accuracies for nearly all
+stars brighter than magnitude Hp = 8 to be better, by up to a factor 4,
+than in the original catalogue. Aims.The new Hipparcos astrometric
+catalogue is checked for the quality of the data and the consistency of
+the formal errors as well as the possible presence of error
+correlations. The differences with the earlier publication are
+explained. Methods. The internal errors are followed through the
+reduction process, and the external errors are investigated on the basis
+of a comparison with radio observations of a small selection of stars,
+and the distribution of negative parallaxes. Error correlation levels
+are investigated and the reduction by more than a factor 10 as obtained
+in the new catalogue is explained. Results.The formal errors on the
+parallaxes for the new catalogue are confirmed. The presence of a small
+amount of additional noise, though unlikely, cannot be ruled out.
+Conclusions. The new reduction of the Hipparcos astrometric data
+provides an improvement by a factor 2.2 in the total weight compared to
+the catalogue published in 1997, and provides much improved data for a
+wide range of studies on stellar luminosities and local galactic
+kinematics.
diff --git a/adsdocmatch/tests/unittests/stubdata/X10-50737.abs b/adsdocmatch/tests/unittests/stubdata/X10-50737.abs
new file mode 100644
index 0000000..d2f4766
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X10-50737.abs
@@ -0,0 +1,17 @@
+Title:              Post-Lie algebra structures on the Witt algebra
+Authors:            Tang, Xiaomin
+Journal:            eprint arXiv:1701.00200
+Publication Date:   01/2017
+Comments:           24 pages
+Origin:             ARXIV
+Keywords:           Mathematics - Rings and Algebras, 17A30, 17A42,
+                    17B60, 18D50
+Bibliographic Code: 2017arXiv170100200T
+
+                               Abstract
+In this paper, we characterize the graded post-Lie algebra structures
+and a class of shifting post-Lie algebra structures on the Witt algebra.
+We obtain some new Lie algebras and give a class of their modules. As an
+application, the homogeneous Rota-Baxter operators and a class of
+non-homogeneous Rota-Baxter operators of weight $1$ on the Witt algebra
+are studied.
diff --git a/adsdocmatch/tests/unittests/stubdata/X11-85081.abs b/adsdocmatch/tests/unittests/stubdata/X11-85081.abs
new file mode 100644
index 0000000..a5835ad
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X11-85081.abs
@@ -0,0 +1,22 @@
+Title:              The Unified Astronomy Thesaurus: Semantic Metadata
+                    for Astronomy and Astrophysics
+Authors:            Frey, Katie; Accomazzi, Alberto
+Journal:            eprint arXiv:1801.01021
+Publication Date:   01/2018
+Comments:           Submitted to the Astrophysical Journal Supplements, 10 pages, 3 tables;
+                    doi:10.3847/1538-4365/aab760
+Origin:             ARXIV
+Keywords:           Astrophysics - Instrumentation and Methods for Astrophysics,
+                    Computer Science - Digital Libraries
+Bibliographic Code: 2018arXiv180101021F
+
+                               Abstract
+Several different controlled vocabularies have been developed and used
+by the astronomical community, each designed to serve a specific need
+and a specific group. The Unified Astronomy Thesaurus (UAT) attempts to
+provide a highly structured controlled vocabulary that will be relevant
+and useful across the entire discipline, regardless of content or
+platform. As two major use cases for the UAT include classifying
+articles and data, we examine the UAT in comparison with the
+Astronomical Subject Keywords used by major publications and the JWST
+Science Keywords used by STScI's Astronomer's Proposal Tool.
diff --git a/adsdocmatch/tests/unittests/stubdata/X18-10145.abs b/adsdocmatch/tests/unittests/stubdata/X18-10145.abs
new file mode 100644
index 0000000..34a51c2
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X18-10145.abs
@@ -0,0 +1,35 @@
+Title:              Observations of large-scale solar flows
+Authors:            Proxauf, Bastian
+Journal:            eprint arXiv:2106.07251
+Publication Date:   06/2021
+Comments:           PhD thesis, 97 pages; doi:10.53846/goediss-8502
+Origin:             ARXIV
+Keywords:           Astrophysics - Solar and Stellar Astrophysics
+Bibliographic Code: 2021arXiv210607251P
+
+                               Abstract
+In this dissertation, several components of large-scale solar flows are
+studied observationally: solar equatorial Rossby waves (waves of radial
+vorticity), large-scale convection, and surface flows around active
+regions. Maps of horizontal flows are derived from photospheric
+observations by the Helioseismic and Magnetic Imager (HMI) aboard the
+Solar Dynamics Observatory (SDO) using two different techniques:
+granulation tracking and local helioseismology. First, the
+eigenfunctions of solar Rossby waves are measured from helioseismic
+ring-diagram flow maps with a correlation method and a spectral
+analysis. Down to $9$ Mm below the surface, the dependence of the radial
+vorticity with radius $r$ is consistent with $r^{m-1}$, for a given
+longitudinal wavenumber $m$. At the surface, the eigenfunctions are
+complex-valued. The real part decreases away from the equator and
+switches sign around $\pm 20-30^\circ$. The imaginary part is small, but
+nonzero, and may be due to wave attenuation. This may have implications
+for the transport of angular momentum in the latitudinal direction.
+Second, we revisit previous measurements of power spectra of
+longitudinal velocities near the solar surface, obtained from
+time-distance and ring-diagram helioseismology. Several issues in these
+past helioseismic analyses are identified and corrected. The corrections
+are not sufficient to remove the discrepancy between the measurements. I
+thus present new velocity power spectra from granulation tracking and
+ring-diagram helioseismology. The two new measurements are close to each
+other near the solar surface, and the corresponding kinetic energy
+decreases with increasing spatial scale.
diff --git a/adsdocmatch/tests/unittests/stubdata/X21-91237.abs b/adsdocmatch/tests/unittests/stubdata/X21-91237.abs
new file mode 100644
index 0000000..87e99ef
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X21-91237.abs
@@ -0,0 +1,30 @@
+Title:              ZipIt! Merging Models from Different Tasks without
+                    Training
+Authors:            Stoica, George; Bolya, Daniel; Bjorner, Jakob;
+                    Hearn, Taylor; Hoffman, Judy
+Journal:            eprint arXiv:2305.03053
+Publication Date:   05/2023
+Origin:             ARXIV
+Keywords:           Computer Science - Computer Vision and Pattern Recognition,
+                    Computer Science - Machine Learning
+Bibliographic Code: 2023arXiv230503053S
+
+                               Abstract
+Typical deep visual recognition models are capable of performing the one
+task they were trained on. In this paper, we tackle the extremely
+difficult problem of combining completely distinct models with different
+initializations, each solving a separate task, into one multi-task model
+without any additional training. Prior work in model merging permutes
+one model to the space of the other then adds them together. While this
+works for models trained on the same task, we find that this fails to
+account for the differences in models trained on disjoint tasks. Thus,
+we introduce "ZipIt!", a general method for merging two arbitrary models
+of the same architecture that incorporates two simple strategies. First,
+in order to account for features that aren't shared between models, we
+expand the model merging problem to additionally allow for merging
+features within each model by defining a general "zip" operation.
+Second, we add support for partially zipping the models up until a
+specified layer, naturally creating a multi-head model. We find that
+these two changes combined account for a staggering 20-60% improvement
+over prior work, making the merging of models trained on disjoint tasks
+feasible.
diff --git a/adsdocmatch/tests/unittests/stubdata/X23-45511.abs b/adsdocmatch/tests/unittests/stubdata/X23-45511.abs
new file mode 100644
index 0000000..19f4b69
--- /dev/null
+++ b/adsdocmatch/tests/unittests/stubdata/X23-45511.abs
@@ -0,0 +1,39 @@
+Title:              Identifying Planetary Names in Astronomy Papers: A
+                    Multi-Step Approach
+Authors:            Shapurian, Golnaz; Kurtz, Michael J;
+                    Accomazzi, Alberto
+Journal:            eprint arXiv:2312.08579
+Publication Date:   12/2023
+Origin:             ARXIV
+Keywords:           Computer Science - Computation and Language,
+                    Astrophysics - Instrumentation and Methods for Astrophysics,
+                    Computer Science - Machine Learning
+Bibliographic Code: 2023arXiv231208579S
+
+                               Abstract
+The automatic identification of planetary feature names in astronomy
+publications presents numerous challenges. These features include
+craters, defined as roughly circular depressions resulting from impact
+or volcanic activity; dorsas, which are elongate raised structures or
+wrinkle ridges; and lacus, small irregular patches of dark, smooth
+material on the Moon, referred to as "lake" (Planetary Names Working
+Group, n.d.). Many feature names overlap with places or people's names
+that they are named after, for example, Syria, Tempe, Einstein, and
+Sagan, to name a few (U.S. Geological Survey, n.d.). Some feature names
+have been used in many contexts, for instance, Apollo, which can refer
+to mission, program, sample, astronaut, seismic, seismometers, core,
+era, data, collection, instrument, and station, in addition to the
+crater on the Moon. Some feature names can appear in the text as
+adjectives, like the lunar craters Black, Green, and White. Some feature
+names in other contexts serve as directions, like craters West and South
+on the Moon. Additionally, some features share identical names across
+different celestial bodies, requiring disambiguation, such as the Adams
+crater, which exists on both the Moon and Mars. We present a multi-step
+pipeline combining rule-based filtering, statistical relevance analysis,
+part-of-speech (POS) tagging, named entity recognition (NER) model,
+hybrid keyword harvesting, knowledge graph (KG) matching, and inference
+with a locally installed large language model (LLM) to reliably identify
+planetary names despite these challenges. When evaluated on a dataset of
+astronomy papers from the Astrophysics Data System (ADS), this
+methodology achieves an F1-score over 0.97 in disambiguating planetary
+feature names.
diff --git a/adsdocmatch/tests/unittests/test_match_w_metadata.py b/adsdocmatch/tests/unittests/test_match_w_metadata.py
index 0fe0070..f80e347 100644
--- a/adsdocmatch/tests/unittests/test_match_w_metadata.py
+++ b/adsdocmatch/tests/unittests/test_match_w_metadata.py
@@ -89,8 +89,7 @@ def test_match_to_pub_1(self):
             'comment': 'No matches with Abstract, trying Title. No document was found in solr matching the request.'
         }]
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
-            matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1701/00200')
+            matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X10-50737.abs')
             self.assertEqual(len(matches), 1)
             fields = matches[0].split('\t')
             self.assertEqual(len(fields), 6)
@@ -113,8 +112,7 @@ def test_match_to_pub_2(self):
             'comment': ''
         }]
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
-            matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1801/01021')
+            matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X11-85081.abs')
             self.assertEqual(len(matches), 1)
             fields = matches[0].split('\t')
             self.assertEqual(len(fields), 6)
@@ -137,8 +135,7 @@ def test_match_to_pub_3(self):
             'comment': ''
         }]
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
-            matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '0708/1752')
+            matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X01-74270.abs')
             self.assertEqual(len(matches), 1)
             fields = matches[0].split('\t')
             self.assertEqual(len(fields), 6)
@@ -168,8 +165,7 @@ def test_match_to_pub_4(self):
             'comment': 'Matching doctype `phdthesis;mastersthesis`. Multi match: 2 of 2.'
         }]
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/'
-            matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '2106/07251')
+            matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X21-91237.abs')
             self.assertEqual(len(matches), 2)
             expected_values = [
                 ['2021arXiv210607251P','2020PhDT........36P','Match','0.8989977',"{'abstract': None, 'title': 1.0, 'author': 1, 'year': 1}",'Matching doctype `phdthesis;mastersthesis`. Multi match: 1 of 2.'],
@@ -193,7 +189,7 @@ def test_match_to_earth_science_1(self):
         }]
         # treat as eprint
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs'))
+            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L48-23288.abs'))
 
         return_value = [{
             'source_bibcode': '2021esoar.10507102L',
@@ -205,7 +201,8 @@ def test_match_to_earth_science_1(self):
         }]
         # now send it as publication
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs'))
+            print(self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L48-23288.abs'))
+            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L48-23288.abs'))
 
     def test_match_to_earth_science_2(self):
         """ test match_to_ earth science records that can appear both as eprint and publication """
@@ -218,7 +215,7 @@ def test_match_to_earth_science_2(self):
             'comment': ''}]
         # treat as eprint
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs'))
+            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L52-28159.abs'))
 
         return_value = [{
             'source_bibcode': '2022EaArX...X58667B',
@@ -230,7 +227,7 @@ def test_match_to_earth_science_2(self):
         }]
         # now send it as publication
         with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value):
-            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs'))
+            self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L52-28159.abs'))
 
     def test_batch_match_to_pub(self):
         """ test batch mode of match_to_pub """
@@ -242,7 +239,7 @@ def test_batch_match_to_pub(self):
         rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME']))
 
         # create input file with list of eprint filenames
-        eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
+        eprint_filenames = ['/X21-91237.abs']
         with open(input_filename, "w") as f:
             for filename in eprint_filenames:
                 f.write("%s\n"%(stubdata_dir+filename))
@@ -505,7 +502,7 @@ def test_process_match_to_pub_without_classic_output(self):
         # create input file with list of eprint filenames
         stubdata_dir = os.path.dirname(__file__) + '/stubdata'
         input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME'])
-        eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
+        eprint_filenames = ['/X21-91237.abs']
         with open(input_filename, "w") as f:
             for filename in eprint_filenames:
                 f.write("%s\n"%(stubdata_dir+filename))
@@ -546,7 +543,7 @@ def test_process_match_to_pub_with_classic_output(self):
         # create input file with list of eprint filenames
         stubdata_dir = os.path.dirname(__file__) + '/stubdata'
         input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME'])
-        eprint_filenames = ['/ArXiv/oai/eprints/2106/07251']
+        eprint_filenames = ['/X21-91237.abs']
         with open(input_filename, "w") as f:
             for filename in eprint_filenames:
                 f.write("%s\n" % (stubdata_dir + filename))
@@ -622,7 +619,7 @@ def test_write_results(self):
         stubdata_dir = os.path.dirname(__file__) + '/stubdata'
         result_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME'])
         rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME']))
-        eprint_filename = "%s%s"% (stubdata_dir, '/ArXiv/oai/eprints/2305/03053')
+        eprint_filename = "%s%s"% (stubdata_dir, '/X21-91237.abs')
         matches = self.match_metadata.process_results([{
             'source_bibcode': '2023arXiv230503053S',
             'status_flaw' : "got 502 for the last failed attempt -- shall be added to rerun list."}], '\t')
diff --git a/adsdocmatch/tests/unittests/test_oracle_util.py b/adsdocmatch/tests/unittests/test_oracle_util.py
index f326498..f413d9e 100644
--- a/adsdocmatch/tests/unittests/test_oracle_util.py
+++ b/adsdocmatch/tests/unittests/test_oracle_util.py
@@ -10,6 +10,7 @@
 
 from adsputils import load_config
 from adsdocmatch.match_w_metadata import MatchMetadata
+from adsdocmatch.pub_parser import get_pub_metadata
 
 config = load_config(proj_home=project_home)
 
@@ -32,14 +33,14 @@ def create_response(self, text):
 
     def test_normalize_author_list(self):
         """ """
-        eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579']
-        stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints'
+        eprint_filenames = ['X18-10145.abs', 'X10-50737.abs', 'X11-85081.abs', 'X23-45511.abs']
+        stubdata_dir = os.path.dirname(__file__) + '/stubdata/'
 
         expected_authors = ['Proxauf, B', 'Tang, X', 'Frey, K; Accomazzi, A', 'Shapurian, G; Kurtz, M; Accomazzi, A']
         for filename, authors in zip(eprint_filenames, expected_authors):
             fullpath = stubdata_dir + filename
             with open(fullpath, 'rb') as arxiv_fp:
-                metadata = self.match_metadata.ARXIV_PARSER.parse(arxiv_fp)
+                metadata = get_pub_metadata(arxiv_fp.read())
                 self.assertEqual(self.match_metadata.ORACLE_UTIL.normalize_author_list(metadata['authors']), authors)
 
         # what if only lastnames are provided
@@ -70,14 +71,15 @@ def test_normalize_author_list(self):
 
     def test_extract_doi(self):
         """ """
-        eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579']
-        stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints'
+        eprint_filenames = ['X18-10145.abs', 'X10-50737.abs', 'X11-85081.abs', 'X23-45511.abs']
+        stubdata_dir = os.path.dirname(__file__) + '/stubdata/'
 
         expected_dois = [['10.53846/goediss-8502'], None, ['10.3847/1538-4365/aab760'], None]
         for filename, doi in zip(eprint_filenames, expected_dois):
             fullpath = stubdata_dir + filename
             with open(fullpath, 'rb') as arxiv_fp:
-                metadata = self.match_metadata.ARXIV_PARSER.parse(arxiv_fp)
+                metadata = get_pub_metadata(arxiv_fp.read())
+                metadata, _, _, _ = self.match_metadata.parse_arXiv_comments(metadata)
                 self.assertEqual(self.match_metadata.ORACLE_UTIL.extract_doi(metadata), doi)
 
     def test_read_google_sheet(self):
diff --git a/requirements.txt b/requirements.txt
index 307dcc8..70571ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-git+https://github.com/adsabs/adsabs-pyingest.git@v1.2.2
 git+https://github.com/adsabs/ADSGoogleConnector.git@v0.0.3
 adsputils==1.4.3
 numpy==1.24.2