From a1863961f31105e885d1fea778f52c461932b963 Mon Sep 17 00:00:00 2001 From: golnazads <28757512+golnazads@users.noreply.github.com> Date: Mon, 18 Dec 2023 17:05:53 -0500 Subject: [PATCH] arXiv source metadata is swtiched, and pyingest dropped --- adsdocmatch/match_w_metadata.py | 88 +++++++++++-------- adsdocmatch/pub_parser/__init__.py | 5 ++ .../stubdata/ArXiv/oai/eprints/0708/1752 | 40 --------- .../stubdata/ArXiv/oai/eprints/1701/00200 | 26 ------ .../stubdata/ArXiv/oai/eprints/1801/01021 | 33 ------- .../stubdata/ArXiv/oai/eprints/2106/07251 | 43 --------- .../stubdata/ArXiv/oai/eprints/2312/08579 | 47 ---------- .../stubdata/{text/L48 => }/L48-23288.abs | 0 .../stubdata/{text/L52 => }/L52-28159.abs | 0 .../tests/unittests/stubdata/X01-74270.abs | 32 +++++++ .../tests/unittests/stubdata/X10-50737.abs | 17 ++++ .../tests/unittests/stubdata/X11-85081.abs | 22 +++++ .../tests/unittests/stubdata/X18-10145.abs | 35 ++++++++ .../tests/unittests/stubdata/X21-91237.abs | 30 +++++++ .../tests/unittests/stubdata/X23-45511.abs | 39 ++++++++ .../tests/unittests/test_match_w_metadata.py | 29 +++--- .../tests/unittests/test_oracle_util.py | 14 +-- requirements.txt | 1 - 18 files changed, 252 insertions(+), 249 deletions(-) delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752 delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200 delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021 delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251 delete mode 100644 adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 rename adsdocmatch/tests/unittests/stubdata/{text/L48 => }/L48-23288.abs (100%) rename adsdocmatch/tests/unittests/stubdata/{text/L52 => }/L52-28159.abs (100%) create mode 100644 adsdocmatch/tests/unittests/stubdata/X01-74270.abs create mode 100644 adsdocmatch/tests/unittests/stubdata/X10-50737.abs create mode 100644 adsdocmatch/tests/unittests/stubdata/X11-85081.abs create mode 100644 adsdocmatch/tests/unittests/stubdata/X18-10145.abs create mode 100644 adsdocmatch/tests/unittests/stubdata/X21-91237.abs create mode 100644 adsdocmatch/tests/unittests/stubdata/X23-45511.abs diff --git a/adsdocmatch/match_w_metadata.py b/adsdocmatch/match_w_metadata.py index f604f65..1597a7d 100644 --- a/adsdocmatch/match_w_metadata.py +++ b/adsdocmatch/match_w_metadata.py @@ -6,7 +6,6 @@ from adsdocmatch.pub_parser import get_pub_metadata from adsdocmatch.oracle_util import OracleUtil from adsdocmatch.matchable_status import matchable_status -from pyingest.parsers.arxiv import ArxivParser from adsputils import setup_logging, load_config proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../")) @@ -30,7 +29,6 @@ class MatchMetadata(): process_pub_bibstem = {} - ARXIV_PARSER = ArxivParser() ORACLE_UTIL = OracleUtil() def get_input_filenames(self, filename): @@ -85,6 +83,45 @@ def process_pub_metadata(self, metadata): self.process_pub_bibstem[bibstem] = 1 if status == True else (0 if status == False else -1) return self.process_pub_bibstem[bibstem] + def parse_arXiv_comments(self, metadata): + """ + + :param metadata: + :return: + """ + comments = metadata.get('arXiv_comments', '') + if comments: + # extract doi out of comments if there are any + match = self.re_doi.search(comments) + if match: + metadata['doi'] = match.group(1) + else: + doi = metadata.get('properties', {}).get('DOI', None) + if doi: + metadata['doi'] = doi.replace('doi:', '') + match_doctype = None + title = metadata.get('title') + # check title for erratum + match = self.re_doctype_errata.search(title) + if match: + match_doctype = ['erratum'] + else: + match = self.re_doctype_bookreview.search(title) + if match: + match_doctype = ['bookreview'] + else: + # check both comments and title for thesis + match = self.re_doctype_thesis.search("%s %s" % (comments, title)) + if match: + match_doctype = ['phdthesis', 'mastersthesis'] + must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH) + else: + metadata.pop("doi", None) + match_doctype = None + must_match = False + comments = '' + return metadata, comments, must_match, match_doctype + def write_results(self, result_filename, matches, metadata_filename, rerun_filename): """ @@ -202,41 +239,8 @@ def match_to_pub(self, filename): """ try: with open(filename, 'rb') as arxiv_fp: - journal = filename.strip().split('/')[-5] - if journal == 'ArXiv': - metadata = self.ARXIV_PARSER.parse(arxiv_fp) - comments = ' '.join(metadata.get('comments', [])) - # extract doi out of comments if there are any - match = self.re_doi.search(comments) - if match: - metadata['doi'] = match.group(1) - else: - doi = metadata.get('properties', {}).get('DOI', None) - if doi: - metadata['doi'] = doi.replace('doi:', '') - match_doctype = None - title = metadata.get('title') - # check title for erratum - match = self.re_doctype_errata.search(title) - if match: - match_doctype = ['erratum'] - else: - match = self.re_doctype_bookreview.search(title) - if match: - match_doctype = ['bookreview'] - else: - # check both comments and title for thesis - match = self.re_doctype_thesis.search("%s %s" % (comments, title)) - if match: - match_doctype = ['phdthesis', 'mastersthesis'] - must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH) - else: - metadata = get_pub_metadata(arxiv_fp.read()) - # remove the doi, since in this case, oracle thinks it is the publication doi - metadata.pop("doi", None) - match_doctype = None - must_match = False - comments = '' + metadata = get_pub_metadata(arxiv_fp.read()) + metadata, comments, must_match, match_doctype = self.parse_arXiv_comments(metadata) oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype) # before proceeding see if this arXiv article's class is among the ones that ADS archives the # published version if available @@ -456,3 +460,13 @@ def process_match_to_pub(self, path): combined_output_filename = "%s%s" % (path, config.get('DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME', 'default')) self.merge_classic_docmatch_results(classic_matched_filename, result_filename, combined_output_filename) return combined_output_filename + +if __name__ == '__main__': + print(MatchMetadata().match_to_pub('/proj/ads/abstracts/gen/text/L48/L48-23288.abs')) + print(MatchMetadata().match_to_arXiv('/proj/ads/abstracts/gen/text/L48/L48-23288.abs')) + + ''' +/proj/ads/abstracts/gen/text/L52/L52-28159.abs +/proj/ads/abstracts/gen/text/L48/L48-23288.abs +/proj/ads/abstracts/sources/ArXiv/oai/arXiv.org/2306/02768 +''' \ No newline at end of file diff --git a/adsdocmatch/pub_parser/__init__.py b/adsdocmatch/pub_parser/__init__.py index 122eefc..1516ba0 100644 --- a/adsdocmatch/pub_parser/__init__.py +++ b/adsdocmatch/pub_parser/__init__.py @@ -51,6 +51,7 @@ def as_needed(article): ("Publication Date", "pubdate"), ("Bibliographic Code", "bibcode"), ("DOI", "doi"), + ("arXiv_comments", "arXiv_comments") ] return_record = {} for src_key, dest_key in field_mappings: @@ -131,4 +132,8 @@ def get_pub_metadata(contents): switch_date = article['Publication Date'].split('/') article['Publication Date'] = switch_date[1] + '/' + switch_date[0] + if 'Origin' in fields_found_in_file: + if article['Origin'] == 'ARXIV': + article['arXiv_comments'] = article.get('Comments', '') + return as_needed(article) diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752 deleted file mode 100644 index 81fa884..0000000 --- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752 +++ /dev/null @@ -1,40 +0,0 @@ - -
- oai:arXiv.org:0708.1752 - 2010-04-06 - physics:astro-ph -
- - - Validation of the new Hipparcos reduction - van Leeuwen, F. - Astrophysics - Context.A new reduction of the astrometric data as produced by the Hipparcos -mission has been published, claiming accuracies for nearly all stars brighter -than magnitude Hp = 8 to be better, by up to a factor 4, than in the original -catalogue. Aims.The new Hipparcos astrometric catalogue is checked for the -quality of the data and the consistency of the formal errors as well as the -possible presence of error correlations. The differences with the earlier -publication are explained. Methods. The internal errors are followed through -the reduction process, and the external errors are investigated on the basis of -a comparison with radio observations of a small selection of stars, and the -distribution of negative parallaxes. Error correlation levels are investigated -and the reduction by more than a factor 10 as obtained in the new catalogue is -explained. Results.The formal errors on the parallaxes for the new catalogue -are confirmed. The presence of a small amount of additional noise, though -unlikely, cannot be ruled out. Conclusions. The new reduction of the Hipparcos -astrometric data provides an improvement by a factor 2.2 in the total weight -compared to the catalogue published in 1997, and provides much improved data -for a wide range of studies on stellar luminosities and local galactic -kinematics. - - Comment: 12 pages, 19 figures, accepted for publication by Astronomy and - Astrophysics - 2007-08-13 - text - http://arxiv.org/abs/0708.1752 - Astron.Astrophys.474:653-664,2007 - doi:10.1051/0004-6361:20078357 - - -
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200 deleted file mode 100644 index ca23e71..0000000 --- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200 +++ /dev/null @@ -1,26 +0,0 @@ - -
- oai:arXiv.org:1701.00200 - 2017-08-22 - math -
- - - Post-Lie algebra structures on the Witt algebra - Tang, Xiaomin - Mathematics - Rings and Algebras - 17A30, 17A42, 17B60, 18D50 - In this paper, we characterize the graded post-Lie algebra structures and a -class of shifting post-Lie algebra structures on the Witt algebra. We obtain -some new Lie algebras and give a class of their modules. As an application, the -homogeneous Rota-Baxter operators and a class of non-homogeneous Rota-Baxter -operators of weight $1$ on the Witt algebra are studied. - - Comment: 24 pages - 2017-01-01 - 2017-08-19 - text - http://arxiv.org/abs/1701.00200 - - -
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021 deleted file mode 100644 index 433aa25..0000000 --- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021 +++ /dev/null @@ -1,33 +0,0 @@ - -
- oai:arXiv.org:1801.01021 - 2018-05-23 - cs - physics:astro-ph -
- - - The Unified Astronomy Thesaurus: Semantic Metadata for Astronomy and - Astrophysics - Frey, Katie - Accomazzi, Alberto - Astrophysics - Instrumentation and Methods for Astrophysics - Computer Science - Digital Libraries - Several different controlled vocabularies have been developed and used by the -astronomical community, each designed to serve a specific need and a specific -group. The Unified Astronomy Thesaurus (UAT) attempts to provide a highly -structured controlled vocabulary that will be relevant and useful across the -entire discipline, regardless of content or platform. As two major use cases -for the UAT include classifying articles and data, we examine the UAT in -comparison with the Astronomical Subject Keywords used by major publications -and the JWST Science Keywords used by STScI's Astronomer's Proposal Tool. - - Comment: Submitted to the Astrophysical Journal Supplements, 10 pages, 3 - tables - 2018-01-03 - text - http://arxiv.org/abs/1801.01021 - doi:10.3847/1538-4365/aab760 - - -
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251 deleted file mode 100644 index 2544460..0000000 --- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251 +++ /dev/null @@ -1,43 +0,0 @@ - -
- oai:arXiv.org:2106.07251 - 2023-01-25 - physics:astro-ph -
- - - Observations of large-scale solar flows - Proxauf, Bastian - Astrophysics - Solar and Stellar Astrophysics - In this dissertation, several components of large-scale solar flows are -studied observationally: solar equatorial Rossby waves (waves of radial -vorticity), large-scale convection, and surface flows around active regions. -Maps of horizontal flows are derived from photospheric observations by the -Helioseismic and Magnetic Imager (HMI) aboard the Solar Dynamics Observatory -(SDO) using two different techniques: granulation tracking and local -helioseismology. First, the eigenfunctions of solar Rossby waves are measured -from helioseismic ring-diagram flow maps with a correlation method and a -spectral analysis. Down to $9$ Mm below the surface, the dependence of the -radial vorticity with radius $r$ is consistent with $r^{m-1}$, for a given -longitudinal wavenumber $m$. At the surface, the eigenfunctions are -complex-valued. The real part decreases away from the equator and switches sign -around $\pm 20-30^\circ$. The imaginary part is small, but nonzero, and may be -due to wave attenuation. This may have implications for the transport of -angular momentum in the latitudinal direction. Second, we revisit previous -measurements of power spectra of longitudinal velocities near the solar -surface, obtained from time-distance and ring-diagram helioseismology. Several -issues in these past helioseismic analyses are identified and corrected. The -corrections are not sufficient to remove the discrepancy between the -measurements. I thus present new velocity power spectra from granulation -tracking and ring-diagram helioseismology. The two new measurements are close -to each other near the solar surface, and the corresponding kinetic energy -decreases with increasing spatial scale. - - Comment: PhD thesis, 97 pages - 2021-06-14 - text - http://arxiv.org/abs/2106.07251 - doi:10.53846/goediss-8502 - - -
diff --git a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 b/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 deleted file mode 100644 index 38fef15..0000000 --- a/adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579 +++ /dev/null @@ -1,47 +0,0 @@ - -
- oai:arXiv.org:2312.08579 - 2023-12-15 - cs - physics:astro-ph -
- - - Identifying Planetary Names in Astronomy Papers: A Multi-Step Approach - Shapurian, Golnaz - Kurtz, Michael J - Accomazzi, Alberto - Computer Science - Computation and Language - Astrophysics - Instrumentation and Methods for Astrophysics - Computer Science - Machine Learning - The automatic identification of planetary feature names in astronomy -publications presents numerous challenges. These features include craters, -defined as roughly circular depressions resulting from impact or volcanic -activity; dorsas, which are elongate raised structures or wrinkle ridges; and -lacus, small irregular patches of dark, smooth material on the Moon, referred -to as "lake" (Planetary Names Working Group, n.d.). Many feature names overlap -with places or people's names that they are named after, for example, Syria, -Tempe, Einstein, and Sagan, to name a few (U.S. Geological Survey, n.d.). Some -feature names have been used in many contexts, for instance, Apollo, which can -refer to mission, program, sample, astronaut, seismic, seismometers, core, era, -data, collection, instrument, and station, in addition to the crater on the -Moon. Some feature names can appear in the text as adjectives, like the lunar -craters Black, Green, and White. Some feature names in other contexts serve as -directions, like craters West and South on the Moon. Additionally, some -features share identical names across different celestial bodies, requiring -disambiguation, such as the Adams crater, which exists on both the Moon and -Mars. We present a multi-step pipeline combining rule-based filtering, -statistical relevance analysis, part-of-speech (POS) tagging, named entity -recognition (NER) model, hybrid keyword harvesting, knowledge graph (KG) -matching, and inference with a locally installed large language model (LLM) to -reliably identify planetary names despite these challenges. When evaluated on a -dataset of astronomy papers from the Astrophysics Data System (ADS), this -methodology achieves an F1-score over 0.97 in disambiguating planetary feature -names. - - 2023-12-13 - text - http://arxiv.org/abs/2312.08579 - - -
diff --git a/adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs b/adsdocmatch/tests/unittests/stubdata/L48-23288.abs similarity index 100% rename from adsdocmatch/tests/unittests/stubdata/text/L48/L48-23288.abs rename to adsdocmatch/tests/unittests/stubdata/L48-23288.abs diff --git a/adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs b/adsdocmatch/tests/unittests/stubdata/L52-28159.abs similarity index 100% rename from adsdocmatch/tests/unittests/stubdata/text/L52/L52-28159.abs rename to adsdocmatch/tests/unittests/stubdata/L52-28159.abs diff --git a/adsdocmatch/tests/unittests/stubdata/X01-74270.abs b/adsdocmatch/tests/unittests/stubdata/X01-74270.abs new file mode 100644 index 0000000..1134f87 --- /dev/null +++ b/adsdocmatch/tests/unittests/stubdata/X01-74270.abs @@ -0,0 +1,32 @@ +Title: Validation of the new Hipparcos reduction +Authors: van Leeuwen, F. +Journal: eprint arXiv:0708.1752 +Publication Date: 08/2007 +Comments: 12 pages, 19 figures, accepted for publication by Astronomy and Astrophysics; + Astron.Astrophys.474:653-664,2007; + doi:10.1051/0004-6361:20078357 +Origin: ARXIV +Keywords: Astrophysics +Bibliographic Code: 2007arXiv0708.1752V + + Abstract +Context.A new reduction of the astrometric data as produced by the +Hipparcos mission has been published, claiming accuracies for nearly all +stars brighter than magnitude Hp = 8 to be better, by up to a factor 4, +than in the original catalogue. Aims.The new Hipparcos astrometric +catalogue is checked for the quality of the data and the consistency of +the formal errors as well as the possible presence of error +correlations. The differences with the earlier publication are +explained. Methods. The internal errors are followed through the +reduction process, and the external errors are investigated on the basis +of a comparison with radio observations of a small selection of stars, +and the distribution of negative parallaxes. Error correlation levels +are investigated and the reduction by more than a factor 10 as obtained +in the new catalogue is explained. Results.The formal errors on the +parallaxes for the new catalogue are confirmed. The presence of a small +amount of additional noise, though unlikely, cannot be ruled out. +Conclusions. The new reduction of the Hipparcos astrometric data +provides an improvement by a factor 2.2 in the total weight compared to +the catalogue published in 1997, and provides much improved data for a +wide range of studies on stellar luminosities and local galactic +kinematics. diff --git a/adsdocmatch/tests/unittests/stubdata/X10-50737.abs b/adsdocmatch/tests/unittests/stubdata/X10-50737.abs new file mode 100644 index 0000000..d2f4766 --- /dev/null +++ b/adsdocmatch/tests/unittests/stubdata/X10-50737.abs @@ -0,0 +1,17 @@ +Title: Post-Lie algebra structures on the Witt algebra +Authors: Tang, Xiaomin +Journal: eprint arXiv:1701.00200 +Publication Date: 01/2017 +Comments: 24 pages +Origin: ARXIV +Keywords: Mathematics - Rings and Algebras, 17A30, 17A42, + 17B60, 18D50 +Bibliographic Code: 2017arXiv170100200T + + Abstract +In this paper, we characterize the graded post-Lie algebra structures +and a class of shifting post-Lie algebra structures on the Witt algebra. +We obtain some new Lie algebras and give a class of their modules. As an +application, the homogeneous Rota-Baxter operators and a class of +non-homogeneous Rota-Baxter operators of weight $1$ on the Witt algebra +are studied. diff --git a/adsdocmatch/tests/unittests/stubdata/X11-85081.abs b/adsdocmatch/tests/unittests/stubdata/X11-85081.abs new file mode 100644 index 0000000..a5835ad --- /dev/null +++ b/adsdocmatch/tests/unittests/stubdata/X11-85081.abs @@ -0,0 +1,22 @@ +Title: The Unified Astronomy Thesaurus: Semantic Metadata + for Astronomy and Astrophysics +Authors: Frey, Katie; Accomazzi, Alberto +Journal: eprint arXiv:1801.01021 +Publication Date: 01/2018 +Comments: Submitted to the Astrophysical Journal Supplements, 10 pages, 3 tables; + doi:10.3847/1538-4365/aab760 +Origin: ARXIV +Keywords: Astrophysics - Instrumentation and Methods for Astrophysics, + Computer Science - Digital Libraries +Bibliographic Code: 2018arXiv180101021F + + Abstract +Several different controlled vocabularies have been developed and used +by the astronomical community, each designed to serve a specific need +and a specific group. The Unified Astronomy Thesaurus (UAT) attempts to +provide a highly structured controlled vocabulary that will be relevant +and useful across the entire discipline, regardless of content or +platform. As two major use cases for the UAT include classifying +articles and data, we examine the UAT in comparison with the +Astronomical Subject Keywords used by major publications and the JWST +Science Keywords used by STScI's Astronomer's Proposal Tool. diff --git a/adsdocmatch/tests/unittests/stubdata/X18-10145.abs b/adsdocmatch/tests/unittests/stubdata/X18-10145.abs new file mode 100644 index 0000000..34a51c2 --- /dev/null +++ b/adsdocmatch/tests/unittests/stubdata/X18-10145.abs @@ -0,0 +1,35 @@ +Title: Observations of large-scale solar flows +Authors: Proxauf, Bastian +Journal: eprint arXiv:2106.07251 +Publication Date: 06/2021 +Comments: PhD thesis, 97 pages; doi:10.53846/goediss-8502 +Origin: ARXIV +Keywords: Astrophysics - Solar and Stellar Astrophysics +Bibliographic Code: 2021arXiv210607251P + + Abstract +In this dissertation, several components of large-scale solar flows are +studied observationally: solar equatorial Rossby waves (waves of radial +vorticity), large-scale convection, and surface flows around active +regions. Maps of horizontal flows are derived from photospheric +observations by the Helioseismic and Magnetic Imager (HMI) aboard the +Solar Dynamics Observatory (SDO) using two different techniques: +granulation tracking and local helioseismology. First, the +eigenfunctions of solar Rossby waves are measured from helioseismic +ring-diagram flow maps with a correlation method and a spectral +analysis. Down to $9$ Mm below the surface, the dependence of the radial +vorticity with radius $r$ is consistent with $r^{m-1}$, for a given +longitudinal wavenumber $m$. At the surface, the eigenfunctions are +complex-valued. The real part decreases away from the equator and +switches sign around $\pm 20-30^\circ$. The imaginary part is small, but +nonzero, and may be due to wave attenuation. This may have implications +for the transport of angular momentum in the latitudinal direction. +Second, we revisit previous measurements of power spectra of +longitudinal velocities near the solar surface, obtained from +time-distance and ring-diagram helioseismology. Several issues in these +past helioseismic analyses are identified and corrected. The corrections +are not sufficient to remove the discrepancy between the measurements. I +thus present new velocity power spectra from granulation tracking and +ring-diagram helioseismology. The two new measurements are close to each +other near the solar surface, and the corresponding kinetic energy +decreases with increasing spatial scale. diff --git a/adsdocmatch/tests/unittests/stubdata/X21-91237.abs b/adsdocmatch/tests/unittests/stubdata/X21-91237.abs new file mode 100644 index 0000000..87e99ef --- /dev/null +++ b/adsdocmatch/tests/unittests/stubdata/X21-91237.abs @@ -0,0 +1,30 @@ +Title: ZipIt! Merging Models from Different Tasks without + Training +Authors: Stoica, George; Bolya, Daniel; Bjorner, Jakob; + Hearn, Taylor; Hoffman, Judy +Journal: eprint arXiv:2305.03053 +Publication Date: 05/2023 +Origin: ARXIV +Keywords: Computer Science - Computer Vision and Pattern Recognition, + Computer Science - Machine Learning +Bibliographic Code: 2023arXiv230503053S + + Abstract +Typical deep visual recognition models are capable of performing the one +task they were trained on. In this paper, we tackle the extremely +difficult problem of combining completely distinct models with different +initializations, each solving a separate task, into one multi-task model +without any additional training. Prior work in model merging permutes +one model to the space of the other then adds them together. While this +works for models trained on the same task, we find that this fails to +account for the differences in models trained on disjoint tasks. Thus, +we introduce "ZipIt!", a general method for merging two arbitrary models +of the same architecture that incorporates two simple strategies. First, +in order to account for features that aren't shared between models, we +expand the model merging problem to additionally allow for merging +features within each model by defining a general "zip" operation. +Second, we add support for partially zipping the models up until a +specified layer, naturally creating a multi-head model. We find that +these two changes combined account for a staggering 20-60% improvement +over prior work, making the merging of models trained on disjoint tasks +feasible. diff --git a/adsdocmatch/tests/unittests/stubdata/X23-45511.abs b/adsdocmatch/tests/unittests/stubdata/X23-45511.abs new file mode 100644 index 0000000..19f4b69 --- /dev/null +++ b/adsdocmatch/tests/unittests/stubdata/X23-45511.abs @@ -0,0 +1,39 @@ +Title: Identifying Planetary Names in Astronomy Papers: A + Multi-Step Approach +Authors: Shapurian, Golnaz; Kurtz, Michael J; + Accomazzi, Alberto +Journal: eprint arXiv:2312.08579 +Publication Date: 12/2023 +Origin: ARXIV +Keywords: Computer Science - Computation and Language, + Astrophysics - Instrumentation and Methods for Astrophysics, + Computer Science - Machine Learning +Bibliographic Code: 2023arXiv231208579S + + Abstract +The automatic identification of planetary feature names in astronomy +publications presents numerous challenges. These features include +craters, defined as roughly circular depressions resulting from impact +or volcanic activity; dorsas, which are elongate raised structures or +wrinkle ridges; and lacus, small irregular patches of dark, smooth +material on the Moon, referred to as "lake" (Planetary Names Working +Group, n.d.). Many feature names overlap with places or people's names +that they are named after, for example, Syria, Tempe, Einstein, and +Sagan, to name a few (U.S. Geological Survey, n.d.). Some feature names +have been used in many contexts, for instance, Apollo, which can refer +to mission, program, sample, astronaut, seismic, seismometers, core, +era, data, collection, instrument, and station, in addition to the +crater on the Moon. Some feature names can appear in the text as +adjectives, like the lunar craters Black, Green, and White. Some feature +names in other contexts serve as directions, like craters West and South +on the Moon. Additionally, some features share identical names across +different celestial bodies, requiring disambiguation, such as the Adams +crater, which exists on both the Moon and Mars. We present a multi-step +pipeline combining rule-based filtering, statistical relevance analysis, +part-of-speech (POS) tagging, named entity recognition (NER) model, +hybrid keyword harvesting, knowledge graph (KG) matching, and inference +with a locally installed large language model (LLM) to reliably identify +planetary names despite these challenges. When evaluated on a dataset of +astronomy papers from the Astrophysics Data System (ADS), this +methodology achieves an F1-score over 0.97 in disambiguating planetary +feature names. diff --git a/adsdocmatch/tests/unittests/test_match_w_metadata.py b/adsdocmatch/tests/unittests/test_match_w_metadata.py index 0fe0070..f80e347 100644 --- a/adsdocmatch/tests/unittests/test_match_w_metadata.py +++ b/adsdocmatch/tests/unittests/test_match_w_metadata.py @@ -89,8 +89,7 @@ def test_match_to_pub_1(self): 'comment': 'No matches with Abstract, trying Title. No document was found in solr matching the request.' }] with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/' - matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1701/00200') + matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X10-50737.abs') self.assertEqual(len(matches), 1) fields = matches[0].split('\t') self.assertEqual(len(fields), 6) @@ -113,8 +112,7 @@ def test_match_to_pub_2(self): 'comment': '' }] with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/' - matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '1801/01021') + matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X11-85081.abs') self.assertEqual(len(matches), 1) fields = matches[0].split('\t') self.assertEqual(len(fields), 6) @@ -137,8 +135,7 @@ def test_match_to_pub_3(self): 'comment': '' }] with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/' - matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '0708/1752') + matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X01-74270.abs') self.assertEqual(len(matches), 1) fields = matches[0].split('\t') self.assertEqual(len(fields), 6) @@ -168,8 +165,7 @@ def test_match_to_pub_4(self): 'comment': 'Matching doctype `phdthesis;mastersthesis`. Multi match: 2 of 2.' }] with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - arXiv_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints/' - matches = self.match_metadata.single_match_to_pub(filename=arXiv_dir + '2106/07251') + matches = self.match_metadata.single_match_to_pub(filename=os.path.dirname(__file__) + '/stubdata/X21-91237.abs') self.assertEqual(len(matches), 2) expected_values = [ ['2021arXiv210607251P','2020PhDT........36P','Match','0.8989977',"{'abstract': None, 'title': 1.0, 'author': 1, 'year': 1}",'Matching doctype `phdthesis;mastersthesis`. Multi match: 1 of 2.'], @@ -193,7 +189,7 @@ def test_match_to_earth_science_1(self): }] # treat as eprint with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs')) + self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L48-23288.abs')) return_value = [{ 'source_bibcode': '2021esoar.10507102L', @@ -205,7 +201,8 @@ def test_match_to_earth_science_1(self): }] # now send it as publication with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L48/L48-23288.abs')) + print(self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L48-23288.abs')) + self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L48-23288.abs')) def test_match_to_earth_science_2(self): """ test match_to_ earth science records that can appear both as eprint and publication """ @@ -218,7 +215,7 @@ def test_match_to_earth_science_2(self): 'comment': ''}] # treat as eprint with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs')) + self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L52-28159.abs')) return_value = [{ 'source_bibcode': '2022EaArX...X58667B', @@ -230,7 +227,7 @@ def test_match_to_earth_science_2(self): }] # now send it as publication with mock.patch.object(self.match_metadata.ORACLE_UTIL, 'get_matches', return_value=return_value): - self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/text/L52/L52-28159.abs')) + self.assertEqual(return_value, self.match_metadata.match_to_pub(os.path.dirname(__file__) + '/stubdata/L52-28159.abs')) def test_batch_match_to_pub(self): """ test batch mode of match_to_pub """ @@ -242,7 +239,7 @@ def test_batch_match_to_pub(self): rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME'])) # create input file with list of eprint filenames - eprint_filenames = ['/ArXiv/oai/eprints/2106/07251'] + eprint_filenames = ['/X21-91237.abs'] with open(input_filename, "w") as f: for filename in eprint_filenames: f.write("%s\n"%(stubdata_dir+filename)) @@ -505,7 +502,7 @@ def test_process_match_to_pub_without_classic_output(self): # create input file with list of eprint filenames stubdata_dir = os.path.dirname(__file__) + '/stubdata' input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME']) - eprint_filenames = ['/ArXiv/oai/eprints/2106/07251'] + eprint_filenames = ['/X21-91237.abs'] with open(input_filename, "w") as f: for filename in eprint_filenames: f.write("%s\n"%(stubdata_dir+filename)) @@ -546,7 +543,7 @@ def test_process_match_to_pub_with_classic_output(self): # create input file with list of eprint filenames stubdata_dir = os.path.dirname(__file__) + '/stubdata' input_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_INPUT_FILENAME']) - eprint_filenames = ['/ArXiv/oai/eprints/2106/07251'] + eprint_filenames = ['/X21-91237.abs'] with open(input_filename, "w") as f: for filename in eprint_filenames: f.write("%s\n" % (stubdata_dir + filename)) @@ -622,7 +619,7 @@ def test_write_results(self): stubdata_dir = os.path.dirname(__file__) + '/stubdata' result_filename = "%s%s" % (stubdata_dir, config['DOCMATCHPIPELINE_EPRINT_RESULT_FILENAME']) rerun_filename = os.path.abspath(os.path.join(stubdata_dir, config['DOCMATCHPIPELINE_RERUN_FILENAME'])) - eprint_filename = "%s%s"% (stubdata_dir, '/ArXiv/oai/eprints/2305/03053') + eprint_filename = "%s%s"% (stubdata_dir, '/X21-91237.abs') matches = self.match_metadata.process_results([{ 'source_bibcode': '2023arXiv230503053S', 'status_flaw' : "got 502 for the last failed attempt -- shall be added to rerun list."}], '\t') diff --git a/adsdocmatch/tests/unittests/test_oracle_util.py b/adsdocmatch/tests/unittests/test_oracle_util.py index f326498..f413d9e 100644 --- a/adsdocmatch/tests/unittests/test_oracle_util.py +++ b/adsdocmatch/tests/unittests/test_oracle_util.py @@ -10,6 +10,7 @@ from adsputils import load_config from adsdocmatch.match_w_metadata import MatchMetadata +from adsdocmatch.pub_parser import get_pub_metadata config = load_config(proj_home=project_home) @@ -32,14 +33,14 @@ def create_response(self, text): def test_normalize_author_list(self): """ """ - eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579'] - stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints' + eprint_filenames = ['X18-10145.abs', 'X10-50737.abs', 'X11-85081.abs', 'X23-45511.abs'] + stubdata_dir = os.path.dirname(__file__) + '/stubdata/' expected_authors = ['Proxauf, B', 'Tang, X', 'Frey, K; Accomazzi, A', 'Shapurian, G; Kurtz, M; Accomazzi, A'] for filename, authors in zip(eprint_filenames, expected_authors): fullpath = stubdata_dir + filename with open(fullpath, 'rb') as arxiv_fp: - metadata = self.match_metadata.ARXIV_PARSER.parse(arxiv_fp) + metadata = get_pub_metadata(arxiv_fp.read()) self.assertEqual(self.match_metadata.ORACLE_UTIL.normalize_author_list(metadata['authors']), authors) # what if only lastnames are provided @@ -70,14 +71,15 @@ def test_normalize_author_list(self): def test_extract_doi(self): """ """ - eprint_filenames = ['/2106/07251', '/1701/00200', '/1801/01021', '/2312/08579'] - stubdata_dir = os.path.dirname(__file__) + '/stubdata/ArXiv/oai/eprints' + eprint_filenames = ['X18-10145.abs', 'X10-50737.abs', 'X11-85081.abs', 'X23-45511.abs'] + stubdata_dir = os.path.dirname(__file__) + '/stubdata/' expected_dois = [['10.53846/goediss-8502'], None, ['10.3847/1538-4365/aab760'], None] for filename, doi in zip(eprint_filenames, expected_dois): fullpath = stubdata_dir + filename with open(fullpath, 'rb') as arxiv_fp: - metadata = self.match_metadata.ARXIV_PARSER.parse(arxiv_fp) + metadata = get_pub_metadata(arxiv_fp.read()) + metadata, _, _, _ = self.match_metadata.parse_arXiv_comments(metadata) self.assertEqual(self.match_metadata.ORACLE_UTIL.extract_doi(metadata), doi) def test_read_google_sheet(self): diff --git a/requirements.txt b/requirements.txt index 307dcc8..70571ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -git+https://github.com/adsabs/adsabs-pyingest.git@v1.2.2 git+https://github.com/adsabs/ADSGoogleConnector.git@v0.0.3 adsputils==1.4.3 numpy==1.24.2