Skip to content

Commit

Permalink
Merge pull request #28 from golnazads/master
Browse files Browse the repository at this point in the history
arXiv source metadata is swtiched, and pyingest dropped
  • Loading branch information
golnazads authored Dec 18, 2023
2 parents 8c8fb27 + a186396 commit 46d01fa
Show file tree
Hide file tree
Showing 18 changed files with 252 additions and 249 deletions.
88 changes: 51 additions & 37 deletions adsdocmatch/match_w_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from adsdocmatch.pub_parser import get_pub_metadata
from adsdocmatch.oracle_util import OracleUtil
from adsdocmatch.matchable_status import matchable_status
from pyingest.parsers.arxiv import ArxivParser
from adsputils import setup_logging, load_config

proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), "../"))
Expand All @@ -30,7 +29,6 @@ class MatchMetadata():

process_pub_bibstem = {}

ARXIV_PARSER = ArxivParser()
ORACLE_UTIL = OracleUtil()

def get_input_filenames(self, filename):
Expand Down Expand Up @@ -85,6 +83,45 @@ def process_pub_metadata(self, metadata):
self.process_pub_bibstem[bibstem] = 1 if status == True else (0 if status == False else -1)
return self.process_pub_bibstem[bibstem]

def parse_arXiv_comments(self, metadata):
"""
:param metadata:
:return:
"""
comments = metadata.get('arXiv_comments', '')
if comments:
# extract doi out of comments if there are any
match = self.re_doi.search(comments)
if match:
metadata['doi'] = match.group(1)
else:
doi = metadata.get('properties', {}).get('DOI', None)
if doi:
metadata['doi'] = doi.replace('doi:', '')
match_doctype = None
title = metadata.get('title')
# check title for erratum
match = self.re_doctype_errata.search(title)
if match:
match_doctype = ['erratum']
else:
match = self.re_doctype_bookreview.search(title)
if match:
match_doctype = ['bookreview']
else:
# check both comments and title for thesis
match = self.re_doctype_thesis.search("%s %s" % (comments, title))
if match:
match_doctype = ['phdthesis', 'mastersthesis']
must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
else:
metadata.pop("doi", None)
match_doctype = None
must_match = False
comments = ''
return metadata, comments, must_match, match_doctype

def write_results(self, result_filename, matches, metadata_filename, rerun_filename):
"""
Expand Down Expand Up @@ -202,41 +239,8 @@ def match_to_pub(self, filename):
"""
try:
with open(filename, 'rb') as arxiv_fp:
journal = filename.strip().split('/')[-5]
if journal == 'ArXiv':
metadata = self.ARXIV_PARSER.parse(arxiv_fp)
comments = ' '.join(metadata.get('comments', []))
# extract doi out of comments if there are any
match = self.re_doi.search(comments)
if match:
metadata['doi'] = match.group(1)
else:
doi = metadata.get('properties', {}).get('DOI', None)
if doi:
metadata['doi'] = doi.replace('doi:', '')
match_doctype = None
title = metadata.get('title')
# check title for erratum
match = self.re_doctype_errata.search(title)
if match:
match_doctype = ['erratum']
else:
match = self.re_doctype_bookreview.search(title)
if match:
match_doctype = ['bookreview']
else:
# check both comments and title for thesis
match = self.re_doctype_thesis.search("%s %s" % (comments, title))
if match:
match_doctype = ['phdthesis', 'mastersthesis']
must_match = any(ads_archive_class in arxiv_class for arxiv_class in metadata.get('class', []) for ads_archive_class in self.MUST_MATCH)
else:
metadata = get_pub_metadata(arxiv_fp.read())
# remove the doi, since in this case, oracle thinks it is the publication doi
metadata.pop("doi", None)
match_doctype = None
must_match = False
comments = ''
metadata = get_pub_metadata(arxiv_fp.read())
metadata, comments, must_match, match_doctype = self.parse_arXiv_comments(metadata)
oracle_matches = self.ORACLE_UTIL.get_matches(metadata, 'eprint', must_match, match_doctype)
# before proceeding see if this arXiv article's class is among the ones that ADS archives the
# published version if available
Expand Down Expand Up @@ -456,3 +460,13 @@ def process_match_to_pub(self, path):
combined_output_filename = "%s%s" % (path, config.get('DOCMATCHPIPELINE_EPRINT_COMBINED_FILENAME', 'default'))
self.merge_classic_docmatch_results(classic_matched_filename, result_filename, combined_output_filename)
return combined_output_filename

if __name__ == '__main__':
print(MatchMetadata().match_to_pub('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))
print(MatchMetadata().match_to_arXiv('/proj/ads/abstracts/gen/text/L48/L48-23288.abs'))

'''
/proj/ads/abstracts/gen/text/L52/L52-28159.abs
/proj/ads/abstracts/gen/text/L48/L48-23288.abs
/proj/ads/abstracts/sources/ArXiv/oai/arXiv.org/2306/02768
'''
5 changes: 5 additions & 0 deletions adsdocmatch/pub_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def as_needed(article):
("Publication Date", "pubdate"),
("Bibliographic Code", "bibcode"),
("DOI", "doi"),
("arXiv_comments", "arXiv_comments")
]
return_record = {}
for src_key, dest_key in field_mappings:
Expand Down Expand Up @@ -131,4 +132,8 @@ def get_pub_metadata(contents):
switch_date = article['Publication Date'].split('/')
article['Publication Date'] = switch_date[1] + '/' + switch_date[0]

if 'Origin' in fields_found_in_file:
if article['Origin'] == 'ARXIV':
article['arXiv_comments'] = article.get('Comments', '')

return as_needed(article)
40 changes: 0 additions & 40 deletions adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/0708/1752

This file was deleted.

26 changes: 0 additions & 26 deletions adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1701/00200

This file was deleted.

33 changes: 0 additions & 33 deletions adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/1801/01021

This file was deleted.

43 changes: 0 additions & 43 deletions adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2106/07251

This file was deleted.

47 changes: 0 additions & 47 deletions adsdocmatch/tests/unittests/stubdata/ArXiv/oai/eprints/2312/08579

This file was deleted.

32 changes: 32 additions & 0 deletions adsdocmatch/tests/unittests/stubdata/X01-74270.abs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
Title: Validation of the new Hipparcos reduction
Authors: van Leeuwen, F.
Journal: eprint arXiv:0708.1752
Publication Date: 08/2007
Comments: 12 pages, 19 figures, accepted for publication by Astronomy and Astrophysics;
Astron.Astrophys.474:653-664,2007;
doi:10.1051/0004-6361:20078357
Origin: ARXIV
Keywords: Astrophysics
Bibliographic Code: 2007arXiv0708.1752V

Abstract
Context.A new reduction of the astrometric data as produced by the
Hipparcos mission has been published, claiming accuracies for nearly all
stars brighter than magnitude Hp = 8 to be better, by up to a factor 4,
than in the original catalogue. Aims.The new Hipparcos astrometric
catalogue is checked for the quality of the data and the consistency of
the formal errors as well as the possible presence of error
correlations. The differences with the earlier publication are
explained. Methods. The internal errors are followed through the
reduction process, and the external errors are investigated on the basis
of a comparison with radio observations of a small selection of stars,
and the distribution of negative parallaxes. Error correlation levels
are investigated and the reduction by more than a factor 10 as obtained
in the new catalogue is explained. Results.The formal errors on the
parallaxes for the new catalogue are confirmed. The presence of a small
amount of additional noise, though unlikely, cannot be ruled out.
Conclusions. The new reduction of the Hipparcos astrometric data
provides an improvement by a factor 2.2 in the total weight compared to
the catalogue published in 1997, and provides much improved data for a
wide range of studies on stellar luminosities and local galactic
kinematics.
17 changes: 17 additions & 0 deletions adsdocmatch/tests/unittests/stubdata/X10-50737.abs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Title: Post-Lie algebra structures on the Witt algebra
Authors: Tang, Xiaomin
Journal: eprint arXiv:1701.00200
Publication Date: 01/2017
Comments: 24 pages
Origin: ARXIV
Keywords: Mathematics - Rings and Algebras, 17A30, 17A42,
17B60, 18D50
Bibliographic Code: 2017arXiv170100200T

Abstract
In this paper, we characterize the graded post-Lie algebra structures
and a class of shifting post-Lie algebra structures on the Witt algebra.
We obtain some new Lie algebras and give a class of their modules. As an
application, the homogeneous Rota-Baxter operators and a class of
non-homogeneous Rota-Baxter operators of weight $1$ on the Witt algebra
are studied.
Loading

0 comments on commit 46d01fa

Please sign in to comment.