From 104adb2d7d214c8bce3b1a465cb30c8e0f70495c Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Fri, 8 Feb 2019 13:09:02 -0500 Subject: [PATCH 1/2] add recover option to be more lenient with bad metadata payloads --- src/oaipmh/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/oaipmh/client.py b/src/oaipmh/client.py index 2b0c611..daffa79 100644 --- a/src/oaipmh/client.py +++ b/src/oaipmh/client.py @@ -107,7 +107,7 @@ def ignoreBadCharacters(self, true_or_false): """ self._ignore_bad_character_hack = true_or_false - def parse(self, xml): + def parse(self, xml, recover=False): """Parse the XML to a lxml tree. """ # XXX this is only safe for UTF-8 encoded content, @@ -122,7 +122,7 @@ def parse(self, xml): if hasattr(xml, "encode"): xml = xml.encode("utf-8") # xml = xml.encode("utf-8") - return etree.XML(xml) + return etree.XML(xml, etree.XMLParser(recover=recover)) # implementation of the various methods, delegated here by # handleVerb method From bf1b7212bef2b510088905e89d09eceef7bd6453 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Fri, 5 Apr 2019 17:22:36 -0400 Subject: [PATCH 2/2] on SyntaxError, log error and retry with parser option recover=True --- src/oaipmh/client.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/oaipmh/client.py b/src/oaipmh/client.py index daffa79..b342b76 100644 --- a/src/oaipmh/client.py +++ b/src/oaipmh/client.py @@ -17,6 +17,8 @@ from lxml import etree import time import codecs +import logging +logger = logging.getLogger( __name__ ) from oaipmh import common, metadata, validation, error from oaipmh.datestamp import datestamp_to_datetime, datetime_to_datestamp @@ -300,8 +302,20 @@ def makeRequestErrorHandling(self, **kw): xml = self.makeRequest(**kw) try: tree = self.parse(xml) - except SyntaxError: - raise error.XMLSyntaxError(kw) + except SyntaxError as e: + try: # try again + logger.warning( e ) + logger.warning( 'attempting to recover...') + tree = self.parse(xml, recover=True ) + ids = tree.xpath( '/oai:OAI-PMH/oai:ListRecords/oai:record/oai:header/oai:identifier', namespaces=self.getNamespaces() ) + if len(ids) == 1: + logger.warning("Recoverable parse error on: {0}".format( ids[0].text )) + else: + logger.warning( + "Recoverable parse error on one or more of:\n {0}".format( + "\n".join([ id.text for id in ids ]))) + except SyntaxError: # can't recover + raise error.XMLSyntaxError(kw) # check whether there are errors first e_errors = tree.xpath('/oai:OAI-PMH/oai:error', namespaces=self.getNamespaces()) @@ -320,6 +334,7 @@ def makeRequestErrorHandling(self, **kw): code, msg)) # find exception in error module and raise with msg raise getattr(error, code[0].upper() + code[1:] + 'Error')(msg) + return tree def makeRequest(self, **kw):