From 8e1741735b297f80b1e16b0642311861864680ea Mon Sep 17 00:00:00 2001 From: Gabe Fierro Date: Mon, 8 Jul 2013 12:43:14 -0700 Subject: [PATCH] include year with parsed xml string, fix unittests --- parse.py | 37 ++++++++++++++++++++++++++----------- process.cfg | 6 +++--- test/test_parse_file.py | 37 ++++++++++++++++++++++--------------- 3 files changed, 51 insertions(+), 29 deletions(-) diff --git a/parse.py b/parse.py index 761531f0..7cca46f7 100755 --- a/parse.py +++ b/parse.py @@ -11,12 +11,14 @@ import lib.handlers.grant_handler as grant_handler import lib.patSQL as patSQL import lib.argconfig_parse as argconfig_parse +from lib.config_parser import get_xml_handlers xmlclasses = [patSQL.AssigneeXML, patSQL.CitationXML, patSQL.ClassXML, \ patSQL.InventorXML, patSQL.PatentXML, patSQL.PatdescXML, \ patSQL.LawyerXML, patSQL.ScirefXML, patSQL.UsreldocXML] regex = re.compile(r"""([<][?]xml version.*?[>]\s*[<][!]DOCTYPE\s+([A-Za-z-]+)\s+.*?/\2[>])""", re.S+re.I) +xmlhandlers = get_xml_handlers('process.cfg') def list_files(patentroot, xmlregex): """ @@ -30,10 +32,21 @@ def list_files(patentroot, xmlregex): sys.exit(1) return files +def _get_year(filename, dateformat='ipg%y%m%d.xml'): + """ + Given a [filename], returns the expanded year. + The optional [dateformat] argument allows for different file formats + """ + filename = re.match(r'ipg\d{6}',filename) + if not filename: return 'default' + filename = filename.group() + '.xml' + return datetime.datetime.strptime(filename, dateformat).year + def extract_xml_strings(filename): """ - Given a [filename], opens the file using mmap and returns a list of all XML strings - contained in the file. + Given a [filename], opens the file using mmap and returns a list of tuples. + Each tuple is of format (year, xml doc string). A tuple is returned for + every valid XML doc in [filename] """ if not filename: return parsed_xmls = [] @@ -41,7 +54,7 @@ def extract_xml_strings(filename): logging.debug("Parsing file: {0}".format(filename)) with open(filename,'r') as f: with contextlib.closing(mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)) as m: - res = [x[0] for x in regex.findall(m)] + res = [(_get_year(filename), x[0]) for x in regex.findall(m)] parsed_xmls.extend(res) return parsed_xmls @@ -54,27 +67,29 @@ def parse_files(filelist): parsed = itertools.imap(extract_xml_strings, filelist) return itertools.chain.from_iterable(parsed) -def apply_xmlclass(xmlstring): +def apply_xmlclass(xmltuple): """ - Parses an xml string given as [xmlstring] with the appropriate parser - and returns the patSQL.*XML formulations of it. Expect this to change + Parses an xml string given as [xmltuple] with the appropriate parser (given + by the first part of the tuple) and returns the patSQL.*XML formulations of + it. Expect this to change when we integrate Ron's SQLAlchemy stuff """ parsed_grants = [] try: - patobj = grant_handler.PatentGrant(xmlstring, True) + year, xml = xmltuple # extract out the parts of the tuple + patobj = xmlhandlers[year].PatentGrant(xml, True) for xmlclass in xmlclasses: parsed_grants.append(xmlclass(patobj)) except Exception as inst: logging.error(type(inst)) - logging.error(" - Error parsing patent: %s" % (xmlstring[:400])) + logging.error(" - Error parsing patent: %s" % (xml[0][:400])) return parsed_grants -def parse_patents(xmlstrings): +def parse_patents(xmltuples): """ - Given a list of xml strings as [xmlstrings], parses them + Given a list of xml strings as [xmltuples], parses them all and returns a flat iterator of patSQL.*XML instances """ - parsed_grants = itertools.imap(apply_xmlclass, xmlstrings) + parsed_grants = itertools.imap(apply_xmlclass, xmltuples) # errored patents return None; we want to get rid of these parsed_grants = itertools.ifilter(lambda x: x, parsed_grants) return itertools.chain.from_iterable(parsed_grants) diff --git a/process.cfg b/process.cfg index 4884cf7e..7e2667fa 100644 --- a/process.cfg +++ b/process.cfg @@ -69,6 +69,6 @@ datadir=/Users/gabe # introduced. In the case where a year cannot be parsed from the filename (the # format `ipgYYMMDD` is assumed), then the default parser is used. [xml-handlers] -2005-2012=lib.handlers.grant_handler_v4.2 -2013=lib.handlers.grant_handler_v4.3 -default=lib.handlers.grant_handler_v4.2 +2005-2012=lib.handlers.grant_handler_v42 +#2013=lib.handlers.grant_handler_v43 +default=lib.handlers.grant_handler_v42 diff --git a/test/test_parse_file.py b/test/test_parse_file.py index 50279387..1d8e839e 100755 --- a/test/test_parse_file.py +++ b/test/test_parse_file.py @@ -31,8 +31,9 @@ def test_extract_xml_strings_one(self): parsed_output = parse.extract_xml_strings(testdir+testfileone) self.assertTrue(isinstance(parsed_output, list)) self.assertTrue(len(parsed_output) == 1) - self.assertTrue(isinstance(parsed_output[0], str)) - self.assertTrue(regex.match(parsed_output[0])) + self.assertTrue(isinstance(parsed_output[0], tuple)) + self.assertTrue(isinstance(parsed_output[0][1], str)) + self.assertTrue(regex.match(parsed_output[0][1])) def test_parse_files_one(self): filelist = [testdir+testfileone] @@ -40,18 +41,21 @@ def test_parse_files_one(self): self.assertTrue(isinstance(parsed_output,Iterable)) parsed_output = list(parsed_output) self.assertTrue(len(parsed_output) == 1) - self.assertTrue(isinstance(parsed_output[0], str)) - self.assertTrue(regex.match(parsed_output[0])) + self.assertTrue(isinstance(parsed_output[0], tuple)) + self.assertTrue(isinstance(parsed_output[0][1], str)) + self.assertTrue(regex.match(parsed_output[0][1])) def test_extract_xml_strings_two(self): parsed_output = parse.extract_xml_strings(testdir+testfiletwo) self.assertTrue(isinstance(parsed_output, Iterable)) parsed_output = list(parsed_output) self.assertTrue(len(parsed_output) == 2) - self.assertTrue(isinstance(parsed_output[0], str)) - self.assertTrue(isinstance(parsed_output[1], str)) - self.assertTrue(regex.match(parsed_output[0])) - self.assertTrue(regex.match(parsed_output[1])) + self.assertTrue(isinstance(parsed_output[0], tuple)) + self.assertTrue(isinstance(parsed_output[0][1], str)) + self.assertTrue(isinstance(parsed_output[1], tuple)) + self.assertTrue(isinstance(parsed_output[1][1], str)) + self.assertTrue(regex.match(parsed_output[0][1])) + self.assertTrue(regex.match(parsed_output[1][1])) def test_parse_files_two(self): filelist = [testdir+testfiletwo] @@ -59,15 +63,17 @@ def test_parse_files_two(self): self.assertTrue(isinstance(parsed_output,Iterable)) parsed_output = list(parsed_output) self.assertTrue(len(parsed_output) == 2) - self.assertTrue(isinstance(parsed_output[0], str)) - self.assertTrue(isinstance(parsed_output[1], str)) - self.assertTrue(regex.match(parsed_output[0])) - self.assertTrue(regex.match(parsed_output[1])) + self.assertTrue(isinstance(parsed_output[0], tuple)) + self.assertTrue(isinstance(parsed_output[0][1], str)) + self.assertTrue(isinstance(parsed_output[1], tuple)) + self.assertTrue(isinstance(parsed_output[1][1], str)) + self.assertTrue(regex.match(parsed_output[0][1])) + self.assertTrue(regex.match(parsed_output[1][1])) def test_use_parse_files_one(self): filelist = [testdir+testfileone] parsed_output = list(parse.parse_files(filelist)) - patobj = PatentGrant(parsed_output[0], True) + patobj = PatentGrant(parsed_output[0][1], True) parsed_xml = [xmlclass(patobj) for xmlclass in xmlclasses] self.assertTrue(len(parsed_xml) == len(xmlclasses)) self.assertTrue(all(parsed_xml)) @@ -77,8 +83,9 @@ def test_use_parse_files_two(self): parsed_output = parse.parse_files(filelist) parsed_xml = [] for us_patent_grant in parsed_output: - self.assertTrue(isinstance(us_patent_grant, str)) - patobj = PatentGrant(us_patent_grant, True) + self.assertTrue(isinstance(us_patent_grant, tuple)) + self.assertTrue(isinstance(us_patent_grant[1], str)) + patobj = PatentGrant(us_patent_grant[1], True) for xmlclass in xmlclasses: parsed_xml.append(xmlclass(patobj)) self.assertTrue(len(parsed_xml) == 2 * len(xmlclasses))