From 8e1741735b297f80b1e16b0642311861864680ea Mon Sep 17 00:00:00 2001
From: Gabe Fierro <gtfierro225@gmail.com>
Date: Mon, 8 Jul 2013 12:43:14 -0700
Subject: [PATCH] include year with parsed xml string, fix unittests

---
 parse.py                | 37 ++++++++++++++++++++++++++-----------
 process.cfg             |  6 +++---
 test/test_parse_file.py | 37 ++++++++++++++++++++++---------------
 3 files changed, 51 insertions(+), 29 deletions(-)

diff --git a/parse.py b/parse.py
index 761531f0..7cca46f7 100755
--- a/parse.py
+++ b/parse.py
@@ -11,12 +11,14 @@
 import lib.handlers.grant_handler as grant_handler
 import lib.patSQL as patSQL
 import lib.argconfig_parse as argconfig_parse
+from lib.config_parser import get_xml_handlers
 
 xmlclasses = [patSQL.AssigneeXML, patSQL.CitationXML, patSQL.ClassXML, \
               patSQL.InventorXML, patSQL.PatentXML, patSQL.PatdescXML, \
               patSQL.LawyerXML, patSQL.ScirefXML, patSQL.UsreldocXML]
 
 regex = re.compile(r"""([<][?]xml version.*?[>]\s*[<][!]DOCTYPE\s+([A-Za-z-]+)\s+.*?/\2[>])""", re.S+re.I)
+xmlhandlers = get_xml_handlers('process.cfg')
 
 def list_files(patentroot, xmlregex):
     """
@@ -30,10 +32,21 @@ def list_files(patentroot, xmlregex):
         sys.exit(1)
     return files
 
+def _get_year(filename, dateformat='ipg%y%m%d.xml'):
+    """
+    Given a [filename], returns the expanded year.
+    The optional [dateformat] argument allows for different file formats
+    """
+    filename = re.match(r'ipg\d{6}',filename)
+    if not filename: return 'default'
+    filename = filename.group() + '.xml'
+    return datetime.datetime.strptime(filename, dateformat).year
+
 def extract_xml_strings(filename):
     """
-    Given a [filename], opens the file using mmap and returns a list of all XML strings
-    contained in the file.
+    Given a [filename], opens the file using mmap and returns a list of tuples.
+    Each tuple is of format (year, xml doc string). A tuple is returned for
+    every valid XML doc in [filename]
     """
     if not filename: return
     parsed_xmls = []
@@ -41,7 +54,7 @@ def extract_xml_strings(filename):
     logging.debug("Parsing file: {0}".format(filename))
     with open(filename,'r') as f:
         with contextlib.closing(mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)) as m:
-            res = [x[0] for x in regex.findall(m)]
+            res = [(_get_year(filename), x[0]) for x in regex.findall(m)]
             parsed_xmls.extend(res)
     return parsed_xmls
 
@@ -54,27 +67,29 @@ def parse_files(filelist):
     parsed = itertools.imap(extract_xml_strings, filelist)
     return itertools.chain.from_iterable(parsed)
 
-def apply_xmlclass(xmlstring):
+def apply_xmlclass(xmltuple):
     """
-    Parses an xml string given as [xmlstring] with the appropriate parser
-    and returns the patSQL.*XML formulations of it. Expect this to change
+    Parses an xml string given as [xmltuple] with the appropriate parser (given
+    by the first part of the tuple) and returns the patSQL.*XML formulations of
+    it. Expect this to change when we integrate Ron's SQLAlchemy stuff
     """
     parsed_grants = []
     try:
-        patobj = grant_handler.PatentGrant(xmlstring, True)
+        year, xml = xmltuple # extract out the parts of the tuple
+        patobj = xmlhandlers[year].PatentGrant(xml, True)
         for xmlclass in xmlclasses:
             parsed_grants.append(xmlclass(patobj))
     except Exception as inst:
         logging.error(type(inst))
-        logging.error("  - Error parsing patent: %s" % (xmlstring[:400]))
+        logging.error("  - Error parsing patent: %s" % (xml[0][:400]))
     return parsed_grants
 
-def parse_patents(xmlstrings):
+def parse_patents(xmltuples):
     """
-    Given a list of xml strings as [xmlstrings], parses them
+    Given a list of xml strings as [xmltuples], parses them
     all and returns a flat iterator of patSQL.*XML instances
     """
-    parsed_grants = itertools.imap(apply_xmlclass, xmlstrings)
+    parsed_grants = itertools.imap(apply_xmlclass, xmltuples)
     # errored patents return None; we want to get rid of these
     parsed_grants = itertools.ifilter(lambda x: x, parsed_grants)
     return itertools.chain.from_iterable(parsed_grants)
diff --git a/process.cfg b/process.cfg
index 4884cf7e..7e2667fa 100644
--- a/process.cfg
+++ b/process.cfg
@@ -69,6 +69,6 @@ datadir=/Users/gabe
 # introduced. In the case where a year cannot be parsed from the filename (the
 # format `ipgYYMMDD` is assumed), then the default parser is used.
 [xml-handlers]
-2005-2012=lib.handlers.grant_handler_v4.2
-2013=lib.handlers.grant_handler_v4.3
-default=lib.handlers.grant_handler_v4.2
+2005-2012=lib.handlers.grant_handler_v42
+#2013=lib.handlers.grant_handler_v43
+default=lib.handlers.grant_handler_v42
diff --git a/test/test_parse_file.py b/test/test_parse_file.py
index 50279387..1d8e839e 100755
--- a/test/test_parse_file.py
+++ b/test/test_parse_file.py
@@ -31,8 +31,9 @@ def test_extract_xml_strings_one(self):
         parsed_output = parse.extract_xml_strings(testdir+testfileone)
         self.assertTrue(isinstance(parsed_output, list))
         self.assertTrue(len(parsed_output) == 1)
-        self.assertTrue(isinstance(parsed_output[0], str))
-        self.assertTrue(regex.match(parsed_output[0]))
+        self.assertTrue(isinstance(parsed_output[0], tuple))
+        self.assertTrue(isinstance(parsed_output[0][1], str))
+        self.assertTrue(regex.match(parsed_output[0][1]))
 
     def test_parse_files_one(self):
         filelist = [testdir+testfileone]
@@ -40,18 +41,21 @@ def test_parse_files_one(self):
         self.assertTrue(isinstance(parsed_output,Iterable))
         parsed_output = list(parsed_output)
         self.assertTrue(len(parsed_output) == 1)
-        self.assertTrue(isinstance(parsed_output[0], str))
-        self.assertTrue(regex.match(parsed_output[0]))
+        self.assertTrue(isinstance(parsed_output[0], tuple))
+        self.assertTrue(isinstance(parsed_output[0][1], str))
+        self.assertTrue(regex.match(parsed_output[0][1]))
 
     def test_extract_xml_strings_two(self):
         parsed_output = parse.extract_xml_strings(testdir+testfiletwo)
         self.assertTrue(isinstance(parsed_output, Iterable))
         parsed_output = list(parsed_output)
         self.assertTrue(len(parsed_output) == 2)
-        self.assertTrue(isinstance(parsed_output[0], str))
-        self.assertTrue(isinstance(parsed_output[1], str))
-        self.assertTrue(regex.match(parsed_output[0]))
-        self.assertTrue(regex.match(parsed_output[1]))
+        self.assertTrue(isinstance(parsed_output[0], tuple))
+        self.assertTrue(isinstance(parsed_output[0][1], str))
+        self.assertTrue(isinstance(parsed_output[1], tuple))
+        self.assertTrue(isinstance(parsed_output[1][1], str))
+        self.assertTrue(regex.match(parsed_output[0][1]))
+        self.assertTrue(regex.match(parsed_output[1][1]))
 
     def test_parse_files_two(self):
         filelist = [testdir+testfiletwo]
@@ -59,15 +63,17 @@ def test_parse_files_two(self):
         self.assertTrue(isinstance(parsed_output,Iterable))
         parsed_output = list(parsed_output)
         self.assertTrue(len(parsed_output) == 2)
-        self.assertTrue(isinstance(parsed_output[0], str))
-        self.assertTrue(isinstance(parsed_output[1], str))
-        self.assertTrue(regex.match(parsed_output[0]))
-        self.assertTrue(regex.match(parsed_output[1]))
+        self.assertTrue(isinstance(parsed_output[0], tuple))
+        self.assertTrue(isinstance(parsed_output[0][1], str))
+        self.assertTrue(isinstance(parsed_output[1], tuple))
+        self.assertTrue(isinstance(parsed_output[1][1], str))
+        self.assertTrue(regex.match(parsed_output[0][1]))
+        self.assertTrue(regex.match(parsed_output[1][1]))
     
     def test_use_parse_files_one(self):
         filelist = [testdir+testfileone]
         parsed_output = list(parse.parse_files(filelist))
-        patobj = PatentGrant(parsed_output[0], True)
+        patobj = PatentGrant(parsed_output[0][1], True)
         parsed_xml = [xmlclass(patobj) for xmlclass in xmlclasses]
         self.assertTrue(len(parsed_xml) == len(xmlclasses))
         self.assertTrue(all(parsed_xml))
@@ -77,8 +83,9 @@ def test_use_parse_files_two(self):
         parsed_output = parse.parse_files(filelist)
         parsed_xml = []
         for us_patent_grant in parsed_output:
-            self.assertTrue(isinstance(us_patent_grant, str))
-            patobj = PatentGrant(us_patent_grant, True)
+            self.assertTrue(isinstance(us_patent_grant, tuple))
+            self.assertTrue(isinstance(us_patent_grant[1], str))
+            patobj = PatentGrant(us_patent_grant[1], True)
             for xmlclass in xmlclasses:
                 parsed_xml.append(xmlclass(patobj))
         self.assertTrue(len(parsed_xml) == 2 * len(xmlclasses))