Skip to content

Commit

Permalink
include year with parsed xml string, fix unittests
Browse files Browse the repository at this point in the history
  • Loading branch information
gtfierro committed Jul 8, 2013
1 parent 9d90caf commit 8e17417
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 29 deletions.
37 changes: 26 additions & 11 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
import lib.handlers.grant_handler as grant_handler
import lib.patSQL as patSQL
import lib.argconfig_parse as argconfig_parse
from lib.config_parser import get_xml_handlers

xmlclasses = [patSQL.AssigneeXML, patSQL.CitationXML, patSQL.ClassXML, \
patSQL.InventorXML, patSQL.PatentXML, patSQL.PatdescXML, \
patSQL.LawyerXML, patSQL.ScirefXML, patSQL.UsreldocXML]

regex = re.compile(r"""([<][?]xml version.*?[>]\s*[<][!]DOCTYPE\s+([A-Za-z-]+)\s+.*?/\2[>])""", re.S+re.I)
xmlhandlers = get_xml_handlers('process.cfg')

def list_files(patentroot, xmlregex):
"""
Expand All @@ -30,18 +32,29 @@ def list_files(patentroot, xmlregex):
sys.exit(1)
return files

def _get_year(filename, dateformat='ipg%y%m%d.xml'):
"""
Given a [filename], returns the expanded year.
The optional [dateformat] argument allows for different file formats
"""
filename = re.match(r'ipg\d{6}',filename)
if not filename: return 'default'
filename = filename.group() + '.xml'
return datetime.datetime.strptime(filename, dateformat).year

def extract_xml_strings(filename):
"""
Given a [filename], opens the file using mmap and returns a list of all XML strings
contained in the file.
Given a [filename], opens the file using mmap and returns a list of tuples.
Each tuple is of format (year, xml doc string). A tuple is returned for
every valid XML doc in [filename]
"""
if not filename: return
parsed_xmls = []
size = os.stat(filename).st_size
logging.debug("Parsing file: {0}".format(filename))
with open(filename,'r') as f:
with contextlib.closing(mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)) as m:
res = [x[0] for x in regex.findall(m)]
res = [(_get_year(filename), x[0]) for x in regex.findall(m)]
parsed_xmls.extend(res)
return parsed_xmls

Expand All @@ -54,27 +67,29 @@ def parse_files(filelist):
parsed = itertools.imap(extract_xml_strings, filelist)
return itertools.chain.from_iterable(parsed)

def apply_xmlclass(xmlstring):
def apply_xmlclass(xmltuple):
"""
Parses an xml string given as [xmlstring] with the appropriate parser
and returns the patSQL.*XML formulations of it. Expect this to change
Parses an xml string given as [xmltuple] with the appropriate parser (given
by the first part of the tuple) and returns the patSQL.*XML formulations of
it. Expect this to change when we integrate Ron's SQLAlchemy stuff
"""
parsed_grants = []
try:
patobj = grant_handler.PatentGrant(xmlstring, True)
year, xml = xmltuple # extract out the parts of the tuple
patobj = xmlhandlers[year].PatentGrant(xml, True)
for xmlclass in xmlclasses:
parsed_grants.append(xmlclass(patobj))
except Exception as inst:
logging.error(type(inst))
logging.error(" - Error parsing patent: %s" % (xmlstring[:400]))
logging.error(" - Error parsing patent: %s" % (xml[0][:400]))
return parsed_grants

def parse_patents(xmlstrings):
def parse_patents(xmltuples):
"""
Given a list of xml strings as [xmlstrings], parses them
Given a list of xml strings as [xmltuples], parses them
all and returns a flat iterator of patSQL.*XML instances
"""
parsed_grants = itertools.imap(apply_xmlclass, xmlstrings)
parsed_grants = itertools.imap(apply_xmlclass, xmltuples)
# errored patents return None; we want to get rid of these
parsed_grants = itertools.ifilter(lambda x: x, parsed_grants)
return itertools.chain.from_iterable(parsed_grants)
Expand Down
6 changes: 3 additions & 3 deletions process.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,6 @@ datadir=/Users/gabe
# introduced. In the case where a year cannot be parsed from the filename (the
# format `ipgYYMMDD` is assumed), then the default parser is used.
[xml-handlers]
2005-2012=lib.handlers.grant_handler_v4.2
2013=lib.handlers.grant_handler_v4.3
default=lib.handlers.grant_handler_v4.2
2005-2012=lib.handlers.grant_handler_v42
#2013=lib.handlers.grant_handler_v43
default=lib.handlers.grant_handler_v42
37 changes: 22 additions & 15 deletions test/test_parse_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,43 +31,49 @@ def test_extract_xml_strings_one(self):
parsed_output = parse.extract_xml_strings(testdir+testfileone)
self.assertTrue(isinstance(parsed_output, list))
self.assertTrue(len(parsed_output) == 1)
self.assertTrue(isinstance(parsed_output[0], str))
self.assertTrue(regex.match(parsed_output[0]))
self.assertTrue(isinstance(parsed_output[0], tuple))
self.assertTrue(isinstance(parsed_output[0][1], str))
self.assertTrue(regex.match(parsed_output[0][1]))

def test_parse_files_one(self):
filelist = [testdir+testfileone]
parsed_output = parse.parse_files(filelist)
self.assertTrue(isinstance(parsed_output,Iterable))
parsed_output = list(parsed_output)
self.assertTrue(len(parsed_output) == 1)
self.assertTrue(isinstance(parsed_output[0], str))
self.assertTrue(regex.match(parsed_output[0]))
self.assertTrue(isinstance(parsed_output[0], tuple))
self.assertTrue(isinstance(parsed_output[0][1], str))
self.assertTrue(regex.match(parsed_output[0][1]))

def test_extract_xml_strings_two(self):
parsed_output = parse.extract_xml_strings(testdir+testfiletwo)
self.assertTrue(isinstance(parsed_output, Iterable))
parsed_output = list(parsed_output)
self.assertTrue(len(parsed_output) == 2)
self.assertTrue(isinstance(parsed_output[0], str))
self.assertTrue(isinstance(parsed_output[1], str))
self.assertTrue(regex.match(parsed_output[0]))
self.assertTrue(regex.match(parsed_output[1]))
self.assertTrue(isinstance(parsed_output[0], tuple))
self.assertTrue(isinstance(parsed_output[0][1], str))
self.assertTrue(isinstance(parsed_output[1], tuple))
self.assertTrue(isinstance(parsed_output[1][1], str))
self.assertTrue(regex.match(parsed_output[0][1]))
self.assertTrue(regex.match(parsed_output[1][1]))

def test_parse_files_two(self):
filelist = [testdir+testfiletwo]
parsed_output = parse.parse_files(filelist)
self.assertTrue(isinstance(parsed_output,Iterable))
parsed_output = list(parsed_output)
self.assertTrue(len(parsed_output) == 2)
self.assertTrue(isinstance(parsed_output[0], str))
self.assertTrue(isinstance(parsed_output[1], str))
self.assertTrue(regex.match(parsed_output[0]))
self.assertTrue(regex.match(parsed_output[1]))
self.assertTrue(isinstance(parsed_output[0], tuple))
self.assertTrue(isinstance(parsed_output[0][1], str))
self.assertTrue(isinstance(parsed_output[1], tuple))
self.assertTrue(isinstance(parsed_output[1][1], str))
self.assertTrue(regex.match(parsed_output[0][1]))
self.assertTrue(regex.match(parsed_output[1][1]))

def test_use_parse_files_one(self):
filelist = [testdir+testfileone]
parsed_output = list(parse.parse_files(filelist))
patobj = PatentGrant(parsed_output[0], True)
patobj = PatentGrant(parsed_output[0][1], True)
parsed_xml = [xmlclass(patobj) for xmlclass in xmlclasses]
self.assertTrue(len(parsed_xml) == len(xmlclasses))
self.assertTrue(all(parsed_xml))
Expand All @@ -77,8 +83,9 @@ def test_use_parse_files_two(self):
parsed_output = parse.parse_files(filelist)
parsed_xml = []
for us_patent_grant in parsed_output:
self.assertTrue(isinstance(us_patent_grant, str))
patobj = PatentGrant(us_patent_grant, True)
self.assertTrue(isinstance(us_patent_grant, tuple))
self.assertTrue(isinstance(us_patent_grant[1], str))
patobj = PatentGrant(us_patent_grant[1], True)
for xmlclass in xmlclasses:
parsed_xml.append(xmlclass(patobj))
self.assertTrue(len(parsed_xml) == 2 * len(xmlclasses))
Expand Down

0 comments on commit 8e17417

Please sign in to comment.