Skip to content

Commit

Permalink
Merge pull request nltk#596 from dimazest/develop
Browse files Browse the repository at this point in the history
pep8 fixes in the BNC reader.
  • Loading branch information
stevenbird committed Feb 6, 2014
2 parents 50a5d96 + 49a1fe6 commit 912f8e6
Showing 1 changed file with 49 additions and 35 deletions.
84 changes: 49 additions & 35 deletions nltk/corpus/reader/bnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,11 @@
"""
__docformat__ = 'epytext en'

import re

import xml.etree.ElementTree as ET

from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.xmldocs import *


class BNCCorpusReader(XMLCorpusReader):
"""
Corpus reader for the XML version of the British National Corpus.
Expand Down Expand Up @@ -61,8 +58,10 @@ def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False):
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
if c5: tag = 'c5'
else: tag = 'pos'
if c5:
tag = 'c5'
else:
tag = 'pos'
if self._lazy:
return concat([BNCWordView(fileid, False, tag, strip_space, stem)
for fileid in self.abspaths(fileids)])
Expand Down Expand Up @@ -101,8 +100,10 @@ def tagged_sents(self, fileids=None, c5=False, strip_space=True,
word tokens. Otherwise, leave the spaces on the tokens.
:param stem: If true, then use word stems instead of word strings.
"""
if c5: tag = 'c5'
else: tag = 'pos'
if c5:
tag = 'c5'
else:
tag = 'pos'
if self._lazy:
return concat([BNCWordView(fileid, True, tag, strip_space, stem)
for fileid in self.abspaths(fileids)])
Expand All @@ -129,9 +130,11 @@ def _words(self, fileid, bracket_sent, tag, strip_space, stem):
for xmlword in _all_xmlwords_in(xmlsent):
word = xmlword.text
if not word:
word = "" # fixes issue 337?
if strip_space or stem: word = word.strip()
if stem: word = xmlword.get('hw', word)
word = "" # fixes issue 337?
if strip_space or stem:
word = word.strip()
if stem:
word = xmlword.get('hw', word)
if tag == 'c5':
word = (word, xmlword.get('c5'))
elif tag == 'pos':
Expand All @@ -145,13 +148,18 @@ def _words(self, fileid, bracket_sent, tag, strip_space, stem):
assert None not in result
return result


def _all_xmlwords_in(elt, result=None):
if result is None: result = []
if result is None:
result = []
for child in elt:
if child.tag in ('c', 'w'): result.append(child)
else: _all_xmlwords_in(child, result)
if child.tag in ('c', 'w'):
result.append(child)
else:
_all_xmlwords_in(child, result)
return result


class BNCSentence(list):
"""
A list of words, augmented by an attribute ``num`` used to record
Expand All @@ -161,6 +169,7 @@ def __init__(self, num, items):
self.num = num
list.__init__(self, items)


class BNCWordView(XMLCorpusView):
"""
A stream backed corpus view specialized for use with the BNC corpus.
Expand All @@ -173,8 +182,10 @@ def __init__(self, fileid, sent, tag, strip_space, stem):
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
if sent: tagspec = '.*/s'
else: tagspec = '.*/s/(.*/)?(c|w)'
if sent:
tagspec = '.*/s'
else:
tagspec = '.*/s/(.*/)?(c|w)'
self._sent = sent
self._tag = tag
self._strip_space = strip_space
Expand All @@ -190,39 +201,43 @@ def __init__(self, fileid, sent, tag, strip_space, stem):
# Reset tag context.
self._tag_context = {0: ()}


title = None #: Title of the document.
author = None #: Author of the document.
editor = None #: Editor
resps = None #: Statement of responsibility
title = None #: Title of the document.
author = None #: Author of the document.
editor = None #: Editor
resps = None #: Statement of responsibility

def handle_header(self, elt, context):
# Set up some metadata!
titles = elt.findall('titleStmt/title')
if titles: self.title = '\n'.join(
[title.text.strip() for title in titles])
if titles:
self.title = '\n'.join(title.text.strip() for title in titles)

authors = elt.findall('titleStmt/author')
if authors: self.author = '\n'.join(
[author.text.strip() for author in authors])
if authors:
self.author = '\n'.join(author.text.strip() for author in authors)

editors = elt.findall('titleStmt/editor')
if editors: self.editor = '\n'.join(
[editor.text.strip() for editor in editors])
if editors:
self.editor = '\n'.join(editor.text.strip() for editor in editors)

resps = elt.findall('titleStmt/respStmt')
if resps: self.resps = '\n\n'.join(
'\n'.join(resp_elt.text.strip() for resp_elt in resp)
for resp in resps)
if resps:
self.resps = '\n\n'.join(
'\n'.join(
resp_elt.text.strip() for resp_elt in resp
) for resp in resps
)

def handle_elt(self, elt, context):
if self._sent: return self.handle_sent(elt)
else: return self.handle_word(elt)
if self._sent:
return self.handle_sent(elt)
else:
return self.handle_word(elt)

def handle_word(self, elt):
word = elt.text
if not word:
word = "" # fixes issue 337?
word = "" # fixes issue 337?
if self._strip_space or self._stem:
word = word.strip()
if self._stem:
Expand All @@ -238,9 +253,8 @@ def handle_sent(self, elt):
for child in elt:
if child.tag == 'mw':
sent += [self.handle_word(w) for w in child]
elif child.tag in ('w','c'):
elif child.tag in ('w', 'c'):
sent.append(self.handle_word(child))
else:
raise ValueError('Unexpected element %s' % child.tag)
return BNCSentence(elt.attrib['n'], sent)

0 comments on commit 912f8e6

Please sign in to comment.