Skip to content

Commit

Permalink
Merge pull request nltk#1719 from alvations/develop
Browse files Browse the repository at this point in the history
LazyCorpusLoader for non-nltk distributed corpora should be commented out.
  • Loading branch information
stevenbird authored May 12, 2017
2 parents 0aeaa58 + 00da7c9 commit 837ec2a
Showing 1 changed file with 25 additions and 16 deletions.
41 changes: 25 additions & 16 deletions nltk/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,6 @@
('.*', 'utf_8')])
gutenberg = LazyCorpusLoader(
'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1')
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
#hebrew_treebank = LazyCorpusLoader(
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
ieer = LazyCorpusLoader(
'ieer', IEERCorpusReader, r'(?!README|\.).*')
inaugural = LazyCorpusLoader(
Expand All @@ -131,8 +128,7 @@
indian = LazyCorpusLoader(
'indian', IndianCorpusReader, r'(?!\.).*\.pos',
tagset='unknown', encoding='utf8')
# ipipan = LazyCorpusLoader(
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')

jeita = LazyCorpusLoader(
'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
knbc = LazyCorpusLoader(
Expand All @@ -156,19 +152,11 @@
'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8")
names = LazyCorpusLoader(
'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
# nkjp = LazyCorpusLoader(
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
nps_chat = LazyCorpusLoader(
'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj')
opinion_lexicon = LazyCorpusLoader(
'opinion_lexicon', OpinionLexiconCorpusReader, r'(\w+)\-words\.txt',
encoding='ISO-8859-2')
panlex_lite = LazyCorpusLoader(
'panlex_lite', PanLexLiteCorpusReader)
# [SB] this breaks in python35 https://github.com/nltk/nltk/issues/1579
# pl196x = LazyCorpusLoader(
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
ppattach = LazyCorpusLoader(
'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
product_reviews_1 = LazyCorpusLoader(
Expand Down Expand Up @@ -253,8 +241,7 @@
'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
# ycoe = LazyCorpusLoader(
# 'ycoe', YCOECorpusReader)

# defined after treebank
propbank = LazyCorpusLoader(
'propbank', PropbankCorpusReader,
Expand Down Expand Up @@ -282,13 +269,35 @@

nonbreaking_prefixes = LazyCorpusLoader(
'nonbreaking_prefixes', NonbreakingPrefixesCorpusReader, r'(?!README|\.).*', encoding='utf8')

perluniprops = LazyCorpusLoader(
'perluniprops', UnicharsCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')

# mwa_ppdb = LazyCorpusLoader(
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')

# See https://github.com/nltk/nltk/issues/1579
# and https://github.com/nltk/nltk/issues/1716
#
# pl196x = LazyCorpusLoader(
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
#
# ipipan = LazyCorpusLoader(
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
#
# nkjp = LazyCorpusLoader(
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
#
#panlex_lite = LazyCorpusLoader(
# 'panlex_lite', PanLexLiteCorpusReader)
#
# ycoe = LazyCorpusLoader(
# 'ycoe', YCOECorpusReader)
#
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
#hebrew_treebank = LazyCorpusLoader(
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')


def demo():
# This is out-of-date:
Expand Down

0 comments on commit 837ec2a

Please sign in to comment.