From 00da7c9d69f83ad07882b63168439c1147d6e9bf Mon Sep 17 00:00:00 2001 From: alvations Date: Fri, 12 May 2017 18:58:12 +0800 Subject: [PATCH] commented out LazyCorpusLoader for non-nltk distributed corpora --- nltk/corpus/__init__.py | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py index d9a57b24a0..d9ccb542ee 100644 --- a/nltk/corpus/__init__.py +++ b/nltk/corpus/__init__.py @@ -120,9 +120,6 @@ ('.*', 'utf_8')]) gutenberg = LazyCorpusLoader( 'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1') -# corpus not available with NLTK; these lines caused help(nltk.corpus) to break -#hebrew_treebank = LazyCorpusLoader( -# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt') ieer = LazyCorpusLoader( 'ieer', IEERCorpusReader, r'(?!README|\.).*') inaugural = LazyCorpusLoader( @@ -131,8 +128,7 @@ indian = LazyCorpusLoader( 'indian', IndianCorpusReader, r'(?!\.).*\.pos', tagset='unknown', encoding='utf8') -# ipipan = LazyCorpusLoader( -# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml') + jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8') knbc = LazyCorpusLoader( @@ -156,19 +152,11 @@ 'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8") names = LazyCorpusLoader( 'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii') -# nkjp = LazyCorpusLoader( -# 'nkjp', NKJPCorpusReader, r'', encoding='utf8') nps_chat = LazyCorpusLoader( 'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj') opinion_lexicon = LazyCorpusLoader( 'opinion_lexicon', OpinionLexiconCorpusReader, r'(\w+)\-words\.txt', encoding='ISO-8859-2') -panlex_lite = LazyCorpusLoader( - 'panlex_lite', PanLexLiteCorpusReader) -# [SB] this breaks in python35 https://github.com/nltk/nltk/issues/1579 -# pl196x = LazyCorpusLoader( -# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml', -# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8') ppattach = LazyCorpusLoader( 'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset']) product_reviews_1 = LazyCorpusLoader( @@ -253,8 +241,7 @@ 'wordnet_ic', WordNetICCorpusReader, '.*\.dat') words = LazyCorpusLoader( 'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii') -# ycoe = LazyCorpusLoader( -# 'ycoe', YCOECorpusReader) + # defined after treebank propbank = LazyCorpusLoader( 'propbank', PropbankCorpusReader, @@ -282,13 +269,35 @@ nonbreaking_prefixes = LazyCorpusLoader( 'nonbreaking_prefixes', NonbreakingPrefixesCorpusReader, r'(?!README|\.).*', encoding='utf8') - perluniprops = LazyCorpusLoader( 'perluniprops', UnicharsCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8') # mwa_ppdb = LazyCorpusLoader( # 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8') +# See https://github.com/nltk/nltk/issues/1579 +# and https://github.com/nltk/nltk/issues/1716 +# +# pl196x = LazyCorpusLoader( +# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml', +# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8') +# +# ipipan = LazyCorpusLoader( +# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml') +# +# nkjp = LazyCorpusLoader( +# 'nkjp', NKJPCorpusReader, r'', encoding='utf8') +# +#panlex_lite = LazyCorpusLoader( +# 'panlex_lite', PanLexLiteCorpusReader) +# +# ycoe = LazyCorpusLoader( +# 'ycoe', YCOECorpusReader) +# +# corpus not available with NLTK; these lines caused help(nltk.corpus) to break +#hebrew_treebank = LazyCorpusLoader( +# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt') + def demo(): # This is out-of-date: