commented out LazyCorpusLoader for non-nltk distributed corpora

pblh123 · May 12, 2017 · 00da7c9 · 00da7c9
1 parent 659600f
commit 00da7c9
Showing 1 changed file with 25 additions and 16 deletions.
diff --git a/nltk/corpus/__init__.py b/nltk/corpus/__init__.py
@@ -120,9 +120,6 @@
         ('.*', 'utf_8')])
 gutenberg = LazyCorpusLoader(
     'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1')
-# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
-#hebrew_treebank = LazyCorpusLoader(
-#    'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
 ieer = LazyCorpusLoader(
     'ieer', IEERCorpusReader, r'(?!README|\.).*')
 inaugural = LazyCorpusLoader(
@@ -131,8 +128,7 @@
 indian = LazyCorpusLoader(
     'indian', IndianCorpusReader, r'(?!\.).*\.pos',
     tagset='unknown', encoding='utf8')
-# ipipan = LazyCorpusLoader(
-#     'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
+
 jeita = LazyCorpusLoader(
     'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
 knbc = LazyCorpusLoader(
@@ -156,19 +152,11 @@
     'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8")
 names = LazyCorpusLoader(
     'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
-# nkjp = LazyCorpusLoader(
-#     'nkjp', NKJPCorpusReader, r'', encoding='utf8')
 nps_chat = LazyCorpusLoader(
     'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj')
 opinion_lexicon = LazyCorpusLoader(
     'opinion_lexicon', OpinionLexiconCorpusReader, r'(\w+)\-words\.txt',
     encoding='ISO-8859-2')
-panlex_lite = LazyCorpusLoader(
-    'panlex_lite', PanLexLiteCorpusReader)
-# [SB] this breaks in python35 https://github.com/nltk/nltk/issues/1579
-# pl196x = LazyCorpusLoader(
-#     'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
-#     cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
 ppattach = LazyCorpusLoader(
     'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset'])
 product_reviews_1 = LazyCorpusLoader(
@@ -253,8 +241,7 @@
     'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
 words = LazyCorpusLoader(
     'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
-# ycoe = LazyCorpusLoader(
-#     'ycoe', YCOECorpusReader)
+
 # defined after treebank
 propbank = LazyCorpusLoader(
     'propbank', PropbankCorpusReader,
@@ -282,13 +269,35 @@
 
 nonbreaking_prefixes = LazyCorpusLoader(
     'nonbreaking_prefixes', NonbreakingPrefixesCorpusReader, r'(?!README|\.).*', encoding='utf8')
-
 perluniprops = LazyCorpusLoader(
     'perluniprops', UnicharsCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
 
 # mwa_ppdb = LazyCorpusLoader(
 #     'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
 
+# See https://github.com/nltk/nltk/issues/1579
+# and https://github.com/nltk/nltk/issues/1716
+#
+# pl196x = LazyCorpusLoader(
+#     'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
+#     cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
+#
+# ipipan = LazyCorpusLoader(
+#     'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
+#
+# nkjp = LazyCorpusLoader(
+#     'nkjp', NKJPCorpusReader, r'', encoding='utf8')
+#
+#panlex_lite = LazyCorpusLoader(
+#    'panlex_lite', PanLexLiteCorpusReader)
+#
+# ycoe = LazyCorpusLoader(
+#     'ycoe', YCOECorpusReader)
+#
+# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
+#hebrew_treebank = LazyCorpusLoader(
+#    'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
+
 
 def demo():
     # This is out-of-date: