Skip to content

Commit

Permalink
Merge pull request nltk#599 from jskda/pull_request.594
Browse files Browse the repository at this point in the history
Pull request.594
  • Loading branch information
stevenbird committed Feb 7, 2014
2 parents 912f8e6 + 78c1311 commit 0c74bf4
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 64 deletions.
118 changes: 61 additions & 57 deletions nltk/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
import io
import os
import textwrap
import weakref
import re
import zipfile
import codecs
Expand Down Expand Up @@ -109,49 +108,55 @@ def split_resource_url(resource_url):
Splits a resource url into "<protocol>:<path>".
>>> windows = sys.platform.startswith('win')
>>> windows or split_resource_url('file:///home/nltk') == ('file', '/home/nltk')
True
>>> not windows or split_resource_url('file:///C:/home/nltk') == ('file', 'C:/home/nltk')
True
>>> split_resource_url('nltk:home/nltk')
('nltk', 'home/nltk')
>>> split_resource_url('nltk:/home/nltk')
('nltk', '/home/nltk')
>>> split_resource_url('file:/home/nltk')
('file', '/home/nltk')
>>> split_resource_url('file:///home/nltk')
('file', '/home/nltk')
>>> split_resource_url('file:///C:/home/nltk')
('file', '/C:/home/nltk')
"""
protocol, path = resource_url.split(':', 1)
protocol, path_ = resource_url.split(':', 1)
if protocol == 'nltk':
pass
elif protocol == 'file':
path = path.lstrip('/')
if not sys.platform.startswith('win'):
path = '/' + path
if path_.startswith('/'):
path_ = '/' + path_.lstrip('/')
else:
path = re.sub(r'^/{0,2}', '', path)
return protocol, path
path_ = re.sub(r'^/{0,2}', '', path_)
return protocol, path_

def normalize_resource_url(resource_url):
"""
r"""
Normalizes a resource url
>>> windows = sys.platform.startswith('win')
>>> normalize_resource_url('file:C:/dir/file')
'file:///C:/dir/file'
>>> normalize_resource_url('file:C:\\\\dir\\\\file')
'file:///C:/dir/file'
>>> normalize_resource_url('file:C:\\\\dir/file')
'file:///C:/dir/file'
>>> normalize_resource_url('file://C:/dir/file')
'file:///C:/dir/file'
>>> normalize_resource_url('file:////C:/dir/file')
'file:///C:/dir/file'
>>> os.path.normpath(split_resource_url(normalize_resource_url('file:grammar.fcfg'))[1]) == \
... ('\\' if windows else '') + os.path.abspath(os.path.join(os.curdir, 'grammar.fcfg'))
True
>>> not windows or normalize_resource_url('file:C:/dir/file') == 'file:///C:/dir/file'
True
>>> not windows or normalize_resource_url('file:C:\\dir\\file') == 'file:///C:/dir/file'
True
>>> not windows or normalize_resource_url('file:C:\\dir/file') == 'file:///C:/dir/file'
True
>>> not windows or normalize_resource_url('file://C:/dir/file') == 'file:///C:/dir/file'
True
>>> not windows or normalize_resource_url('file:////C:/dir/file') == 'file:///C:/dir/file'
True
>>> not windows or normalize_resource_url('nltk:C:/dir/file') == 'file:///C:/dir/file'
True
>>> not windows or normalize_resource_url('nltk:C:\\\\dir\\\\file') == 'file:///C:/dir/file'
>>> not windows or normalize_resource_url('nltk:C:\\dir\\file') == 'file:///C:/dir/file'
True
>>> windows or normalize_resource_url('file:/dir/file/toy.cfg') == 'file:///dir/file/toy.cfg'
True
>>> normalize_resource_url('nltk:home/nltk')
'nltk:home/nltk'
>>> normalize_resource_url('nltk:/home/nltk')
'file:///home/nltk'
>>> windows or normalize_resource_url('nltk:/home/nltk') == 'file:///home/nltk'
True
>>> normalize_resource_url('http://example.com/dir/file')
'http://example.com/dir/file'
>>> normalize_resource_url('dir/file')
Expand All @@ -165,21 +170,21 @@ def normalize_resource_url(resource_url):
name = resource_url
# use file protocol if the path is an absolute path
if protocol == 'nltk' and os.path.isabs(name):
protocol = 'file'
if protocol == 'file':
protocol = 'file:///'
protocol = 'file://'
name = normalize_resource_name(name, False, None)
elif protocol == 'file':
protocol = 'file://'
# name is absolute
name = normalize_resource_name(name, False).lstrip('/')
name = normalize_resource_name(name, False, None)
elif protocol == 'nltk':
protocol = 'nltk:'
name = normalize_resource_name(name, False).lstrip('/')
name = normalize_resource_name(name, True)
else:
# handled by urllib
protocol += '://'
name = name.lstrip('/')
return ''.join([protocol,name])
return ''.join([protocol, name])

def normalize_resource_name(resource_name, allow_relative = True):
def normalize_resource_name(resource_name, allow_relative=True, relative_path=None):
"""
:type resource_name: str or unicode
:param resource_name: The name of the resource to search for.
Expand All @@ -193,30 +198,29 @@ def normalize_resource_name(resource_name, allow_relative = True):
'./'
>>> normalize_resource_name('./', True)
'./'
>>> windows or normalize_resource_name('dir/file', False) == '/dir/file'
True
>>> not windows or normalize_resource_name('C:/file', False) == 'C:/file'
>>> windows or normalize_resource_name('dir/file', False, '/') == '/dir/file'
True
>>> windows or normalize_resource_name('/dir/file', False) == '/dir/file'
>>> not windows or normalize_resource_name('C:/file', False, '/') == '/C:/file'
True
>>> windows or normalize_resource_name('../dir/file', False) == '/dir/file'
>>> windows or normalize_resource_name('/dir/file', False, '/') == '/dir/file'
True
>>> not windows or normalize_resource_name('/C:/file', False) == 'C:/file'
True
>>> not windows or normalize_resource_name('../C:/file', False) == 'C:/file'
>>> windows or normalize_resource_name('../dir/file', False, '/') == '/dir/file'
True
"""
is_dir = bool(re.search(r'[\\/]$',resource_name)) or resource_name.endswith(os.path.sep)
resource_name = os.path.normpath(resource_name).replace('\\','/').replace(os.path.sep,'/')
is_dir = bool(re.search(r'[\\/.]$', resource_name)) or resource_name.endswith(os.path.sep)
if sys.platform.startswith('win'):
resource_name = resource_name.lstrip('/')
else:
resource_name = re.sub(r'^/+', '/', resource_name)
if allow_relative:
if resource_name == '.':
is_dir = True
resource_name = os.path.normpath(resource_name)
else:
if resource_name[0] == '.':
resource_name = re.sub('^[./]+', '', resource_name)
resource_name = resource_name.lstrip('/')
if not sys.platform.startswith('win'):
resource_name = '/' + resource_name
if relative_path is None:
relative_path = os.curdir
resource_name = os.path.abspath(os.path.join(relative_path, resource_name))
resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/')
if sys.platform.startswith('win') and os.path.isabs(resource_name):
resource_name = '/' + resource_name
if is_dir and not resource_name.endswith('/'):
resource_name += '/'
return resource_name
Expand Down Expand Up @@ -446,7 +450,7 @@ def __init__(self, zipfile, entry=''):
zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))

# Normalize the entry string, it should be absolute:
entry = normalize_resource_name(entry, False).lstrip('/')
entry = normalize_resource_name(entry, False, '/').lstrip('/')

# Check that the entry exists:
if entry:
Expand Down Expand Up @@ -564,26 +568,26 @@ def find(resource_name, paths=None):
zipfile, zipentry = m.groups()

# Check each item in our path
for _path in paths:
for path_ in paths:
# Is the path item a zipfile?
if _path and (os.path.isfile(_path) and _path.endswith('.zip')):
if path_ and (os.path.isfile(path_) and path_.endswith('.zip')):
try:
return ZipFilePathPointer(_path, resource_name)
return ZipFilePathPointer(path_, resource_name)
except IOError:
# resource not in zipfile
continue

# Is the path item a directory or is resource_name an absolute path?
elif not _path or os.path.isdir(_path):
elif not path_ or os.path.isdir(path_):
if zipfile is None:
p = os.path.join(_path, resource_name)
p = os.path.join(path_, resource_name)
if os.path.exists(p):
if p.endswith('.gz'):
return GzipFileSystemPathPointer(p)
else:
return FileSystemPathPointer(p)
else:
p = os.path.join(_path, zipfile)
p = os.path.join(path_, zipfile)
if os.path.exists(p):
try:
return ZipFilePathPointer(p, zipentry)
Expand Down
10 changes: 3 additions & 7 deletions nltk/test/runtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,16 @@
import sys
import os
import nose
from nose.plugins.manager import PluginManager
from nose.plugins.doctests import Doctest
from nose.plugins import builtin

NLTK_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
sys.path.insert(0, NLTK_ROOT)

NLTK_TEST_DIR = os.path.join(NLTK_ROOT, 'nltk')


if __name__ == '__main__':
# XXX: imports can't be moved to the top of the file
# because nose loader raises an exception then. Why?
from nose.plugins.manager import PluginManager
from nose.plugins.doctests import Doctest
from nose.plugins import builtin

# there shouldn't be import from NLTK for coverage to work properly
from doctest_nose_plugin import DoctestFix

Expand Down

0 comments on commit 0c74bf4

Please sign in to comment.