From 09fa3fc4e8aded574fcb5299d4c08abd5eca83d6 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Aug 2015 18:00:56 -0400 Subject: [PATCH 1/6] Remove unused imports --- datashape/util.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/datashape/util.py b/datashape/util.py index 4fa9ef5..9040bbe 100644 --- a/datashape/util.py +++ b/datashape/util.py @@ -2,8 +2,6 @@ from __future__ import print_function, division, absolute_import import operator -import ctypes -import sys from . import py2help from . import parser From 1345590c69b2ad74555cd3febcc3a93a3dd85c09 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Aug 2015 18:01:05 -0400 Subject: [PATCH 2/6] Test register_codec --- datashape/tests/test_util.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/datashape/tests/test_util.py b/datashape/tests/test_util.py index 4c75a20..d987ceb 100644 --- a/datashape/tests/test_util.py +++ b/datashape/tests/test_util.py @@ -2,6 +2,7 @@ import datashape from datashape import dshape, has_var_dim, has_ellipsis +from datashape import register_encoding def test_cat_dshapes(): @@ -70,3 +71,14 @@ def test_has_ellipsis(ds): (dshape("M * int32"),)]) def test_not_has_ellipsis(ds): assert not has_ellipsis(ds) + + +def test_register_codec(): + with pytest.raises(ValueError): + assert dshape("string['utf8mb4']").measure.encoding == 'utf8mb4' + + register_encoding('utf8mb4') + assert dshape("string['utf8mb4']").measure.encoding == 'utf8mb4' + + with pytest.raises(ValueError): + register_encoding('utf8mb4') From 0821e0ed70867e208a29b66862053dd1a4c305d1 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Aug 2015 18:01:30 -0400 Subject: [PATCH 3/6] Implement simple encoding registry --- datashape/internal_utils.py | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/datashape/internal_utils.py b/datashape/internal_utils.py index 8513ced..e400839 100644 --- a/datashape/internal_utils.py +++ b/datashape/internal_utils.py @@ -8,6 +8,7 @@ import keyword import re +import codecs class IndexCallable(object): @@ -121,3 +122,44 @@ def groupby(func, seq): def isidentifier(s): return (keyword.iskeyword(s) or re.match(r'^[_a-zA-Z][_a-zA-Z0-9]*$', s) is not None) + + +_canonical_string_encodings = {} + + +def register_encoding(encoding, canonical_name=None): + """Register an encoding with datashape. + + Parameters + ---------- + encoding : str + The name of the encoding + canonical_name : str, optional + The canonical name of the encoding. Defaults to `name`. + """ + try: + canonical_name = codecs.lookup(encoding).name + except LookupError: + pass + else: + raise ValueError('encoding %r already registered and maps to %r' % + (encoding, canonical_name)) + if encoding in _canonical_string_encodings: + raise ValueError('encoding %r already registered and maps to %r' % + (encoding, _canonical_string_encodings[encoding])) + return _canonical_string_encodings.setdefault(encoding, + canonical_name or encoding) + + +def canonical_name(encoding): + try: + return codecs.lookup(encoding).name + except LookupError: + if encoding not in _canonical_string_encodings: + raise ValueError('Invalid encoding %r. You can register the ' + 'encoding with datashape.register_encoding(%r)' % + (encoding, encoding)) + return _canonical_string_encodings[encoding] + + +register_encoding('A', 'ascii') From fbb20e4c50c0952f3cc0b22ea4ca86c18798aab4 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Aug 2015 18:01:40 -0400 Subject: [PATCH 4/6] Use the registry in the String type --- datashape/coretypes.py | 55 ++++++++++++------------------------------ 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/datashape/coretypes.py b/datashape/coretypes.py index 80feae1..5ce346d 100644 --- a/datashape/coretypes.py +++ b/datashape/coretypes.py @@ -9,12 +9,13 @@ import ctypes import operator + from math import ceil import numpy as np from .py2help import _inttypes, _strtypes, unicode, OrderedDict -from .internal_utils import IndexCallable, isidentifier +from .internal_utils import IndexCallable, isidentifier, canonical_name # Classes of unit types. @@ -311,31 +312,13 @@ def __str__(self): return 'bytes' -_canonical_string_encodings = { - u'A' : u'A', - u'ascii' : u'A', - u'U8' : u'U8', - u'utf-8' : u'U8', - u'utf_8' : u'U8', - u'utf8' : u'U8', - u'U16' : u'U16', - u'utf-16' : u'U16', - u'utf_16' : u'U16', - u'utf16' : u'U16', - u'U32' : u'U32', - u'utf-32' : u'U32', - u'utf_32' : u'U32', - u'utf32' : u'U32' -} - - class String(Unit): """ String container >>> String() ctype("string") >>> String(10, 'ascii') - ctype("string[10, 'A']") + ctype("string[10, 'ascii']") """ cls = MEASURE __slots__ = 'fixlen', 'encoding' @@ -351,30 +334,24 @@ def __init__(self, *args): if len(args) == 2: fixlen, encoding = args - encoding = encoding or 'U8' + encoding = encoding or u'utf8' if isinstance(encoding, str): encoding = unicode(encoding) - try: - encoding = _canonical_string_encodings[encoding] - except KeyError: - raise ValueError('Unsupported string encoding %s' % - repr(encoding)) - self.encoding = encoding + self.encoding = canonical_name(encoding) self.fixlen = fixlen - # Put it in a canonical form - def __str__(self): - if self.fixlen is None and self.encoding == 'U8': + utf8 = canonical_name('utf8') + if self.fixlen is None and self.encoding == utf8: return 'string' - elif self.fixlen is not None and self.encoding == 'U8': + elif self.fixlen is not None and self.encoding == utf8: return 'string[%i]' % self.fixlen - elif self.fixlen is None and self.encoding != 'U8': - return 'string[%s]' % repr(self.encoding).strip('u') + elif self.fixlen is None and self.encoding != utf8: + return 'string[%s]' % repr(self.encoding).lstrip('u') else: return 'string[%i, %s]' % (self.fixlen, - repr(self.encoding).strip('u')) + repr(self.encoding).lstrip('u')) def __repr__(self): s = str(self) @@ -391,10 +368,8 @@ def to_numpy_dtype(self): dtype('S30') """ if self.fixlen: - if self.encoding == 'A': - return np.dtype('S%d' % self.fixlen) - else: - return np.dtype('U%d' % self.fixlen) + prefix = 'S' if self.encoding == 'ascii' else 'U' + return np.dtype('%s%d' % (prefix, self.fixlen)) from .py2help import unicode # Create a dtype with metadata indicating it's @@ -669,7 +644,7 @@ def from_numpy_dtype(self, dt): >>> CType.from_numpy_dtype(dtype('M8')) DateTime(None) >>> CType.from_numpy_dtype(dtype('U30')) - ctype("string[30, 'U32']") + ctype("string[30, 'utf-32']") """ try: return Type.lookup_type(dt.name) @@ -1116,7 +1091,7 @@ def from_numpy(shape, dt): dshape("5 * 5 * int32") >>> from_numpy((10,), dtype('S10')) - dshape("10 * string[10, 'A']") + dshape("10 * string[10, 'ascii']") """ dtype = np.dtype(dt) From 9df17f4ad04c676a55b137ab99fdf465520f8063 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Aug 2015 18:01:50 -0400 Subject: [PATCH 5/6] Export the function --- datashape/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datashape/__init__.py b/datashape/__init__.py index e78a0ef..69aa440 100644 --- a/datashape/__init__.py +++ b/datashape/__init__.py @@ -8,6 +8,7 @@ from .type_symbol_table import * from .discovery import discover from .util import * +from .internal_utils import register_encoding from .promote import promote, optionify from .error import DataShapeSyntaxError From b5c8788035e8d96ee32ab8067cb75df4f0e60ced Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Aug 2015 18:02:14 -0400 Subject: [PATCH 6/6] Update existing tests --- datashape/tests/test_coretypes.py | 2 +- datashape/tests/test_creation.py | 18 +++++++++--------- datashape/tests/test_str.py | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datashape/tests/test_coretypes.py b/datashape/tests/test_coretypes.py index 67dd083..77f4a3c 100644 --- a/datashape/tests/test_coretypes.py +++ b/datashape/tests/test_coretypes.py @@ -123,7 +123,7 @@ def test_ascii_string(self): def test_string(self): assert (from_numpy((2,), np.dtype('U7')) == - dshape('2 * string[7, "U32"]')) + dshape('2 * string[7, "utf32"]')) def test_string_from_CType_classmethod(self): assert CType.from_numpy_dtype(np.dtype('S7')) == String(7, 'A') diff --git a/datashape/tests/test_creation.py b/datashape/tests/test_creation.py index ea2ae2c..ab06e60 100644 --- a/datashape/tests/test_creation.py +++ b/datashape/tests/test_creation.py @@ -6,7 +6,7 @@ import pytest import datashape -from datashape import dshape, error, DataShape, Record +from datashape import dshape, error, DataShape, Record, register_encoding class TestDataShapeCreation(unittest.TestCase): @@ -70,14 +70,14 @@ def test_type_decl_concrete(self): def test_string_atom(self): self.assertEqual(dshape('string'), dshape("string['U8']")) - self.assertEqual(dshape("string['ascii']")[0].encoding, 'A') - self.assertEqual(dshape("string['A']")[0].encoding, 'A') - self.assertEqual(dshape("string['utf-8']")[0].encoding, 'U8') - self.assertEqual(dshape("string['U8']")[0].encoding, 'U8') - self.assertEqual(dshape("string['utf-16']")[0].encoding, 'U16') - self.assertEqual(dshape("string['U16']")[0].encoding, 'U16') - self.assertEqual(dshape("string['utf-32']")[0].encoding, 'U32') - self.assertEqual(dshape("string['U32']")[0].encoding, 'U32') + self.assertEqual(dshape("string['ascii']")[0].encoding, 'ascii') + self.assertEqual(dshape("string['A']")[0].encoding, 'ascii') + self.assertEqual(dshape("string['utf-8']")[0].encoding, 'utf-8') + self.assertEqual(dshape("string['U8']")[0].encoding, 'utf-8') + self.assertEqual(dshape("string['utf-16']")[0].encoding, 'utf-16') + self.assertEqual(dshape("string['U16']")[0].encoding, 'utf-16') + self.assertEqual(dshape("string['utf-32']")[0].encoding, 'utf-32') + self.assertEqual(dshape("string['U32']")[0].encoding, 'utf-32') def test_time(self): self.assertEqual(dshape('time')[0].tz, None) diff --git a/datashape/tests/test_str.py b/datashape/tests/test_str.py index 79613fd..bd17615 100644 --- a/datashape/tests/test_str.py +++ b/datashape/tests/test_str.py @@ -19,7 +19,7 @@ def test_primitive_measure_str(self): self.assertEqual(str(datashape.float64), 'float64') self.assertEqual(str(datashape.string), 'string') self.assertEqual(str(datashape.String(3)), 'string[3]') - self.assertEqual(str(datashape.String('A')), "string['A']") + self.assertEqual(str(datashape.String('A')), "string['ascii']") def test_structure_str(self): self.assertEqual(str(dshape('{x:int32, y:int64}')), @@ -43,7 +43,7 @@ def test_primitive_measure_repr(self): self.assertEqual(repr(datashape.string), 'ctype("string")') self.assertEqual(repr(datashape.String(3)), 'ctype("string[3]")') self.assertEqual(repr(datashape.String('A')), - """ctype("string['A']")""") + """ctype("string['ascii']")""") def test_structure_repr(self): self.assertEqual(repr(dshape('{x:int32, y:int64}')),