Finish implementation of Table.from_json. Closes #344.

wireservice · Oct 31, 2015 · 708d0c8 · 708d0c8
2 parents acb7406 + b8720fe
commit 708d0c8
Show file tree

Hide file tree

Showing 8 changed files with 251 additions and 30 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -3,6 +3,7 @@
 
 Version 1.1.0 introduces one major breaking API change. The Table, Table.from_csv and TableSet.from_csv methods now all takes "column_names" and "column_types" as separate arguments instead of as a list of tuples. Was done to facilitate type inference and to streamline the API in various other places.
 
+* Table.from_json is implemented. (#344, #347)
 * Date and DateTime type testing now takes specified format into account. (#361)
 * Number column now takes a float_precision argument.
 * Number columns can now be populated with float values. (#370)

diff --git a/agate/table.py b/agate/table.py
@@ -25,6 +25,11 @@
 from itertools import chain
 import sys
 
+if sys.version_info < (2, 7):
+    import simplejson as json
+else:
+    import json
+
 try:
     from collections import OrderedDict
 except ImportError: # pragma: no cover
@@ -47,7 +52,7 @@
 from agate.mapped_sequence import MappedSequence
 from agate.preview import print_table, print_bars
 from agate.rows import Row
-from agate.utils import NullOrder, Patchable, max_precision, make_number_formatter, round_limits, letter_name
+from agate.utils import NullOrder, Patchable, max_precision, make_number_formatter, round_limits, letter_name, parse_object
 
 if six.PY2:
     from agate import csv_py2 as csv
@@ -179,6 +184,44 @@ def __init__(self, rows, column_names=None, column_types=None, row_names=None, _
 
         self._columns = MappedSequence(new_columns, self._column_names)
 
+
+    @property
+    def column_types(self):
+        """
+        Get an ordered sequence of this table's column types.
+
+        :returns: A sequence of :class:`.DataType` instances.
+        """
+        return self._column_types
+
+    @property
+    def column_names(self):
+        """
+        Get an ordered sequence of this table's column names.
+        """
+        return self._column_names
+
+    @property
+    def row_names(self):
+        """
+        Get an ordered sequence of this table's row names.
+        """
+        return self._row_names
+
+    @property
+    def columns(self):
+        """
+        Get this tables' :class:`.MappedSequence` of columns.
+        """
+        return self._columns
+
+    @property
+    def rows(self):
+        """
+        Get this tables' :class:`.MappedSequence` of rows.
+        """
+        return self._rows
+
     def _fork(self, rows, column_names=None, column_types=None, row_names=None):
         """
         Create a new table using the metadata from this one.
@@ -262,42 +305,91 @@ def to_csv(self, path, **kwargs):
             if close:
                 f.close()
 
-    @property
-    def column_types(self):
+    @classmethod
+    def from_json(cls, path, row_names=None, key=None, **kwargs):
         """
-        Get an ordered sequence of this table's column types.
+        Create a new table from a JSON file. Contents should be an array
+        containing a dictionary for each "row". Nested objects or lists will
+        also be parsed. For example, this object:
 
-        :returns: A sequence of :class:`.DataType` instances.
-        """
-        return self._column_types
+        .. code-block:: javascript
 
-    @property
-    def column_names(self):
-        """
-        Get an ordered sequence of this table's column names.
-        """
-        return self._column_names
+            {
+                'one': {
+                    'a': 1,
+                    'b': 2,
+                    'c': 3
+                },
+                'two': [4, 5, 6],
+                'three': 'd'
+            }
 
-    @property
-    def row_names(self):
-        """
-        Get an ordered sequence of this table's row names.
-        """
-        return self._row_names
+        Would generate these columns and values:
 
-    @property
-    def columns(self):
-        """
-        Get this tables' :class:`.MappedSequence` of columns.
-        """
-        return self._columns
+        .. code-block:: python
 
-    @property
-    def rows(self):
-        """
-        Get this tables' :class:`.MappedSequence` of rows.
+            {
+                'one/a': 1,
+                'one/b': 2,
+                'one/c': 3,
+                'two.0': 4,
+                'two.1': 5,
+                'two.2': 6,
+                'three': 'd'
+            }
+
+        Column names and types will be inferred from the data. Not all rows are
+        required to have the same keys. Missing elements will be filled in with
+        null.
+
+        If the file contains a top-level dictionary you may specify what
+        property contains the row list using the ``key`` parameter.
+
+        ``kwargs`` will be passed through to :meth:`json.load`.
+
+        :param path:
+            Filepath or file-like object from which to read CSV data.
+        :param row_names:
+            See :meth:`Table.__init__`.
+        :key:
+            The key of the top-level dictionary that contains a list of row
+            arrays.
         """
-        return self._rows
+        if hasattr(path, 'read'):
+            js = json.load(path, object_pairs_hook=OrderedDict, **kwargs)
+        else:
+            with open(path, 'r') as f:
+                js = json.load(f, object_pairs_hook=OrderedDict, **kwargs)
+
+        if isinstance(js, dict):
+            if not key:
+                raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.')
+
+            js = js[key]
+
+        column_names = []
+        row_objects = []
+
+        for obj in js:
+            parsed = parse_object(obj)
+
+            for key in parsed.keys():
+                if key not in column_names:
+                    column_names.append(key)
+
+            row_objects.append(parsed)
+
+        rows = []
+
+        for obj in row_objects:
+            r = []
+
+            for name in column_names:
+                r.append(obj.get(name, None))
+
+            rows.append(r)
+
+        return Table(rows, column_names, row_names=row_names)
 
     @allow_tableset_proxy
     def select(self, selected_names):

diff --git a/agate/utils.py b/agate/utils.py
@@ -9,11 +9,18 @@
 from functools import wraps
 import string
 
+try:
+    from collections import OrderedDict
+except ImportError: # pragma: no cover
+    from ordereddict import OrderedDict
+
 try:
     from cdecimal import Decimal, ROUND_FLOOR, ROUND_CEILING
 except ImportError: #pragma: no cover
     from decimal import Decimal, ROUND_FLOOR, ROUND_CEILING
 
+import six
+
 def memoize(func):
     """
     Dead-simple memoize decorator for instance methods that take no arguments.
@@ -205,3 +212,23 @@ def letter_name(index):
     count = len(letters)
 
     return letters[index % count] * ((index // count) + 1)
+
+def parse_object(obj, path=''):
+    """
+    Recursively parse JSON objects and a dictionary of paths/keys and values.
+    Inspired by JSONPipe (https://github.com/dvxhouse/jsonpipe).
+    """
+    if isinstance(obj, dict):
+        iterator = obj.items()
+    elif isinstance(obj, (list, tuple)):
+        iterator = enumerate(obj)
+    else:
+        return { path.strip('/'): obj }
+
+    d = OrderedDict()
+
+    for key, value in iterator:
+        key = six.text_type(key)
+        d.update(parse_object(value, path + key + '/'))
+
+    return d
diff --git a/examples/test.json b/examples/test.json
@@ -0,0 +1,17 @@
+[
+    {
+        "one": 1,
+        "two": 4,
+        "three": "a"
+    },
+    {
+        "one": 2,
+        "two": 3,
+        "three": "b"
+    },
+    {
+        "one": null,
+        "two": 2,
+        "three": "👍"
+    }
+]
diff --git a/requirements-py2.txt b/requirements-py2.txt
@@ -13,3 +13,4 @@ Babel>=2.0
 parsedatetime>=1.5
 pytz>=2015.4
 mock>=1.3.0
+simplejson>=2.1
diff --git a/setup.py b/setup.py
@@ -12,6 +12,7 @@
 
 if sys.version_info == (2, 6):
     install_requires.append('ordereddict>=1.1')
+    install_requires.append('simplejson>=2.1')
 
 setup(
     name='agate',

diff --git a/tests/test_table.py b/tests/test_table.py
@@ -175,6 +175,87 @@ def test_from_csv(self):
     def test_from_csv_columns_and_header(self):
         column_names = ['a', 'b', 'c']
 
+    def test_from_json(self):
+        table = Table.from_json('examples/test.json')
+
+        self.assertEqual(len(table.columns), 3)
+        self.assertEqual(len(table.rows), 3)
+
+        self.assertSequenceEqual(table.column_names, ['one', 'two', 'three'])
+        self.assertIsInstance(table.columns[0].data_type, Number)
+        self.assertIsInstance(table.columns[1].data_type, Number)
+        self.assertIsInstance(table.columns[2].data_type, Text)
+
+        self.assertSequenceEqual(table.rows[0], [1, 4, 'a'])
+        self.assertSequenceEqual(table.rows[1], [2, 3, 'b'])
+        self.assertSequenceEqual(table.rows[2], [None, 2, u'👍'])
+
+    def test_from_json_file_like_object(self):
+        with open('examples/test.json') as f:
+            table = Table.from_json(f)
+
+        self.assertEqual(len(table.columns), 3)
+        self.assertEqual(len(table.rows), 3)
+
+        self.assertSequenceEqual(table.column_names, ['one', 'two', 'three'])
+        self.assertIsInstance(table.columns[0].data_type, Number)
+        self.assertIsInstance(table.columns[1].data_type, Number)
+        self.assertIsInstance(table.columns[2].data_type, Text)
+
+        self.assertSequenceEqual(table.rows[0], [1, 4, 'a'])
+        self.assertSequenceEqual(table.rows[1], [2, 3, 'b'])
+        self.assertSequenceEqual(table.rows[2], [None, 2, u'👍'])
+
+    def test_from_json_with_key(self):
+        table = Table.from_json('examples/test_key.json', key='data')
+
+        self.assertEqual(len(table.columns), 3)
+        self.assertEqual(len(table.rows), 3)
+
+        self.assertSequenceEqual(table.column_names, ['one', 'two', 'three'])
+        self.assertIsInstance(table.columns[0].data_type, Number)
+        self.assertIsInstance(table.columns[1].data_type, Number)
+        self.assertIsInstance(table.columns[2].data_type, Text)
+
+        self.assertSequenceEqual(table.rows[0], [1, 4, 'a'])
+        self.assertSequenceEqual(table.rows[1], [2, 3, 'b'])
+        self.assertSequenceEqual(table.rows[2], [None, 2, u'👍'])
+
+    def test_from_json_mixed_keys(self):
+        table = Table.from_json('examples/test_mixed.json')
+
+        self.assertEqual(len(table.columns), 5)
+        self.assertEqual(len(table.rows), 3)
+
+        self.assertSequenceEqual(table.column_names, ['one', 'two', 'three', 'four', 'five'])
+        self.assertIsInstance(table.columns[0].data_type, Number)
+        self.assertIsInstance(table.columns[1].data_type, Number)
+        self.assertIsInstance(table.columns[2].data_type, Text)
+        self.assertIsInstance(table.columns[3].data_type, Text)
+        self.assertIsInstance(table.columns[4].data_type, Number)
+
+        self.assertSequenceEqual(table.rows[0], [1, 4, 'a', None, None])
+        self.assertSequenceEqual(table.rows[1], [2, 3, 'b', 'd', None])
+        self.assertSequenceEqual(table.rows[2], [None, 2, u'👍', None, 5])
+
+    def test_from_json_nested(self):
+        table = Table.from_json('examples/test_nested.json')
+
+        self.assertEqual(len(table.columns), 6)
+        self.assertEqual(len(table.rows), 2)
+
+        self.assertSequenceEqual(table.column_names, ['one', 'two/two_a', 'two/two_b', 'three/0', 'three/1', 'three/2'])
+        self.assertIsInstance(table.columns[0].data_type, Number)
+        self.assertIsInstance(table.columns[1].data_type, Text)
+        self.assertIsInstance(table.columns[2].data_type, Text)
+        self.assertIsInstance(table.columns[3].data_type, Text)
+        self.assertIsInstance(table.columns[4].data_type, Number)
+        self.assertIsInstance(table.columns[5].data_type, Text)
+
+        self.assertSequenceEqual(table.rows[0], [1, 'a', 'b', 'a', 2, 'c'])
+        self.assertSequenceEqual(table.rows[1], [2, 'c', 'd', 'd', 2, 'f'])
+
+    def test_from_csv_file_like_object(self):
         table = Table.from_csv('examples/test.csv', column_names, self.column_types)
 
         self.assertEqual(len(table.columns), 3)

diff --git a/tox.ini b/tox.ini
@@ -18,6 +18,7 @@ deps=
     {[testenv:py27]deps}
     unittest2==0.5.1
     ordereddict>=1.1
+    simplejson>=2.1
 
 [testenv:py33]
 deps=