Skip to content

Commit

Permalink
Merge branch 'main' into fix-timezone
Browse files Browse the repository at this point in the history
  • Loading branch information
martindurant committed Dec 12, 2023
2 parents 85199bc + 9c71c10 commit 6d4d15f
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 47 deletions.
4 changes: 3 additions & 1 deletion fastparquet/parquet_thrift/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@
def __getattr__(name):
# for compatability with coe that calls, e.g., parquet_thrift.RowGroup(...)
from ..cencoding import ThriftObject
return partial(ThriftObject.from_fields, thrift_name=name)
if name[0].isupper():
return partial(ThriftObject.from_fields, thrift_name=name)
raise AttributeError(name)
2 changes: 1 addition & 1 deletion fastparquet/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import fsspec
import numpy as np
import pandas as pd
from pandas._testing import makeMixedDataFrame
from .util import makeMixedDataFrame
try:
from pandas.tslib import Timestamp
except ImportError:
Expand Down
2 changes: 1 addition & 1 deletion fastparquet/test/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from fastparquet import ParquetFile
from fastparquet import write, parquet_thrift, update_file_custom_metadata
from fastparquet import writer, encoding
from pandas._testing import makeMixedDataFrame
from .util import makeMixedDataFrame
from pandas.testing import assert_frame_equal
from pandas.api.types import CategoricalDtype
import pytest
Expand Down
80 changes: 36 additions & 44 deletions fastparquet/test/test_pd_optional_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,68 +3,59 @@
import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal
from pandas.core.arrays import IntegerArray
import fastparquet as fp
from .util import tempdir
from fastparquet import write, parquet_thrift
from fastparquet.parquet_thrift.parquet import ttypes as tt
import numpy.random as random


EXPECTED_SERIES_INT8 = pd.Series(random.uniform(low=-128, high=127,size=100)).round()
EXPECTED_SERIES_INT16 = pd.Series(random.uniform(low=-32768, high=32767,size=100)).round()
EXPECTED_SERIES_INT32 = pd.Series(random.uniform(low=-2147483648, high=2147483647,size=100)).round()
EXPECTED_SERIES_INT64 = pd.Series(random.uniform(low=-9223372036854775808, high=9223372036854775807,size=100)).round()
EXPECTED_SERIES_UINT8 = pd.Series(random.uniform(low=0, high=255,size=100)).round()
EXPECTED_SERIES_UINT16 = pd.Series(random.uniform(low=0, high=65535,size=100)).round()
EXPECTED_SERIES_UINT32 = pd.Series(random.uniform(low=0, high=4294967295,size=100)).round()
EXPECTED_SERIES_UINT64 = pd.Series(random.uniform(low=0, high=18446744073709551615,size=100)).round()
EXPECTED_SERIES_BOOL = pd.Series(random.choice([False, True], 100))
EXPECTED_SERIES_STRING = pd.Series(random.choice([
EXPECTED_SERIES_INT8 = random.uniform(low=-128, high=127, size=100).round()
EXPECTED_SERIES_INT16 = random.uniform(low=-32768, high=32767, size=100).round()
EXPECTED_SERIES_INT32 = random.uniform(low=-2147483648, high=2147483647, size=100).round()
EXPECTED_SERIES_INT64 = random.uniform(low=-9223372036854775808, high=9223372036854775807, size=100).round()
EXPECTED_SERIES_UINT8 = random.uniform(low=0, high=255, size=100).round()
EXPECTED_SERIES_UINT16 = random.uniform(low=0, high=65535, size=100).round()
EXPECTED_SERIES_UINT32 = random.uniform(low=0, high=4294967295, size=100).round()
EXPECTED_SERIES_UINT64 = random.uniform(low=0, high=18446744073709551615, size=100).round()
EXPECTED_SERIES_BOOL = random.choice([False, True], 100)
EXPECTED_SERIES_STRING = random.choice([
'You', 'are', 'my', 'fire',
'The', 'one', 'desire',
'Believe', 'when', 'I', 'say',
'I', 'want', 'it', 'that', 'way'
], 100))
], 100)


EXPECTED_SERIES_INT8.loc[20:30] = np.nan
EXPECTED_SERIES_INT16.loc[20:30] = np.nan
EXPECTED_SERIES_INT32.loc[20:30] = np.nan
EXPECTED_SERIES_INT64.loc[20:30] = np.nan
EXPECTED_SERIES_UINT8.loc[20:30] = np.nan
EXPECTED_SERIES_UINT16.loc[20:30] = np.nan
EXPECTED_SERIES_UINT32.loc[20:30] = np.nan
EXPECTED_SERIES_UINT64.loc[20:30] = np.nan
EXPECTED_SERIES_BOOL.loc[20:30] = np.nan
EXPECTED_SERIES_STRING.loc[20:30] = np.nan
EXPECTED_SERIES_INT8[20:30] = np.nan
EXPECTED_SERIES_INT16[20:30] = np.nan
EXPECTED_SERIES_INT32[20:30] = np.nan
EXPECTED_SERIES_INT64[20:30] = np.nan
EXPECTED_SERIES_UINT8[20:30] = np.nan
EXPECTED_SERIES_UINT16[20:30] = np.nan
EXPECTED_SERIES_UINT32[20:30] = np.nan
EXPECTED_SERIES_UINT64[20:30] = np.nan
EXPECTED_SERIES_BOOL[20:30] = np.nan
EXPECTED_SERIES_STRING[20:30] = np.nan
mask = EXPECTED_SERIES_UINT64 > -1


TEST = pd.DataFrame({
'int8': EXPECTED_SERIES_INT8.astype('Int8'),
'int16': EXPECTED_SERIES_INT16.astype('Int16'),
'int32': EXPECTED_SERIES_INT32.astype('Int32'),
'int64': EXPECTED_SERIES_INT64.astype('Int64'),
'uint8': EXPECTED_SERIES_UINT8.astype('UInt8'),
'uint16': EXPECTED_SERIES_UINT16.astype('UInt16'),
'uint32': EXPECTED_SERIES_UINT32.astype('UInt32'),
'uint64': EXPECTED_SERIES_UINT64.astype('UInt64'),
'bool': EXPECTED_SERIES_BOOL.astype('boolean'),
'string': EXPECTED_SERIES_STRING.astype('string')
'int8': pd.Series(pd.array(EXPECTED_SERIES_INT8, dtype='Int8')),
'int16': pd.Series(pd.array(EXPECTED_SERIES_INT16, dtype='Int16')),
'int32': pd.Series(pd.array(EXPECTED_SERIES_INT32, dtype='Int32')),
'int64': pd.Series(pd.array(EXPECTED_SERIES_INT64, dtype='Int64')),
'uint8': pd.Series(pd.array(EXPECTED_SERIES_UINT8, dtype='UInt8')),
'uint16': pd.Series(pd.array(EXPECTED_SERIES_UINT16, dtype='UInt16')),
'uint32': pd.Series(pd.array(EXPECTED_SERIES_UINT32, dtype='UInt32')),
'uint64': pd.Series(pd.array(EXPECTED_SERIES_UINT64, dtype='UInt64')),
'bool': pd.Series(pd.array(EXPECTED_SERIES_BOOL, dtype='boolean')),
'string': pd.Series(EXPECTED_SERIES_STRING, dtype='string')
})


EXPECTED = pd.DataFrame({
'int8': EXPECTED_SERIES_INT8.astype('float16'),
'int16': EXPECTED_SERIES_INT16.astype('float32'),
'int32': EXPECTED_SERIES_INT32.astype('float64'),
'int64': EXPECTED_SERIES_INT64.astype('float64'),
'uint8': EXPECTED_SERIES_UINT8.astype('float16'),
'uint16': EXPECTED_SERIES_UINT16.astype('float32'),
'uint32': EXPECTED_SERIES_UINT32.astype('float64'),
'uint64': EXPECTED_SERIES_UINT64.astype('float64'),
'bool': EXPECTED_SERIES_BOOL.astype('float16'),
'string': EXPECTED_SERIES_STRING
})
EXPECTED = TEST


EXPECTED_PARQUET_TYPES = {
Expand All @@ -80,7 +71,8 @@
'string': 'BYTE_ARRAY'
}

@pytest.mark.parametrize('comp', (None,'snappy', 'gzip'))

@pytest.mark.parametrize('comp', (None, 'snappy', 'gzip'))
@pytest.mark.parametrize('scheme', ('simple', 'hive'))
def test_write_nullable_columns(tempdir, scheme, comp):
fname = os.path.join(tempdir, 'test_write_nullable_columns.parquet')
Expand Down
17 changes: 17 additions & 0 deletions fastparquet/test/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import tempfile
import shutil

import pandas as pd

TEST_DATA = "test-data"

port = 5555
Expand Down Expand Up @@ -76,3 +78,18 @@ def tempdir():
yield d
if os.path.exists(d):
shutil.rmtree(d, ignore_errors=True)



def makeMixedDataFrame():
index = pd.Index(["a", "b", "c", "d", "e"], name="index")

data = {
"A": pd.Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float64"),
"B": pd.Series([0.0, 1.0, 0.0, 1.0, 0.0], dtype="float64"),
"C": pd.Series(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype='object'),
"D": pd.bdate_range("1/1/2009", periods=5),
}
return pd.DataFrame(data=data)


0 comments on commit 6d4d15f

Please sign in to comment.