diff --git a/README.rst b/README.rst index b686f58..ab04888 100644 --- a/README.rst +++ b/README.rst @@ -113,3 +113,19 @@ python types, such as with ``as_bytes()`` File "", line 1, in simd.SimdError: start: '40', is out of bounds for vector of size 16 +The data inside a vector can also be retrieved as a collection type, like a ``tuple`` , + +.. code:: py + + >>> a = simd.Vec(size=32, repeat_value=5, repeat_size=4) + >>> a + [5,0,0,0,5,0,0,0,5,0,0,0,5,0,0,0,5,0,0,0,5,0,0,0,5,0,0,0,5,0,0,0] + >>> a.as_tuple(type=int, width=4) + (5, 5, 5, 5, 5, 5, 5, 5) + >>> a.as_tuple(type=int, width=1) + (5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0, 5, 0, 0, 0) + >>> a.as_tuple(type=int, width=8) + (21474836485, 21474836485, 21474836485, 21474836485) + +The above example shows the pure ``__repr__`` method of ``Vec`` only depicts a hexadecimal, byte level representation of the vector data, but a method like ``as_tuple`` allows the viewing of data with different types. One unique aspect of the ``simd`` module is it treats data and memory similar to that of C, where a chunk of 16 bytes could be two 64 bit integers, four 32 bit integers, and so on. + diff --git a/include/simd_vec_filter.h b/include/simd_vec_filter.h new file mode 100644 index 0000000..ed03e37 --- /dev/null +++ b/include/simd_vec_filter.h @@ -0,0 +1,121 @@ +#ifndef PYSIMD_VEC_FILTER_H +#define PYSIMD_VEC_FILTER_H + +#include "simd_vec_type.h" +#include "vec_macros.h" + +static int pysimd_vec_filter_32(struct pysimd_vec_t* vec, int* gt, + int* lt, + int* eq) { +#if defined(PYSIMD_X86_SSE2) + // pass + unsigned char* ptr = vec->data; + const unsigned char* ptr_end = ptr + vec->size; + while (ptr < ptr_end) { + __m128i loaded = _mm_load_si128((__m128i const*)ptr); + __m128i mask = _mm_set1_epi8(0xff); + if (gt != NULL) { + __m128i gtnum = _mm_set1_epi32(*gt); + __m128i gtres = _mm_cmpgt_epi32 (loaded, gtnum); + mask = _mm_and_si128(mask, gtres); + } + if (lt != NULL) { + __m128i ltnum = _mm_set1_epi32(*lt); + __m128i ltres = _mm_cmplt_epi32 (loaded, ltnum); + mask = _mm_and_si128(mask, ltres); + } + if (eq != NULL) { + __m128i eqnum = _mm_set1_epi32(*eq); + __m128i eqres = _mm_cmpeq_epi32 (loaded, eqnum); + mask = _mm_and_si128(mask, eqres); + } + __m128i final_result = _mm_and_si128(mask, loaded); + int mask_result = _mm_movemask_epi8 (final_result); + size_t to_advance = 0; + switch (mask_result) { + case 0xFFFF: + case 0x0FFF: + case 0x00FF: + case 0x000F: + case 0x0: + // no filtering needed + to_advance = 16; + break; + case 0xFF0F: + final_result = _mm_shuffle_epi32(final_result, 0x78); + to_advance = 12; + break; + case 0xF00F: + // shuffle, reverse order of 0b10101100 + final_result = _mm_shuffle_epi32(final_result, 0xac); + to_advance = 8; + break; + case 0x0FF0: + // shuffle, reverse order of 0b11001001 + final_result = _mm_shuffle_epi32(final_result, 0xc9); + to_advance = 8; + break; + case 0xF0F0: + // shuffle, reverse order of 0b10001101 + final_result = _mm_shuffle_epi32(final_result, 0x8d); + to_advance = 8; + break; + case 0x0F0F: + // shuffle, reverse order of 0b11011000 + final_result = _mm_shuffle_epi32(final_result, 0xd8); + to_advance = 8; + break; + case 0xF0FF: + final_result = _mm_shuffle_epi32(final_result, 0xb4); + to_advance = 12; + break; + case 0xFFF0: + // shuffle, reversed order of 00011011 + final_result = _mm_shuffle_epi32(final_result, 0x1b); + to_advance = 12; + break; + case 0xFF00: + // shuffle, reversed order of 00001110 + final_result = _mm_shuffle_epi32(final_result, 0xe); + to_advance = 8; + break; + case 0xF000: + // shuffle, reversed order of 00000011 + final_result = _mm_shuffle_epi32(final_result, 0x3); + to_advance = 4; + break; + case 0x0F00: + // shuffle, reversed order of 0b00000010 + final_result = _mm_shuffle_epi32(final_result, 0x2); + to_advance = 4; + break; + case 0x00F0: + // shuffle, reversed order of 0b00000001 + final_result = _mm_shuffle_epi32(final_result, 0x1); + to_advance = 4; + break; + default: + fprintf(stderr, "Got impossible mask value: 0x%x, aborting ...\n", mask_result); + abort(); + } + _mm_store_si128 ((__m128i*)ptr, final_result); + ptr += to_advance; + } +#else + const unsigned char* reader = vec->data; + const unsigned char* read_end = reader + vec->size; + void* new_buf = calloc(1, vec->size); + unsigned char* writer = new_buf; + while (reader < read_end) { + if (*reader) { + *writer++ = *reader; + } + ++reader; + } + free(vec->data); + vec->data = new_buf; +#endif + return 1; +} + +#endif // PYSIMD_VEC_FILTER_H diff --git a/setup.py b/setup.py index a50ad65..0954326 100644 --- a/setup.py +++ b/setup.py @@ -5,6 +5,11 @@ DEFAULT_COMPILER = get_default_compiler() +# This attribute determines the minimum alignment required by sizes of a simd.Vec object +# The intention is that, the minimum allows any simd instruction available to be executed +# on vector object without needing to check the length/size of it +pysimd_minimum_align = 8 + pysimd_patch_version = 4 pysimd_minor_version = 0 pysimd_major_version = 0 @@ -62,6 +67,7 @@ """) as sse2_test: if sse2_test.works: macro_defs.append(('PYSIMD_X86_SSE2', '1')) + pysimd_minimum_align = 16 with CheckCCompiles("sse3", x86_header_string + """ int main(void) { @@ -108,6 +114,7 @@ """) as avx_test: if avx_test.works: macro_defs.append(('PYSIMD_X86_AVX', '1')) + pysimd_minimum_align = 32 if DEFAULT_COMPILER == 'unix': compiler_flags.append('-mavx') @@ -123,6 +130,7 @@ """) as avx2_test: if avx2_test.works: macro_defs.append(('PYSIMD_X86_AVX2', '1')) + pysimd_minimum_align = 32 if DEFAULT_COMPILER == 'unix': compiler_flags.append('-mavx2') @@ -142,9 +150,12 @@ """) as avx512f_test: if avx512f_test.works: macro_defs.append(('PYSIMD_X86_AVX512F', '1')) + pysimd_minimum_align = 64 if DEFAULT_COMPILER == 'unix': compiler_flags.append('-mavx512f') +macro_defs.append(('PYSIMD_MIN_ALIGN', str(pysimd_minimum_align))) + if os.name == 'nt': macro_defs.append(('_CRT_SECURE_NO_WARNINGS', '1')) diff --git a/src/pymain.c b/src/pymain.c index 7356acb..4a7df2c 100644 --- a/src/pymain.c +++ b/src/pymain.c @@ -1,6 +1,7 @@ #include "core_simd_info.h" #include "simd_vec.h" #include "simd_vec_arith.h" +#include "simd_vec_filter.h" #define PY_SSIZE_T_CLEAN #include #include "structmember.h" @@ -329,6 +330,114 @@ SimdObject_as_bytes(SimdObject *self, PyObject *args, PyObject *kwargs) } +static PyObject* +SimdObject_as_tuple(SimdObject *self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = {"type", "width", NULL}; + PyObject* tuple_to_give = NULL; + PyObject* param_type = NULL; + Py_ssize_t param_width = 0; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "On", kwlist, + ¶m_type, ¶m_width)) { + return NULL; + } + + size_t actual_width = (size_t)param_width; + if (actual_width != 1 && actual_width != 2 && actual_width != 4 && actual_width != 8) { + PyErr_Format(SimdError, "The width '%zu' is not supported for method 'as_tuple'", actual_width); + return NULL; + } + size_t n_members = self->vec.size / actual_width; + tuple_to_give = PyTuple_New(n_members); + + if ((PyTypeObject*)param_type == &PyLong_Type) { + if (actual_width == 1) { + char* reader = (char*)(self->vec.data); + for (size_t i = 0; i < n_members; ++i) { + PyObject* to_put = PyLong_FromLong(reader[i]); + if (to_put == NULL) { + Py_DECREF(tuple_to_give); + PyErr_Format(PyExc_SystemError, "Internal object failure line: %u", __LINE__); + return NULL; + } + PyTuple_SET_ITEM(tuple_to_give, i, to_put); + } + } else if (actual_width == 2) { + short* reader = (short*)(self->vec.data); + for (size_t i = 0; i < n_members; ++i) { + PyObject* to_put = PyLong_FromLong(reader[i]); + if (to_put == NULL) { + Py_DECREF(tuple_to_give); + PyErr_Format(PyExc_SystemError, "Internal object failure line: %u", __LINE__); + return NULL; + } + PyTuple_SET_ITEM(tuple_to_give, i, to_put); + } + } else if (actual_width == 4) { + int* reader = (int*)(self->vec.data); + for (size_t i = 0; i < n_members; ++i) { + PyObject* to_put = PyLong_FromLong(reader[i]); + if (to_put == NULL) { + Py_DECREF(tuple_to_give); + PyErr_Format(PyExc_SystemError, "Internal object failure line: %u", __LINE__); + return NULL; + } + PyTuple_SET_ITEM(tuple_to_give, i, to_put); + } + } else if (actual_width == 8) { + long long* reader = (long long*)(self->vec.data); + for (size_t i = 0; i < n_members; ++i) { + PyObject* to_put = PyLong_FromLongLong(reader[i]); + if (to_put == NULL) { + Py_DECREF(tuple_to_give); + PyErr_Format(PyExc_SystemError, "Internal object failure line: %u", __LINE__); + return NULL; + } + PyTuple_SET_ITEM(tuple_to_give, i, to_put); + } + } else { + Py_FatalError("Should not reach this point in 'as_tuple', width error"); + } + } else if ((PyTypeObject*)param_type == &PyFloat_Type) { + if (actual_width == 4) { + float* reader = (float*)(self->vec.data); + for (size_t i = 0; i < n_members; ++i) { + PyObject* to_put = PyFloat_FromDouble((double)reader[i]); + if (to_put == NULL) { + Py_DECREF(tuple_to_give); + PyErr_Format(PyExc_SystemError, "Internal object failure line: %u", __LINE__); + return NULL; + } + PyTuple_SET_ITEM(tuple_to_give, i, to_put); + } + } else if (actual_width == 8) { + double* reader = (double*)(self->vec.data); + for (size_t i = 0; i < n_members; ++i) { + PyObject* to_put = PyFloat_FromDouble(reader[i]); + if (to_put == NULL) { + Py_DECREF(tuple_to_give); + PyErr_Format(PyExc_SystemError, "Internal object failure line: %u", __LINE__); + return NULL; + } + PyTuple_SET_ITEM(tuple_to_give, i, to_put); + } + } else { + if (actual_width == 1 || actual_width == 2) { + PyErr_Format(SimdError, "The width '%zu' is not supported for floats for 'as_tuple'", actual_width); + Py_DECREF(tuple_to_give); + return NULL; + } else { + Py_FatalError("Should not reach invalid state for float in 'as_tuple"); + } + } + } else { + Py_DECREF(tuple_to_give); + PyErr_Format(SimdError, "The type '%s' is not supported for method 'as_tuple'", param_type->ob_type->tp_name); + return NULL; + } + return tuple_to_give; +} + static PyObject * SimdObject_clear(SimdObject *self, PyObject *Py_UNUSED(ignored)) { @@ -362,6 +471,9 @@ static PyMethodDef SimdObject_methods[] = { {"as_bytes", (PyCFunction) SimdObject_as_bytes, METH_VARARGS | METH_KEYWORDS, "Returns a bytes object representing the internal bytes of the vector" }, + {"as_tuple", (PyCFunction) SimdObject_as_tuple, METH_VARARGS | METH_KEYWORDS, + "Returns a tuple populated with members of the vector, defaults to 32 bit integers" + }, {"copy", (PyCFunction) SimdObject_copy, METH_VARARGS | METH_KEYWORDS, "Returns a copy of the vector" },