diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index 39f9ddb690..53e141726d 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -12,12 +12,16 @@ cuda_kernels_impl = [ + "awkward_Index_nones_as_index", "awkward_ListArray_min_range", "awkward_ListArray_validity", "awkward_BitMaskedArray_to_ByteMaskedArray", "awkward_ListArray_compact_offsets", "awkward_ListOffsetArray_flatten_offsets", "awkward_IndexedArray_overlay_mask", + "awkward_ByteMaskedArray_numnull", + "awkward_IndexedArray_numnull", + "awkward_IndexedArray_numnull_parents", "awkward_IndexedArray_numnull_unique_64", "awkward_NumpyArray_fill", "awkward_ListArray_fill", @@ -43,12 +47,19 @@ "awkward_RegularArray_getitem_next_range", "awkward_RegularArray_getitem_next_range_spreadadvanced", "awkward_RegularArray_getitem_next_array", + "awkward_RegularArray_getitem_next_array_regularize", + "awkward_RegularArray_reduce_local_nextparents", + "awkward_RegularArray_reduce_nonlocal_preparenext", "awkward_missing_repeat", "awkward_RegularArray_getitem_jagged_expand", "awkward_ListArray_getitem_jagged_expand", + "awkward_ListArray_getitem_jagged_carrylen", "awkward_ListArray_getitem_next_array_advanced", "awkward_ListArray_getitem_next_array", "awkward_ListArray_getitem_next_at", + "awkward_ListArray_getitem_next_range_counts", + "awkward_ListArray_rpad_and_clip_length_axis1", + "awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64", "awkward_NumpyArray_reduce_adjust_starts_64", "awkward_NumpyArray_reduce_adjust_starts_shifts_64", "awkward_RegularArray_getitem_next_at", @@ -86,6 +97,7 @@ "awkward_reduce_sum_bool", "awkward_reduce_prod_bool", "awkward_reduce_countnonzero", + "awkward_sorting_ranges_length", ] diff --git a/dev/generate-tests.py b/dev/generate-tests.py index fae659da4e..0360b31f48 100644 --- a/dev/generate-tests.py +++ b/dev/generate-tests.py @@ -35,6 +35,13 @@ def __init__(self, name, typename, direction, role="default"): self.role = role +no_role_kernels = [ + "awkward_NumpyArray_sort_asstrings_uint8", + "awkward_argsort", + "awkward_sort", +] + + class Specification: def __init__(self, templatized_kernel_name, spec, testdata, blacklisted): self.templatized_kernel_name = templatized_kernel_name @@ -51,6 +58,8 @@ def __init__(self, templatized_kernel_name, spec, testdata, blacklisted): ) if blacklisted: self.tests = [] + elif templatized_kernel_name in no_role_kernels: + self.tests = [] else: self.tests = self.gettests(testdata) @@ -185,6 +194,7 @@ def gettests(self, testdata): def readspec(): specdict = {} + specdict_unit = {} with open(os.path.join(CURRENT_DIR, "..", "kernel-specification.yml")) as f: loadfile = yaml.load(f, Loader=yaml.CSafeLoader) @@ -193,6 +203,13 @@ def readspec(): data = json.load(f)["tests"] for spec in indspec: + for childfunc in spec["specializations"]: + specdict_unit[childfunc["name"]] = Specification( + spec["name"], + childfunc, + data, + not spec["automatic-tests"], + ) if "def " in spec["definition"]: for childfunc in spec["specializations"]: specdict[childfunc["name"]] = Specification( @@ -201,7 +218,7 @@ def readspec(): data, not spec["automatic-tests"], ) - return specdict + return specdict, specdict_unit def getdtypes(args): @@ -215,6 +232,8 @@ def getdtypes(args): typename = typename + "_" if count == 1: dtypes.append("cupy." + typename) + elif count == 2: + dtypes.append("cupy." + typename) return dtypes @@ -239,7 +258,12 @@ def checkintrange(test_args, error, args): if "int" in typename or "uint" in typename: dtype = gettypename(typename) min_val, max_val = np.iinfo(dtype).min, np.iinfo(dtype).max - if "List" in typename: + if "List[List" in typename: + for row in val: + for data in row: + if not (min_val <= data <= max_val): + flag = False + elif "List" in typename: for data in val: if not (min_val <= data <= max_val): flag = False @@ -652,12 +676,16 @@ def gencpuunittests(specdict): cuda_kernels_tests = [ + "awkward_Index_nones_as_index", "awkward_ListArray_min_range", "awkward_ListArray_validity", "awkward_BitMaskedArray_to_ByteMaskedArray", "awkward_ListArray_compact_offsets", "awkward_ListOffsetArray_flatten_offsets", "awkward_IndexedArray_overlay_mask", + "awkward_ByteMaskedArray_numnull", + "awkward_IndexedArray_numnull", + "awkward_IndexedArray_numnull_parents", "awkward_IndexedArray_numnull_unique_64", "awkward_NumpyArray_fill", "awkward_ListArray_fill", @@ -683,12 +711,19 @@ def gencpuunittests(specdict): "awkward_RegularArray_getitem_next_range", "awkward_RegularArray_getitem_next_range_spreadadvanced", "awkward_RegularArray_getitem_next_array", + "awkward_RegularArray_getitem_next_array_regularize", + "awkward_RegularArray_reduce_local_nextparents", + "awkward_RegularArray_reduce_nonlocal_preparenext", "awkward_missing_repeat", "awkward_RegularArray_getitem_jagged_expand", "awkward_ListArray_getitem_jagged_expand", + "awkward_ListArray_getitem_jagged_carrylen", "awkward_ListArray_getitem_next_array_advanced", "awkward_ListArray_getitem_next_array", "awkward_ListArray_getitem_next_at", + "awkward_ListArray_getitem_next_range_counts", + "awkward_ListArray_rpad_and_clip_length_axis1", + "awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64", "awkward_NumpyArray_reduce_adjust_starts_64", "awkward_NumpyArray_reduce_adjust_starts_shifts_64", "awkward_RegularArray_getitem_next_at", @@ -726,6 +761,7 @@ def gencpuunittests(specdict): "awkward_reduce_sum_bool", "awkward_reduce_prod_bool", "awkward_reduce_countnonzero", + "awkward_sorting_ranges_length", ] @@ -966,8 +1002,12 @@ def gencudaunittests(specdict): ) ) elif count == 2: - raise NotImplementedError - + f.write( + " " * 4 + + "{} = cupy.array({}, dtype=cupy.{})\n".format( + arg, val, typename + ) + ) cuda_string = ( "funcC = cupy_backend['" + spec.templatized_kernel_name @@ -1068,10 +1108,10 @@ def evalkernels(): if __name__ == "__main__": genpykernels() evalkernels() - specdict = readspec() + specdict, specdict_unit = readspec() genspectests(specdict) gencpukerneltests(specdict) - gencpuunittests(specdict) + gencpuunittests(specdict_unit) genunittests() gencudakerneltests(specdict) - gencudaunittests(specdict) + gencudaunittests(specdict_unit) diff --git a/kernel-specification.yml b/kernel-specification.yml index 852db8c235..4fbc49edb8 100644 --- a/kernel-specification.yml +++ b/kernel-specification.yml @@ -1960,11 +1960,12 @@ kernels: description: null definition: | def awkward_ListArray_min_range(tomin, fromstarts, fromstops, lenstarts): - shorter = fromstops[0] - fromstarts[0] - for i in range(1, lenstarts): - rangeval = fromstops[i] - fromstarts[i] - shorter = shorter if shorter < rangeval else rangeval - tomin[0] = shorter + if lenstarts > 0: + shorter = fromstops[0] - fromstarts[0] + for i in range(1, lenstarts): + rangeval = fromstops[i] - fromstarts[i] + shorter = shorter if shorter < rangeval else rangeval + tomin[0] = shorter automatic-tests: true - name: awkward_ListArray_rpad_and_clip_length_axis1 @@ -5917,5 +5918,9 @@ kernels: - {name: parentslength, type: "int64_t", dir: in, role: default} description: null definition: | - Insert Python definition here + def awkward_sorting_ranges_length(tolength, parents, parentslength): + tolength[0] = 2 + for i in range(1, parentslength): + if parents[i - 1] != parents[i]: + tolength[0] = tolength[0] + 1 automatic-tests: false diff --git a/kernel-test-data.json b/kernel-test-data.json index 9c7dc92172..052edda1d4 100644 --- a/kernel-test-data.json +++ b/kernel-test-data.json @@ -895,6 +895,17 @@ "name": "awkward_index_rpad_and_clip_axis0", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "target": 0 + }, + "outputs": { + "toindex": [] + } + }, { "error": false, "message": "", @@ -1033,6 +1044,19 @@ "name": "awkward_BitMaskedArray_to_ByteMaskedArray", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "bitmasklength": 0, + "frombitmask": [], + "lsb_order": false, + "validwhen": false + }, + "outputs": { + "tobytemask": [] + } + }, { "error": false, "message": "", @@ -1078,6 +1102,18 @@ "name": "awkward_ByteMaskedArray_getitem_nextcarry", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "mask": [], + "validwhen": false + }, + "outputs": { + "tocarry": [] + } + }, { "error": false, "message": "", @@ -1108,6 +1144,19 @@ "name": "awkward_ByteMaskedArray_getitem_nextcarry_outindex", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "mask": [], + "validwhen": true + }, + "outputs": { + "outindex": [], + "tocarry": [] + } + }, { "error": false, "message": "", @@ -1138,8 +1187,20 @@ }, { "name": "awkward_ByteMaskedArray_numnull", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "mask": [], + "validwhen": false + }, + "outputs": { + "numnull": [0] + } + }, { "error": false, "message": "", @@ -1350,6 +1411,18 @@ "name": "awkward_ByteMaskedArray_toIndexedOptionArray", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "mask": [], + "validwhen": false + }, + "outputs": { + "toindex": [] + } + }, { "error": false, "message": "", @@ -1380,6 +1453,30 @@ "name": "awkward_IndexedArray_flatten_nextcarry", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromindex": [], + "lencontent": 0, + "lenindex": 0 + }, + "outputs": { + "tocarry": [] + } + }, + { + "error": true, + "message": "index out of range", + "inputs": { + "fromindex": [0, 1], + "lencontent": 0, + "lenindex": 2 + }, + "outputs": { + "tocarry": [] + } + }, { "error": true, "message": "index out of range", @@ -1408,8 +1505,32 @@ }, { "name": "awkward_IndexedArray_getitem_nextcarry", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromindex": [], + "lencontent": 0, + "lenindex": 0 + }, + "outputs": { + "tocarry": [] + } + }, + { + "error": true, + "message": "index out of range", + "inputs": { + "fromindex": [0, 1], + "lencontent": 0, + "lenindex": 2 + }, + "outputs": { + "tocarry": [] + } + }, { "error": true, "message": "index out of range", @@ -1750,8 +1871,34 @@ }, { "name": "awkward_IndexedArray_getitem_nextcarry_outindex", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromindex": [], + "lencontent": 0, + "lenindex": 0 + }, + "outputs": { + "tocarry": [], + "toindex": [] + } + }, + { + "error": true, + "message": "index out of range", + "inputs": { + "fromindex": [0, 1, 2, 4], + "lencontent": 0, + "lenindex": 4 + }, + "outputs": { + "tocarry": [], + "toindex": [] + } + }, { "error": true, "message": "index out of range", @@ -1795,11 +1942,62 @@ }, { "name": "awkward_IndexedArray_numnull", - "status": false, + "status": true, "tests": [ { "error": false, "message": "", + "inputs": { + "fromindex": [], + "lenindex": 0 + }, + "outputs": { + "numnull": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromindex": [1], + "lenindex": 1 + }, + "outputs": { + "numnull": [0] + } + }, + { + "error": false, + "inputs": { + "fromindex": [-1], + "lenindex": 1 + }, + "outputs": { + "numnull": [1] + } + }, + { + "error": false, + "inputs": { + "fromindex": [-1, -1, -1, -1], + "lenindex": 4 + }, + "outputs": { + "numnull": [4] + } + }, + { + "error": false, + "inputs": { + "fromindex": [0, -1, 2, -1, -1, -1, -1], + "lenindex": 7 + }, + "outputs": { + "numnull": [5] + } + }, + { + "error": false, "inputs": { "fromindex": [0, 1], "lenindex": 2 @@ -1943,109 +2141,215 @@ ] }, { - "name": "awkward_IndexedArray_numnull_unique_64", + "name": "awkward_IndexedArray_numnull_parents", "status": true, "tests": [ { "error": false, - "message": "", "inputs": { - "lenindex": 4 + "fromindex": [], + "lenindex": 0 }, "outputs": { - "toindex": [0, 1, 2, 3, -1] + "numnull": [], + "tolength": [0] } }, { "error": false, - "message": "", "inputs": { - "lenindex": 2 + "fromindex": [1], + "lenindex": 1 }, "outputs": { - "toindex": [0, 1, -1] + "numnull": [0], + "tolength": [0] } }, { "error": false, - "message": "", "inputs": { - "lenindex": 0 + "fromindex": [-1], + "lenindex": 1 }, "outputs": { - "toindex": [-1] + "numnull": [1], + "tolength": [1] } }, { "error": false, - "message": "", "inputs": { - "lenindex": 3 + "fromindex": [-1, -1, -1, -1], + "lenindex": 4 }, "outputs": { - "toindex": [0, 1, 2, -1] + "numnull": [1, 1, 1, 1], + "tolength": [4] } }, { "error": false, - "message": "", "inputs": { - "lenindex": 1 + "fromindex": [0, -1, 2, -1, -1, -1, -1], + "lenindex": 7 }, "outputs": { - "toindex": [0, -1] + "numnull": [0, 1, 0, 1, 1, 1, 1], + "tolength": [5] } - } - ] - }, - { - "name": "awkward_IndexedArray_overlay_mask", - "status": true, - "tests": [ + }, { "error": false, - "message": "", "inputs": { - "fromindex": [5, 4, 3, 2, 1, 0], - "length": 6, - "mask": [0, 0, 0, 0, 0, 0] + "fromindex": [0, 1], + "lenindex": 2 }, "outputs": { - "toindex": [5, 4, 3, 2, 1, 0] + "numnull": [0, 0], + "tolength": [0] } - } - ] - }, - { - "name": "awkward_ListArray_broadcast_tooffsets", - "status": false, - "tests": [ + }, { "error": false, - "message": "", "inputs": { - "fromoffsets": [0, 2, 4, 6, 8, 10, 12, 13, 14, 15, 16], - "fromstarts": [0, 0, 0, 0, 0, 0, 4, 4, 4, 4], - "fromstops": [2, 2, 2, 2, 2, 2, 5, 5, 5, 5], - "lencontent": 5, - "offsetslength": 11 + "fromindex": [0, 1, 2, 3], + "lenindex": 4 }, "outputs": { - "tocarry": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 4, 4, 4, 4] + "numnull": [0, 0, 0, 0], + "tolength": [0] } }, { "error": false, - "message": "", "inputs": { - "fromoffsets": [0, 2, 4, 6, 8, 10], - "fromstarts": [0, 0, 0, 3, 3], - "fromstops": [2, 2, 2, 5, 5], - "lencontent": 5, - "offsetslength": 6 + "fromindex": [0, 1, -2, 3, -4, 5, -6], + "lenindex": 7 }, "outputs": { - "tocarry": [0, 1, 0, 1, 0, 1, 3, 4, 3, 4] + "numnull": [0, 0, 1, 0, 1, 0, 1], + "tolength": [3] + } + } + ] + }, + { + "name": "awkward_IndexedArray_numnull_unique_64", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "lenindex": 4 + }, + "outputs": { + "toindex": [0, 1, 2, 3, -1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "lenindex": 2 + }, + "outputs": { + "toindex": [0, 1, -1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "lenindex": 0 + }, + "outputs": { + "toindex": [-1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "lenindex": 3 + }, + "outputs": { + "toindex": [0, 1, 2, -1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "lenindex": 1 + }, + "outputs": { + "toindex": [0, -1] + } + } + ] + }, + { + "name": "awkward_IndexedArray_overlay_mask", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromindex": [], + "length": 0, + "mask": [] + }, + "outputs": { + "toindex": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromindex": [5, 4, 3, 2, 1, 0], + "length": 6, + "mask": [0, 0, 0, 0, 0, 0] + }, + "outputs": { + "toindex": [5, 4, 3, 2, 1, 0] + } + } + ] + }, + { + "name": "awkward_ListArray_broadcast_tooffsets", + "status": false, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromoffsets": [0, 2, 4, 6, 8, 10, 12, 13, 14, 15, 16], + "fromstarts": [0, 0, 0, 0, 0, 0, 4, 4, 4, 4], + "fromstops": [2, 2, 2, 2, 2, 2, 5, 5, 5, 5], + "lencontent": 5, + "offsetslength": 11 + }, + "outputs": { + "tocarry": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 4, 4, 4, 4] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromoffsets": [0, 2, 4, 6, 8, 10], + "fromstarts": [0, 0, 0, 3, 3], + "fromstops": [2, 2, 2, 5, 5], + "lencontent": 5, + "offsetslength": 6 + }, + "outputs": { + "tocarry": [0, 1, 0, 1, 0, 1, 3, 4, 3, 4] } }, { @@ -4305,229 +4609,491 @@ ] }, { - "name": "awkward_RegularArray_localindex", + "name": "awkward_RegularArray_reduce_local_nextparents", "status": true, "tests": [ { "error": false, "message": "", "inputs": { - "length": 2, - "size": 3 + "size": 3, + "length": 0 }, "outputs": { - "toindex": [0, 1, 2, 0, 1, 2] + "nextparents": [] } }, { "error": false, "message": "", "inputs": { - "length": 6, - "size": 5 + "size": 0, + "length": 0 }, "outputs": { - "toindex": [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4] + "nextparents": [] } - } - ] - }, - { - "name": "awkward_RegularArray_rpad_and_clip_axis1", - "status": true, - "tests": [ + }, { "error": false, "message": "", "inputs": { - "length": 3, - "size": 2, - "target": 2 + "size": 0, + "length": 2 }, "outputs": { - "toindex": [0, 1, 2, 3, 4, 5] + "nextparents": [] } }, { "error": false, "message": "", "inputs": { - "length": 2, "size": 3, - "target": 3 + "length": 2 }, "outputs": { - "toindex": [0, 1, 2, 3, 4, 5] + "nextparents": [0, 0, 0, 1, 1, 1] } }, { "error": false, "message": "", "inputs": { - "length": 3, - "size": 3, - "target": 3 + "size": 1, + "length": 1 }, "outputs": { - "toindex": [0, 1, 2, 3, 4, 5, 6, 7, 8] + "nextparents": [0] } - }, + } + ] + }, + { + "name": "awkward_RegularArray_reduce_nonlocal_preparenext", + "status": true, + "tests": [ { "error": false, "message": "", "inputs": { - "length": 3, + "parents": [], "size": 3, - "target": 2 + "length": 0 }, "outputs": { - "toindex": [0, 1, 3, 4, 6, 7] + "nextcarry": [], + "nextparents": [] } }, { "error": false, "message": "", "inputs": { - "length": 6, - "size": 5, - "target": 2 + "parents": [], + "size": 0, + "length": 0 }, "outputs": { - "toindex": [0, 1, 5, 6, 10, 11, 15, 16, 20, 21, 25, 26] + "nextcarry": [], + "nextparents": [] } }, { "error": false, "message": "", "inputs": { - "length": 3, - "size": 2, - "target": 1 + "parents": [0, 1], + "size": 0, + "length": 2 }, "outputs": { - "toindex": [0, 2, 4] + "nextcarry": [], + "nextparents": [] } }, { "error": false, "message": "", "inputs": { - "length": 3, + "parents": [0, 1], "size": 3, - "target": 1 + "length": 2 }, "outputs": { - "toindex": [0, 3, 6] + "nextcarry": [0, 3, 1, 4, 2, 5], + "nextparents": [0, 3, 1, 4, 2, 5] } - } - ] - }, - { - "name": "awkward_RegularArray_getitem_carry", - "status": true, - "tests": [ + }, { "error": false, "message": "", "inputs": { - "fromcarry": [0, 0], - "lencarry": 2, - "size": 1 + "parents": [2, 4, 6], + "size": 3, + "length": 3 }, "outputs": { - "tocarry": [0, 0] + "nextcarry": [0, 3, 6, 1, 4, 7, 2, 5, 8], + "nextparents": [6, 12, 18, 7, 13, 19, 8, 14, 20] } }, { "error": false, "message": "", "inputs": { - "fromcarry": [0, 0, 0, 1, 1, 1], - "lencarry": 6, - "size": 1 + "parents": [0], + "size": 1, + "length": 1 }, "outputs": { - "tocarry": [0, 0, 0, 1, 1, 1] + "nextcarry": [0], + "nextparents": [0] } - }, + } + ] + }, + { + "name": "awkward_RegularArray_localindex", + "status": true, + "tests": [ { "error": false, "message": "", "inputs": { - "fromcarry": [0, 0, 0, 2, 2], - "lencarry": 5, - "size": 1 + "length": 0, + "size": 3 }, "outputs": { - "tocarry": [0, 0, 0, 2, 2] + "toindex": [] } }, { "error": false, "message": "", "inputs": { - "fromcarry": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4], - "lencarry": 10, - "size": 1 + "length": 0, + "size": 0 }, "outputs": { - "tocarry": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4] + "toindex": [] } }, { "error": false, "message": "", "inputs": { - "fromcarry": [0, 1, 0, 1, 0, 1, 3, 4, 3, 4], - "lencarry": 10, - "size": 1 + "length": 2, + "size": 0 }, "outputs": { - "tocarry": [0, 1, 0, 1, 0, 1, 3, 4, 3, 4] + "toindex": [] } }, { "error": false, "message": "", "inputs": { - "fromcarry": [0, 0, 1, 1], - "lencarry": 4, - "size": 2 + "length": 2, + "size": 3 }, "outputs": { - "tocarry": [0, 1, 0, 1, 2, 3, 2, 3] + "toindex": [0, 1, 2, 0, 1, 2] } }, { "error": false, "message": "", "inputs": { - "fromcarry": [0, 0, 0, 0], - "lencarry": 4, - "size": 3 + "length": 6, + "size": 5 }, "outputs": { - "tocarry": [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] + "toindex": [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4] } - }, + } + ] + }, + { + "name": "awkward_RegularArray_rpad_and_clip_axis1", + "status": true, + "tests": [ { "error": false, "message": "", "inputs": { - "fromcarry": [0, 1, 1, 2], - "lencarry": 4, - "size": 2 + "length": 3, + "size": 0, + "target": 2 }, "outputs": { - "tocarry": [0, 1, 2, 3, 2, 3, 4, 5] + "toindex": [-1, -1, -1, -1, -1, -1] } }, { "error": false, "message": "", "inputs": { - "fromcarry": [0, 0], - "lencarry": 2, + "length": 0, + "size": 0, + "target": 0 + }, + "outputs": { + "toindex": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "size": 0, + "target": 2 + }, + "outputs": { + "toindex": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "size": 2, + "target": 2 + }, + "outputs": { + "toindex": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 3, + "size": 2, + "target": 2 + }, + "outputs": { + "toindex": [0, 1, 2, 3, 4, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 2, + "size": 3, + "target": 3 + }, + "outputs": { + "toindex": [0, 1, 2, 3, 4, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 3, + "size": 3, + "target": 3 + }, + "outputs": { + "toindex": [0, 1, 2, 3, 4, 5, 6, 7, 8] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 3, + "size": 3, + "target": 2 + }, + "outputs": { + "toindex": [0, 1, 3, 4, 6, 7] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 6, + "size": 5, + "target": 2 + }, + "outputs": { + "toindex": [0, 1, 5, 6, 10, 11, 15, 16, 20, 21, 25, 26] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 3, + "size": 2, + "target": 1 + }, + "outputs": { + "toindex": [0, 2, 4] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 3, + "size": 3, + "target": 1 + }, + "outputs": { + "toindex": [0, 3, 6] + } + } + ] + }, + { + "name": "awkward_RegularArray_getitem_carry", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 0], + "lencarry": 2, + "size": 0 + }, + "outputs": { + "tocarry": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [], + "lencarry": 0, + "size": 1 + }, + "outputs": { + "tocarry": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [], + "lencarry": 0, + "size": 0 + }, + "outputs": { + "tocarry": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 0], + "lencarry": 2, + "size": 1 + }, + "outputs": { + "tocarry": [0, 0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 0, 0, 1, 1, 1], + "lencarry": 6, + "size": 1 + }, + "outputs": { + "tocarry": [0, 0, 0, 1, 1, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 0, 0, 2, 2], + "lencarry": 5, + "size": 1 + }, + "outputs": { + "tocarry": [0, 0, 0, 2, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4], + "lencarry": 10, + "size": 1 + }, + "outputs": { + "tocarry": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 1, 0, 1, 0, 1, 3, 4, 3, 4], + "lencarry": 10, + "size": 1 + }, + "outputs": { + "tocarry": [0, 1, 0, 1, 0, 1, 3, 4, 3, 4] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 0, 1, 1], + "lencarry": 4, + "size": 2 + }, + "outputs": { + "tocarry": [0, 1, 0, 1, 2, 3, 2, 3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 0, 0, 0], + "lencarry": 4, + "size": 3 + }, + "outputs": { + "tocarry": [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 1, 1, 2], + "lencarry": 4, + "size": 2 + }, + "outputs": { + "tocarry": [0, 1, 2, 3, 2, 3, 4, 5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromcarry": [0, 0], + "lencarry": 2, "size": 5 }, "outputs": { @@ -4960,6 +5526,71 @@ "name": "awkward_RegularArray_getitem_jagged_expand", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "regularlength": 0, + "regularsize": 0, + "singleoffsets": [0] + }, + "outputs": { + "multistarts": [], + "multistops": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "regularlength": 1, + "regularsize": 0, + "singleoffsets": [1] + }, + "outputs": { + "multistarts": [], + "multistops": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "regularlength": 0, + "regularsize": 0, + "singleoffsets": [0] + }, + "outputs": { + "multistarts": [], + "multistops": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "regularlength": 1, + "regularsize": 1, + "singleoffsets": [0, 2] + }, + "outputs": { + "multistarts": [0], + "multistops": [2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "regularlength": 2, + "regularsize": 1, + "singleoffsets": [0, 2] + }, + "outputs": { + "multistarts": [0, 0], + "multistops": [2, 2] + } + }, { "error": false, "message": "", @@ -6767,8 +7398,20 @@ }, { "name": "awkward_RegularArray_getitem_next_array_regularize", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromarray": [], + "lenarray": 0, + "size": 2 + }, + "outputs": { + "toarray": [] + } + }, { "error": false, "message": "", @@ -8725,6 +9368,18 @@ "name": "awkward_ListArray_getitem_next_range_spreadadvanced", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromadvanced": [], + "fromoffsets": [0], + "lenstarts": 0 + }, + "outputs": { + "toadvanced": [] + } + }, { "error": false, "message": "", @@ -8767,6 +9422,42 @@ "name": "awkward_RegularArray_getitem_next_range_spreadadvanced", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromadvanced": [], + "length": 0, + "nextsize": 2 + }, + "outputs": { + "toadvanced": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromadvanced": [], + "length": 1, + "nextsize": 0 + }, + "outputs": { + "toadvanced": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromadvanced": [], + "length": 0, + "nextsize": 0 + }, + "outputs": { + "toadvanced": [] + } + }, { "error": false, "message": "", @@ -8803,11 +9494,11 @@ "inputs": { "inneroffsets": [0], "inneroffsetslen": 1, - "outeroffsets": [0, 0, 0, 0, 0], - "outeroffsetslen": 5 + "outeroffsets": [], + "outeroffsetslen": 0 }, "outputs": { - "tooffsets": [0, 0, 0, 0, 0] + "tooffsets": [] } }, { @@ -9228,6 +9919,20 @@ "name": "awkward_MaskedArray_getitem_next_jagged_project", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "index": [], + "length": 0, + "starts_in": [], + "stops_in": [] + }, + "outputs": { + "starts_out": [], + "stops_out": [] + } + }, { "error": false, "message": "", @@ -9692,6 +10397,19 @@ "name": "awkward_ByteMaskedArray_overlay_mask", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "mymask": [], + "theirmask": [], + "validwhen": false + }, + "outputs": { + "tomask": [] + } + }, { "error": false, "message": "", @@ -9704,6 +10422,58 @@ "outputs": { "tomask": [0, 0] } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 2, + "mymask": [0, 0], + "theirmask": [0, 0], + "validwhen": true + }, + "outputs": { + "tomask": [1, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 2, + "mymask": [1, 0], + "theirmask": [0, 1], + "validwhen": false + }, + "outputs": { + "tomask": [1, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 2, + "mymask": [0, 0], + "theirmask": [0, 1], + "validwhen": false + }, + "outputs": { + "tomask": [0, 1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 2, + "mymask": [1, 0], + "theirmask": [0, 0], + "validwhen": false + }, + "outputs": { + "tomask": [1, 0] + } } ] }, @@ -9806,8 +10576,22 @@ }, { "name": "awkward_IndexedArray_reduce_next_64", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "index": [], + "length": 0, + "parents": [] + }, + "outputs": { + "nextcarry": [], + "nextparents": [], + "outindex": [] + } + }, { "error": false, "message": "", @@ -9968,6 +10752,45 @@ "name": "awkward_IndexedArray_simplify", "status": true, "tests": [ + { + "error": true, + "message": "index out of range", + "inputs": { + "innerindex": [], + "innerlength": 0, + "outerindex": [0, 3], + "outerlength": 2 + }, + "outputs": { + "toindex": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "innerindex": [], + "innerlength": 0, + "outerindex": [], + "outerlength": 0 + }, + "outputs": { + "toindex": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "innerindex": [0, 1], + "innerlength": 2, + "outerindex": [], + "outerlength": 0 + }, + "outputs": { + "toindex": [123, 123] + } + }, { "error": true, "message": "index out of range", @@ -10429,6 +11252,39 @@ "name": "awkward_IndexedArray_validity", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "index": [], + "isoption": true, + "lencontent": 3, + "length": 0 + }, + "outputs": {} + }, + { + "error": false, + "message": "", + "inputs": { + "index": [], + "isoption": true, + "lencontent": 0, + "length": 0 + }, + "outputs": {} + }, + { + "error": true, + "message": "index[i] >= len(content)", + "inputs": { + "index": [0, 1, 1, 1, 1, 3], + "isoption": true, + "lencontent": 0, + "length": 6 + }, + "outputs": {} + }, { "error": true, "message": "index[i] >= len(content)", @@ -10875,6 +11731,17 @@ "name": "awkward_IndexedOptionArray_rpad_and_clip_mask_axis1", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "frommask": [], + "length": 0 + }, + "outputs": { + "toindex": [] + } + }, { "error": false, "message": "", @@ -10892,6 +11759,18 @@ "name": "awkward_index_rpad_and_clip_axis1", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "target": 1 + }, + "outputs": { + "tostarts": [], + "tostops": [] + } + }, { "error": false, "message": "", @@ -11163,8 +12042,56 @@ }, { "name": "awkward_ListArray_getitem_jagged_carrylen", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "slicestarts": [], + "slicestops": [], + "sliceouterlen": 0 + }, + "outputs": { + "carrylen": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "slicestarts": [0, 2], + "slicestops": [0, 2], + "sliceouterlen": 2 + }, + "outputs": { + "carrylen": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "slicestarts": [2], + "slicestops": [4], + "sliceouterlen": 1 + }, + "outputs": { + "carrylen": [2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "slicestarts": [1], + "slicestops": [1], + "sliceouterlen": 1 + }, + "outputs": { + "carrylen": [0] + } + }, { "error": false, "message": "", @@ -11651,7 +12578,7 @@ }, { "name": "awkward_ListArray_getitem_jagged_expand", - "status": false, + "status": true, "tests": [ { "error": true, @@ -11857,6 +12784,18 @@ "name": "awkward_ListArray_min_range", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [], + "fromstops": [], + "lenstarts": 0 + }, + "outputs": { + "tomin": [] + } + }, { "error": false, "message": "", @@ -11911,6 +12850,19 @@ "name": "awkward_ListArray_rpad_and_clip_length_axis1", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [], + "fromstops": [], + "lenstarts": 0, + "target": 1 + }, + "outputs": { + "tomin": [0] + } + }, { "error": false, "message": "", @@ -12060,6 +13012,39 @@ "name": "awkward_ListArray_validity", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "lencontent": 0, + "length": 0, + "starts": [], + "stops": [] + }, + "outputs": {} + }, + { + "error": false, + "message": "", + "inputs": { + "lencontent": 2, + "length": 0, + "starts": [], + "stops": [] + }, + "outputs": {} + }, + { + "error": true, + "message": "stop[i] > len(content)", + "inputs": { + "lencontent": 0, + "length": 3, + "starts": [0, 0, 1], + "stops": [0, 1, 5] + }, + "outputs": {} + }, { "error": true, "message": "start[i] > stop[i]", @@ -13294,15 +14279,28 @@ "length": 8 }, "outputs": { - "size": [2] + "size": [2] + } + } + ] + }, + { + "name": "awkward_IndexedArray_fill", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "base": 0, + "fromindex": [], + "length": 0, + "toindexoffset": 0 + }, + "outputs": { + "toindex": [] } - } - ] - }, - { - "name": "awkward_IndexedArray_fill", - "status": true, - "tests": [ + }, { "error": false, "message": "", @@ -13374,6 +14372,18 @@ "name": "awkward_IndexedArray_fill_count", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "base": 0, + "length": 0, + "toindexoffset": 0 + }, + "outputs": { + "toindex": [] + } + }, { "error": false, "message": "", @@ -13428,6 +14438,22 @@ "name": "awkward_ListArray_fill", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "base": 0, + "fromstarts": [], + "fromstops": [], + "length": 0, + "tostartsoffset": 0, + "tostopsoffset": 0 + }, + "outputs": { + "tostarts": [], + "tostops": [] + } + }, { "error": false, "message": "", @@ -13606,6 +14632,18 @@ "toindex": [0, 0, 1, 1] } }, + { + "error": false, + "message": "", + "inputs": { + "fromindex": [], + "length": 0, + "toindexoffset": 0 + }, + "outputs": { + "toindex": [] + } + }, { "error": false, "message": "", @@ -13620,10 +14658,41 @@ } ] }, + { + "name": "awkward_UnionArray_flatten_length", + "status": false, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromtags": [0, 0, 0, 0], + "fromindex": [0, 1, 2, 3], + "length": 4, + "offsetsraws": [[0, 1, 3, 5, 7], [1, 3, 5, 7, 9]] + }, + "outputs": { + "total_length": [7] + } + } + ] + }, { "name": "awkward_UnionArray_validity", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "index": [], + "lencontents": [], + "length": 0, + "numcontents": 2, + "tags": [] + }, + "outputs": {} + }, { "error": true, "message": "tags[i] < 0", @@ -13746,10 +14815,119 @@ } ] }, + { + "name": "awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "mask": [], + "valid_when": false, + "shifts": [] + }, + "outputs": { + "nextshifts": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 7, + "mask": [0, 0, 0, 1, 1, 0, 0], + "valid_when": false, + "shifts": [0, 1, 1, 0, 1, 1, 0] + }, + "outputs": { + "nextshifts": [0, 1, 1, 3, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 1, + "mask": [0], + "valid_when": false, + "shifts": [0] + }, + "outputs": { + "nextshifts": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 1, + "mask": [0], + "valid_when": false, + "shifts": [1] + }, + "outputs": { + "nextshifts": [1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 1, + "mask": [0], + "valid_when": true, + "shifts": [1] + }, + "outputs": { + "nextshifts": [123] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 7, + "mask": [0, 0, 0, 1, 1, 0, 0], + "valid_when": true, + "shifts": [0, 1, 1, 0, 1, 1, 0] + }, + "outputs": { + "nextshifts": [3, 4] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 5, + "mask": [0, 1, 0, 1, 1], + "valid_when": true, + "shifts": [0, 0, 1, 0, 0] + }, + "outputs": { + "nextshifts": [1, 2, 2] + } + } + ] + }, { "name": "awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "length": 0, + "mask": [], + "valid_when": false + }, + "outputs": { + "nextshifts": [] + } + }, { "error": false, "message": "", @@ -13759,7 +14937,136 @@ "valid_when": false }, "outputs": { - "nextshifts": [0, 0, 0, 2, 2] + "nextshifts": [0, 0, 0, 2, 2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 1, + "mask": [0], + "valid_when": false + }, + "outputs": { + "nextshifts": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 1, + "mask": [0], + "valid_when": true + }, + "outputs": { + "nextshifts": [123] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 7, + "mask": [0, 0, 0, 1, 1, 0, 0], + "valid_when": true + }, + "outputs": { + "nextshifts": [3, 3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "length": 5, + "mask": [0, 1, 0, 1, 1], + "valid_when": true + }, + "outputs": { + "nextshifts": [1, 2, 2] + } + } + ] + }, + { + "name": "awkward_ByteMaskedArray_reduce_next_64", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "mask": [], + "parents": [], + "length": 0, + "validwhen": false + }, + "outputs": { + "nextcarry": [], + "nextparents": [], + "outindex": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "mask": [0, 0, 0, 1, 1, 0, 0], + "parents": [0, 0, 1, 1, 2, 2, 2], + "length": 7, + "validwhen": false + }, + "outputs": { + "nextcarry": [0, 1, 2, 5, 6], + "nextparents": [0, 0, 1, 2, 2], + "outindex": [0, 1, 2, -1, -1, 3, 4] + } + }, + { + "error": false, + "message": "", + "inputs": { + "mask": [0], + "parents": [2], + "length": 1, + "validwhen": false + }, + "outputs": { + "nextcarry": [0], + "nextparents": [2], + "outindex": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "mask": [1], + "parents": [1], + "length": 1, + "validwhen": false + }, + "outputs": { + "nextcarry": [123], + "nextparents": [123], + "outindex": [-1] + } + }, + { + "error": false, + "message": "", + "inputs": { + "mask": [0, 1, 0, 1, 1], + "parents": [0, 0, 1, 1, 1], + "length": 5, + "validwhen": true + }, + "outputs": { + "nextcarry": [1, 3, 4], + "nextparents": [0, 1, 1], + "outindex": [-1, 0, -1, 1, 2] } } ] @@ -13768,6 +15075,19 @@ "name": "awkward_IndexedArray_index_of_nulls", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromindex": [], + "lenindex": 0, + "parents": [], + "starts": [] + }, + "outputs": { + "toindex": [] + } + }, { "error": false, "message": "", @@ -13992,8 +15312,19 @@ }, { "name": "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "index": [], + "length": 0 + }, + "outputs": { + "nextshifts": [] + } + }, { "error": false, "message": "", @@ -14108,8 +15439,20 @@ }, { "name": "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "index": [], + "length": 0, + "shifts": [] + }, + "outputs": { + "nextshifts": [] + } + }, { "error": false, "message": "", @@ -14998,6 +16341,19 @@ "name": "awkward_ListArray_getitem_next_at", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "at": -2, + "fromstarts": [], + "fromstops": [], + "lenstarts": 0 + }, + "outputs": { + "tocarry": [] + } + }, { "error": true, "message": "index out of range", @@ -15691,8 +17047,19 @@ }, { "name": "awkward_ListArray_getitem_next_range_counts", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromoffsets": [], + "lenstarts": 0 + }, + "outputs": { + "total": [0] + } + }, { "error": false, "message": "", @@ -15714,6 +17081,28 @@ "outputs": { "total": [9] } + }, + { + "error": false, + "message": "", + "inputs": { + "fromoffsets": [0, 0, 0, 0], + "lenstarts": 3 + }, + "outputs": { + "total": [0] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromoffsets": [0, 3], + "lenstarts": 1 + }, + "outputs": { + "total": [3] + } } ] }, @@ -15848,6 +17237,51 @@ "name": "awkward_ListArray_rpad_axis1", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [], + "fromstops": [], + "length": 0, + "target": 4 + }, + "outputs": { + "toindex": [], + "tostarts": [], + "tostops": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [], + "fromstops": [], + "length": 0, + "target": 0 + }, + "outputs": { + "toindex": [], + "tostarts": [], + "tostops": [] + } + }, + { + "error": false, + "message": "", + "inputs": { + "fromstarts": [0, 3, 4, 5, 8], + "fromstops": [3, 3, 6, 8, 9], + "length": 5, + "target": 0 + }, + "outputs": { + "toindex": [0, 1, 2, 4, 5, 5, 6, 7, 8, 123, 123], + "tostarts": [0, 3, 3, 5, 8], + "tostops": [3, 3, 5, 8, 9] + } + }, { "error": false, "message": "", @@ -16839,8 +18273,19 @@ }, { "name": "awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64", - "status": false, + "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "nextlen": 0, + "nextparents": [] + }, + "outputs": { + "nextstarts": [] + } + }, { "error": false, "message": "", @@ -17527,6 +18972,18 @@ "toptr": [0, 1, 3] } }, + { + "error": false, + "message": "", + "inputs": { + "tooffset": 0, + "fromptr": [], + "length": 0 + }, + "outputs": { + "toptr": [] + } + }, { "error": false, "message": "", @@ -17995,6 +19452,17 @@ "name": "awkward_UnionArray_fillna", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "fromindex": [], + "length": 0 + }, + "outputs": { + "toindex": [] + } + }, { "error": false, "message": "", @@ -18144,6 +19612,19 @@ "name": "awkward_UnionArray_filltags", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "base": 0, + "fromtags": [], + "length": 0, + "totagsoffset": 0 + }, + "outputs": { + "totags": [] + } + }, { "error": false, "message": "", @@ -18163,6 +19644,18 @@ "name": "awkward_UnionArray_filltags_const", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "base": 0, + "length": 0, + "totagsoffset": 0 + }, + "outputs": { + "totags": [] + } + }, { "error": false, "message": "", @@ -18215,7 +19708,7 @@ }, { "name": "awkward_UnionArray_project", - "status": false, + "status": true, "tests": [ { "error": false, @@ -18568,6 +20061,19 @@ "toindex": [0, 1, -1, -1, -1, 5, -1, 7, 8, 9, -1, -1, -1, 13, -1, -1] } }, + { + "error": false, + "message": "", + "inputs": { + "bitmasklength": 0, + "frombitmask": [], + "lsb_order": false, + "validwhen": false + }, + "outputs": { + "toindex": [] + } + }, { "error": false, "message": "", @@ -18666,7 +20172,7 @@ }, { "name": "awkward_Content_getitem_next_missing_jagged_getmaskstartstop", - "status": false, + "status": true, "tests": [ { "error": false, @@ -18682,6 +20188,20 @@ "stops_out": [1, 1] } }, + { + "error": false, + "message": "", + "inputs": { + "index_in": [], + "length": 0, + "offsets_in": [] + }, + "outputs": { + "mask_out": [], + "starts_out": [], + "stops_out": [] + } + }, { "error": false, "message": "", @@ -21902,6 +23422,25 @@ "name": "awkward_UnionArray_simplify", "status": true, "tests": [ + { + "error": false, + "message": "", + "inputs": { + "base": 0, + "innerindex": [], + "innertags": [], + "innerwhich": 0, + "length": 0, + "outerindex": [], + "outertags": [], + "outerwhich": 1, + "towhich": 1 + }, + "outputs": { + "toindex": [], + "totags": [] + } + }, { "error": false, "message": "", @@ -21941,6 +23480,67 @@ } } ] + }, + { + "name": "awkward_sorting_ranges_length", + "status": true, + "tests": [ + { + "error": false, + "message": "", + "inputs": { + "parents": [], + "parentslength": 0 + }, + "outputs": { + "tolength": [2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "parents": [0, 1], + "parentslength": 2 + }, + "outputs": { + "tolength": [3] + } + }, + { + "error": false, + "message": "", + "inputs": { + "parents": [0, 3, 6, 9], + "parentslength": 4 + }, + "outputs": { + "tolength": [5] + } + }, + { + "error": false, + "message": "", + "inputs": { + "parents": [3, 3, 3, 3], + "parentslength": 4 + }, + "outputs": { + "tolength": [2] + } + }, + { + "error": false, + "message": "", + "inputs": { + "parents": [2, 4, 4], + "parentslength": 3 + }, + "outputs": { + "tolength": [3] + } + } + ] } ] } diff --git a/src/awkward/_connect/cuda/__init__.py b/src/awkward/_connect/cuda/__init__.py index b3e3857d9c..df44325b91 100644 --- a/src/awkward/_connect/cuda/__init__.py +++ b/src/awkward/_connect/cuda/__init__.py @@ -71,7 +71,11 @@ def fetch_template_specializations(kernel_dict): # These cuda kernels consist of multiple kernels that don't have templated # specializations of the same name (e.g. '_a', '_b'). kernel_exclusions = [ + "awkward_Index_nones_as_index", "awkward_ByteMaskedArray_getitem_nextcarry", + "awkward_ByteMaskedArray_numnull", + "awkward_IndexedArray_numnull", + "awkward_IndexedArray_numnull_parents", "awkward_ByteMaskedArray_getitem_nextcarry_outindex", "awkward_ByteMaskedArray_reduce_next_64", "awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64", @@ -80,13 +84,21 @@ def fetch_template_specializations(kernel_dict): "awkward_IndexedArray_flatten_nextcarry", "awkward_IndexedArray_getitem_nextcarry", "awkward_IndexedArray_getitem_nextcarry_outindex", + "awkward_ListArray_getitem_next_range_counts", "awkward_IndexedArray_index_of_nulls", "awkward_IndexedArray_reduce_next_64", "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64", "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64", "awkward_IndexedOptionArray_rpad_and_clip_mask_axis1", "awkward_ListArray_compact_offsets", + "awkward_ListArray_getitem_jagged_carrylen", + "awkward_ListArray_min_range", + "awkward_ListArray_rpad_and_clip_length_axis1", + "awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64", "awkward_MaskedArray_getitem_next_jagged_project", + "awkward_RegularArray_getitem_next_array_regularize", + "awkward_RegularArray_reduce_local_nextparents", + "awkward_RegularArray_reduce_nonlocal_preparenext", "awkward_UnionArray_project", "awkward_reduce_count_64", "awkward_reduce_sum", @@ -99,6 +111,7 @@ def fetch_template_specializations(kernel_dict): "awkward_reduce_countnonzero", "awkward_reduce_max", "awkward_reduce_min", + "awkward_sorting_ranges_length", ] template_specializations = [] import re diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_BitMaskedArray_to_ByteMaskedArray.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_BitMaskedArray_to_ByteMaskedArray.cu index 6599717f47..77525faa9d 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_BitMaskedArray_to_ByteMaskedArray.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_BitMaskedArray_to_ByteMaskedArray.cu @@ -2,15 +2,16 @@ template __global__ void -awkward_BitMaskedArray_to_ByteMaskedArray(T* tobytemask, - const C* frombitmask, - int64_t bitmasklength, - bool validwhen, - bool lsb_order, - uint64_t invocation_index, - uint64_t* err_code) { - uint64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; +awkward_BitMaskedArray_to_ByteMaskedArray( + T* tobytemask, + const C* frombitmask, + int64_t bitmasklength, + bool validwhen, + bool lsb_order, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < bitmasklength) { if (lsb_order) { uint8_t byte = frombitmask[thread_id]; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_BitMaskedArray_to_IndexedOptionArray.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_BitMaskedArray_to_IndexedOptionArray.cu index 5566c6a8a5..4d35527ab0 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_BitMaskedArray_to_IndexedOptionArray.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_BitMaskedArray_to_IndexedOptionArray.cu @@ -1,14 +1,15 @@ // BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE -template +template __global__ void -awkward_BitMaskedArray_to_IndexedOptionArray(C* toindex, - const T* frombitmask, - int64_t bitmasklength, - bool validwhen, - bool lsb_order, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_BitMaskedArray_to_IndexedOptionArray( + T* toindex, + const C* frombitmask, + int64_t bitmasklength, + bool validwhen, + bool lsb_order, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (lsb_order) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_getitem_nextcarry.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_getitem_nextcarry.cu index e43bbc3997..d97cfe9de5 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_getitem_nextcarry.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_getitem_nextcarry.cu @@ -5,7 +5,7 @@ // (tocarry, mask, length, validwhen, invocation_index, err_code) = args // scan_in_array = cupy.empty(length, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_getitem_nextcarry_a', tocarry.dtype, mask.dtype]))(grid, block, (tocarry, mask, length, validwhen, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_getitem_nextcarry_b', tocarry.dtype, mask.dtype]))(grid, block, (tocarry, mask, length, validwhen, scan_in_array, invocation_index, err_code)) // out["awkward_ByteMaskedArray_getitem_nextcarry_a", {dtype_specializations}] = None // out["awkward_ByteMaskedArray_getitem_nextcarry_b", {dtype_specializations}] = None @@ -13,13 +13,14 @@ template __global__ void -awkward_ByteMaskedArray_getitem_nextcarry_a(T* tocarry, - const C* mask, - int64_t length, - bool validwhen, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ByteMaskedArray_getitem_nextcarry_a( + T* tocarry, + const C* mask, + int64_t length, + bool validwhen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -35,13 +36,14 @@ awkward_ByteMaskedArray_getitem_nextcarry_a(T* tocarry, template __global__ void -awkward_ByteMaskedArray_getitem_nextcarry_b(T* tocarry, - const C* mask, - int64_t length, - bool validwhen, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ByteMaskedArray_getitem_nextcarry_b( + T* tocarry, + const C* mask, + int64_t length, + bool validwhen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_getitem_nextcarry_outindex.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_getitem_nextcarry_outindex.cu index a661fb3aa7..8f416fcc69 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_getitem_nextcarry_outindex.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_getitem_nextcarry_outindex.cu @@ -5,7 +5,7 @@ // (tocarry, outindex, mask, length, validwhen, invocation_index, err_code) = args // scan_in_array = cupy.empty(length, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_getitem_nextcarry_outindex_a', tocarry.dtype, outindex.dtype, mask.dtype]))(grid, block, (tocarry, outindex, mask, length, validwhen, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_getitem_nextcarry_outindex_b', tocarry.dtype, outindex.dtype, mask.dtype]))(grid, block, (tocarry, outindex, mask, length, validwhen, scan_in_array, invocation_index, err_code)) // out["awkward_ByteMaskedArray_getitem_nextcarry_outindex_a", {dtype_specializations}] = None // out["awkward_ByteMaskedArray_getitem_nextcarry_outindex_b", {dtype_specializations}] = None @@ -13,14 +13,15 @@ template __global__ void -awkward_ByteMaskedArray_getitem_nextcarry_outindex_a(T* tocarry, - C* outindex, - const U* mask, - int64_t length, - bool validwhen, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ByteMaskedArray_getitem_nextcarry_outindex_a( + T* tocarry, + C* outindex, + const U* mask, + int64_t length, + bool validwhen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -36,14 +37,15 @@ awkward_ByteMaskedArray_getitem_nextcarry_outindex_a(T* tocarry, template __global__ void -awkward_ByteMaskedArray_getitem_nextcarry_outindex_b(T* tocarry, - C* outindex, - const U* mask, - int64_t length, - bool validwhen, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ByteMaskedArray_getitem_nextcarry_outindex_b( + T* tocarry, + C* outindex, + const U* mask, + int64_t length, + bool validwhen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_numnull.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_numnull.cu new file mode 100644 index 0000000000..c16cd68d41 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_numnull.cu @@ -0,0 +1,51 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (numnull, mask, length, validwhen, invocation_index, err_code) = args +// scan_in_array = cupy.empty(length, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_numnull_a', numnull.dtype, mask.dtype]))(grid, block, (numnull, mask, length, validwhen, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_numnull_b', numnull.dtype, mask.dtype]))(grid, block, (numnull, mask, length, validwhen, scan_in_array, invocation_index, err_code)) +// out["awkward_ByteMaskedArray_numnull_a", {dtype_specializations}] = None +// out["awkward_ByteMaskedArray_numnull_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_ByteMaskedArray_numnull_a( + T* numnull, + const C* mask, + int64_t length, + bool validwhen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < length) { + if ((mask[thread_id] != 0) != validwhen) { + scan_in_array[thread_id] = 1; + } + else { + scan_in_array[thread_id] = 0; + } + } + } +} + +template +__global__ void +awkward_ByteMaskedArray_numnull_b( + T* numnull, + const C* mask, + int64_t length, + bool validwhen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + *numnull = length > 0 ? scan_in_array[length - 1] : 0; + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_overlay_mask.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_overlay_mask.cu index d90019a267..e3175808fe 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_overlay_mask.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_overlay_mask.cu @@ -2,13 +2,14 @@ template __global__ void -awkward_ByteMaskedArray_overlay_mask(T* tomask, - const C* theirmask, - const U* mymask, - int64_t length, - bool validwhen, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ByteMaskedArray_overlay_mask( + T* tomask, + const C* theirmask, + const U* mymask, + int64_t length, + bool validwhen, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_64.cu index 4cf1c8b58d..451afd7612 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_64.cu @@ -4,31 +4,34 @@ // def f(grid, block, args): // (nextcarry, nextparents, outindex, mask, parents, length, validwhen, invocation_index, err_code) = args // scan_in_array = cupy.empty(length, dtype=cupy.int64) -// cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_reduce_next_64_a', nextcarry.dtype, nextparents.dtype, outindex.dtype]))(grid, block, (nextcarry, nextparents, outindex, mask, parents, length, validwhen, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, length, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_reduce_next_64_b', nextcarry.dtype, nextparents.dtype, outindex.dtype]))(grid, block, (nextcarry, nextparents, outindex, mask, parents, length, validwhen, scan_in_array, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_reduce_next_64_a', nextcarry.dtype, nextparents.dtype, outindex.dtype, mask.dtype, parents.dtype]))(grid, block, (nextcarry, nextparents, outindex, mask, parents, length, validwhen, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_ByteMaskedArray_reduce_next_64_b', nextcarry.dtype, nextparents.dtype, outindex.dtype, mask.dtype, parents.dtype]))(grid, block, (nextcarry, nextparents, outindex, mask, parents, length, validwhen, scan_in_array, invocation_index, err_code)) // out["awkward_ByteMaskedArray_reduce_next_64_a", {dtype_specializations}] = None // out["awkward_ByteMaskedArray_reduce_next_64_b", {dtype_specializations}] = None // END PYTHON template __global__ void -awkward_ByteMaskedArray_reduce_next_64_a(T* nextcarry, - C* nextparents, - U* outindex, - const V* mask, - const W* parents, - int64_t length, - bool validwhen, - int64_t* scan_in_array, - uint64_t* invocation_index, - uint64_t* err_code) { +awkward_ByteMaskedArray_reduce_next_64_a( + T* nextcarry, + C* nextparents, + U* outindex, + const V* mask, + const W* parents, + int64_t length, + bool validwhen, + int64_t* scan_in_array, + uint64_t* invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { if ((mask[thread_id] != 0) == validwhen) { scan_in_array[thread_id] = 1; + } else { + scan_in_array[thread_id] = 0; } } } @@ -36,16 +39,17 @@ awkward_ByteMaskedArray_reduce_next_64_a(T* nextcarry, template __global__ void -awkward_ByteMaskedArray_reduce_next_64_b(T* nextcarry, - C* nextparents, - U* outindex, - const V* mask, - const W* parents, - int64_t length, - bool validwhen, - int64_t* scan_in_array, - uint64_t* invocation_index, - uint64_t* err_code) { +awkward_ByteMaskedArray_reduce_next_64_b( + T* nextcarry, + C* nextparents, + U* outindex, + const V* mask, + const W* parents, + int64_t length, + bool validwhen, + int64_t* scan_in_array, + uint64_t* invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64.cu index 867193a2cc..5506d7ef64 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64.cu @@ -6,9 +6,9 @@ // scan_in_array_k = cupy.empty(length, dtype=cupy.int64) // scan_in_array_nullsum = cupy.empty(length, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64_a", nextshifts.dtype, mask.dtype]))(grid, block, (nextshifts, mask, length, valid_when, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) -// scan_in_array_k = inclusive_scan(grid, block, (scan_in_array_k, invocation_index, err_code)) -// scan_in_array_nullsum = inclusive_scan(grid, block, (scan_in_array_nullsum, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64_a", nextshifts.dtype, mask.dtype]))(grid, block, (nextshifts, mask, length, valid_when, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) +// scan_in_array_k = cupy.cumsum(scan_in_array_k) +// scan_in_array_nullsum = cupy.cumsum(scan_in_array_nullsum) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64_b", nextshifts.dtype, mask.dtype]))(grid, block, (nextshifts, mask, length, valid_when, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) // out["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64_a", {dtype_specializations}] = None // out["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64_b", {dtype_specializations}] = None // END PYTHON @@ -55,8 +55,7 @@ awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64_b( if (thread_id < length) { if ((mask[thread_id] != 0) == (valid_when != 0)) { - nextshifts[scan_in_array_k[thread_id] - 1] = - scan_in_array_nullsum[thread_id] - 1; + nextshifts[scan_in_array_k[thread_id] - 1] = scan_in_array_nullsum[thread_id]; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64.cu index 373460d7e3..21fee4ef44 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64.cu @@ -5,10 +5,10 @@ // (nextshifts, mask, length, valid_when, shifts, invocation_index, err_code) = args // scan_in_array_k = cupy.empty(length, dtype=cupy.int64) // scan_in_array_nullsum = cupy.empty(length, dtype=cupy.int64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_a", nextshifts.dtype, mask.dtype]))(grid, block, (nextshifts, mask, length, valid_when, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) -// scan_in_array_k = inclusive_scan(grid, block, (scan_in_array_k, invocation_index, err_code)) -// scan_in_array_nullsum = inclusive_scan(grid, block, (scan_in_array_nullsum, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_a", nextshifts.dtype, mask.dtype]))(grid, block, (nextshifts, mask, length, valid_when, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_a", nextshifts.dtype, mask.dtype, shifts.dtype]))(grid, block, (nextshifts, mask, length, valid_when, shifts, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) +// scan_in_array_k = cupy.cumsum(scan_in_array_k) +// scan_in_array_nullsum = cupy.cumsum(scan_in_array_nullsum) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_b", nextshifts.dtype, mask.dtype, shifts.dtype]))(grid, block, (nextshifts, mask, length, valid_when, shifts, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) // out["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_a", {dtype_specializations}] = None // out["awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_b", {dtype_specializations}] = None // END PYTHON @@ -57,8 +57,7 @@ awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_b( if (thread_id < length) { if ((mask[thread_id] != 0) == (valid_when != 0)) { - nextshifts[scan_in_array_k[thread_id] - 1] = - shifts[thread_id] + (scan_in_array_nullsum[thread_id] - 1); + nextshifts[scan_in_array_k[thread_id] - 1] = shifts[thread_id] + scan_in_array_nullsum[thread_id]; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_toIndexedOptionArray.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_toIndexedOptionArray.cu index 1339e73b13..a104214b17 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_toIndexedOptionArray.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ByteMaskedArray_toIndexedOptionArray.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_ByteMaskedArray_toIndexedOptionArray(T* toindex, - const C* mask, - int64_t length, - bool validwhen, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ByteMaskedArray_toIndexedOptionArray( + T* toindex, + const C* mask, + int64_t length, + bool validwhen, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_Content_getitem_next_missing_jagged_getmaskstartstop.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_Content_getitem_next_missing_jagged_getmaskstartstop.cu index 9f24700d5c..e63598235e 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_Content_getitem_next_missing_jagged_getmaskstartstop.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_Content_getitem_next_missing_jagged_getmaskstartstop.cu @@ -3,9 +3,9 @@ // BEGIN PYTHON // def f(grid, block, args): // (index_in, offsets_in, mask_out, starts_out, stops_out, length, invocation_index, err_code) = args -// scan_in_array = cupy.empty(length, dtype=cupy.int64) +// scan_in_array = cupy.empty(length + 1, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_Content_getitem_next_missing_jagged_getmaskstartstop_a", index_in.dtype, offsets_in.dtype, mask_out.dtype, starts_out.dtype, stops_out.dtype]))(grid, block, (index_in, offsets_in, mask_out, starts_out, stops_out, length, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_Content_getitem_next_missing_jagged_getmaskstartstop_b", index_in.dtype, offsets_in.dtype, mask_out.dtype, starts_out.dtype, stops_out.dtype]))(grid, block, (index_in, offsets_in, mask_out, starts_out, stops_out, length, scan_in_array, invocation_index, err_code)) // out["awkward_Content_getitem_next_missing_jagged_getmaskstartstop_a", {dtype_specializations}] = None // out["awkward_Content_getitem_next_missing_jagged_getmaskstartstop_b", {dtype_specializations}] = None @@ -25,12 +25,12 @@ awkward_Content_getitem_next_missing_jagged_getmaskstartstop_a( uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_id < length) { - if (index_in[thread_id] >= 0) { - scan_in_array[thread_id] = 1; + scan_in_array[0] = 0; + if (index_in[thread_id] < 0) { + scan_in_array[thread_id + 1] = 0; } else { - scan_in_array[thread_id] = 0; + scan_in_array[thread_id + 1] = 1; } } } @@ -52,15 +52,13 @@ awkward_Content_getitem_next_missing_jagged_getmaskstartstop_b( int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { - int64_t pre_in = scan_in_array[thread_id] - 1; - starts_out[thread_id] = offsets_in[pre_in]; - + starts_out[thread_id] = offsets_in[scan_in_array[thread_id]]; if (index_in[thread_id] < 0) { mask_out[thread_id] = -1; - stops_out[thread_id] = offsets_in[pre_in]; + stops_out[thread_id] = offsets_in[scan_in_array[thread_id + 1]]; } else { mask_out[thread_id] = thread_id; - stops_out[thread_id] = offsets_in[pre_in + 1]; + stops_out[thread_id] = offsets_in[scan_in_array[thread_id + 1]]; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_Index_nones_as_index.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_Index_nones_as_index.cu new file mode 100644 index 0000000000..b8c0f6a024 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_Index_nones_as_index.cu @@ -0,0 +1,56 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (toindex, length, invocation_index, err_code) = args +// scan_in_array = cupy.empty(length, dtype=cupy.int64) +// scan_in_array_n_non_null = cupy.empty(length, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_Index_nones_as_index_a", toindex.dtype]))(grid, block, (toindex, length, scan_in_array, scan_in_array_n_non_null, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// scan_in_array_n_non_null = cupy.cumsum(scan_in_array_n_non_null) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_Index_nones_as_index_b", toindex.dtype]))(grid, block, (toindex, length, scan_in_array, scan_in_array_n_non_null, invocation_index, err_code)) +// out["awkward_Index_nones_as_index_a", {dtype_specializations}] = None +// out["awkward_Index_nones_as_index_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_Index_nones_as_index_a( + T* toindex, + int64_t length, + int64_t* scan_in_array, + int64_t* scan_in_array_n_non_null, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < length) { + if (toindex[thread_id] != -1) { + scan_in_array[thread_id] = 1; + scan_in_array_n_non_null[thread_id] = 0; + } + else { + scan_in_array_n_non_null[thread_id] = 1; + scan_in_array[thread_id] = 0; + } + } + } +} + +template +__global__ void +awkward_Index_nones_as_index_b( + T* toindex, + int64_t length, + int64_t* scan_in_array, + int64_t* scan_in_array_n_non_null, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t n_non_null = scan_in_array[length - 1]; + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id < length) { + toindex[thread_id] == -1 ? toindex[thread_id] = (n_non_null + scan_in_array_n_non_null[thread_id] - 1): toindex[thread_id]; + } + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_fill.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_fill.cu index 98cc1ab21a..8608d48930 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_fill.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_fill.cu @@ -2,13 +2,14 @@ template __global__ void -awkward_IndexedArray_fill(T* toindex, - int64_t toindexoffset, - const C* fromindex, - int64_t length, - int64_t base, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_fill( + T* toindex, + int64_t toindexoffset, + const C* fromindex, + int64_t length, + int64_t base, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_fill_count.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_fill_count.cu index 6583a04fcf..3ab72241fe 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_fill_count.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_fill_count.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_IndexedArray_fill_count(T* toindex, - int64_t toindexoffset, - int64_t length, - int64_t base, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_fill_count( + T* toindex, + int64_t toindexoffset, + int64_t length, + int64_t base, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_flatten_nextcarry.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_flatten_nextcarry.cu index 2fdb9cd26c..8a0d09deb9 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_flatten_nextcarry.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_flatten_nextcarry.cu @@ -9,7 +9,7 @@ enum class INDEXEDARRAY_FLATTEN_NEXTCARRY_ERRORS { // (tocarry, fromindex, lenindex, lencontent, invocation_index, err_code) = args // scan_in_array = cupy.empty(lenindex, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_flatten_nextcarry_a", tocarry.dtype, fromindex.dtype]))(grid, block, (tocarry, fromindex, lenindex, lencontent, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_flatten_nextcarry_b", tocarry.dtype, fromindex.dtype]))(grid, block, (tocarry, fromindex, lenindex, lencontent, scan_in_array, invocation_index, err_code)) // out["awkward_IndexedArray_flatten_nextcarry_a", {dtype_specializations}] = None // out["awkward_IndexedArray_flatten_nextcarry_b", {dtype_specializations}] = None @@ -17,13 +17,14 @@ enum class INDEXEDARRAY_FLATTEN_NEXTCARRY_ERRORS { template __global__ void -awkward_IndexedArray_flatten_nextcarry_a(T* tocarry, - const C* fromindex, - int64_t lenindex, - int64_t lencontent, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_flatten_nextcarry_a( + T* tocarry, + const C* fromindex, + int64_t lenindex, + int64_t lencontent, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < lenindex) { @@ -41,13 +42,14 @@ awkward_IndexedArray_flatten_nextcarry_a(T* tocarry, template __global__ void -awkward_IndexedArray_flatten_nextcarry_b(T* tocarry, - const C* fromindex, - int64_t lenindex, - int64_t lencontent, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_flatten_nextcarry_b( + T* tocarry, + const C* fromindex, + int64_t lenindex, + int64_t lencontent, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry.cu index f530a6bd45..081b597e91 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry.cu @@ -9,7 +9,7 @@ enum class INDEXEDARRAY_GETITEM_NEXTCARRY_ERRORS { // (tocarry, fromindex, lenindex, lencontent, invocation_index, err_code) = args // scan_in_array = cupy.empty(lenindex, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_getitem_nextcarry_a", tocarry.dtype, fromindex.dtype]))(grid, block, (tocarry, fromindex, lenindex, lencontent, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_getitem_nextcarry_b", tocarry.dtype, fromindex.dtype]))(grid, block, (tocarry, fromindex, lenindex, lencontent, scan_in_array, invocation_index, err_code)) // out["awkward_IndexedArray_getitem_nextcarry_a", {dtype_specializations}] = None // out["awkward_IndexedArray_getitem_nextcarry_b", {dtype_specializations}] = None @@ -17,13 +17,14 @@ enum class INDEXEDARRAY_GETITEM_NEXTCARRY_ERRORS { template __global__ void -awkward_IndexedArray_getitem_nextcarry_a(T* tocarry, - const C* fromindex, - int64_t lenindex, - int64_t lencontent, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_getitem_nextcarry_a( + T* tocarry, + const C* fromindex, + int64_t lenindex, + int64_t lencontent, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < lenindex) { @@ -41,13 +42,14 @@ awkward_IndexedArray_getitem_nextcarry_a(T* tocarry, template __global__ void -awkward_IndexedArray_getitem_nextcarry_b(T* tocarry, - const C* fromindex, - int64_t lenindex, - int64_t lencontent, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_getitem_nextcarry_b( + T* tocarry, + const C* fromindex, + int64_t lenindex, + int64_t lencontent, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry_outindex.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry_outindex.cu index 237790a617..7ef0bf9cdd 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry_outindex.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_getitem_nextcarry_outindex.cu @@ -9,7 +9,7 @@ enum class INDEXEDARRAY_GETITEM_NEXTCARRY_OUTINDEX_ERRORS { // (tocarry, toindex, fromindex, lenindex, lencontent, invocation_index, err_code) = args // scan_in_array = cupy.empty(lenindex, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_getitem_nextcarry_outindex_a", tocarry.dtype, toindex.dtype, fromindex.dtype]))(grid, block, (tocarry, toindex, fromindex, lenindex, lencontent, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_getitem_nextcarry_outindex_b", tocarry.dtype, toindex.dtype, fromindex.dtype]))(grid, block, (tocarry, toindex, fromindex, lenindex, lencontent, scan_in_array, invocation_index, err_code)) // out["awkward_IndexedArray_getitem_nextcarry_outindex_a", {dtype_specializations}] = None // out["awkward_IndexedArray_getitem_nextcarry_outindex_b", {dtype_specializations}] = None @@ -17,14 +17,15 @@ enum class INDEXEDARRAY_GETITEM_NEXTCARRY_OUTINDEX_ERRORS { template __global__ void -awkward_IndexedArray_getitem_nextcarry_outindex_a(T* tocarry, - C* toindex, - const U* fromindex, - int64_t lenindex, - int64_t lencontent, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_getitem_nextcarry_outindex_a( + T* tocarry, + C* toindex, + const U* fromindex, + int64_t lenindex, + int64_t lencontent, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < lenindex) { @@ -43,14 +44,15 @@ awkward_IndexedArray_getitem_nextcarry_outindex_a(T* tocarry, template __global__ void -awkward_IndexedArray_getitem_nextcarry_outindex_b(T* tocarry, - C* toindex, - const U* fromindex, - int64_t lenindex, - int64_t lencontent, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_getitem_nextcarry_outindex_b( + T* tocarry, + C* toindex, + const U* fromindex, + int64_t lenindex, + int64_t lencontent, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_index_of_nulls.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_index_of_nulls.cu index a5beadb17f..aaaae57f13 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_index_of_nulls.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_index_of_nulls.cu @@ -5,7 +5,7 @@ // (toindex, fromindex, lenindex, parents, starts, invocation_index, err_code) = args // scan_in_array = cupy.empty(lenindex, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_index_of_nulls_a", toindex.dtype, fromindex.dtype, parents.dtype, starts.dtype]))(grid, block, (toindex, fromindex, lenindex, parents, starts, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_index_of_nulls_b", toindex.dtype, fromindex.dtype, parents.dtype, starts.dtype]))(grid, block, (toindex, fromindex, lenindex, parents, starts, scan_in_array, invocation_index, err_code)) // out["awkward_IndexedArray_index_of_nulls_a", {dtype_specializations}] = None // out["awkward_IndexedArray_index_of_nulls_b", {dtype_specializations}] = None @@ -13,14 +13,15 @@ template __global__ void -awkward_IndexedArray_index_of_nulls_a(T* toindex, - const C* fromindex, - int64_t lenindex, - const U* parents, - const V* starts, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_index_of_nulls_a( + T* toindex, + const C* fromindex, + int64_t lenindex, + const U* parents, + const V* starts, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -36,14 +37,15 @@ awkward_IndexedArray_index_of_nulls_a(T* toindex, template __global__ void -awkward_IndexedArray_index_of_nulls_b(T* toindex, - const C* fromindex, - int64_t lenindex, - const U* parents, - const V* starts, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_index_of_nulls_b( + T* toindex, + const C* fromindex, + int64_t lenindex, + const U* parents, + const V* starts, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull.cu new file mode 100644 index 0000000000..b340562e55 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull.cu @@ -0,0 +1,49 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (numnull, fromindex, lenindex, invocation_index, err_code) = args +// scan_in_array = cupy.empty(lenindex, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_IndexedArray_numnull_a', numnull.dtype, fromindex.dtype]))(grid, block, (numnull, fromindex, lenindex, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_IndexedArray_numnull_b', numnull.dtype, fromindex.dtype]))(grid, block, (numnull, fromindex, lenindex, scan_in_array, invocation_index, err_code)) +// out["awkward_IndexedArray_numnull_a", {dtype_specializations}] = None +// out["awkward_IndexedArray_numnull_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_IndexedArray_numnull_a( + T* numnull, + const C* fromindex, + int64_t lenindex, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenindex) { + if (fromindex[thread_id] < 0) { + scan_in_array[thread_id] = 1; + } + else { + scan_in_array[thread_id] = 0; + } + } + } +} + +template +__global__ void +awkward_IndexedArray_numnull_b( + T* numnull, + const C* fromindex, + int64_t lenindex, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + *numnull = lenindex > 0 ? scan_in_array[lenindex - 1] : 0; + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull_parents.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull_parents.cu new file mode 100644 index 0000000000..2948351950 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull_parents.cu @@ -0,0 +1,53 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (numnull, tolength, fromindex, lenindex, invocation_index, err_code) = args +// scan_in_array = cupy.empty(lenindex, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_IndexedArray_numnull_parents_a', numnull.dtype, tolength.dtype, fromindex.dtype]))(grid, block, (numnull, tolength, fromindex, lenindex, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_IndexedArray_numnull_parents_b', numnull.dtype, tolength.dtype, fromindex.dtype]))(grid, block, (numnull, tolength, fromindex, lenindex, scan_in_array, invocation_index, err_code)) +// out["awkward_IndexedArray_numnull_parents_a", {dtype_specializations}] = None +// out["awkward_IndexedArray_numnull_parents_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_IndexedArray_numnull_parents_a( + T* numnull, + C* tolength, + const U* fromindex, + int64_t lenindex, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenindex) { + if (fromindex[thread_id] < 0) { + numnull[thread_id] = 1; + scan_in_array[thread_id] = 1; + } + else { + numnull[thread_id] = 0; + scan_in_array[thread_id] = 0; + } + } + } +} + +template +__global__ void +awkward_IndexedArray_numnull_parents_b( + T* numnull, + C* tolength, + const U* fromindex, + int64_t lenindex, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + *tolength = lenindex > 0 ? scan_in_array[lenindex - 1] : 0; + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull_unique_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull_unique_64.cu index 85306498d0..cdcafd636f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull_unique_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_numnull_unique_64.cu @@ -2,10 +2,11 @@ template __global__ void -awkward_IndexedArray_numnull_unique_64(T* toindex, - int64_t lenindex, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_numnull_unique_64( + T* toindex, + int64_t lenindex, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id <= lenindex) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_overlay_mask.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_overlay_mask.cu index 9d19d37e5e..b9decedf61 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_overlay_mask.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_overlay_mask.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_IndexedArray_overlay_mask(T* toindex, - const C* mask, - const U* fromindex, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_overlay_mask( + T* toindex, + const C* mask, + const U* fromindex, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_64.cu index 32aff24eb4..33baa5d585 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_64.cu @@ -4,24 +4,25 @@ // def f(grid, block, args): // (nextcarry, nextparents, outindex, index, parents, length, invocation_index, err_code) = args // scan_in_array = cupy.empty(length, dtype=cupy.int64) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_reduce_next_64_a", nextcarry.dtype, nextparents.dtype, outindex.dtype, index.dtype, parents.dtype]))(grid, block, (nextcarry, nextparents, outindex, index, parents, length, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) -// cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_reduce_next_64_b", nextcarry.dtype, nextparents.dtype, outindex.dtype, index.dtype, parents.dtype]))(grid, block, (nextcarry, nextparents, outindex, index, parents, length, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_reduce_next_64_a", nextcarry.dtype, nextparents.dtype, outindex.dtype, index.dtype, parents.dtype]))(grid, block, (nextcarry, nextparents, outindex, index, parents, length, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_reduce_next_64_b", nextcarry.dtype, nextparents.dtype, outindex.dtype, index.dtype, parents.dtype]))(grid, block, (nextcarry, nextparents, outindex, index, parents, length, scan_in_array, invocation_index, err_code)) // out["awkward_IndexedArray_reduce_next_64_a", {dtype_specializations}] = None // out["awkward_IndexedArray_reduce_next_64_b", {dtype_specializations}] = None // END PYTHON template __global__ void -awkward_IndexedArray_reduce_next_64_a(T* nextcarry, - C* nextparents, - U* outindex, - const V* index, - const W* parents, - int64_t length, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_reduce_next_64_a( + T* nextcarry, + C* nextparents, + U* outindex, + const V* index, + const W* parents, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -37,15 +38,16 @@ awkward_IndexedArray_reduce_next_64_a(T* nextcarry, template __global__ void -awkward_IndexedArray_reduce_next_64_b(T* nextcarry, - C* nextparents, - U* outindex, - const V* index, - const W* parents, - int64_t length, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_reduce_next_64_b( + T* nextcarry, + C* nextparents, + U* outindex, + const V* index, + const W* parents, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_fix_offsets_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_fix_offsets_64.cu index 0bb661fc73..5fce2fa869 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_fix_offsets_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_fix_offsets_64.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_IndexedArray_reduce_next_fix_offsets_64(T* outoffsets, - const C* starts, - int64_t startslength, - int64_t outindexlength, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_reduce_next_fix_offsets_64( + T* outoffsets, + const C* starts, + int64_t startslength, + int64_t outindexlength, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < startslength) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64.cu index 7ad21c6b06..c04c239b61 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64.cu @@ -6,8 +6,8 @@ // scan_in_array_k = cupy.empty(length, dtype=cupy.int64) // scan_in_array_nullsum = cupy.empty(length, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64_a", nextshifts.dtype, index.dtype]))(grid, block, (nextshifts, index, length, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) -// scan_in_array_k = inclusive_scan(grid, block, (scan_in_array_k, invocation_index, err_code)) -// scan_in_array_nullsum = inclusive_scan(grid, block, (scan_in_array_nullsum, invocation_index, err_code)) +// scan_in_array_k = cupy.cumsum(scan_in_array_k) +// scan_in_array_nullsum = cupy.cumsum(scan_in_array_nullsum) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64_b", nextshifts.dtype, index.dtype]))(grid, block, (nextshifts, index, length, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) // out["awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64_a", {dtype_specializations}] = None // out["awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64_b", {dtype_specializations}] = None @@ -53,8 +53,7 @@ awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64_b( if (thread_id < length) { if (index[thread_id] >= 0) { - nextshifts[scan_in_array_k[thread_id] - 1] = - scan_in_array_nullsum[thread_id] - 1; + nextshifts[scan_in_array_k[thread_id] - 1] = scan_in_array_nullsum[thread_id]; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64.cu index a75a32d0cf..a61abd0e7d 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64.cu @@ -6,8 +6,8 @@ // scan_in_array_k = cupy.empty(length, dtype=cupy.int64) // scan_in_array_nullsum = cupy.empty(length, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_a", nextshifts.dtype, index.dtype, shifts.dtype]))(grid, block, (nextshifts, index, length, shifts, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) -// scan_in_array_k = inclusive_scan(grid, block, (scan_in_array_k, invocation_index, err_code)) -// scan_in_array_nullsum = inclusive_scan(grid, block, (scan_in_array_nullsum, invocation_index, err_code)) +// scan_in_array_k = cupy.cumsum(scan_in_array_k) +// scan_in_array_nullsum = cupy.cumsum(scan_in_array_nullsum) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_b", nextshifts.dtype, index.dtype, shifts.dtype]))(grid, block, (nextshifts, index, length, shifts, scan_in_array_k, scan_in_array_nullsum, invocation_index, err_code)) // out["awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_a", {dtype_specializations}] = None // out["awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_b", {dtype_specializations}] = None @@ -55,8 +55,7 @@ awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64_b( if (thread_id < length) { if (index[thread_id] >= 0) { - nextshifts[scan_in_array_k[thread_id] - 1] = - shifts[thread_id] + (scan_in_array_nullsum[thread_id] - 1); + nextshifts[scan_in_array_k[thread_id] - 1] = shifts[thread_id] + scan_in_array_nullsum[thread_id]; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_simplify.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_simplify.cu index 92382a33cc..8191338414 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_simplify.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_simplify.cu @@ -6,13 +6,14 @@ enum class INDEXEDARRAY_SIMPLIFY_ERRORS { template __global__ void -awkward_IndexedArray_simplify(T* toindex, - const C* outerindex, - int64_t outerlength, - const U* innerindex, - int64_t innerlength, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_simplify( + T* toindex, + const C* outerindex, + int64_t outerlength, + const U* innerindex, + int64_t innerlength, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outerlength) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_validity.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_validity.cu index e0f33e9a46..5506fbd498 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_validity.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedArray_validity.cu @@ -7,12 +7,13 @@ enum class INDEXEDARRAY_VALIDITY_ERRORS { template __global__ void -awkward_IndexedArray_validity(const T* index, - int64_t length, - int64_t lencontent, - bool isoption, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedArray_validity( + const T* index, + int64_t length, + int64_t lencontent, + bool isoption, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedOptionArray_rpad_and_clip_mask_axis1.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedOptionArray_rpad_and_clip_mask_axis1.cu index 5bcfd361c4..95fe4a8936 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedOptionArray_rpad_and_clip_mask_axis1.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_IndexedOptionArray_rpad_and_clip_mask_axis1.cu @@ -5,7 +5,7 @@ // (toindex, frommask, length, invocation_index, err_code) = args // scan_in_array = cupy.empty(length, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedOptionArray_rpad_and_clip_mask_axis1_a", toindex.dtype, frommask.dtype]))(grid, block, (toindex, frommask, length, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_IndexedOptionArray_rpad_and_clip_mask_axis1_b", toindex.dtype, frommask.dtype]))(grid, block, (toindex, frommask, length, scan_in_array, invocation_index, err_code)) // out["awkward_IndexedOptionArray_rpad_and_clip_mask_axis1_a", {dtype_specializations}] = None // out["awkward_IndexedOptionArray_rpad_and_clip_mask_axis1_b", {dtype_specializations}] = None @@ -13,12 +13,13 @@ template __global__ void -awkward_IndexedOptionArray_rpad_and_clip_mask_axis1_a(T* toindex, - const C* frommask, - int64_t length, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedOptionArray_rpad_and_clip_mask_axis1_a( + T* toindex, + const C* frommask, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -34,12 +35,13 @@ awkward_IndexedOptionArray_rpad_and_clip_mask_axis1_a(T* toindex, template __global__ void -awkward_IndexedOptionArray_rpad_and_clip_mask_axis1_b(T* toindex, - const C* frommask, - int64_t length, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_IndexedOptionArray_rpad_and_clip_mask_axis1_b( + T* toindex, + const C* frommask, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_compact_offsets.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_compact_offsets.cu index d2f89e63eb..feec5ecd4c 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_compact_offsets.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_compact_offsets.cu @@ -14,12 +14,13 @@ enum class LISTARRAY_COMPACT_OFFSETS_ERRORS { template __global__ void -awkward_ListArray_compact_offsets_a(T* tooffsets, - const C* fromstarts, - const U* fromstops, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListArray_compact_offsets_a( + T* tooffsets, + const C* fromstarts, + const U* fromstops, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; tooffsets[0] = 0; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_fill.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_fill.cu index 80e110fee0..849b766054 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_fill.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_fill.cu @@ -2,16 +2,17 @@ template __global__ void -awkward_ListArray_fill(T* tostarts, - int64_t tostartsoffset, - C* tostops, - int64_t tostopsoffset, - const U* fromstarts, - const V* fromstops, - int64_t length, - int64_t base, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListArray_fill( + T* tostarts, + int64_t tostartsoffset, + C* tostops, + int64_t tostopsoffset, + const U* fromstarts, + const V* fromstops, + int64_t length, + int64_t base, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_carrylen.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_carrylen.cu new file mode 100644 index 0000000000..a6e01f3f82 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_carrylen.cu @@ -0,0 +1,46 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (carrylen, slicestarts, slicestops, sliceouterlen, invocation_index, err_code) = args +// scan_in_array = cupy.empty(sliceouterlen, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_jagged_carrylen_a", carrylen.dtype, slicestarts.dtype, slicestops.dtype]))(grid, block, (carrylen, slicestarts, slicestops, sliceouterlen, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_jagged_carrylen_b", carrylen.dtype, slicestarts.dtype, slicestops.dtype]))(grid, block, (carrylen, slicestarts, slicestops, sliceouterlen, scan_in_array, invocation_index, err_code)) +// out["awkward_ListArray_getitem_jagged_carrylen_a", {dtype_specializations}] = None +// out["awkward_ListArray_getitem_jagged_carrylen_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_ListArray_getitem_jagged_carrylen_a( + T* carrylen, + const C* slicestarts, + const U* slicestops, + int64_t sliceouterlen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < sliceouterlen) { + scan_in_array[thread_id] = (int64_t)(slicestops[thread_id] - slicestarts[thread_id]); + } + } +} + +template +__global__ void +awkward_ListArray_getitem_jagged_carrylen_b( + T* carrylen, + const C* slicestarts, + const U* slicestops, + int64_t sliceouterlen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + *carrylen = sliceouterlen > 0 ? scan_in_array[sliceouterlen - 1] : 0; + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_expand.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_expand.cu index 030a04b73e..cf60c17dc1 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_expand.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_jagged_expand.cu @@ -7,31 +7,34 @@ enum class LISTARRAY_GETITEM_JAGGED_EXPAND_ERRORS { template __global__ void -awkward_ListArray_getitem_jagged_expand(T* multistarts, - C* multistops, - const U* singleoffsets, - V* tocarry, - const W* fromstarts, - const X* fromstops, - int64_t jaggedsize, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListArray_getitem_jagged_expand( + T* multistarts, + C* multistops, + const U* singleoffsets, + V* tocarry, + const W* fromstarts, + const X* fromstops, + int64_t jaggedsize, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) % length; + int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / jaggedsize; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % jaggedsize; - W start = fromstarts[thread_id]; - X stop = fromstops[thread_id]; - if (stop < start) { - RAISE_ERROR(LISTARRAY_GETITEM_JAGGED_EXPAND_ERRORS::STOPS_LT_START) + if (thread_id < length && thready_id < jaggedsize) { + W start = fromstarts[thread_id]; + X stop = fromstops[thread_id]; + if (stop < start) { + RAISE_ERROR(LISTARRAY_GETITEM_JAGGED_EXPAND_ERRORS::STOPS_LT_START) + } + if ((stop - start) != jaggedsize) { + RAISE_ERROR(LISTARRAY_GETITEM_JAGGED_EXPAND_ERRORS::FIT_ERR) + } + multistarts[(thread_id * jaggedsize) + thready_id] = + singleoffsets[thready_id]; + multistops[(thread_id * jaggedsize) + thready_id] = + singleoffsets[(thready_id + 1)]; + tocarry[(thread_id * jaggedsize) + thready_id] = (start + thready_id); } - if ((stop - start) != jaggedsize) { - RAISE_ERROR(LISTARRAY_GETITEM_JAGGED_EXPAND_ERRORS::FIT_ERR) - } - multistarts[(thread_id * jaggedsize) + thready_id] = - singleoffsets[thready_id]; - multistops[(thread_id * jaggedsize) + thready_id] = - singleoffsets[(thready_id + 1)]; - tocarry[(thread_id * jaggedsize) + thready_id] = (start + thready_id); } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_array.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_array.cu index 0a94139d32..42c3573983 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_array.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_array.cu @@ -8,16 +8,17 @@ enum class LISTARRAY_GETITEM_NEXT_ARRAY_ERRORS { template __global__ void -awkward_ListArray_getitem_next_array(T* tocarry, - C* toadvanced, - const U* fromstarts, - const V* fromstops, - const W* fromarray, - int64_t lenstarts, - int64_t lenarray, - int64_t lencontent, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListArray_getitem_next_array( + T* tocarry, + C* toadvanced, + const U* fromstarts, + const V* fromstops, + const W* fromarray, + int64_t lenstarts, + int64_t lenarray, + int64_t lencontent, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / lenarray; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % lenarray; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_array_advanced.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_array_advanced.cu index a88e0d7df4..349a63c0f4 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_array_advanced.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_array_advanced.cu @@ -8,17 +8,18 @@ enum class LISTARRAY_GETITEM_NEXT_ARRAY_ADVANCED_ERRORS { template __global__ void -awkward_ListArray_getitem_next_array_advanced(T* tocarry, - C* toadvanced, - const U* fromstarts, - const V* fromstops, - const W* fromarray, - const X* fromadvanced, - int64_t lenstarts, - int64_t lenarray, - int64_t lencontent, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListArray_getitem_next_array_advanced( + T* tocarry, + C* toadvanced, + const U* fromstarts, + const V* fromstops, + const W* fromarray, + const X* fromadvanced, + int64_t lenstarts, + int64_t lenarray, + int64_t lencontent, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_at.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_at.cu index a04dd50aed..421f0d15c1 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_at.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_at.cu @@ -6,13 +6,14 @@ enum class LISTARRAY_GETITEM_NEXT_AT_ERRORS { template __global__ void -awkward_ListArray_getitem_next_at(T* tocarry, - const C* fromstarts, - const U* fromstops, - int64_t lenstarts, - int64_t at, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListArray_getitem_next_at( + T* tocarry, + const C* fromstarts, + const U* fromstops, + int64_t lenstarts, + int64_t at, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_counts.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_counts.cu new file mode 100644 index 0000000000..f05144decf --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_getitem_next_range_counts.cu @@ -0,0 +1,44 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (total, fromoffsets, lenstarts, invocation_total, err_code) = args +// scan_in_array = cupy.empty(lenstarts, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_counts_a", total.dtype, fromoffsets.dtype]))(grid, block, (total, fromoffsets, lenstarts, scan_in_array, invocation_total, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_getitem_next_range_counts_b", total.dtype, fromoffsets.dtype]))(grid, block, (total, fromoffsets, lenstarts, scan_in_array, invocation_total, err_code)) +// out["awkward_ListArray_getitem_next_range_counts_a", {dtype_specializations}] = None +// out["awkward_ListArray_getitem_next_range_counts_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_ListArray_getitem_next_range_counts_a( + T* total, + const C* fromoffsets, + int64_t lenstarts, + int64_t* scan_in_array, + uint64_t invocation_total, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenstarts) { + scan_in_array[thread_id] = (int64_t)fromoffsets[thread_id + 1] - fromoffsets[thread_id]; + } + } +} + +template +__global__ void +awkward_ListArray_getitem_next_range_counts_b( + T* total, + const C* fromoffsets, + int64_t lenstarts, + int64_t* scan_in_array, + uint64_t invocation_total, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + *total = lenstarts > 0 ? scan_in_array[lenstarts - 1] : 0; + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_min_range.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_min_range.cu index a49f52173c..f4fb8268b0 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_min_range.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_min_range.cu @@ -1,21 +1,55 @@ // BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE +// BEGIN PYTHON +// def f(grid, block, args): +// (tomin, fromstarts, fromstops, lenstarts, invocation_index, err_code) = args +// scan_in_array = cupy.empty(lenstarts, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_min_range_a", tomin.dtype, fromstarts.dtype, fromstops.dtype]))(grid, block, (tomin, fromstarts, fromstops, lenstarts, scan_in_array, invocation_index, err_code)) +// if lenstarts > 0: +// scan_in_array[0] = cupy.min(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_min_range_b", tomin.dtype, fromstarts.dtype, fromstops.dtype]))(grid, block, (tomin, fromstarts, fromstops, lenstarts, scan_in_array, invocation_index, err_code)) +// out["awkward_ListArray_min_range_a", {dtype_specializations}] = None +// out["awkward_ListArray_min_range_b", {dtype_specializations}] = None +// END PYTHON + template __global__ void -awkward_ListArray_min_range(T* tomin, - const C* fromstarts, - const U* fromstops, - int64_t lenstarts, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListArray_min_range_a( + T* tomin, + const C* fromstarts, + const U* fromstops, + int64_t lenstarts, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int64_t shorter = fromstops[0] - fromstarts[0]; - if (thread_id >=1 && thread_id < lenstarts) { - int64_t rangeval = fromstops[thread_id] - fromstarts[thread_id]; - shorter = (shorter < rangeval) ? shorter : rangeval; - atomicMin(tomin, shorter); + if (thread_id < lenstarts) { + if (thread_id == 0) { + scan_in_array[thread_id] = fromstops[0] - fromstarts[0]; + } + else { + int64_t rangeval = fromstops[thread_id] - fromstarts[thread_id]; + scan_in_array[thread_id] = rangeval; + } + } + } +} + +template +__global__ void +awkward_ListArray_min_range_b( + T* tomin, + const C* fromstarts, + const U* fromstops, + int64_t lenstarts, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + if (lenstarts > 0) { + *tomin = scan_in_array[0]; } } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_rpad_and_clip_length_axis1.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_rpad_and_clip_length_axis1.cu new file mode 100644 index 0000000000..37be0aa11b --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_rpad_and_clip_length_axis1.cu @@ -0,0 +1,49 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (tomin, fromstarts, fromstops, target, lenstarts, invocation_index, err_code) = args +// scan_in_array = cupy.empty(lenstarts, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_rpad_and_clip_length_axis1_a", tomin.dtype, fromstarts.dtype, fromstops.dtype]))(grid, block, (tomin, fromstarts, fromstops, target, lenstarts, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_rpad_and_clip_length_axis1_b", tomin.dtype, fromstarts.dtype, fromstops.dtype]))(grid, block, (tomin, fromstarts, fromstops, target, lenstarts, scan_in_array, invocation_index, err_code)) +// out["awkward_ListArray_rpad_and_clip_length_axis1_a", {dtype_specializations}] = None +// out["awkward_ListArray_rpad_and_clip_length_axis1_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_ListArray_rpad_and_clip_length_axis1_a( + T* tomin, + const C* fromstarts, + const U* fromstops, + int64_t target, + int64_t lenstarts, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenstarts) { + int64_t rangeval = fromstops[thread_id] - fromstarts[thread_id]; + scan_in_array[thread_id] = (target > rangeval) ? target : rangeval; + } + } +} + +template +__global__ void +awkward_ListArray_rpad_and_clip_length_axis1_b( + T* tomin, + const C* fromstarts, + const U* fromstops, + int64_t target, + int64_t lenstarts, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + *tomin = lenstarts > 0 ? scan_in_array[lenstarts - 1] : 0; + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_validity.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_validity.cu index befc592c13..c37de8100b 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_validity.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_validity.cu @@ -6,19 +6,20 @@ enum class LISTARRAY_VALIDITY_ERRORS { ERROR_STOP_CONTENT // message: "stop[i] > len(content)" }; -template +template __global__ void -awkward_ListArray_validity(const C* starts, - const T* stops, - int64_t length, - int64_t lencontent, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListArray_validity( + const T* starts, + const C* stops, + int64_t length, + int64_t lencontent, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { - C start = starts[thread_id]; - T stop = stops[thread_id]; + T start = starts[thread_id]; + C stop = stops[thread_id]; if (start != stop) { if (start > stop) { RAISE_ERROR(LISTARRAY_VALIDITY_ERRORS::ERROR_START_STOP) diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_flatten_offsets.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_flatten_offsets.cu index 87e09eb853..5bf7bab47e 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_flatten_offsets.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_flatten_offsets.cu @@ -2,13 +2,14 @@ template __global__ void -awkward_ListOffsetArray_flatten_offsets(T* tooffsets, - const C* outeroffsets, - int64_t outeroffsetslen, - const U* inneroffsets, - int64_t inneroffsetslen, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListOffsetArray_flatten_offsets( + T* tooffsets, + const C* outeroffsets, + int64_t outeroffsetslen, + const U* inneroffsets, + int64_t inneroffsetslen, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outeroffsetslen) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64.cu new file mode 100644 index 0000000000..d7aa492421 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64.cu @@ -0,0 +1,52 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (nextstarts, nextparents, nextlen, invocation_index, err_code) = args +// scan_in_array = cupy.empty(nextlen, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64_a", nextstarts.dtype, nextparents.dtype]))(grid, block, (nextstarts, nextparents, nextlen, scan_in_array, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64_b", nextstarts.dtype, nextparents.dtype]))(grid, block, (nextstarts, nextparents, nextlen, scan_in_array, invocation_index, err_code)) +// out["awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64_a", {dtype_specializations}] = None +// out["awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64_a( + T* nextstarts, + const C* nextparents, + int64_t nextlen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < nextlen) { + if (thread_id == 0) { + scan_in_array[0] = -1; + } + scan_in_array[thread_id + 1] = nextparents[thread_id]; + } + } +} + +template +__global__ void +awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64_b( + T* nextstarts, + const C* nextparents, + int64_t nextlen, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < nextlen) { + if (nextparents[thread_id] != scan_in_array[thread_id]) { + nextstarts[nextparents[thread_id]] = thread_id; + } + } + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_and_clip_axis1.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_and_clip_axis1.cu index c3032e77d8..44249232e9 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_and_clip_axis1.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_and_clip_axis1.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_ListOffsetArray_rpad_and_clip_axis1(T* toindex, - const C* fromoffsets, - int64_t length, - int64_t target, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListOffsetArray_rpad_and_clip_axis1( + T* toindex, + const C* fromoffsets, + int64_t length, + int64_t target, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / target; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % target; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_axis1.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_axis1.cu index 449f6b84ea..a21f746f5c 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_axis1.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_rpad_axis1.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_ListOffsetArray_rpad_axis1(T* toindex, - const C* fromoffsets, - int64_t fromlength, - int64_t target, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_ListOffsetArray_rpad_axis1( + T* toindex, + const C* fromoffsets, + int64_t fromlength, + int64_t target, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / target; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % target; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_MaskedArray_getitem_next_jagged_project.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_MaskedArray_getitem_next_jagged_project.cu index 26372ebe46..af0ab2faea 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_MaskedArray_getitem_next_jagged_project.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_MaskedArray_getitem_next_jagged_project.cu @@ -5,7 +5,7 @@ // (index, starts_in, stops_in, starts_out, stops_out, length, invocation_index, err_code) = args // scan_in_array = cupy.empty(length, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_MaskedArray_getitem_next_jagged_project_a", index.dtype, starts_in.dtype, stops_in.dtype, starts_out.dtype, stops_out.dtype]))(grid, block, (index, starts_in, stops_in, starts_out, stops_out, length, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_MaskedArray_getitem_next_jagged_project_b", index.dtype, starts_in.dtype, stops_in.dtype, starts_out.dtype, stops_out.dtype]))(grid, block, (index, starts_in, stops_in, starts_out, stops_out, length, scan_in_array, invocation_index, err_code)) // out["awkward_MaskedArray_getitem_next_jagged_project_a", {dtype_specializations}] = None // out["awkward_MaskedArray_getitem_next_jagged_project_b", {dtype_specializations}] = None @@ -13,15 +13,16 @@ template __global__ void -awkward_MaskedArray_getitem_next_jagged_project_a(T* index, - C* starts_in, - U* stops_in, - V* starts_out, - W* stops_out, - int64_t length, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_MaskedArray_getitem_next_jagged_project_a( + T* index, + C* starts_in, + U* stops_in, + V* starts_out, + W* stops_out, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -37,15 +38,16 @@ awkward_MaskedArray_getitem_next_jagged_project_a(T* index, template __global__ void -awkward_MaskedArray_getitem_next_jagged_project_b(T* index, - C* starts_in, - U* stops_in, - V* starts_out, - W* stops_out, - int64_t length, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_MaskedArray_getitem_next_jagged_project_b( + T* index, + C* starts_in, + U* stops_in, + V* starts_out, + W* stops_out, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_fill.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_fill.cu index ca5a6e6f18..e0ef17c78f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_fill.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_fill.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_NumpyArray_fill(T* toptr, - int64_t tooffset, - const C* fromptr, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_NumpyArray_fill( + T* toptr, + int64_t tooffset, + const C* fromptr, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_adjust_starts_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_adjust_starts_64.cu index 59ac8caf21..75595ff90f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_adjust_starts_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_adjust_starts_64.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_NumpyArray_reduce_adjust_starts_64(T* toptr, - int64_t outlength, - const C* parents, - const U* starts, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_NumpyArray_reduce_adjust_starts_64( + T* toptr, + int64_t outlength, + const C* parents, + const U* starts, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_adjust_starts_shifts_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_adjust_starts_shifts_64.cu index 526dbde17c..eead767e76 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_adjust_starts_shifts_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_adjust_starts_shifts_64.cu @@ -2,13 +2,14 @@ template __global__ void -awkward_NumpyArray_reduce_adjust_starts_shifts_64(T* toptr, - int64_t outlength, - const C* parents, - const U* starts, - const V* shifts, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_NumpyArray_reduce_adjust_starts_shifts_64( + T* toptr, + int64_t outlength, + const C* parents, + const U* starts, + const V* shifts, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_mask_ByteMaskedArray_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_mask_ByteMaskedArray_64.cu index 1d8400ea67..17282b7176 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_mask_ByteMaskedArray_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_NumpyArray_reduce_mask_ByteMaskedArray_64.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_NumpyArray_reduce_mask_ByteMaskedArray_64(T* toptr, - const C* parents, - int64_t lenparents, - int64_t outlength, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_NumpyArray_reduce_mask_ByteMaskedArray_64( + T* toptr, + const C* parents, + int64_t lenparents, + int64_t outlength, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_carry.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_carry.cu index 4a1c41ad4f..c5f993afd1 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_carry.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_carry.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_RegularArray_getitem_carry(T* tocarry, - const C* fromcarry, - int64_t lencarry, - int64_t size, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_RegularArray_getitem_carry( + T* tocarry, + const C* fromcarry, + int64_t lencarry, + int64_t size, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / size; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % size; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_jagged_expand.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_jagged_expand.cu index 71a0c1f1c2..8c232aca7f 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_jagged_expand.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_jagged_expand.cu @@ -2,13 +2,14 @@ template __global__ void -awkward_RegularArray_getitem_jagged_expand(T* multistarts, - C* multistops, - const U* singleoffsets, - int64_t regularsize, - int64_t regularlength, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_RegularArray_getitem_jagged_expand( + T* multistarts, + C* multistops, + const U* singleoffsets, + int64_t regularsize, + int64_t regularlength, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / regularsize; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % regularsize; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array.cu index d7d188d8c4..ba4b229917 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array.cu @@ -2,14 +2,15 @@ template __global__ void -awkward_RegularArray_getitem_next_array(T* tocarry, - C* toadvanced, - const U* fromarray, - int64_t length, - int64_t lenarray, - int64_t size, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_RegularArray_getitem_next_array( + T* tocarry, + C* toadvanced, + const U* fromarray, + int64_t length, + int64_t lenarray, + int64_t size, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / lenarray; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % lenarray; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array_advanced.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array_advanced.cu index fdbe2c8206..79382736d2 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array_advanced.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array_advanced.cu @@ -2,15 +2,16 @@ template __global__ void -awkward_RegularArray_getitem_next_array_advanced(T* tocarry, - C* toadvanced, - const U* fromadvanced, - const V* fromarray, - int64_t length, - int64_t lenarray, - int64_t size, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_RegularArray_getitem_next_array_advanced( + T* tocarry, + C* toadvanced, + const U* fromadvanced, + const V* fromarray, + int64_t length, + int64_t lenarray, + int64_t size, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array_regularize.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array_regularize.cu new file mode 100644 index 0000000000..c5b9d50cc1 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_array_regularize.cu @@ -0,0 +1,62 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (toarray, fromarray, lenarray, size, invocation_index, err_code) = args +// scan_in_array = cupy.empty(lenarray, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_RegularArray_getitem_next_array_regularize_a', toarray.dtype, fromarray.dtype]))(grid, block, (toarray, fromarray, lenarray, size, scan_in_array, invocation_index, err_code)) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_RegularArray_getitem_next_array_regularize_b', toarray.dtype, fromarray.dtype]))(grid, block, (toarray, fromarray, lenarray, size, scan_in_array, invocation_index, err_code)) +// out["awkward_RegularArray_getitem_next_array_regularize_a", {dtype_specializations}] = None +// out["awkward_RegularArray_getitem_next_array_regularize_b", {dtype_specializations}] = None +// END PYTHON + +enum class REGULARARRAY_GETITEM_NEXT_ARRAY_REGULARIZE_ERRORS { + IND_OUT_OF_RANGE // message: "index out of range" +}; + +template +__global__ void +awkward_RegularArray_getitem_next_array_regularize_a( + T* toarray, + const C* fromarray, + int64_t lenarray, + int64_t size, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenarray) { + scan_in_array[thread_id] = fromarray[thread_id]; + if (scan_in_array[thread_id] < 0) { + scan_in_array[thread_id] = fromarray[thread_id] + size; + } + if (!(0 <= scan_in_array[thread_id] && scan_in_array[thread_id] < size)) { + RAISE_ERROR(REGULARARRAY_GETITEM_NEXT_ARRAY_REGULARIZE_ERRORS::IND_OUT_OF_RANGE) + } + } + } +} + +template +__global__ void +awkward_RegularArray_getitem_next_array_regularize_b( + T* toarray, + const C* fromarray, + int64_t lenarray, + int64_t size, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + if (thread_id < lenarray) { + toarray[thread_id] = scan_in_array[thread_id]; + if (!(0 <= toarray[thread_id] && toarray[thread_id] < size)) { + RAISE_ERROR(REGULARARRAY_GETITEM_NEXT_ARRAY_REGULARIZE_ERRORS::IND_OUT_OF_RANGE) + } + } + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_at.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_at.cu index be74e01221..8f1282974d 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_at.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_at.cu @@ -6,12 +6,13 @@ enum class REGULARARRAY_GETITEM_NEXT_AT_ERRORS { template __global__ void -awkward_RegularArray_getitem_next_at(T* tocarry, - int64_t at, - int64_t length, - int64_t size, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_RegularArray_getitem_next_at( + T* tocarry, + int64_t at, + int64_t length, + int64_t size, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; int64_t regular_at = at; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_range.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_range.cu index 7af734b0a1..ff25c86a11 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_range.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_range.cu @@ -2,14 +2,15 @@ template __global__ void -awkward_RegularArray_getitem_next_range(T* tocarry, - int64_t regular_start, - int64_t step, - int64_t length, - int64_t size, - int64_t nextsize, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_RegularArray_getitem_next_range( + T* tocarry, + int64_t regular_start, + int64_t step, + int64_t length, + int64_t size, + int64_t nextsize, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / nextsize; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % nextsize; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_range_spreadadvanced.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_range_spreadadvanced.cu index 9984fc787e..814b31aa9a 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_range_spreadadvanced.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_getitem_next_range_spreadadvanced.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_RegularArray_getitem_next_range_spreadadvanced(T* toadvanced, - const C* fromadvanced, - int64_t length, - int64_t nextsize, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_RegularArray_getitem_next_range_spreadadvanced( + T* toadvanced, + const C* fromadvanced, + int64_t length, + int64_t nextsize, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / nextsize; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % nextsize; @@ -15,6 +16,5 @@ awkward_RegularArray_getitem_next_range_spreadadvanced(T* toadvanced, if (thread_id < length) { toadvanced[(thread_id * nextsize) + thready_id] = fromadvanced[thread_id]; } - toadvanced[(thread_id * nextsize) + thready_id] = fromadvanced[thread_id]; } } diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_localindex.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_localindex.cu index dce60cd4ee..597098e2a2 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_localindex.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_localindex.cu @@ -2,11 +2,12 @@ template __global__ void -awkward_RegularArray_localindex(T* toindex, - int64_t size, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_RegularArray_localindex( + T* toindex, + int64_t size, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / size; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % size; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_reduce_local_nextparents.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_reduce_local_nextparents.cu new file mode 100644 index 0000000000..b55ee1d7f6 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_reduce_local_nextparents.cu @@ -0,0 +1,48 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (nextparents, size, length, invocation_index, err_code) = args +// scan_in_array = cupy.empty(length * size, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_RegularArray_reduce_local_nextparents_a', nextparents.dtype]))(grid, block, (nextparents, size, length, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_RegularArray_reduce_local_nextparents_b', nextparents.dtype]))(grid, block, (nextparents, size, length, scan_in_array, invocation_index, err_code)) +// out["awkward_RegularArray_reduce_local_nextparents_a", {dtype_specializations}] = None +// out["awkward_RegularArray_reduce_local_nextparents_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_RegularArray_reduce_local_nextparents_a( + T* nextparents, + int64_t size, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t len = length * size; + if (thread_id < len) { + scan_in_array[thread_id] = 1; + } + } +} + +template +__global__ void +awkward_RegularArray_reduce_local_nextparents_b( + T* nextparents, + int64_t size, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / size; + int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % size; + if (thread_id < length && thready_id < size) { + nextparents[scan_in_array[thread_id * size + thready_id] - 1] = thread_id; + } + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_reduce_nonlocal_preparenext.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_reduce_nonlocal_preparenext.cu new file mode 100644 index 0000000000..e8d063fae7 --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_reduce_nonlocal_preparenext.cu @@ -0,0 +1,53 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (nextcarry, nextparents, parents, size, length, invocation_index, err_code) = args +// scan_in_array = cupy.empty(length * size, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_RegularArray_reduce_nonlocal_preparenext_a', nextcarry.dtype, nextparents.dtype, parents.dtype]))(grid, block, (nextcarry, nextparents, parents, size, length, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_RegularArray_reduce_nonlocal_preparenext_b', nextcarry.dtype, nextparents.dtype, parents.dtype]))(grid, block, (nextcarry, nextparents, parents, size, length, scan_in_array, invocation_index, err_code)) +// out["awkward_RegularArray_reduce_nonlocal_preparenext_a", {dtype_specializations}] = None +// out["awkward_RegularArray_reduce_nonlocal_preparenext_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_RegularArray_reduce_nonlocal_preparenext_a( + T* nextcarry, + C* nextparents, + const U* parents, + int64_t size, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { +if (err_code[0] == NO_ERROR) { + int64_t thready_id = blockIdx.x * blockDim.x + threadIdx.x; + int64_t len = length * size; + if (thready_id < len) { + scan_in_array[thready_id] = 1; + } + } +} + +template +__global__ void +awkward_RegularArray_reduce_nonlocal_preparenext_b( + T* nextcarry, + C* nextparents, + const U* parents, + int64_t size, + int64_t length, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) / length; + int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) % length; + if (thread_id < length && thready_id < size) { + nextcarry[scan_in_array[thready_id * length + thread_id] - 1] = thread_id * size + thready_id; + nextparents[scan_in_array[thready_id * length + thread_id] - 1] = parents[thread_id] * size + thready_id; + } + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_rpad_and_clip_axis1.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_rpad_and_clip_axis1.cu index ada6cfd004..fd9d23ab41 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_rpad_and_clip_axis1.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_RegularArray_rpad_and_clip_axis1.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_RegularArray_rpad_and_clip_axis1(T* toindex, - int64_t target, - int64_t size, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_RegularArray_rpad_and_clip_axis1( + T* toindex, + int64_t target, + int64_t size, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / target; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % target; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillindex.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillindex.cu index 1b3e2d0444..6f76000d3d 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillindex.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillindex.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_UnionArray_fillindex(T* toindex, - int64_t toindexoffset, - const C* fromindex, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_fillindex( + T* toindex, + int64_t toindexoffset, + const C* fromindex, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillindex_count.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillindex_count.cu index ff9676b49b..88d9559305 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillindex_count.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillindex_count.cu @@ -2,11 +2,12 @@ template __global__ void -awkward_UnionArray_fillindex_count(T* toindex, - int64_t toindexoffset, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_fillindex_count( + T* toindex, + int64_t toindexoffset, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillna.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillna.cu index fe8b2643ea..d44e103852 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillna.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_fillna.cu @@ -2,11 +2,12 @@ template __global__ void -awkward_UnionArray_fillna(T* toindex, - const C* fromindex, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_fillna( + T* toindex, + const C* fromindex, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_filltags.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_filltags.cu index 314c2dc96a..c826e3dcb3 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_filltags.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_filltags.cu @@ -2,13 +2,14 @@ template __global__ void -awkward_UnionArray_filltags(T* totags, - int64_t totagsoffset, - const C* fromtags, - int64_t length, - int64_t base, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_filltags( + T* totags, + int64_t totagsoffset, + const C* fromtags, + int64_t length, + int64_t base, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_filltags_const.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_filltags_const.cu index a4f62bda6b..bf8935abef 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_filltags_const.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_filltags_const.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_UnionArray_filltags_const(T* totags, - int64_t totagsoffset, - int64_t length, - int64_t base, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_filltags_const( + T* totags, + int64_t totagsoffset, + int64_t length, + int64_t base, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_project.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_project.cu index 24ed394fbd..b758010d14 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_project.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_project.cu @@ -3,7 +3,7 @@ // (lenout, tocarry, fromtags, fromindex, length, which, invocation_index, err_code) = args // scan_in_array = cupy.empty(length, dtype=cupy.int64) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_UnionArray_project_a", lenout.dtype, tocarry.dtype, fromtags.dtype, fromindex.dtype]))(grid, block, (lenout, tocarry, fromtags, fromindex, length, which, scan_in_array, invocation_index, err_code)) -// scan_in_array = inclusive_scan(grid, block, (scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) // cuda_kernel_templates.get_function(fetch_specialization(["awkward_UnionArray_project_b", lenout.dtype, tocarry.dtype, fromtags.dtype, fromindex.dtype]))(grid, block, (lenout, tocarry, fromtags, fromindex, length, which, scan_in_array, invocation_index, err_code)) // out["awkward_UnionArray_project_a", {dtype_specializations}] = None // out["awkward_UnionArray_project_b", {dtype_specializations}] = None @@ -11,15 +11,16 @@ template __global__ void -awkward_UnionArray_project_a(T* lenout, - C* tocarry, - const U* fromtags, - const V* fromindex, - int64_t length, - int64_t which, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_project_a( + T* lenout, + C* tocarry, + const U* fromtags, + const V* fromindex, + int64_t length, + int64_t which, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { @@ -34,19 +35,19 @@ awkward_UnionArray_project_a(T* lenout, template __global__ void -awkward_UnionArray_project_b(T* lenout, - C* tocarry, - const U* fromtags, - const V* fromindex, - int64_t length, - int64_t which, - int64_t* scan_in_array, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_project_b( + T* lenout, + C* tocarry, + const U* fromtags, + const V* fromindex, + int64_t length, + int64_t which, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { - lenout[0] = scan_in_array[length - 1]; + *lenout = length > 0 ? scan_in_array[length - 1] : 0; int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - if (thread_id < length) { if (fromtags[thread_id] == which) { tocarry[scan_in_array[thread_id] - 1] = fromindex[thread_id]; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_simplify.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_simplify.cu index bcba812aba..a1bf597f06 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_simplify.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_simplify.cu @@ -2,19 +2,20 @@ template __global__ void -awkward_UnionArray_simplify(T* totags, - C* toindex, - const U* outertags, - const V* outerindex, - const W* innertags, - const X* innerindex, - int64_t towhich, - int64_t innerwhich, - int64_t outerwhich, - int64_t length, - int64_t base, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_simplify( + T* totags, + C* toindex, + const U* outertags, + const V* outerindex, + const W* innertags, + const X* innerindex, + int64_t towhich, + int64_t innerwhich, + int64_t outerwhich, + int64_t length, + int64_t base, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_simplify_one.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_simplify_one.cu index 7d78b98c4c..936975a787 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_simplify_one.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_simplify_one.cu @@ -2,16 +2,17 @@ template __global__ void -awkward_UnionArray_simplify_one(T* totags, - C* toindex, - const U* fromtags, - const V* fromindex, - int64_t towhich, - int64_t fromwhich, - int64_t length, - int64_t base, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_simplify_one( + T* totags, + C* toindex, + const U* fromtags, + const V* fromindex, + int64_t towhich, + int64_t fromwhich, + int64_t length, + int64_t base, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_validity.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_validity.cu index d0a3899109..e24e5a3cf9 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_validity.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_UnionArray_validity.cu @@ -9,13 +9,14 @@ enum class UNIONARRAY_VALIDITY_ERRORS { template __global__ void -awkward_UnionArray_validity(const T* tags, - const C* index, - int64_t length, - int64_t numcontents, - const U* lencontents, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_UnionArray_validity( + const T* tags, + const C* index, + int64_t length, + int64_t numcontents, + const U* lencontents, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_index_rpad_and_clip_axis0.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_index_rpad_and_clip_axis0.cu index bb2eb7db61..b7bdacfc99 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_index_rpad_and_clip_axis0.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_index_rpad_and_clip_axis0.cu @@ -2,11 +2,12 @@ template __global__ void -awkward_index_rpad_and_clip_axis0(T* toindex, - int64_t target, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_index_rpad_and_clip_axis0( + T* toindex, + int64_t target, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_index_rpad_and_clip_axis1.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_index_rpad_and_clip_axis1.cu index 5237e63204..cd5d1f6f5b 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_index_rpad_and_clip_axis1.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_index_rpad_and_clip_axis1.cu @@ -2,12 +2,13 @@ template __global__ void -awkward_index_rpad_and_clip_axis1(T* tostarts, - C* tostops, - int64_t target, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_index_rpad_and_clip_axis1( + T* tostarts, + C* tostops, + int64_t target, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_localindex.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_localindex.cu index 5e13416426..afdec75eff 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_localindex.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_localindex.cu @@ -2,10 +2,11 @@ template __global__ void -awkward_localindex(T* toindex, - int64_t length, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_localindex( + T* toindex, + int64_t length, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < length) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_missing_repeat.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_missing_repeat.cu index e16b0d700f..efd631085a 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_missing_repeat.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_missing_repeat.cu @@ -2,13 +2,14 @@ template __global__ void -awkward_missing_repeat(T* outindex, - const C* index, - int64_t indexlength, - int64_t repetitions, - int64_t regularsize, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_missing_repeat( + T* outindex, + const C* index, + int64_t indexlength, + int64_t repetitions, + int64_t regularsize, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = (blockIdx.x * blockDim.x + threadIdx.x) / indexlength; int64_t thready_id = (blockIdx.x * blockDim.x + threadIdx.x) % indexlength; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu index ca88685e0e..7cd5da0f33 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu @@ -11,13 +11,14 @@ template __global__ void -awkward_reduce_argmax_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_argmax_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -28,13 +29,14 @@ awkward_reduce_argmax_a(T* toptr, template __global__ void -awkward_reduce_argmax_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_argmax_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu index 7ef169e498..282ebd11cc 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu @@ -11,13 +11,14 @@ template __global__ void -awkward_reduce_argmin_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_argmin_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -28,13 +29,14 @@ awkward_reduce_argmin_a(T* toptr, template __global__ void -awkward_reduce_argmin_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_argmin_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu index afab5efeaa..0870da2ff7 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu @@ -14,14 +14,15 @@ template __global__ void -awkward_reduce_count_64_a(T* toptr, - const bool* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_count_64_a( + T* toptr, + const bool* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu index b6fdd6f92d..6b07dfa208 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu @@ -14,14 +14,15 @@ template __global__ void -awkward_reduce_countnonzero_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_countnonzero_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -33,14 +34,15 @@ awkward_reduce_countnonzero_a(T* toptr, template __global__ void -awkward_reduce_countnonzero_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_countnonzero_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu index 14b2fd1351..3c20b653ac 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu @@ -11,14 +11,15 @@ template __global__ void -awkward_reduce_max_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - T identity, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_max_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T identity, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -29,14 +30,15 @@ awkward_reduce_max_a(T* toptr, template __global__ void -awkward_reduce_max_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - T identity, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_max_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T identity, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu index fc08a13175..ae0e2dcb61 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu @@ -11,14 +11,15 @@ template __global__ void -awkward_reduce_min_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - T identity, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_min_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T identity, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; if (thread_id < outlength) { @@ -29,14 +30,15 @@ awkward_reduce_min_a(T* toptr, template __global__ void -awkward_reduce_min_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - T identity, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_min_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + T identity, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu index d688b424e4..74843af6c0 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu @@ -14,14 +14,15 @@ template __global__ void -awkward_reduce_prod_bool_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_prod_bool_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -33,14 +34,15 @@ awkward_reduce_prod_bool_a(T* toptr, template __global__ void -awkward_reduce_prod_bool_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_prod_bool_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu index abbee36a18..13c5a31dbf 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu @@ -14,14 +14,15 @@ template __global__ void -awkward_reduce_sum_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -33,14 +34,15 @@ awkward_reduce_sum_a(T* toptr, template __global__ void -awkward_reduce_sum_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -53,14 +55,15 @@ awkward_reduce_sum_b(T* toptr, template __global__ void -awkward_reduce_sum_c(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_c( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu index 86fdc77fb0..0e062a6c78 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu @@ -14,14 +14,15 @@ template __global__ void -awkward_reduce_sum_bool_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_bool_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -33,14 +34,15 @@ awkward_reduce_sum_bool_a(T* toptr, template __global__ void -awkward_reduce_sum_bool_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_bool_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu index dd10241e90..8bdb3fccc2 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu @@ -14,14 +14,15 @@ template __global__ void -awkward_reduce_sum_int32_bool_64_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_int32_bool_64_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -33,14 +34,15 @@ awkward_reduce_sum_int32_bool_64_a(T* toptr, template __global__ void -awkward_reduce_sum_int32_bool_64_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_int32_bool_64_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu index 5a515169e0..041558a663 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu @@ -14,14 +14,15 @@ template __global__ void -awkward_reduce_sum_int64_bool_64_a(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_int64_bool_64_a( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -33,14 +34,15 @@ awkward_reduce_sum_int64_bool_64_a(T* toptr, template __global__ void -awkward_reduce_sum_int64_bool_64_b(T* toptr, - const C* fromptr, - const U* parents, - int64_t lenparents, - int64_t outlength, - uint64_t* atomicAdd_toptr, - uint64_t invocation_index, - uint64_t* err_code) { +awkward_reduce_sum_int64_bool_64_b( + T* toptr, + const C* fromptr, + const U* parents, + int64_t lenparents, + int64_t outlength, + uint64_t* atomicAdd_toptr, + uint64_t invocation_index, + uint64_t* err_code) { if (err_code[0] == NO_ERROR) { int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_sorting_ranges_length.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_sorting_ranges_length.cu new file mode 100644 index 0000000000..d0fdb4ddbc --- /dev/null +++ b/src/awkward/_connect/cuda/cuda_kernels/awkward_sorting_ranges_length.cu @@ -0,0 +1,51 @@ +// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + +// BEGIN PYTHON +// def f(grid, block, args): +// (tolength, parents, parentslength, invocation_index, err_code) = args +// scan_in_array = cupy.empty(parentslength + 1, dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_sorting_ranges_length_a', tolength.dtype, parents.dtype]))(grid, block, (tolength, parents, parentslength, scan_in_array, invocation_index, err_code)) +// scan_in_array = cupy.cumsum(scan_in_array) +// cuda_kernel_templates.get_function(fetch_specialization(['awkward_sorting_ranges_length_b', tolength.dtype, parents.dtype]))(grid, block, (tolength, parents, parentslength, scan_in_array, invocation_index, err_code)) +// out["awkward_sorting_ranges_length_a", {dtype_specializations}] = None +// out["awkward_sorting_ranges_length_b", {dtype_specializations}] = None +// END PYTHON + +template +__global__ void +awkward_sorting_ranges_length_a( + T* tolength, + const C* parents, + int64_t parentslength, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_id == 0 ) { + scan_in_array[thread_id] = 2; + } + if (thread_id > 0 && thread_id < parentslength) { + if (parents[thread_id - 1] != parents[thread_id]) { + scan_in_array[thread_id] = 1; + } + else { + scan_in_array[thread_id] = 0; + } + } + } +} + +template +__global__ void +awkward_sorting_ranges_length_b( + T* tolength, + const C* parents, + int64_t parentslength, + int64_t* scan_in_array, + uint64_t invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + *tolength = parentslength > 0 ? scan_in_array[parentslength - 1] : scan_in_array[0]; + } +} diff --git a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu index 3c5cdf37f9..8a02094f34 100644 --- a/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu +++ b/src/awkward/_connect/cuda/cuda_kernels/cuda_common.cu @@ -46,6 +46,17 @@ typedef unsigned long long uintmax_t; // stride = stride * 2 // return d_final // out['inclusive_scan_kernel', cupy.int64] = inclusive_scan + +// def exclusive_scan(grid, block, args): +// print(args) +// (d_in, invocation_index, err_code) = args +// import math +// d_out = cupy.empty(len(d_in), dtype=cupy.int64) +// cuda_kernel_templates.get_function(fetch_specialization(['exclusive_scan_kernel', cupy.int64]))(grid, block, (d_in, d_out, len(d_in), invocation_index, err_code)) +// print(d_out) +// print("\n") +// return d_out +// out['exclusive_scan_kernel', cupy.int64] = exclusive_scan // END PYTHON template @@ -87,3 +98,49 @@ inclusive_scan_kernel(T* d_in, } } } + + +template +__global__ void +exclusive_scan_kernel(T* input, + T* output, + int64_t n, + uint64_t* invocation_index, + uint64_t* err_code) { + if (err_code[0] == NO_ERROR) { + extern __shared__ int temp[1024*2]; + int tid = threadIdx.x; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + temp[tid] = input[idx]; + } else { + temp[tid] = 0; + } + __syncthreads(); + + for (int stride = 1; stride <= 1024; stride *= 2) { + int index = (tid + 1) * stride * 2 - 1; + if (index < 2 * 1024) { + temp[index] += temp[index - stride]; + } + __syncthreads(); + } + + if (tid == 0) { + temp[2 * 1024 - 1] = 0; + } + __syncthreads(); + + for (int stride = 1024; stride > 0; stride /= 2) { + int index = (tid + 1) * stride * 2 - 1; + if (index + stride < 2 * 1024) { + temp[index + stride] += temp[index]; + } + __syncthreads(); + } + + if (idx < n) { + output[idx] = temp[tid]; + } + } +} diff --git a/tests-cuda/test_1276_cuda_num.py b/tests-cuda/test_1276_cuda_num.py index 2ce8b86b70..b283a4a835 100644 --- a/tests-cuda/test_1276_cuda_num.py +++ b/tests-cuda/test_1276_cuda_num.py @@ -14,7 +14,6 @@ pytest.skip(reason="too old Numba version", allow_module_level=True) -@pytest.mark.xfail(reason="unimplemented CUDA Kernels (awkward_ByteMaskedArray_numnull") def test_num_1(): content = ak.Array( ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] @@ -25,11 +24,8 @@ def test_num_1(): ) cuda_array = ak.to_backend(array, "cuda") assert ak.num(cuda_array, 0) == ak.num(array, 0) - with pytest.raises(NotImplementedError): - ak.num(cuda_array, 1) -@pytest.mark.xfail(reason="unimplemented CUDA Kernels (awkward_ByteMaskedArray_numnull") def test_num_2(): content = ak.Array( ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine"] @@ -40,8 +36,6 @@ def test_num_2(): ) cuda_array = ak.to_backend(array, "cuda") assert ak.num(cuda_array, 0) == ak.num(array, 0) - with pytest.raises(NotImplementedError): - ak.num(cuda_array, 1) def test_num_3(): diff --git a/tests-cuda/test_1276_cuda_transfers.py b/tests-cuda/test_1276_cuda_transfers.py index 75627b15b5..8886fd7b70 100644 --- a/tests-cuda/test_1276_cuda_transfers.py +++ b/tests-cuda/test_1276_cuda_transfers.py @@ -284,7 +284,6 @@ def test_tocuda_unimplementedkernels14(): assert ak.to_list(copyback_bytemaskedarray) == ak.to_list(bytemaskedarray) -@pytest.mark.xfail(reason="awkward_ListArray_broadcast_tooffsets is not implemented") def test_tocuda_unimplementedkernels15(): ioa = ak.contents.IndexedOptionArray( ak.index.Index32([-30, 19, 6, 7, -3, 21, 13, 22, 17, 9, -12, 16]),