-
Notifications
You must be signed in to change notification settings - Fork 0
/
shared_memory_total_warp_reduce_op.hpp
executable file
·136 lines (98 loc) · 6.46 KB
/
shared_memory_total_warp_reduce_op.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// 一个带warp reduce模板,其中warp块是按照行对齐的,一行包含多个warp,一个warp不能包含多行。
// 需要满足每一行的块为32的倍数,每个warp将自己的内容放到block中,然后在block中做一次归约。
// 因为每个warp负责的内容都在一行内,所以行列分块没有意义,这里本质上要一个线程一个非零元就好了,所以无视thread层级是怎么分块的
// 都让一个线程负责一个非零元即可
// 在树状归约问题上,先可以让每次归约的结果放在共享内存中
// 这个版本的warp结果可以在sharedmemory中规约,还有一个原子加的版本,和thread的版本一样,都有两个版本
#ifndef SHARED_MEMORY_TOTAL_WARP_REDUCE_TEMPLATE_H
#define SHARED_MEMORY_TOTAL_WARP_REDUCE_TEMPLATE_H
#include "struct.hpp"
#include "config.hpp"
#include "arr_optimization.hpp"
#include "code_builder.hpp"
typedef struct shared_memory_total_warp_reduce_template
{
// 稠密矩阵号
unsigned long dense_block_index;
// 对应的密集矩阵
sparse_struct_t *matrix = NULL;
// 当前密集子块的首行行号
unsigned long kernal_first_row_index = 0;
unsigned long kernal_first_col_index = 0;
// 存在compressed row padding,只有有效的compressed内相对行号才需要找到原索引,并写回结果。
unsigned long effective_row_num;
// 用一个变量存是否要用原子加来归约
bool is_atom_add = false;
// 重新构建一个数组,每一行在warp计算结果中的偏移量
void *row_offset_in_warp_tmp_result = NULL;
data_type data_type_of_row_offset_in_warp_tmp_result;
unsigned long size_of_row_offset_in_warp_tmp_result;
// 每个block的首行行号,用来进行归约
void *block_first_row_index = NULL;
data_type data_type_of_block_first_row_index;
unsigned long size_of_block_first_row_index;
// 每个block的第一个warp粒度的块的索引
void *block_begin_warp_index_offset = NULL;
data_type data_type_of_block_begin_warp_index_offset;
unsigned long size_of_block_begin_warp_index_offset;
// 排序相关
// 用一个可能存在的数组存储排序之后的输出,可能有全局的和局部的两种情况
bool global_sort_index = false;
bool local_sort_index = false;
void *row_index_before_sort = NULL;
data_type data_type_of_row_index_before_sort;
unsigned long size_of_row_index_before_sort;
// 每个warp粒度的块的第一个非零元的索引。不需要block索引
void *global_warp_block_first_nz = NULL;
data_type data_type_of_global_warp_block_first_nz;
unsigned long size_of_global_warp_block_first_nz;
// 当前稠密视图子块的所有值
void *val_arr = NULL;
data_type data_type_of_val_arr;
unsigned long size_of_val_arr;
// 当前稠密视图子块的所有列号
void *col_index_arr = NULL;
data_type data_type_of_col_index_arr;
unsigned long size_of_col_index_arr;
// warp计算结果的行偏移,可以线性压缩
arr_compress_type row_offset_in_warp_tmp_result_compress = NONE_COMPRESS;
void *row_offset_in_warp_tmp_result_compress_meta = NULL;
// 每个block的行起始位置的压缩,可以线性压缩
arr_compress_type block_first_row_index_compress = NONE_COMPRESS;
void *block_first_row_index_compress_meta = NULL;
// 每个block的第一个warp的索引
arr_compress_type block_begin_warp_index_offset_compress = NONE_COMPRESS;
void *block_begin_warp_index_offset_compress_meta = NULL;
// warp块的第一个非零元
arr_compress_type global_warp_block_first_nz_compress = NONE_COMPRESS;
void *global_warp_block_first_nz_compress_meta = NULL;
// 排序原索引
arr_compress_type row_index_before_sort_compress = NONE_COMPRESS;
void *row_index_before_sort_compress_meta = NULL;
// 当前内核使用的线程块数量和线程块内的线程数量
unsigned long tblock_num = get_config()["DEFAULT_THREAD_BLOCK_NUM"].as_integer();
unsigned long thread_num_in_block = get_config()["DEFAULT_THREAD_NUM_IN_BLOCK"].as_integer();
// 当前模板每一行的树状规约并行度
unsigned long thread_num_of_row_reduce = 1;
// 用一个数存储一个模板的id的哈希
unsigned long hash_of_this_template;
} shared_memory_total_warp_reduce_template_t;
shared_memory_total_warp_reduce_template_t *init_shared_memory_total_warp_reduce_template(code_builder_t *builder, unsigned long dense_block_id);
bool is_supported_by_shared_memory_total_warp_reduce_template(code_builder_t *builder, unsigned long dense_block_id);
bool is_supported_by_shared_memory_total_warp_reduce_template(sparse_struct_t *matrix, unsigned long dense_block_id);
// 打印所有数据
void store_template_data(shared_memory_total_warp_reduce_template_t *output_template, string output_dir, bool force_not_share_global_sort_index = false);
string code_of_template_data_struct(shared_memory_total_warp_reduce_template_t *output_template, unsigned long dense_block_id);
string code_of_read_template_data_from_file_func_define(shared_memory_total_warp_reduce_template_t *output_template, unsigned long dense_block_id, bool force_not_share_global_sort_index = false);
string code_of_template_kernal(shared_memory_total_warp_reduce_template_t *output_template, unsigned long dense_block_id);
string code_of_kernal_function_call(shared_memory_total_warp_reduce_template_t *output_template, unsigned long dense_block_id);
string code_of_write_template_data_to_gpu(shared_memory_total_warp_reduce_template_t *output_template, unsigned long dense_block_id, bool force_not_share_global_sort_index = false);
bool compress_block_begin_warp_index_offset(shared_memory_total_warp_reduce_template_t *output_template, bool need_check, arr_compress_type type);
bool compress_row_offset_in_warp_tmp_result(shared_memory_total_warp_reduce_template_t *output_template, bool need_check, arr_compress_type type);
bool compress_block_first_row_index(shared_memory_total_warp_reduce_template_t *output_template, bool need_check, arr_compress_type type);
bool compress_global_warp_block_first_nz(shared_memory_total_warp_reduce_template_t *output_template, bool need_check, arr_compress_type type);
// 尝试所有的压缩
void try_all_compress(shared_memory_total_warp_reduce_template_t *output_template);
// 归约每一行结果的线程
bool set_row_reduce_thread_num(shared_memory_total_warp_reduce_template_t *output_template, unsigned long row_reduce_thread_num);
#endif