Skip to content

Commit

Permalink
#4836: Add support for blocking conv activation in 2d systolic conv v…
Browse files Browse the repository at this point in the history
…ariant
  • Loading branch information
tt-nshanker committed Jan 25, 2024
1 parent cfe78bb commit 38659f6
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 158 deletions.
286 changes: 204 additions & 82 deletions tests/ttnn/unit_tests/operations/test_conv.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,66 +11,7 @@
import ttnn


@skip_for_wormhole_b0()
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array",
(
# unique convs in rn50 (complete list)
# first conv post folding and input_channels padding to tile width
(8, 64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True),
# rn50 layer1
(8, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True),
# rn50 layer2
(8, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
(20, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
(8, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True),
# rn50 layer3
(8, 256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False),
(8, 256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False),
# rn50 layer4
(8, 512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False),
(8, 512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False),
# sd convs with HxW=32x32
# (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False),
# (1, 320, 320, 32, 32, 3, 3, 2, 2, 1, 1, False),
# (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False),
# (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False),
# (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False), # bfloat16 activations doesnt fit
# (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
# (1, 1280, 1280, 8, 8, 3, 3, 2, 2, 1, 1, False), #fails to parallelize with sharding
# (1, 1280, 1280, 4, 4, 3, 3, 1, 1, 1, 1, False), #fails to parallelize with sharding
# (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False), # slightly low pcc with 0.99698. bfloat16 weights doesnt fit
# (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False), # doesnt fit at all.. for all data types
# sd conv with HxW=512x512
# (1, 320, 320, 512, 512, 3, 3, 1, 1, 1, 1, False), # doesnt fit at all.. for all data types
# sd conv with HxW=256x256
# (1, 320, 320, 256, 256, 3, 3, 1, 1, 1, 1, False), # doesnt fit at all.. for all data types
# sd convs with HxW=64x64
# (1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False), # bfloat16 weights or activations doesnt fit
(1, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False),
# (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False), # doesnt fit at all.. for all datatypes
# (1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False), # bfloat16 weights or activations doesnt fit
# (1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False), # bfloat16 activations doesnt fit
# (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
# (1, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False), #slightly low pcc 0.99697. bfloat16 doesnt fit.
# (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
# (1, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False), # not tested yet
# (1, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False), # not tested yet
),
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
def test_conv(
use_program_cache,
def run_conv(
device,
math_fidelity,
activations_dtype,
Expand All @@ -87,28 +28,8 @@ def test_conv(
pad_h,
pad_w,
use_1d_systolic_array,
config_override,
):
if input_channels == 16:
pytest.skip("These tests are hanging in interleaved_to_sharded after rebase. Issue: #4336")

if math_fidelity != ttnn.MathFidelity.LoFi:
pytest.skip(
"By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
)

if (
activations_dtype == ttnn.bfloat16
and batch_size == 20
and (
output_channels == 64
or (
stride_h == 2
and (output_channels == 256 or (output_channels == 128 and weights_dtype == ttnn.bfloat16))
)
)
):
pytest.skip("Skipping test because it won't fit in L1!")

torch.manual_seed(0)
conv_input_shape = [batch_size, input_channels, input_height, input_width]
conv_weight_shape = [output_channels, input_channels, filter_height, filter_width]
Expand Down Expand Up @@ -150,6 +71,7 @@ def test_conv(
bias=tt_bias_tensor,
math_fidelity=math_fidelity,
weights_dtype=weights_dtype,
conv_blocking_and_parallelization_config_override=config_override,
)

assert "conv" in reader_patterns_cache and "halo" in reader_patterns_cache
Expand All @@ -169,7 +91,207 @@ def test_conv(
torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2))

if math_fidelity == ttnn.MathFidelity.LoFi and activations_dtype == ttnn.bfloat8_b:
pcc = 0.998
pcc = 0.9969
else:
pcc = 0.998
assert_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)


@skip_for_wormhole_b0()
@pytest.mark.parametrize(
"output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array",
(
# unique convs in rn50 (complete list)
# first conv post folding and input_channels padding to tile width
(64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True),
# rn50 layer1
(64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True),
# rn50 layer2
(128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
(128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
(128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True),
# rn50 layer3
(256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False),
(256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False),
# rn50 layer4
(512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False),
(512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False),
),
)
@pytest.mark.parametrize(
"batch_size",
[8, 16, 20],
ids=["batch_size_8", "batch_size_16", "batch_size_20"],
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
def test_resnet50_conv(
use_program_cache,
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
):
if input_channels == 16:
pytest.skip("These tests are hanging in interleaved_to_sharded after rebase. Issue: #4336")

if math_fidelity != ttnn.MathFidelity.LoFi:
pytest.skip(
"By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
)

if (
activations_dtype == ttnn.bfloat16
and batch_size == 20
and (
output_channels == 64
or (
stride_h == 2
and (output_channels == 256 or (output_channels == 128 and weights_dtype == ttnn.bfloat16))
)
)
):
pytest.skip("Skipping test because it won't fit in L1!")

if (
input_channels >= 320
and (not input_channels == 512)
and (activations_dtype == ttnn.bfloat16 or weights_dtype == ttnn.bfloat16)
):
pytest.skip("Skipping tests with bfloat16 for sd convs")

run_conv(
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override=None,
)


@skip_for_wormhole_b0()
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
(
# sd convs with HxW=32x32
# (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
# (1, 320, 320, 32, 32, 3, 3, 2, 2, 1, 1, False, None),
# (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
# (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None),
# (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 activations doesnt fit
# (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
# (1, 1280, 1280, 8, 8, 3, 3, 2, 2, 1, 1, False, None), #fails to parallelize with sharding
# (1, 1280, 1280, 4, 4, 3, 3, 1, 1, 1, 1, False, None), #fails to parallelize with sharding
# (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # slightly low pcc with 0.99698. bfloat16 weights doesnt fit
# (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, None), # doesnt fit at all.. for all data types
# sd convs with HxW=64x64 with batch size = 1
# (2, 32, 16, 64, 64, 3, 3, 1, 1, 1, 1, True, None), # not supported
(1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}), # bfloat16 doesnt fit
(1, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None),
(1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}), #
(1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 doesnt fit
(1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # bfloat16 weights doesnt fit
(1, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 doesnt fit.
(1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None), # bfloat16 weights doesnt fit
(1, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
(1, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
# sd convs with HxW=64x64 with batch size=2
(2, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
(2, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None), # fits with bfloat8_b
(2, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
(2, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 doesnt fit
(2, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # bfloat16 doesnt fit
(2, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, {"act_block_h": 32}), # bfloat16 doesnt fit
(2, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
(2, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}), # bfloat16 doesnt fit
(2, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
),
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat8_b],
ids=["weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[ttnn.bfloat8_b],
ids=["activations_BFLOAT8_B"],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
def test_sd_conv(
use_program_cache,
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override,
):
if math_fidelity != ttnn.MathFidelity.LoFi:
pytest.skip(
"By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
)

run_conv(
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override,
)
Original file line number Diff line number Diff line change
Expand Up @@ -173,22 +173,23 @@ void MAIN {
#ifdef SFPU_OP_INIT_ACTIVATION
SFPU_OP_INIT_ACTIVATION
#endif
// in1 num blocks w is the outer loop. Output blocks are computed in col major order.
for(uint32_t in1_block_w_i = 0; in1_block_w_i < in1_num_blocks_w; ++in1_block_w_i) {

#ifdef PRE_TILIZE
unpack_reconfig_data_format_srca(in1_cb_id, in0_pretilize_cb_id);
for(uint32_t in0_block_h_i = 0; in0_block_h_i < in0_num_blocks_h; ++in0_block_h_i) {

col_major_to_row_major_init();
tilize_in(in0_pretilize_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks, tilized_in0_cb_id);
row_major_to_col_major_init();
#ifdef PRE_TILIZE
unpack_reconfig_data_format_srca(in1_cb_id, in0_pretilize_cb_id);

// TODO: unpack_reconfig_data_format_srca(in0_pretilize_cb_id, in1_cb_id) doesn't work if in0 is BFLOATB_B and in1 is BFLOAT16
mm_block_init_short();
unpack_reconfig_data_format_srca(in1_cb_id);
#endif
col_major_to_row_major_init();
tilize_in(in0_pretilize_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks, tilized_in0_cb_id);
row_major_to_col_major_init();

// TODO: unpack_reconfig_data_format_srca(in0_pretilize_cb_id, in1_cb_id) doesn't work if in0 is BFLOATB_B and in1 is BFLOAT16
mm_block_init_short();
unpack_reconfig_data_format_srca(in1_cb_id);
#endif

// in1 num blocks w is the outer loop. Output blocks are computed in col major order.
for(uint32_t in1_block_w_i = 0; in1_block_w_i < in1_num_blocks_w; ++in1_block_w_i) {
for(uint32_t in0_block_h_i = 0; in0_block_h_i < in0_num_blocks_h; ++in0_block_h_i) {
bool enable_reload = false;

#ifdef PACK_RELU
Expand Down Expand Up @@ -296,6 +297,7 @@ void MAIN {
PACK( cb_interface[matmul_partials_cb].fifo_wr_ptr = partials_cb_write_ptr );
}
}

cb_pop_front(mm_in0_cb_id, in0_block_num_tiles);
cb_pop_front(in1_cb_id, in1_block_num_tiles);
} // for in0_num_blocks_w
Expand Down Expand Up @@ -380,6 +382,7 @@ void MAIN {
}
}
#endif

} // for in0_num_blocks_h
#ifdef FUSE_BIAS
bias_block_offset += in1_block_w;
Expand Down
Loading

0 comments on commit 38659f6

Please sign in to comment.