Skip to content

Commit

Permalink
#0: squash later
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-nshanker committed Jan 25, 2024
1 parent 33878e5 commit ff68aed
Show file tree
Hide file tree
Showing 3 changed files with 205 additions and 126 deletions.
321 changes: 201 additions & 120 deletions tests/ttnn/unit_tests/test_conv.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,98 +11,7 @@
import ttnn


@skip_for_wormhole_b0()
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
(
# unique convs in rn50 (complete list)
# first conv post folding and input_channels padding to tile width
(8, 64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True, None),
# rn50 layer1
(8, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None),
# rn50 layer2
(8, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True, None),
(20, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True, None),
(8, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True, None),
# rn50 layer3
(8, 256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False, None),
(8, 256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False, None),
# rn50 layer4
(8, 512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False, None),
(8, 512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False, None),
# rn50 with batch size = 16
(16, 64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True, None),
# rn50 layer1
(16, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None),
# rn50 layer2
(16, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True, None),
(16, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True, None),
# rn50 layer3
(16, 256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False, None),
(16, 256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False, None),
# rn50 layer4
(16, 512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False, None),
(16, 512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False, None),
# rn50 with batch size = 20
(20, 64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True, None),
# rn50 layer1
(20, 64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True, None),
# rn50 layer2
(20, 128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True, None),
(20, 128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True, None),
# rn50 layer3
(20, 256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False, None),
(20, 256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False, None),
# rn50 layer4
(20, 512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False, None),
(20, 512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False, None),
# sd convs with HxW=32x32
# (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
# (1, 320, 320, 32, 32, 3, 3, 2, 2, 1, 1, False, None),
# (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
# (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None),
# (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 activations doesnt fit
# (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
# (1, 1280, 1280, 8, 8, 3, 3, 2, 2, 1, 1, False, None), #fails to parallelize with sharding
# (1, 1280, 1280, 4, 4, 3, 3, 1, 1, 1, 1, False, None), #fails to parallelize with sharding
# (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # slightly low pcc with 0.99698. bfloat16 weights doesnt fit
# (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, None), # doesnt fit at all.. for all data types
# sd convs with HxW=64x64 with batch size = 1
# (2, 32, 16, 64, 64, 3, 3, 1, 1, 1, 1, True, None), # not supported
(1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}), # bfloat16 doesnt fit
(1, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None),
(1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}), #
(1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 doesnt fit
(1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # bfloat16 weights doesnt fit
(1, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 doesnt fit.
(1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None), # bfloat16 weights doesnt fit
(1, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
(1, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
# sd convs with HxW=64x64 with batch size=2
(2, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
(2, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None), # fits with bfloat8_b
(2, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
(2, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 doesnt fit
(2, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # bfloat16 doesnt fit
(2, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, {"act_block_h": 32}), # bfloat16 doesnt fit
(2, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
(2, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}), # bfloat16 doesnt fit
(2, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
),
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
def test_conv(
use_program_cache,
def run_conv(
device,
math_fidelity,
activations_dtype,
Expand All @@ -121,34 +30,6 @@ def test_conv(
use_1d_systolic_array,
config_override,
):
if input_channels == 16:
pytest.skip("These tests are hanging in interleaved_to_sharded after rebase. Issue: #4336")

if math_fidelity != ttnn.MathFidelity.LoFi:
pytest.skip(
"By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
)

if (
activations_dtype == ttnn.bfloat16
and batch_size == 20
and (
output_channels == 64
or (
stride_h == 2
and (output_channels == 256 or (output_channels == 128 and weights_dtype == ttnn.bfloat16))
)
)
):
pytest.skip("Skipping test because it won't fit in L1!")

if (
input_channels >= 320
and (not input_channels == 512)
and (activations_dtype == ttnn.bfloat16 or weights_dtype == ttnn.bfloat16)
):
pytest.skip("Skipping tests with bfloat16 for sd convs")

torch.manual_seed(0)
conv_input_shape = [batch_size, input_channels, input_height, input_width]
conv_weight_shape = [output_channels, input_channels, filter_height, filter_width]
Expand Down Expand Up @@ -214,3 +95,203 @@ def test_conv(
else:
pcc = 0.998
assert_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=pcc)


@skip_for_wormhole_b0()
@pytest.mark.parametrize(
"output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array",
(
# unique convs in rn50 (complete list)
# first conv post folding and input_channels padding to tile width
(64, 16, 115, 115, 4, 4, 1, 1, 0, 0, True),
# rn50 layer1
(64, 64, 56, 56, 3, 3, 1, 1, 1, 1, True),
# rn50 layer2
(128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
(128, 128, 56, 56, 3, 3, 2, 2, 1, 1, True),
(128, 128, 28, 28, 3, 3, 1, 1, 1, 1, True),
# rn50 layer3
(256, 256, 28, 28, 3, 3, 2, 2, 1, 1, False),
(256, 256, 14, 14, 3, 3, 1, 1, 1, 1, False),
# rn50 layer4
(512, 512, 14, 14, 3, 3, 2, 2, 1, 1, False),
(512, 512, 7, 7, 3, 3, 1, 1, 1, 1, False),
),
)
@pytest.mark.parametrize(
"batch_size",
[8, 16, 20],
ids=["batch_size_8", "batch_size_16", "batch_size_20"],
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[ttnn.bfloat16, ttnn.bfloat8_b],
ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
def test_resnet50_conv(
use_program_cache,
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
):
if input_channels == 16:
pytest.skip("These tests are hanging in interleaved_to_sharded after rebase. Issue: #4336")

if math_fidelity != ttnn.MathFidelity.LoFi:
pytest.skip(
"By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
)

if (
activations_dtype == ttnn.bfloat16
and batch_size == 20
and (
output_channels == 64
or (
stride_h == 2
and (output_channels == 256 or (output_channels == 128 and weights_dtype == ttnn.bfloat16))
)
)
):
pytest.skip("Skipping test because it won't fit in L1!")

if (
input_channels >= 320
and (not input_channels == 512)
and (activations_dtype == ttnn.bfloat16 or weights_dtype == ttnn.bfloat16)
):
pytest.skip("Skipping tests with bfloat16 for sd convs")

run_conv(
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override=None,
)


@skip_for_wormhole_b0()
@pytest.mark.parametrize(
"batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
(
# sd convs with HxW=32x32
# (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
# (1, 320, 320, 32, 32, 3, 3, 2, 2, 1, 1, False, None),
# (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
# (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None),
# (1, 640, 640, 16, 16, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 activations doesnt fit
# (1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None), # slighlty low pcc with 0.99689. bfloat16 weights doesnt fit
# (1, 1280, 1280, 8, 8, 3, 3, 2, 2, 1, 1, False, None), #fails to parallelize with sharding
# (1, 1280, 1280, 4, 4, 3, 3, 1, 1, 1, 1, False, None), #fails to parallelize with sharding
# (1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # slightly low pcc with 0.99698. bfloat16 weights doesnt fit
# (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, None), # doesnt fit at all.. for all data types
# sd convs with HxW=64x64 with batch size = 1
# (2, 32, 16, 64, 64, 3, 3, 1, 1, 1, 1, True, None), # not supported
(1, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}), # bfloat16 doesnt fit
(1, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None),
(1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}), #
(1, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 doesnt fit
(1, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # bfloat16 weights doesnt fit
(1, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 doesnt fit.
(1, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, None), # bfloat16 weights doesnt fit
(1, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
(1, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
# sd convs with HxW=64x64 with batch size=2
(2, 320, 320, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
(2, 320, 320, 64, 64, 3, 3, 2, 2, 1, 1, False, None), # fits with bfloat8_b
(2, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
(2, 640, 640, 32, 32, 3, 3, 2, 2, 1, 1, False, None), # bfloat16 doesnt fit
(2, 1280, 1280, 16, 16, 3, 3, 1, 1, 1, 1, False, None), # bfloat16 doesnt fit
(2, 1280, 1280, 16, 16, 3, 3, 2, 2, 1, 1, False, {"act_block_h": 32}), # bfloat16 doesnt fit
(2, 1280, 1280, 8, 8, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 32}),
(2, 1280, 1280, 32, 32, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}), # bfloat16 doesnt fit
(2, 640, 640, 64, 64, 3, 3, 1, 1, 1, 1, False, {"act_block_h": 64}),
),
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat8_b],
ids=["weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[ttnn.bfloat8_b],
ids=["activations_BFLOAT8_B"],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.HiFi4, ttnn.MathFidelity.LoFi], ids=["HiFi4", "LoFi"])
def test_sd_conv(
use_program_cache,
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override,
):
if math_fidelity != ttnn.MathFidelity.LoFi:
pytest.skip(
"By default, only run tests with LoFi math for pipelines. For local unit testing, enable the other variants by uncommenting the skip here!"
)

run_conv(
device,
math_fidelity,
activations_dtype,
weights_dtype,
batch_size,
output_channels,
input_channels,
input_height,
input_width,
filter_height,
filter_width,
stride_h,
stride_w,
pad_h,
pad_w,
use_1d_systolic_array,
config_override,
)
Original file line number Diff line number Diff line change
Expand Up @@ -551,13 +551,14 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_(const Tens

if (fully_buffer_weights) {
num_weight_cb_tiles *= window_outer;
} else if (per_core_weight_matrix_width_ntiles < 8) {
} else if (per_core_weight_matrix_width_ntiles < 5 && per_core_out_matrix_height_ntiles < 22) {
num_weight_cb_tiles = num_weight_cb_tiles * 2;
}
if (conv_act_size_c / conv_act_c_blocks < 256) {

if (conv_act_size_c / conv_act_c_blocks < 160 && per_core_out_matrix_height_ntiles < 22) {
num_act_cb_tiles = num_act_cb_tiles * 2; // double buffered
}
cout << "here" << endl;

uint32_t writer_output_block_num_tiles = out_block_h_ntiles * weight_block_w_ntiles;

// TODO: Moving this function call to after kernel logic causes pcc fails
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,6 @@ def determine_parallel_config(
per_core_out_matrix_height_ntiles=per_core_out_matrix_height_ntiles,
per_core_weight_matrix_width_ntiles=per_core_out_matrix_width_ntiles,
)
print("grid_size=", grid_size)
print("per_core_out_matrix_height=", per_core_out_matrix_height_ntiles)
print("per_core_out_matrix_width=", per_core_out_matrix_width_ntiles)
return conv_parallelization_config, num_cores_nhw


Expand Down

0 comments on commit ff68aed

Please sign in to comment.