Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 2 cq implementation for Resnet #9057

Merged
merged 6 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
308 changes: 180 additions & 128 deletions models/demos/resnet/tests/test_metal_resnet50.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytest
import tt_lib

from models.utility_functions import is_e75, skip_for_wormhole_b0
from models.utility_functions import is_e75, skip_for_wormhole_b0, divup

from models.demos.resnet.tt.metalResnetBlock50 import ResNet, Bottleneck
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import (
Expand Down Expand Up @@ -117,26 +117,107 @@
}


@skip_for_wormhole_b0("This test is not supported on WHB0, please use the TTNN version.")
@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
@pytest.mark.parametrize("batch_size", [1, 2, 16, 20], ids=["batch_1", "batch_2", "batch_16", "batch_20"])
@pytest.mark.parametrize(
"weights_dtype",
[tt_lib.tensor.DataType.BFLOAT16, tt_lib.tensor.DataType.BFLOAT8_B],
ids=["weights_BFLOAT16", "weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[tt_lib.tensor.DataType.BFLOAT16, tt_lib.tensor.DataType.BFLOAT8_B],
ids=["activations_BFLOAT16", "activations_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"math_fidelity",
[tt_lib.tensor.MathFidelity.HiFi4, tt_lib.tensor.MathFidelity.HiFi2, tt_lib.tensor.MathFidelity.LoFi],
ids=["HiFi4", "HiFi2", "LoFi"],
)
def test_run_resnet50_inference(
device, use_program_cache, batch_size, weights_dtype, activations_dtype, math_fidelity, imagenet_sample_input
def run_model(device, tt_image, tt_resnet50):
tt_output = tt_resnet50(tt_image)
return tt_output.cpu(blocking=True)


def run_2cq_model(device, tt_image, tt_resnet50):
input_shape = tt_image.get_legacy_shape()
shard_spec = tt_lib.tensor.ShardSpec(
tt_lib.tensor.CoreRangeSet(
{
tt_lib.tensor.CoreRange(
tt_lib.tensor.CoreCoord(0, 0),
tt_lib.tensor.CoreCoord(7, 0),
)
}
),
[
divup(tt_image.volume() // input_shape[3], 8),
input_shape[3],
],
tt_lib.tensor.ShardOrientation.ROW_MAJOR,
False,
)
sharded_mem_config_DRAM = tt_lib.tensor.MemoryConfig(
tt_lib.tensor.TensorMemoryLayout.HEIGHT_SHARDED, tt_lib.tensor.BufferType.DRAM, shard_spec
)
tt_image_res = tt_lib.tensor.allocate_tensor_on_device(
tt_image.shape, tt_image.dtype, tt_image.layout, device, sharded_mem_config_DRAM
)
op_event = tt_lib.device.CreateEvent()
write_event = tt_lib.device.CreateEvent()
# Initialize the op event so we can write
tt_lib.device.RecordEvent(device, 0, op_event)

tt_lib.device.WaitForEvent(device, 1, op_event)
tt_lib.tensor.write_tensor(tt_image, tt_image_res, 1)
tt_lib.device.RecordEvent(device, 1, write_event)
_ = tt_resnet50(tt_image_res, write_event, op_event).cpu(blocking=True)

# Test overlapping write
outputs = []
for iter in range(0, 2):
tt_lib.device.WaitForEvent(device, 1, op_event)
tt_lib.tensor.write_tensor(tt_image, tt_image_res, 1)
tt_lib.device.RecordEvent(device, 1, write_event)
outputs.append(tt_resnet50(tt_image_res, write_event, op_event).cpu(blocking=False))
tt_lib.device.Synchronize(device)
return outputs[1]


def run_trace_model(device, tt_image, tt_resnet50):
input_shape = tt_image.get_legacy_shape()
shard_spec = tt_lib.tensor.ShardSpec(
tt_lib.tensor.CoreRangeSet(
{
tt_lib.tensor.CoreRange(
tt_lib.tensor.CoreCoord(0, 0),
tt_lib.tensor.CoreCoord(7, 0),
)
}
),
[
divup(tt_image.volume() // input_shape[3], 8),
input_shape[3],
],
tt_lib.tensor.ShardOrientation.ROW_MAJOR,
False,
)
Comment on lines +170 to +187
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added num_cores_to_core_range_set to ttlib and ttnn. I think we should start using this for HEIGHT and WIDTH sharded tensors. This will help remove inconsistencies with sharding where grids are not the actual shard grid.

sharded_mem_config_DRAM = tt_lib.tensor.MemoryConfig(
tt_lib.tensor.TensorMemoryLayout.HEIGHT_SHARDED, tt_lib.tensor.BufferType.DRAM, shard_spec
)
tt_image_res = tt_lib.tensor.allocate_tensor_on_device(
tt_image.shape, tt_image.dtype, tt_image.layout, device, sharded_mem_config_DRAM
)
tt_lib.tensor.write_tensor(tt_image, tt_image_res)

# Compile
tt_resnet50(tt_image_res)
# Trace
tid = tt_lib.device.BeginTraceCapture(device, 0, 1500000)
tt_output_res = tt_resnet50(tt_image_res)
tt_lib.device.EndTraceCapture(device, 0, tid)

tt_lib.tensor.write_tensor(tt_image, tt_image_res)
tt_lib.device.ReplayTrace(device, 0, tid, True)

# Done with the trace, can deallocate the buffers now.
tt_lib.device.ReleaseTrace(device, tid)

return tt_output_res.cpu(blocking=True)


def run_resnet50_inference(
device,
use_program_cache,
batch_size,
weights_dtype,
activations_dtype,
math_fidelity,
imagenet_sample_input,
run_fn,
):
if is_e75(device):
pytest.skip("Resnet50 is not supported on E75")
Expand All @@ -159,8 +240,6 @@ def test_run_resnet50_inference(
with torch.no_grad():
torch.manual_seed(1234)

tt_lib.device.EnableMemoryReports()

torch_resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
torch_resnet50.eval()

Expand All @@ -185,17 +264,8 @@ def test_run_resnet50_inference(

torch_output = torch_resnet50(image).unsqueeze(1).unsqueeze(1)
tt_image = tt_resnet50.preprocessing(image)
tt_output = tt_resnet50(tt_image)
tt_output = tt_output.cpu().to_torch().to(torch.float)

# # run again to measure end to end perf
# start_time = datetime.now()
# tt_output = tt_resnet50(image)
# end_time = datetime.now()
# diff = end_time - start_time
# logger.info("End to end time (microseconds))", diff.microseconds)
# throughput_fps = (float) (1000000 / diff.microseconds)
# logger.info("Throughput (fps)", throughput_fps)
tt_output = run_fn(device, tt_image, tt_resnet50)
tt_output = tt_output.to_torch().to(torch.float)

_, _, _, info = get_atol_rtol_pcc(torch_output, tt_output)
logger.info(info)
Expand Down Expand Up @@ -239,6 +309,72 @@ def test_run_resnet50_inference(
[tt_lib.tensor.MathFidelity.HiFi4, tt_lib.tensor.MathFidelity.HiFi2, tt_lib.tensor.MathFidelity.LoFi],
ids=["HiFi4", "HiFi2", "LoFi"],
)
def test_run_resnet50_inference(
device, use_program_cache, batch_size, weights_dtype, activations_dtype, math_fidelity, imagenet_sample_input
):
run_resnet50_inference(
device,
use_program_cache,
batch_size,
weights_dtype,
activations_dtype,
math_fidelity,
imagenet_sample_input,
run_model,
)


@skip_for_wormhole_b0("This test is not supported on WHB0, please use the TTNN version.")
@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_hw_cqs": 2}], indirect=True)
@pytest.mark.parametrize("batch_size", [20], ids=["batch_20"])
@pytest.mark.parametrize(
"weights_dtype",
[tt_lib.tensor.DataType.BFLOAT8_B],
ids=["weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[tt_lib.tensor.DataType.BFLOAT8_B],
ids=["activations_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"math_fidelity",
[tt_lib.tensor.MathFidelity.LoFi],
ids=["LoFi"],
)
def test_run_resnet50_2cqs_inference(
device, use_program_cache, batch_size, weights_dtype, activations_dtype, math_fidelity, imagenet_sample_input
):
run_resnet50_inference(
device,
use_program_cache,
batch_size,
weights_dtype,
activations_dtype,
math_fidelity,
imagenet_sample_input,
run_2cq_model,
)


@skip_for_wormhole_b0("This test is not supported on WHB0, please use the TTNN version.")
@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_hw_cqs": 2}], indirect=True)
@pytest.mark.parametrize("batch_size", [20], ids=["batch_20"])
@pytest.mark.parametrize(
"weights_dtype",
[tt_lib.tensor.DataType.BFLOAT8_B],
ids=["weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[tt_lib.tensor.DataType.BFLOAT8_B],
ids=["activations_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"math_fidelity",
[tt_lib.tensor.MathFidelity.LoFi],
ids=["LoFi"],
)
@pytest.mark.parametrize("enable_async", [True, False])
def test_run_resnet50_trace_inference(
device,
Expand All @@ -250,101 +386,17 @@ def test_run_resnet50_trace_inference(
imagenet_sample_input,
enable_async,
):
if is_e75(device):
pytest.skip("Resnet50 is not supported on E75")
device.enable_async(enable_async)
if batch_size > 8 and (
activations_dtype != tt_lib.tensor.DataType.BFLOAT8_B or weights_dtype != tt_lib.tensor.DataType.BFLOAT8_B
):
pytest.skip("Batch > 8 must be run fully bfp8")
if batch_size <= 2:
pytest.skip("batch 1 and 2 are not supported with sharded data")
image1 = imagenet_sample_input
image = image1
model_config = {
"MATH_FIDELITY": math_fidelity,
"WEIGHTS_DTYPE": weights_dtype,
"ACTIVATIONS_DTYPE": activations_dtype,
}
for i in range(batch_size - 1):
image = torch.cat((image, image1), dim=0)
with torch.no_grad():
torch.manual_seed(1234)

tt_lib.device.EnableMemoryReports()

torch_resnet50 = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
torch_resnet50.eval()

state_dict = torch_resnet50.state_dict()
storage_in_dram = False
sharded = False
if batch_size >= 8:
sharded = True
# run once to compile ops
tt_resnet50 = ResNet(
Bottleneck,
[3, 4, 6, 3],
device=device,
state_dict=state_dict,
base_address="",
fold_batchnorm=True,
storage_in_dram=storage_in_dram,
batch_size=batch_size,
model_config=model_config,
sharded=sharded,
)

torch_output = torch_resnet50(image).unsqueeze(1).unsqueeze(1)
interleaved_mem_config_DRAM = tt_lib.tensor.MemoryConfig(
memory_layout=tt_lib.tensor.TensorMemoryLayout.INTERLEAVED,
buffer_type=tt_lib.tensor.BufferType.DRAM,
)

tt_image_res = tt_resnet50.preprocessing(image).to(device, interleaved_mem_config_DRAM)

# Compile
tt_resnet50(tt_image_res)
# Trace
tid = tt_lib.device.BeginTraceCapture(device, 0, 1334880)
tt_output_res = tt_resnet50(tt_image_res)
tt_lib.device.EndTraceCapture(device, 0, tid)
run_resnet50_inference(
device,
use_program_cache,
batch_size,
weights_dtype,
activations_dtype,
math_fidelity,
imagenet_sample_input,
run_trace_model,
)

tt_lib.device.ReplayTrace(device, 0, tid, True)

tt_output = tt_output_res.cpu().to_torch().to(torch.float)

# # run again to measure end to end perf
# start_time = datetime.now()
# tt_output = tt_resnet50(image)
# end_time = datetime.now()
# diff = end_time - start_time
# logger.info("End to end time (microseconds))", diff.microseconds)
# throughput_fps = (float) (1000000 / diff.microseconds)
# logger.info("Throughput (fps)", throughput_fps)

_, _, _, info = get_atol_rtol_pcc(torch_output, tt_output)
logger.info(info)

valid_pcc = 1.0
if batch_size >= 8:
valid_pcc = golden_pcc[batch_size][
(model_config["MATH_FIDELITY"], model_config["WEIGHTS_DTYPE"], model_config["ACTIVATIONS_DTYPE"])
]
else:
if model_config["ACTIVATIONS_DTYPE"] == tt_lib.tensor.DataType.BFLOAT8_B:
if model_config["MATH_FIDELITY"] == tt_lib.tensor.MathFidelity.LoFi:
valid_pcc = 0.87
else:
valid_pcc = 0.94
else:
if model_config["MATH_FIDELITY"] == tt_lib.tensor.MathFidelity.LoFi:
valid_pcc = 0.93
else:
valid_pcc = 0.982
passing_pcc, _ = comp_pcc(torch_output, tt_output, pcc=valid_pcc)
assert passing_pcc
# assert passing # fails because of torch.allclose
# Done with the trace, can deallocate the buffers now.
tt_lib.device.ReleaseTrace(device, tid)
device.enable_async(False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

import pytest
import tt_lib

from models.demos.resnet.tests.test_metal_resnet50 import run_resnet50_inference, run_2cq_model
from models.utility_functions import skip_for_wormhole_b0


@skip_for_wormhole_b0("This test is not supported on WHB0, please use the TTNN version.")
@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576, "num_hw_cqs": 2}], indirect=True)
@pytest.mark.parametrize("batch_size", [20], ids=["batch_20"])
@pytest.mark.parametrize(
"weights_dtype",
[tt_lib.tensor.DataType.BFLOAT8_B],
ids=["weights_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"activations_dtype",
[tt_lib.tensor.DataType.BFLOAT8_B],
ids=["activations_BFLOAT8_B"],
)
@pytest.mark.parametrize(
"math_fidelity",
[tt_lib.tensor.MathFidelity.LoFi],
ids=["LoFi"],
)
def test_run_resnet50_2cqs_inference(
device, use_program_cache, batch_size, weights_dtype, activations_dtype, math_fidelity, imagenet_sample_input
):
run_resnet50_inference(
device,
use_program_cache,
batch_size,
weights_dtype,
activations_dtype,
math_fidelity,
imagenet_sample_input,
run_2cq_model,
)
Loading
Loading