Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[trivial] bug fix on recorder_v2.py script #2800

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions test/input_gen/genModelTests_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,7 @@ def forward(self, inputs, labels):
label_dims=[(1, 10)],
name="fc_mixed_training",
optimizer=fc_mixed_training.getOptimizer(),
type="mixed",
)

inspect_file("fc_mixed_training.nnmodelgolden")
Expand Down Expand Up @@ -846,6 +847,7 @@ def forward(self, inputs, labels):
label_dims=[(1, 1)],
name="fc_mixed_training_nan_sgd",
optimizer=fc_mixed_training_nan_sgd.getOptimizer(),
type="mixed",
)

# Function to check the created golden test file
Expand Down
139 changes: 84 additions & 55 deletions test/input_gen/recorder_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

from transLayer_v2 import params_translated

if torch.__version__ != "1.9.1":
if torch.__version__ != "2.4":
print(
"the script was tested at version 1.9.1 it might not work if torch version is different"
"the script was tested at version 2.4 it might not work if torch version is different"
)

SEED = 1234
Expand All @@ -31,30 +31,31 @@


def _get_writer(file):
def write_fn(items, type = 'float32'):
def write_fn(items, type="float32"):
if not isinstance(items, (list, tuple)):
items = [items]

for item in items:
print(item.numel(), " -0-----")
print(item)
np.array([item.numel()], dtype='int32').tofile(file)
a=np.array(item.detach().cpu(), dtype=type)
np.array([item.numel()], dtype="int32").tofile(file)
a = np.array(item.detach().cpu(), dtype=type)
a.tofile(file)
print(a.dtype)

return items

return write_fn


def _get_writer_mixed(file):
def write_fn(items, num_type = 'int32', type = 'float32'):
def write_fn(items, num_type="int32", type="float32"):
if not isinstance(items, (list, tuple)):
items = [items]

for item in items:
np.array([item.numel()], dtype=num_type).tofile(file)
a=np.array(item.detach().cpu(), dtype=type)
a = np.array(item.detach().cpu(), dtype=type)
a.tofile(file)

return items
Expand All @@ -71,7 +72,7 @@ def shape_to_np(shape, dtype=int):

if not isinstance(dtype, list):
dtype = [dtype] * len(shapes)
np_array = list([shape_to_np(s,t) for s,t in zip(shapes, dtype)])
np_array = list([shape_to_np(s, t) for s, t in zip(shapes, dtype)])
return list([torch.tensor(t * scale) for t in np_array])


Expand All @@ -81,8 +82,18 @@ def shape_to_np(shape, dtype=int):
# @param input_dims dimensions to record including batch (list of tuple)
# @param label_dims dimensions to record including batch (list of tuple)
# @param name golden name
def record_v2(model, iteration, input_dims, label_dims, name, clip=False,
input_dtype=None, input_label_reader=None, optimizer=None):
def record_v2(
model,
iteration,
input_dims,
label_dims,
name,
clip=False,
input_dtype=None,
input_label_reader=None,
optimizer=None,
type="default",
):
## file format is as below
# [<number of iteration(int)> <Iteration> <Iteration>...<Iteration>]
# Each iteration contains
Expand All @@ -101,7 +112,9 @@ def record_iteration(write_fn):
if input_label_reader != None:
inputs, labels = input_label_reader(input_dims, label_dims, input_dtype)
else:
inputs = _rand_like(input_dims, dtype=input_dtype if input_dtype is not None else float)
inputs = _rand_like(
input_dims, dtype=input_dtype if input_dtype is not None else float
)
labels = _rand_like(label_dims, dtype=float)
write_fn(inputs)
write_fn(labels)
Expand All @@ -117,64 +130,81 @@ def record_iteration(write_fn):
optimizer.step()

def record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler):
model_= model.cuda()
model_ = model.cuda()

print(inputs[0], " inputs inside")
output = model_(inputs[0], labels[0])
output = model_(inputs[0].cuda(), labels[0].cuda())

print("model output type: ",output.dtype)
print("model output type: ", output.dtype)

with autocast(device_type='cuda', dtype=torch.float16):
l=model_.loss(output, labels[0].to('cuda'))
with autocast(device_type="cuda", dtype=torch.float16):
l = model_.loss(output, labels[0].to("cuda"))

optimizer.zero_grad()

scaler.scale(l).backward()
print("Gradient ---------------")
for param in model_.parameters():
print (param.grad)
mask = torch.isnan(param.grad) or torch.isinf(param.grad)
check_nan = mask.int()
if check_nan.sum().item():
is_nan = True
else:
is_nan = False

print(param.grad)
is_nan = torch.any(torch.isnan(param.grad) | torch.isinf(param.grad))
if is_nan:
print("nan or inf detected in gradient")
break

if not is_nan:
print("------------------------------- not nan")
write_fn(output,'int32','float32')
write_fn(output, "int32", "float32")
return output, is_nan

with open(file_name, "wb") as f:
# write number of iterations
print("iteration : ", iteration)
np.array([iteration], dtype="int32").tofile(f)

write_fn = _get_writer_mixed(f)
for i in range(iteration):
if input_label_reader != None:
inputs, labels = input_label_reader(input_dims, label_dims, input_dtype)
else:
inputs = _rand_like(input_dims, dtype=input_dtype if input_dtype is not None else float)
labels = _rand_like(label_dims, dtype=float)
print("inputs ==============")
write_fn(inputs,'int32', 'float32')
print("labels ==============")
write_fn(labels, 'int32', 'float32')
is_nan = True;
print("=========================== ", i)
scaler = amp.GradScaler()
print("weights ==============")
write_fn(list(t for _, t in params_translated(model)),'int16','float16')
print("\n\n")
while(is_nan):
print( "before is_nan_", is_nan)
output,is_nan_ = record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler)
is_nan = is_nan_
print( "after is_nan_", is_nan)
scaler.step(optimizer)
scaler.update()
if type == "default":
with open(file_name, "wb") as f:
# write number of iterations
np.array([iteration], dtype="int32").tofile(f)

write_fn = _get_writer(f)
for _ in range(iteration):
record_iteration(write_fn)

elif type == "mixed":
with open(file_name, "wb") as f:
# write number of iterations
print("iteration : ", iteration)
np.array([iteration], dtype="int32").tofile(f)

write_fn = _get_writer_mixed(f)
for i in range(iteration):
if input_label_reader != None:
inputs, labels = input_label_reader(
input_dims, label_dims, input_dtype
)
else:
inputs = _rand_like(
input_dims,
dtype=input_dtype if input_dtype is not None else float,
)
labels = _rand_like(label_dims, dtype=float)
print("inputs ==============")
write_fn(inputs, "int32", "float32")
print("labels ==============")
write_fn(labels, "int32", "float32")
is_nan = True
print("=========================== ", i)
scaler = amp.GradScaler()
print("weights ==============")
write_fn(
list(t for _, t in params_translated(model)), "int16", "float16"
)
print("\n\n")
while is_nan:
print("before is_nan_", is_nan)
output, is_nan_ = record_iteration_with_amp(
write_fn, inputs, labels, is_nan, scaler
)
is_nan = is_nan_
print("after is_nan_", is_nan)
scaler.step(optimizer)
scaler.update()


##
# @brief inpsect if file is created correctly
Expand All @@ -194,4 +224,3 @@ def inspect_file(file_name, show_content=True):
t = np.fromfile(f, dtype="float32", count=sz)
if show_content:
print(t)

Loading