diff --git a/test/input_gen/genModelTests_v2.py b/test/input_gen/genModelTests_v2.py index b9b03cebe..86d2c808b 100644 --- a/test/input_gen/genModelTests_v2.py +++ b/test/input_gen/genModelTests_v2.py @@ -801,6 +801,7 @@ def forward(self, inputs, labels): label_dims=[(1, 10)], name="fc_mixed_training", optimizer=fc_mixed_training.getOptimizer(), + type="mixed", ) inspect_file("fc_mixed_training.nnmodelgolden") @@ -846,6 +847,7 @@ def forward(self, inputs, labels): label_dims=[(1, 1)], name="fc_mixed_training_nan_sgd", optimizer=fc_mixed_training_nan_sgd.getOptimizer(), + type="mixed", ) # Function to check the created golden test file diff --git a/test/input_gen/recorder_v2.py b/test/input_gen/recorder_v2.py index 6b8f42ff8..868b55d90 100644 --- a/test/input_gen/recorder_v2.py +++ b/test/input_gen/recorder_v2.py @@ -17,9 +17,9 @@ from transLayer_v2 import params_translated -if torch.__version__ != "1.9.1": +if torch.__version__ != "2.4": print( - "the script was tested at version 1.9.1 it might not work if torch version is different" + "the script was tested at version 2.4 it might not work if torch version is different" ) SEED = 1234 @@ -31,15 +31,15 @@ def _get_writer(file): - def write_fn(items, type = 'float32'): + def write_fn(items, type="float32"): if not isinstance(items, (list, tuple)): items = [items] for item in items: print(item.numel(), " -0-----") print(item) - np.array([item.numel()], dtype='int32').tofile(file) - a=np.array(item.detach().cpu(), dtype=type) + np.array([item.numel()], dtype="int32").tofile(file) + a = np.array(item.detach().cpu(), dtype=type) a.tofile(file) print(a.dtype) @@ -47,14 +47,15 @@ def write_fn(items, type = 'float32'): return write_fn + def _get_writer_mixed(file): - def write_fn(items, num_type = 'int32', type = 'float32'): + def write_fn(items, num_type="int32", type="float32"): if not isinstance(items, (list, tuple)): items = [items] for item in items: np.array([item.numel()], dtype=num_type).tofile(file) - a=np.array(item.detach().cpu(), dtype=type) + a = np.array(item.detach().cpu(), dtype=type) a.tofile(file) return items @@ -71,7 +72,7 @@ def shape_to_np(shape, dtype=int): if not isinstance(dtype, list): dtype = [dtype] * len(shapes) - np_array = list([shape_to_np(s,t) for s,t in zip(shapes, dtype)]) + np_array = list([shape_to_np(s, t) for s, t in zip(shapes, dtype)]) return list([torch.tensor(t * scale) for t in np_array]) @@ -81,8 +82,18 @@ def shape_to_np(shape, dtype=int): # @param input_dims dimensions to record including batch (list of tuple) # @param label_dims dimensions to record including batch (list of tuple) # @param name golden name -def record_v2(model, iteration, input_dims, label_dims, name, clip=False, - input_dtype=None, input_label_reader=None, optimizer=None): +def record_v2( + model, + iteration, + input_dims, + label_dims, + name, + clip=False, + input_dtype=None, + input_label_reader=None, + optimizer=None, + type="default", +): ## file format is as below # [ ...] # Each iteration contains @@ -101,7 +112,9 @@ def record_iteration(write_fn): if input_label_reader != None: inputs, labels = input_label_reader(input_dims, label_dims, input_dtype) else: - inputs = _rand_like(input_dims, dtype=input_dtype if input_dtype is not None else float) + inputs = _rand_like( + input_dims, dtype=input_dtype if input_dtype is not None else float + ) labels = _rand_like(label_dims, dtype=float) write_fn(inputs) write_fn(labels) @@ -117,64 +130,81 @@ def record_iteration(write_fn): optimizer.step() def record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler): - model_= model.cuda() + model_ = model.cuda() print(inputs[0], " inputs inside") - output = model_(inputs[0], labels[0]) + output = model_(inputs[0].cuda(), labels[0].cuda()) - print("model output type: ",output.dtype) + print("model output type: ", output.dtype) - with autocast(device_type='cuda', dtype=torch.float16): - l=model_.loss(output, labels[0].to('cuda')) + with autocast(device_type="cuda", dtype=torch.float16): + l = model_.loss(output, labels[0].to("cuda")) optimizer.zero_grad() scaler.scale(l).backward() print("Gradient ---------------") for param in model_.parameters(): - print (param.grad) - mask = torch.isnan(param.grad) or torch.isinf(param.grad) - check_nan = mask.int() - if check_nan.sum().item(): - is_nan = True - else: - is_nan = False - + print(param.grad) + is_nan = torch.any(torch.isnan(param.grad) | torch.isinf(param.grad)) + if is_nan: + print("nan or inf detected in gradient") + break if not is_nan: print("------------------------------- not nan") - write_fn(output,'int32','float32') + write_fn(output, "int32", "float32") return output, is_nan - with open(file_name, "wb") as f: - # write number of iterations - print("iteration : ", iteration) - np.array([iteration], dtype="int32").tofile(f) - - write_fn = _get_writer_mixed(f) - for i in range(iteration): - if input_label_reader != None: - inputs, labels = input_label_reader(input_dims, label_dims, input_dtype) - else: - inputs = _rand_like(input_dims, dtype=input_dtype if input_dtype is not None else float) - labels = _rand_like(label_dims, dtype=float) - print("inputs ==============") - write_fn(inputs,'int32', 'float32') - print("labels ==============") - write_fn(labels, 'int32', 'float32') - is_nan = True; - print("=========================== ", i) - scaler = amp.GradScaler() - print("weights ==============") - write_fn(list(t for _, t in params_translated(model)),'int16','float16') - print("\n\n") - while(is_nan): - print( "before is_nan_", is_nan) - output,is_nan_ = record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler) - is_nan = is_nan_ - print( "after is_nan_", is_nan) - scaler.step(optimizer) - scaler.update() + if type == "default": + with open(file_name, "wb") as f: + # write number of iterations + np.array([iteration], dtype="int32").tofile(f) + + write_fn = _get_writer(f) + for _ in range(iteration): + record_iteration(write_fn) + + elif type == "mixed": + with open(file_name, "wb") as f: + # write number of iterations + print("iteration : ", iteration) + np.array([iteration], dtype="int32").tofile(f) + + write_fn = _get_writer_mixed(f) + for i in range(iteration): + if input_label_reader != None: + inputs, labels = input_label_reader( + input_dims, label_dims, input_dtype + ) + else: + inputs = _rand_like( + input_dims, + dtype=input_dtype if input_dtype is not None else float, + ) + labels = _rand_like(label_dims, dtype=float) + print("inputs ==============") + write_fn(inputs, "int32", "float32") + print("labels ==============") + write_fn(labels, "int32", "float32") + is_nan = True + print("=========================== ", i) + scaler = amp.GradScaler() + print("weights ==============") + write_fn( + list(t for _, t in params_translated(model)), "int16", "float16" + ) + print("\n\n") + while is_nan: + print("before is_nan_", is_nan) + output, is_nan_ = record_iteration_with_amp( + write_fn, inputs, labels, is_nan, scaler + ) + is_nan = is_nan_ + print("after is_nan_", is_nan) + scaler.step(optimizer) + scaler.update() + ## # @brief inpsect if file is created correctly @@ -194,4 +224,3 @@ def inspect_file(file_name, show_content=True): t = np.fromfile(f, dtype="float32", count=sz) if show_content: print(t) -