Skip to content

Commit

Permalink
remove eval
Browse files Browse the repository at this point in the history
  • Loading branch information
priyakasimbeg committed Apr 24, 2024
1 parent 7477578 commit 516fb7c
Showing 1 changed file with 87 additions and 87 deletions.
174 changes: 87 additions & 87 deletions submission_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,93 +377,93 @@ def train_once(
train_state['is_time_remaining'] = (
train_state['accumulated_submission_time'] < max_allowed_runtime_sec)
# Check if submission is eligible for an untimed eval.
if ((train_step_end_time - train_state['last_eval_time']) >=
workload.eval_period_time_sec or train_state['training_complete']):
with profiler.profile('Evaluation'):
del batch
_reset_cuda_mem()

try:
eval_start_time = get_time()
latest_eval_result = workload.eval_model(global_eval_batch_size,
model_params,
model_state,
eval_rng,
data_dir,
imagenet_v2_data_dir,
global_step)
# Check if targets reached.
# Note that this is one of the stopping conditions for the length of
# a training run. To score the run we only consider the time
# to validation target retrospectively.
train_state['validation_goal_reached'] = (
workload.has_reached_validation_target(latest_eval_result) or
train_state['validation_goal_reached'])
train_state['test_goal_reached'] = (
workload.has_reached_test_target(latest_eval_result) or
train_state['test_goal_reached'])
goals_reached = (
train_state['validation_goal_reached'] and
train_state['test_goal_reached'])
# Save last eval time.
eval_end_time = get_time()
train_state['last_eval_time'] = eval_end_time

# Accumulate eval time.
train_state[
'accumulated_eval_time'] += eval_end_time - eval_start_time

# Add times to eval results for logging.
latest_eval_result['score'] = (
train_state['accumulated_submission_time'])
latest_eval_result[
'total_duration'] = eval_end_time - global_start_time
latest_eval_result['accumulated_submission_time'] = train_state[
'accumulated_submission_time']
latest_eval_result['accumulated_eval_time'] = train_state[
'accumulated_eval_time']
latest_eval_result['accumulated_logging_time'] = train_state[
'accumulated_logging_time']
time_since_start = latest_eval_result['total_duration']
logging.info(f'Time since start: {time_since_start:.2f}s, '
f'\tStep: {global_step}, \t{latest_eval_result}')
eval_results.append((global_step, latest_eval_result))

logging_start_time = get_time()

if log_dir is not None and RANK == 0:
metrics_logger.append_scalar_metrics(
latest_eval_result,
global_step=global_step,
preemption_count=preemption_count,
is_eval=True,
)
if save_checkpoints:
checkpoint_utils.save_checkpoint(
framework=FLAGS.framework,
optimizer_state=optimizer_state,
model_params=model_params,
model_state=model_state,
train_state=train_state,
eval_results=eval_results,
global_step=global_step,
preemption_count=preemption_count,
checkpoint_dir=log_dir,
save_intermediate_checkpoints=FLAGS
.save_intermediate_checkpoints)

logging_end_time = get_time()
train_state['accumulated_logging_time'] += (
logging_end_time - logging_start_time)

_reset_cuda_mem()

except RuntimeError as e:
logging.exception(f'Eval step {global_step} error.\n')
if 'out of memory' in str(e):
logging.warning('Error: GPU out of memory during eval during step '
f'{global_step}, error : {str(e)}.')
_reset_cuda_mem()
# if ((train_step_end_time - train_state['last_eval_time']) >=
# workload.eval_period_time_sec or train_state['training_complete']):
# with profiler.profile('Evaluation'):
# del batch
# _reset_cuda_mem()

# try:
# eval_start_time = get_time()
# latest_eval_result = workload.eval_model(global_eval_batch_size,
# model_params,
# model_state,
# eval_rng,
# data_dir,
# imagenet_v2_data_dir,
# global_step)
# # Check if targets reached.
# # Note that this is one of the stopping conditions for the length of
# # a training run. To score the run we only consider the time
# # to validation target retrospectively.
# train_state['validation_goal_reached'] = (
# workload.has_reached_validation_target(latest_eval_result) or
# train_state['validation_goal_reached'])
# train_state['test_goal_reached'] = (
# workload.has_reached_test_target(latest_eval_result) or
# train_state['test_goal_reached'])
# goals_reached = (
# train_state['validation_goal_reached'] and
# train_state['test_goal_reached'])
# # Save last eval time.
# eval_end_time = get_time()
# train_state['last_eval_time'] = eval_end_time

# # Accumulate eval time.
# train_state[
# 'accumulated_eval_time'] += eval_end_time - eval_start_time

# # Add times to eval results for logging.
# latest_eval_result['score'] = (
# train_state['accumulated_submission_time'])
# latest_eval_result[
# 'total_duration'] = eval_end_time - global_start_time
# latest_eval_result['accumulated_submission_time'] = train_state[
# 'accumulated_submission_time']
# latest_eval_result['accumulated_eval_time'] = train_state[
# 'accumulated_eval_time']
# latest_eval_result['accumulated_logging_time'] = train_state[
# 'accumulated_logging_time']
# time_since_start = latest_eval_result['total_duration']
# logging.info(f'Time since start: {time_since_start:.2f}s, '
# f'\tStep: {global_step}, \t{latest_eval_result}')
# eval_results.append((global_step, latest_eval_result))

# logging_start_time = get_time()

# if log_dir is not None and RANK == 0:
# metrics_logger.append_scalar_metrics(
# latest_eval_result,
# global_step=global_step,
# preemption_count=preemption_count,
# is_eval=True,
# )
# if save_checkpoints:
# checkpoint_utils.save_checkpoint(
# framework=FLAGS.framework,
# optimizer_state=optimizer_state,
# model_params=model_params,
# model_state=model_state,
# train_state=train_state,
# eval_results=eval_results,
# global_step=global_step,
# preemption_count=preemption_count,
# checkpoint_dir=log_dir,
# save_intermediate_checkpoints=FLAGS
# .save_intermediate_checkpoints)

# logging_end_time = get_time()
# train_state['accumulated_logging_time'] += (
# logging_end_time - logging_start_time)

# _reset_cuda_mem()

# except RuntimeError as e:
# logging.exception(f'Eval step {global_step} error.\n')
# if 'out of memory' in str(e):
# logging.warning('Error: GPU out of memory during eval during step '
# f'{global_step}, error : {str(e)}.')
# _reset_cuda_mem()

train_state['last_step_end_time'] = get_time()

Expand Down

0 comments on commit 516fb7c

Please sign in to comment.