Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] New Limb Masking + Nonlinear DL + F10.7 Experiment #25

Merged
merged 12 commits into from
Oct 2, 2024
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ wandb
output
outputs
*.tar
notebooks/imgs_for_google_emb/*
notebooks/camera_ready/*/lightning_logs

# aux directories
Expand Down
163 changes: 163 additions & 0 deletions experiments/for_google_emb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# default.yaml

# MODEL SUMMARY
# | Name | Type | Params
# -------------------------------------------------------
# 0 | autoencoder | MaskedAutoencoderViT3D | 333 M
# -------------------------------------------------------
# 329 M Trainable params
# 4.7 M Non-trainable params
# 333 M Total params
# 1,335.838 Total estimated model params size (MB)

# general
log_level: 'DEBUG'
experiment:
name: "mae-helioprojected-2011" # generate random name in wandb
project: "sdofm"
task: "pretrain" # options: train, evaluate (not implemented)
model: "samae"
backbone_checkpoint: null
seed: 0
disable_cuda: false
resuming: false
wandb:
enable: true
entity: "fdlx"
group: "sdofm-phase1"
job_type: "pretrain"
tags: []
notes: ""
output_directory: "wandb_output"
log_model: "all" # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches)
gcp_storage: # this will checkpoint all epoches, perhaps clean up this config
enabled: true
bucket: "sdofm-checkpoints"
fold: null
evaluate: false # skip training and only evaluate (requires checkpoint to be set)
checkpoint: null # this is the wandb run_id of the checkpoint to load
device: null # this is set automatically using the disable_cuda flag and torch.cuda.is_available()
precision: '16' #-true' # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
log_n_batches: 1000 # log every n training batches
save_results: true # save full results to file and wandb
accelerator: "auto" # options are "auto", "gpu", "tpu", "ipu", or "cpu"
profiler: null #'XLAProfiler' # 'XLAProfiler' # options are XLAProfiler/PyTorchProfiler Warning: XLA for TPUs only works on single world size
distributed:
enabled: true
world_size: "auto" # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators.
log_every_n_steps: 50

# dataset configuration
data:
min_date: '2015-02-01 00:00:00.00' # NOT IMPLEMENTED # minimum is '2010-09-09 00:00:11.08'
max_date: '2015-05-31 23:59:59.99' # NOT IMPLEMENTED # maximum is '2023-05-26 06:36:08.072'
month_splits: # non selected months will form training set
train: [1] #,2,3,4,5,6,7,8,9,10]
val: [2]
test: [3,4]
holdout: []
num_workers: 16 # set appropriately for your machine
prefetch_factor: 3
num_frames: 1 # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE
drop_frame_dim: false
# output_directory: "wandb_output"
sdoml:
base_directory: "/mnt/sdoml"
sub_directory:
hmi: "HMI.zarr"
aia: "AIA.zarr"
eve: "EVE_legacy.zarr"
cache: "cache"
components: null # null for select all magnetic components ["Bx", "By", "Bz"]
wavelengths: null # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"]
ions: null # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"]
frequency: '12min' # smallest is 12min
mask_with_hmi_threshold: null # None/null for no mask, float for threshold
feature_engineering:
enabled: true
dclass: 'HelioProjected'

# model configurations
model:
# PRETRAINERS
mae:
img_size: 512
patch_size: 16
num_frames: 1
tubelet_size: 1
in_chans: 9
embed_dim: 128
depth: 24
num_heads: 16
decoder_embed_dim: 512
decoder_depth: 8
decoder_num_heads: 16
mlp_ratio: 4.0
norm_layer: 'LayerNorm'
norm_pix_loss: False
masking_ratio: 0.75
samae:
# uses all parameters as in mae plus these
masking_type: "solar_aware" # 'random' or 'solar_aware'
active_region_mu_degs: 15.73
active_region_std_degs: 6.14
active_region_scale: 1.0
active_region_abs_lon_max_degs: 60
active_region_abs_lat_max_degs: 60
nvae:
use_se: true
res_dist: true
num_x_bits: 8
num_latent_scales: 3 # 5
num_groups_per_scale: 1 # 16
num_latent_per_group: 1 # 10
ada_groups: true
min_groups_per_scale: 1
num_channels_enc: 30
num_channels_dec: 30
num_preprocess_blocks: 2 # 1
num_preprocess_cells: 2
num_cell_per_cond_enc: 2
num_postprocess_blocks: 2 # 1
num_postprocess_cells: 2
num_cell_per_cond_dec: 2
num_mixture_dec: 1
num_nf: 2
kl_anneal_portion: 0.3
kl_const_portion: 0.0001
kl_const_coeff: 0.0001
# learning_rate: 1e-2
# weight_decay: 3e-4
weight_decay_norm_anneal: true
weight_decay_norm_init: 1.
weight_decay_norm: 1e-2

# FINE-TUNERS
degragation:
num_neck_filters: 32
output_dim: 1 # not sure why this is implemented for autocorrelation, should be a scalar
loss: "mse" # options: "mse", "heteroscedastic"
freeze_encoder: true

# ML optimization arguments:
opt:
loss: "mse" # options: "mae", "mse", "mape"
scheduler: "constant" #other options: "cosine", "plateau", "exp"
scheduler_warmup: 0
batch_size: 1
learning_rate: 0.0001
weight_decay: 3e-4 # 0.0
optimiser: "adam"
epochs: 2
patience: 2

# hydra configuration
# hydra:
# sweeper:
# params:
# model.mae.embed_dim: 256, 512
# model.mae.masking_ratio: 0.5, 0.75
# model.samae.masking_type: "random", "solar_aware"

hydra:
mode: RUN
8 changes: 6 additions & 2 deletions experiments/pretrain_32.2M_mae_HP_r512_e128_p16.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ experiment:
evaluate: false # skip training and only evaluate (requires checkpoint to be set)
checkpoint: null # this is the wandb run_id of the checkpoint to load
device: null # this is set automatically using the disable_cuda flag and torch.cuda.is_available()
precision: '16' #-true' # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
precision: '16-mixed' #-true' # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
log_n_batches: 1000 # log every n training batches
save_results: true # save full results to file and wandb
accelerator: "auto" # options are "auto", "gpu", "tpu", "ipu", or "cpu"
Expand Down Expand Up @@ -73,6 +73,7 @@ data:
ions: null # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"]
frequency: '12min' # smallest is 12min
mask_with_hmi_threshold: null # None/null for no mask, float for threshold
apply_mask: false
feature_engineering:
enabled: true
dclass: 'HelioProjected'
Expand Down Expand Up @@ -148,9 +149,12 @@ model:
learning_rate: 0.0001
weight_decay: 3e-4 # 0.0
optimiser: "adam"
epochs: 2
epochs: 100
patience: 2

misc:
limb_mask: false

# hydra configuration
# hydra:
# sweeper:
Expand Down
167 changes: 167 additions & 0 deletions experiments/pretrain_32.2M_mae_log_r512_e128_p16.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# default.yaml

# MODEL SUMMARY
# | Name | Type | Params
# -------------------------------------------------------
# 0 | autoencoder | MaskedAutoencoderViT3D | 333 M
# -------------------------------------------------------
# 329 M Trainable params
# 4.7 M Non-trainable params
# 333 M Total params
# 1,335.838 Total estimated model params size (MB)

# general
log_level: 'DEBUG'
experiment:
name: "mae-log-helioprojected-limbmasked-2011subset-r512-e128-p16" # generate random name in wandb
project: "sdofm"
task: "pretrain" # options: train, evaluate (not implemented)
model: "mae"
backbone_checkpoint: null
seed: 0
disable_cuda: false
resuming: false
wandb:
enable: true
entity: "fdlx"
group: "sdofm-phase1"
job_type: "pretrain"
tags: []
notes: ""
output_directory: "wandb_output"
log_model: "all" # can be True (final checkpoint), False (no checkpointing), or "all" (for all epoches)
gcp_storage: # this will checkpoint all epoches, perhaps clean up this config
enabled: true
bucket: "sdofm-checkpoints"
fold: null
evaluate: false # skip training and only evaluate (requires checkpoint to be set)
checkpoint: null # this is the wandb run_id of the checkpoint to load
device: null # this is set automatically using the disable_cuda flag and torch.cuda.is_available()
precision: '16-mixed' #-true' # (32, 64) for cuda, ('32-true', '16-true', 'bf16-true') for tpu
log_n_batches: 1000 # log every n training batches
save_results: true # save full results to file and wandb
accelerator: "auto" # options are "auto", "gpu", "tpu", "ipu", or "cpu"
profiler: null #'XLAProfiler' # 'XLAProfiler' # options are XLAProfiler/PyTorchProfiler Warning: XLA for TPUs only works on single world size
distributed:
enabled: true
world_size: "auto" # The "auto" option recognizes the machine you are on, and selects the appropriate number of accelerators.
log_every_n_steps: 50

# dataset configuration
data:
min_date: '2011-01-01 00:00:00.00' # NOT IMPLEMENTED # minimum is '2010-09-09 00:00:11.08'
max_date: '2011-03-31 23:59:59.99' # NOT IMPLEMENTED # maximum is '2023-05-26 06:36:08.072'
month_splits: # non selected months will form training set
train: [1] #,2,3,4,5,6,7,8,9,10]
val: [2]
test: [3]
holdout: []
num_workers: 16 # set appropriately for your machine
prefetch_factor: 3
num_frames: 1 # WARNING: This is only read for FINETUNING, model num_frames overrides in BACKBONE
drop_frame_dim: false
# output_directory: "wandb_output"
sdoml:
base_directory: "/mnt/sdoml"
sub_directory:
hmi: "HMI.zarr"
aia: "AIA.zarr"
eve: "EVE_legacy.zarr"
cache: "cache"
components: null # null for select all magnetic components ["Bx", "By", "Bz"]
wavelengths: null # null for select all wavelengths channels ["131A","1600A","1700A","171A","193A","211A","304A","335A","94A"]
ions: null # null to select all ion channels ["C III", "Fe IX", "Fe VIII", "Fe X", "Fe XI", "Fe XII", "Fe XIII", "Fe XIV", "Fe XIX", "Fe XV", "Fe XVI", "Fe XVIII", "Fe XVI_2", "Fe XX", "Fe XX_2", "Fe XX_3", "H I", "H I_2", "H I_3", "He I", "He II", "He II_2", "He I_2", "Mg IX", "Mg X", "Mg X_2", "Ne VII", "Ne VIII", "O II", "O III", "O III_2", "O II_2", "O IV", "O IV_2", "O V", "O VI", "S XIV", "Si XII", "Si XII_2"]
frequency: '12min' # smallest is 12min
mask_with_hmi_threshold: null # None/null for no mask, float for threshold
apply_mask: true
feature_engineering:
enabled: true
dclass: 'Log'

# model configurations
model:
# PRETRAINERS
mae:
img_size: 512
patch_size: 16
num_frames: 1
tubelet_size: 1
in_chans: 9
embed_dim: 128
depth: 24
num_heads: 16
decoder_embed_dim: 512
decoder_depth: 8
decoder_num_heads: 16
mlp_ratio: 4.0
norm_layer: 'LayerNorm'
norm_pix_loss: False
masking_ratio: 0.75
samae:
# uses all parameters as in mae plus these
masking_type: "solar_aware" # 'random' or 'solar_aware'
active_region_mu_degs: 15.73
active_region_std_degs: 6.14
active_region_scale: 1.0
active_region_abs_lon_max_degs: 60
active_region_abs_lat_max_degs: 60
nvae:
use_se: true
res_dist: true
num_x_bits: 8
num_latent_scales: 3 # 5
num_groups_per_scale: 1 # 16
num_latent_per_group: 1 # 10
ada_groups: true
min_groups_per_scale: 1
num_channels_enc: 30
num_channels_dec: 30
num_preprocess_blocks: 2 # 1
num_preprocess_cells: 2
num_cell_per_cond_enc: 2
num_postprocess_blocks: 2 # 1
num_postprocess_cells: 2
num_cell_per_cond_dec: 2
num_mixture_dec: 1
num_nf: 2
kl_anneal_portion: 0.3
kl_const_portion: 0.0001
kl_const_coeff: 0.0001
# learning_rate: 1e-2
# weight_decay: 3e-4
weight_decay_norm_anneal: true
weight_decay_norm_init: 1.
weight_decay_norm: 1e-2

# FINE-TUNERS
degragation:
num_neck_filters: 32
output_dim: 1 # not sure why this is implemented for autocorrelation, should be a scalar
loss: "mse" # options: "mse", "heteroscedastic"
freeze_encoder: true

# ML optimization arguments:
opt:
loss: "mse" # options: "mae", "mse", "mape"
scheduler: "constant" #other options: "cosine", "plateau", "exp"
scheduler_warmup: 0
batch_size: 1
learning_rate: 0.0001
weight_decay: 3e-4 # 0.0
optimiser: "adam"
epochs: 2
patience: 2

misc:
limb_mask: true

# hydra configuration
# hydra:
# sweeper:
# params:
# model.mae.embed_dim: 256, 512
# model.mae.masking_ratio: 0.5, 0.75
# model.samae.masking_type: "random", "solar_aware"

hydra:
mode: RUN
Loading