add results + configs

segment-any-text · Jun 17, 2024 · d7af259 · d7af259
1 parent 08b34c0
commit d7af259
Show file tree

Hide file tree

Showing 323 changed files with 117,262 additions and 1,482 deletions.
diff --git a/configs/SM/sat_sm_12l.json b/configs/SM/sat_sm_12l.json
@@ -0,0 +1,22 @@
+{
+    "output_dir": "sat_sm_12l",
+    "lim_lookahead": false,
+    "block_size": 256,
+    "no_sm_corruption": false,
+    "overwrite_output_dir": true,
+    "evaluation_strategy": "steps",
+    "eval_steps": 250,
+    "report_to": "wandb",
+    "learning_rate": 0.00003,
+    "warmup_steps": 500,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 128,
+    "weight_decay": 0.01,
+    "push_to_hub": false,
+    "save_total_limit": 1,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "load_best_model_at_end": false,
+    "max_steps": 20000,
+    "num_layers": 12
+}
diff --git a/configs/SM/sat_sm_12l_ll.json b/configs/SM/sat_sm_12l_ll.json
@@ -0,0 +1,22 @@
+{
+    "output_dir": "sat_sm_12l",
+    "lim_lookahead": true,
+    "block_size": 256,
+    "no_sm_corruption": false,
+    "overwrite_output_dir": true,
+    "evaluation_strategy": "steps",
+    "eval_steps": 250,
+    "report_to": "wandb",
+    "learning_rate": 0.00003,
+    "warmup_steps": 500,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 128,
+    "weight_decay": 0.01,
+    "push_to_hub": false,
+    "save_total_limit": 1,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "load_best_model_at_end": false,
+    "max_steps": 20000,
+    "num_layers": 12
+}
diff --git a/configs/SM/sat_sm_12l_no-pretraining.json b/configs/SM/sat_sm_12l_no-pretraining.json
@@ -0,0 +1,23 @@
+{
+    "output_dir": "sat_sm_12l",
+    "lim_lookahead": true,
+    "block_size": 256,
+    "no_sm_corruption": false,
+    "without_pretraining" : true,
+    "overwrite_output_dir": true,
+    "evaluation_strategy": "steps",
+    "eval_steps": 250,
+    "report_to": "wandb",
+    "learning_rate": 0.00003,
+    "warmup_steps": 500,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 128,
+    "weight_decay": 0.01,
+    "push_to_hub": false,
+    "save_total_limit": 1,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "load_best_model_at_end": false,
+    "max_steps": 20000,
+    "num_layers": 12
+}
diff --git a/configs/SM/sat_sm_12l_only_clean.json b/configs/SM/sat_sm_12l_only_clean.json
@@ -0,0 +1,22 @@
+{
+    "output_dir": "sat_sm_12l",
+    "lim_lookahead": true,
+    "block_size": 256,
+    "no_sm_corruption": true,
+    "overwrite_output_dir": true,
+    "evaluation_strategy": "steps",
+    "eval_steps": 250,
+    "report_to": "wandb",
+    "learning_rate": 0.00003,
+    "warmup_steps": 500,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 128,
+    "weight_decay": 0.01,
+    "push_to_hub": false,
+    "save_total_limit": 1,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "load_best_model_at_end": false,
+    "max_steps": 20000,
+    "num_layers": 12
+}
diff --git a/configs/SM/sat_sm_1l.json b/configs/SM/sat_sm_1l.json
@@ -0,0 +1,22 @@
+{
+    "output_dir": "sat_sm_1l",
+    "lim_lookahead": false,
+    "block_size": 256,
+    "no_sm_corruption": false,
+    "overwrite_output_dir": true,
+    "evaluation_strategy": "steps",
+    "eval_steps": 250,
+    "report_to": "wandb",
+    "learning_rate": 0.00003,
+    "warmup_steps": 500,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 128,
+    "weight_decay": 0.01,
+    "push_to_hub": false,
+    "save_total_limit": 1,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "load_best_model_at_end": false,
+    "max_steps": 20000,
+    "num_layers": 1
+}
diff --git a/configs/SM/sat_sm_3l.json b/configs/SM/sat_sm_3l.json
@@ -0,0 +1,22 @@
+{
+    "output_dir": "sat_sm_3l",
+    "lim_lookahead": false,
+    "block_size": 256,
+    "no_sm_corruption": false,
+    "overwrite_output_dir": true,
+    "evaluation_strategy": "steps",
+    "eval_steps": 250,
+    "report_to": "wandb",
+    "learning_rate": 0.00003,
+    "warmup_steps": 500,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 128,
+    "weight_decay": 0.01,
+    "push_to_hub": false,
+    "save_total_limit": 1,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "load_best_model_at_end": false,
+    "max_steps": 20000,
+    "num_layers": 3
+}
diff --git a/configs/SM/sat_sm_6l.json b/configs/SM/sat_sm_6l.json
@@ -0,0 +1,22 @@
+{
+    "output_dir": "sat_sm_6l",
+    "lim_lookahead": false,
+    "block_size": 256,
+    "no_sm_corruption": false,
+    "overwrite_output_dir": true,
+    "evaluation_strategy": "steps",
+    "eval_steps": 250,
+    "report_to": "wandb",
+    "learning_rate": 0.00003,
+    "warmup_steps": 500,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 128,
+    "weight_decay": 0.01,
+    "push_to_hub": false,
+    "save_total_limit": 1,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "load_best_model_at_end": false,
+    "max_steps": 20000,
+    "num_layers": 6
+}
diff --git a/configs/SM/sat_sm_9l.json b/configs/SM/sat_sm_9l.json
@@ -0,0 +1,22 @@
+{
+    "output_dir": "sat_sm_9",
+    "lim_lookahead": false,
+    "block_size": 256,
+    "no_sm_corruption": false,
+    "overwrite_output_dir": true,
+    "evaluation_strategy": "steps",
+    "eval_steps": 250,
+    "report_to": "wandb",
+    "learning_rate": 0.00003,
+    "warmup_steps": 500,
+    "per_device_train_batch_size": 128,
+    "per_device_eval_batch_size": 128,
+    "weight_decay": 0.01,
+    "push_to_hub": false,
+    "save_total_limit": 1,
+    "save_strategy": "steps",
+    "save_steps": 1000,
+    "load_best_model_at_end": false,
+    "max_steps": 20000,
+    "num_layers": 9
+}
diff --git a/configs/bertchar_mini_stratify_0.1.json → configs/WtP/bertchar_mini_stratify_0.1.json b/configs/bertchar_mini_stratify_0.1.json → configs/WtP/bertchar_mini_stratify_0.1.json
diff --git a/configs/bertchar_tiny_stratify_0.1.json → configs/WtP/bertchar_tiny_stratify_0.1.json b/configs/bertchar_tiny_stratify_0.1.json → configs/WtP/bertchar_tiny_stratify_0.1.json
diff --git a/configs/canine_no_stratify_3layers.json → configs/WtP/canine_no_stratify_3layers.json b/configs/canine_no_stratify_3layers.json → configs/WtP/canine_no_stratify_3layers.json
diff --git a/...gs/canine_stratify_0.1_12layers_long.json → ...tP/canine_stratify_0.1_12layers_long.json b/...gs/canine_stratify_0.1_12layers_long.json → ...tP/canine_stratify_0.1_12layers_long.json
diff --git a/...ine_stratify_0.1_12layers_long_no_la.json → ...ine_stratify_0.1_12layers_long_no_la.json b/...ine_stratify_0.1_12layers_long_no_la.json → ...ine_stratify_0.1_12layers_long_no_la.json
diff --git a/configs/canine_stratify_0.1_1layer_long.json → .../WtP/canine_stratify_0.1_1layer_long.json b/configs/canine_stratify_0.1_1layer_long.json → .../WtP/canine_stratify_0.1_1layer_long.json
diff --git a/...anine_stratify_0.1_1layer_long_no_la.json → ...anine_stratify_0.1_1layer_long_no_la.json b/...anine_stratify_0.1_1layer_long_no_la.json → ...anine_stratify_0.1_1layer_long_no_la.json
diff --git a/...igs/canine_stratify_0.1_3layers_long.json → ...WtP/canine_stratify_0.1_3layers_long.json b/...igs/canine_stratify_0.1_3layers_long.json → ...WtP/canine_stratify_0.1_3layers_long.json
diff --git a/...nine_stratify_0.1_3layers_long_no_la.json → ...nine_stratify_0.1_3layers_long_no_la.json b/...nine_stratify_0.1_3layers_long_no_la.json → ...nine_stratify_0.1_3layers_long_no_la.json
diff --git a/...tratify_0.1_3layers_long_subsample25.json → ...tratify_0.1_3layers_long_subsample25.json b/...tratify_0.1_3layers_long_subsample25.json → ...tratify_0.1_3layers_long_subsample25.json
diff --git a/...tratify_0.1_3layers_long_subsample50.json → ...tratify_0.1_3layers_long_subsample50.json b/...tratify_0.1_3layers_long_subsample50.json → ...tratify_0.1_3layers_long_subsample50.json
diff --git a/...tratify_0.1_3layers_long_subsample75.json → ...tratify_0.1_3layers_long_subsample75.json b/...tratify_0.1_3layers_long_subsample75.json → ...tratify_0.1_3layers_long_subsample75.json
diff --git a/...s/canine_stratify_0.1_3layers_no_aux.json → ...P/canine_stratify_0.1_3layers_no_aux.json b/...s/canine_stratify_0.1_3layers_no_aux.json → ...P/canine_stratify_0.1_3layers_no_aux.json
diff --git a/...stratify_0.1_3layers_no_aux_training.json → ...stratify_0.1_3layers_no_aux_training.json b/...stratify_0.1_3layers_no_aux_training.json → ...stratify_0.1_3layers_no_aux_training.json
diff --git a/...tify_0.1_3layers_shared_lang_adapter.json → ...tify_0.1_3layers_shared_lang_adapter.json b/...tify_0.1_3layers_shared_lang_adapter.json → ...tify_0.1_3layers_shared_lang_adapter.json
diff --git a/...igs/canine_stratify_0.1_6layers_long.json → ...WtP/canine_stratify_0.1_6layers_long.json b/...igs/canine_stratify_0.1_6layers_long.json → ...WtP/canine_stratify_0.1_6layers_long.json
diff --git a/...nine_stratify_0.1_6layers_long_no_la.json → ...nine_stratify_0.1_6layers_long_no_la.json b/...nine_stratify_0.1_6layers_long_no_la.json → ...nine_stratify_0.1_6layers_long_no_la.json
diff --git a/...igs/canine_stratify_0.1_9layers_long.json → ...WtP/canine_stratify_0.1_9layers_long.json b/...igs/canine_stratify_0.1_9layers_long.json → ...WtP/canine_stratify_0.1_9layers_long.json
diff --git a/...nine_stratify_0.1_9layers_long_no_la.json → ...nine_stratify_0.1_9layers_long_no_la.json b/...nine_stratify_0.1_9layers_long_no_la.json → ...nine_stratify_0.1_9layers_long_no_la.json
diff --git a/configs/canine_stratify_0.1_debug.json → configs/WtP/canine_stratify_0.1_debug.json b/configs/canine_stratify_0.1_debug.json → configs/WtP/canine_stratify_0.1_debug.json
diff --git a/configs/canine_stratify_0.1_3layers_lookahead_128.json b/configs/canine_stratify_0.1_3layers_lookahead_128.json
diff --git a/configs/peft/lora.json → configs/lora/lora_12l.json b/configs/peft/lora.json → configs/lora/lora_12l.json
@@ -1,6 +1,6 @@
 {
-    "model_name_or_path": "xlmr-3l-v3_look48_lc0.1-mix2",
-    "output_dir": "xlmr-3l-v4_LL_lora-v2_ep30_s10k",
+    "model_name_or_path": "segment-any-text/sat-12l",
+    "output_dir": "sat-12l-LL_lora",
     "block_size": 256,
     "eval_stride": 128,
     "do_train": true,
@@ -16,7 +16,7 @@
     "num_train_epochs": 30,
     "logging_steps": 50,
     "report_to": "wandb",
-    "wandb_project": "sentence-peft-v2",
+    "wandb_project": "sentence",
     "save_steps": 100000000,
     "remove_unused_columns": false,
     "one_sample_per_line": false,

diff --git a/configs/peft/adapter.json → configs/lora/lora_3l.json b/configs/peft/adapter.json → configs/lora/lora_3l.json
@@ -1,6 +1,6 @@
 {
-    "model_name_or_path": "xlmr-normal-p-v3",
-    "output_dir": "xlmr-3l-v3_adapter_rf32_ep20_v2_100-1k-10k",
+    "model_name_or_path": "segment-any-text/sat-3l",
+    "output_dir": "sat-3l-LL_lora",
     "block_size": 256,
     "eval_stride": 128,
     "do_train": true,
@@ -13,9 +13,10 @@
     "preprocessing_num_workers": 1,
     "learning_rate": 3e-4,
     "fp16": false,
-    "num_train_epochs": 20,
+    "num_train_epochs": 30,
     "logging_steps": 50,
     "report_to": "wandb",
+    "wandb_project": "sentence",
     "save_steps": 100000000,
     "remove_unused_columns": false,
     "one_sample_per_line": false,
@@ -29,9 +30,9 @@
     "use_subwords": true,
     "custom_punctuation_file": "punctuation_xlmr_unk.txt",
     "log_level": "warning",
-    "adapter_config": "seq_bn[reduction_factor=32]",
+    "adapter_config": "lora[r=16,alpha=32,intermediate_lora=True]",
     "weight_decay": 0.0,
     "auxiliary_remove_prob": 0.0,
-    "do_process": false,
-    "n_train_steps": [100, 1000, 10000]
+    "train_adapter": true,
+    "subsample": 10000
 }
diff --git a/configs/lora/lora_lyrics.json b/configs/lora/lora_lyrics.json
@@ -0,0 +1,42 @@
+{
+    "model_name_or_path": "segment-any-text/sat-12l",
+    "output_dir": "sat-12l-no-LL_lora_lyrics",
+    "block_size": 512,
+    "eval_stride": 256,
+    "do_train": true,
+    "do_eval": true,
+    "per_device_train_batch_size": 64,
+    "per_device_eval_batch_size": 32,
+    "gradient_accumulation_steps": 1,
+    "eval_accumulation_steps": 8,
+    "evaluation_strategy": "epoch",
+    "dataloader_num_workers": 1,
+    "preprocessing_num_workers": 1,
+    "learning_rate": 3e-4,
+    "fp16": false,
+    "num_train_epochs": 30,
+    "logging_steps": 50,
+    "report_to": "wandb",
+    "wandb_project": "sentence",
+    "save_steps": 100000000,
+    "remove_unused_columns": false,
+    "one_sample_per_line": true,
+    "do_sentence_training": true,
+    "do_auxiliary_training": false,
+    "warmup_ratio": 0.1,
+    "non_punctuation_sample_ratio": null,
+    "prediction_loss_only": true,
+    "use_auxiliary": true,
+    "ddp_timeout": 3600,
+    "use_subwords": true,
+    "custom_punctuation_file": "punctuation_xlmr_unk.txt",
+    "log_level": "warning",
+    "adapter_config": "lora[r=16,alpha=32,intermediate_lora=True]",
+    "weight_decay": 0.0,
+    "auxiliary_remove_prob": 0.0,
+    "text_path": "data/lyrics.pth",
+    "skip_eval_loss": false,
+    "shuffle": false,
+    "train_adapter": true,
+    "subsample": 10000
+}
diff --git a/configs/peft/lora_lyrics_xlmr.json → configs/lora/lora_lyrics_no_pretraining.json b/configs/peft/lora_lyrics_xlmr.json → configs/lora/lora_lyrics_no_pretraining.json
@@ -1,12 +1,13 @@
 {
     "model_name_or_path": "xlm-roberta-base",
-    "output_dir": "xlm-roberta-base_lora-v2_ep30_mldbW-verses_bs512",
+    "output_dir": "xlmr-12l_lora_lyrics",
     "block_size": 512,
+    "eval_stride": 256,
     "do_train": true,
     "do_eval": true,
-    "per_device_train_batch_size": 32,
+    "per_device_train_batch_size": 64,
     "per_device_eval_batch_size": 32,
-    "gradient_accumulation_steps": 2,
+    "gradient_accumulation_steps": 1,
     "eval_accumulation_steps": 8,
     "evaluation_strategy": "epoch",
     "dataloader_num_workers": 1,
@@ -16,7 +17,7 @@
     "num_train_epochs": 30,
     "logging_steps": 50,
     "report_to": "wandb",
-    "wandb_project": "lyrics-peft",
+    "wandb_project": "sentence",
     "save_steps": 100000000,
     "remove_unused_columns": false,
     "one_sample_per_line": true,
@@ -25,17 +26,17 @@
     "warmup_ratio": 0.1,
     "non_punctuation_sample_ratio": null,
     "prediction_loss_only": true,
-    "use_auxiliary": false,
+    "use_auxiliary": true,
     "ddp_timeout": 3600,
     "use_subwords": true,
     "custom_punctuation_file": "punctuation_xlmr_unk.txt",
     "log_level": "warning",
     "adapter_config": "lora[r=16,alpha=32,intermediate_lora=True]",
     "weight_decay": 0.0,
     "auxiliary_remove_prob": 0.0,
-    "text_path": "data/all_data_11_05-lyrics.pth",
+    "text_path": "data/lyrics.pth",
     "skip_eval_loss": false,
     "shuffle": false,
     "train_adapter": true,
-    "subsample": null
+    "subsample": 10000
 }