resolved #170

hassonlab · Mar 11, 2024 · 457ba54 · 457ba54
1 parent 9dc12c4
commit 457ba54
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 24 deletions.
diff --git a/Makefile b/Makefile
@@ -92,15 +92,15 @@ download-247-pickles:
 	gsutil -m rsync -x "^(?!.*676).*" gs://247-podcast-data/247-pickles/ results/676/
 
 ## settings for targets: generate-embeddings, concatenate-embeddings
-%-embeddings: PRJCT_ID := tfs
+%-embeddings: PRJCT_ID := podcast
 # {tfs | podcast}
-%-embeddings: SID := 625
+%-embeddings: SID := 661
 # {625 | 676 | 7170 | 798 | 661} 
-%-embeddings: CONV_IDS = $(shell seq 1 54)
+%-embeddings: CONV_IDS = $(shell seq 1 1)
 # {54 for 625 | 78 for 676 | 1 for 661 | 24 for 7170 | 15 for 798}
 %-embeddings: PKL_IDENTIFIER := full
 # {full | trimmed | binned}
-%-embeddings: EMB_TYPE := openai/whisper-tiny.en
+%-embeddings: EMB_TYPE := google/gemma-7b
 # {"gpt2", "gpt2-large", "gpt2-xl", \
 "EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B", \
 "EleutherAI/gpt-neox-20b", \
@@ -110,7 +110,7 @@ download-247-pickles:
 "openai/whisper-tiny.en", "openai/whisper-base.en", "openai/whisper-medium.en", \
 "openai/whisper-large", "openai/whisper-large-v2" \
 }
-%-embeddings: CNXT_LEN := 1
+%-embeddings: CNXT_LEN := 8192
 %-embeddings: LAYER := all
 # {'all' for all layers | 'last' for the last layer | (list of) integer(s) >= 1}
 
@@ -141,7 +141,7 @@ download-247-pickles:
 # Note: embeddings file is the same for all podcast subjects \
 and hence only generate once using subject: 661
 %-embeddings: JOB_NAME = $(subst /,-,$(EMB_TYPE))
-%-embeddings: CMD = sbatch --job-name=$(SID)-$(JOB_NAME)-cnxt-$$cnxt_len submit.sh
+%-embeddings: CMD = sbatch --job-name=perplexity-$(SID)-$(JOB_NAME)-cnxt-$(CNXT_LEN) submit.sh
 # {echo | python | sbatch --job-name=$(SID)-$(JOB_NAME)-cnxt-$$cnxt_len submit.sh}
 
 
@@ -200,7 +200,18 @@ copy-embeddings:
 
 # Download huggingface models to cache (before generating embeddings)
 # This target needs to be run on the head node
-cache-models: MODEL := openai/whisper-medium.en
+cache-models: MODEL := causal
 # {causal | seq2seq | mlm | or any model name specified in EMB_TYPE comments}
 cache-models:
 	python -c "from scripts import tfsemb_download; tfsemb_download.download_tokenizers_and_models(\"$(MODEL)\")"
+
+perp-embeddings:
+	mkdir -p logs
+	for conv_id in $(CONV_IDS); do \
+		python scripts/tfsemb_perplexity.py \
+		--project-id $(PRJCT_ID) \
+		--pkl-identifier $(PKL_IDENTIFIER) \
+		--subject $(SID) \
+		--conversation-id $$conv_id \
+		--embedding-type $(EMB_TYPE); \
+	done;
diff --git a/scripts/tfsemb_LMBase.py b/scripts/tfsemb_LMBase.py
@@ -61,7 +61,7 @@ def main():
         base_df = add_vocab_columns(args, base_df, column="word")
     else:
         base_df = tokenize_and_explode(args, base_df)
-        # base_df = add_vocab_columns(args, base_df, column="token2word")
+        base_df = add_vocab_columns(args, base_df, column="token2word")
 
     svpkl(base_df, args.base_df_file)
 

diff --git a/scripts/tfsemb_download.py b/scripts/tfsemb_download.py
@@ -1,4 +1,7 @@
 import os
+from transformers import BitsAndBytesConfig
+import torch
+
 
 from transformers import (
     AutoConfig,
@@ -12,22 +15,29 @@
 )
 
 CAUSAL_MODELS = [
-    "distilgpt2",
+    "distilgpt2",  # distilbert/distilgpt2
     "gpt2",
     "gpt2-medium",
     "gpt2-large",
     "gpt2-xl",
     "EleutherAI/gpt-neo-125M",
     "EleutherAI/gpt-neo-1.3B",
     "EleutherAI/gpt-neo-2.7B",
-    "EleutherAI/gpt-neox-20b",
+    "EleutherAI/gpt-neox-20b",      # quantized for A100-80GB
     "facebook/opt-125m",
     "facebook/opt-350m",
     "facebook/opt-1.3b",
     "facebook/opt-2.7b",
     "facebook/opt-6.7b",
+    "facebook/opt-13b",
     "facebook/opt-30b",
+    "facebook/opt-66b",             # quantized for A100-80GB
     "bigscience/bloom",
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-2-13b-hf",    #
+    "meta-llama/Llama-2-70b-hf",     # quantized for A100-80GB
+    "google/gemma-2b",
+    "google/gemma-7b"
 ]
 
 SEQ2SEQ_MODELS = ["facebook/blenderbot_small-90M", "facebook/blenderbot-3B"]
@@ -59,6 +69,15 @@
     "mlm": (MLM_MODELS, AutoModelForMaskedLM),
 }
 
+# set quantization configuration to load large model with less GPU memory
+# this requires the `bitsandbytes` library
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+
 
 def clean_lm_model_name(item):
     """Remove unnecessary parts from the language model name.
@@ -145,6 +164,8 @@ def download_hf_model(
         output_hidden_states=True,
         cache_dir=cache_dir,
         local_files_only=local_files_only,
+        device_map="auto",
+        quantization_config=bnb_config,
     )
 
     return model
@@ -162,7 +183,7 @@ def download_hf_tokenizer(
 
     tokenizer = tokenizer_class.from_pretrained(
         model_name,
-        add_prefix_space=True,
+        # add_prefix_space=True,
         cache_dir=cache_dir,
         local_files_only=local_files_only,
     )

diff --git a/scripts/tfsemb_genemb_causal.py b/scripts/tfsemb_genemb_causal.py
@@ -70,7 +70,7 @@ def model_forward_pass(args, data_dl):
         for batch_idx, batch in enumerate(data_dl):
             if batch_idx % 10 == 0:
                 print(f"Batch ID: {batch_idx}")
-            batch = batch.to(args.device)
+            batch = batch.to(device)
             model_output = model(batch)
 
             logits = model_output.logits.cpu()
@@ -151,7 +151,7 @@ def process_extracted_logits(args, concat_logits, sentence_token_ids):
             sti = torch.tensor(sentence_token_ids)
             true_y = torch.cat([sti[0, 1:], sti[1:, -1]]).unsqueeze(-1)
 
-    prediction_probabilities = F.softmax(prediction_scores, dim=1)
+    prediction_probabilities = F.softmax(prediction_scores.float(), dim=1)
 
     logp = np.log2(prediction_probabilities)
     entropy = [None] + torch.sum(-prediction_probabilities * logp, dim=1).tolist()
@@ -177,12 +177,12 @@ def process_extracted_logits(args, concat_logits, sentence_token_ids):
     if args.embedding_type in tfsemb_dwnld.CAUSAL_MODELS:
         if k == 1:
             predicted_words = [
-                args.tokenizer.convert_tokens_to_string(token)
+                args.tokenizer.convert_tokens_to_string([token])
                 for token in predicted_tokens
             ]
         else:
             predicted_words = [
-                [args.tokenizer.convert_tokens_to_string(token) for token in token_list]
+                [args.tokenizer.convert_tokens_to_string([token]) for token in token_list]
                 for token_list in predicted_tokens
             ]
 

diff --git a/scripts/tfsemb_main.py b/scripts/tfsemb_main.py
@@ -54,7 +54,7 @@ def check_token_is_root(args, df):
     token_is_root_string = args.embedding_type.split("/")[-1] + "_token_is_root"
     df[token_is_root_string] = (
         df["word"]
-        == df["token"].apply(args.tokenizer.convert_tokens_to_string).str.strip()
+        == df["token"].apply(lambda x: [x]).apply(args.tokenizer.convert_tokens_to_string).str.strip()
     )
 
     return df
@@ -70,6 +70,7 @@ def convert_token_to_word(args, df):
 
     df["token2word"] = (
         df["token"]
+        .apply(lambda x: [x])
         .apply(args.tokenizer.convert_tokens_to_string)
         .str.strip()
         .str.lower()

diff --git a/scripts/tfsemb_perplexity.py b/scripts/tfsemb_perplexity.py
@@ -32,7 +32,8 @@ def main():
     # else:
     #     raise Exception("Base dataframe does not exist")
 
-    base_df_path = args.base_df_file.replace("661/embeddings", "777/pickles/embeddings")
+    # base_df_path = args.base_df_file.replace("661/embeddings", "777/pickles/embeddings")
+    base_df_path = args.base_df_file
     base_df = load_pickle(base_df_path)
 
     utterance_df = select_conversation(args, base_df)
@@ -43,7 +44,7 @@ def main():
     except:
         max_length = args.model.config.max_position_embeddings
 
-    strides = [512, 1024, 2048, 4096]
+    strides = [512, 1024, 2048, 4096, 8192]
     encodings = torch.tensor([tuple(utterance_df.token_id.tolist())])
     seq_len = encodings.size(1)
 
@@ -62,7 +63,7 @@ def main():
             target_ids[:, :-trg_len] = -100
 
             with torch.no_grad():
-                model = model.to(device)
+                # model = model.to(device)
                 model.eval()
                 outputs = model(input_ids, labels=target_ids)
 

diff --git a/submit.sh b/submit.sh
@@ -1,12 +1,15 @@
 #!/bin/bash
-#SBATCH --time=02:10:00
+#SBATCH --time=00:15:00
 #SBATCH --mem=128GB
-#SBATCH --gres=gpu:1
+#SBATCH --gres=gpu:2
+#SBATCH --constraint=gpu80
 #SBATCH --nodes=1
 ##SBATCH --cpus-per-task=4
 #SBATCH --open-mode=truncate
-#SBATCH -o './logs/%x.out'
-#SBATCH -e './logs/%x.err'
+#SBATCH -o './perplexity_logs/%x.out'
+#SBATCH -e './perplexity_logs/%x.err'
+#SBATCH [email protected]
+#SBATCH --mail-type=FAIL
 
 if [[ "$HOSTNAME" == *"tiger"* ]]
 then
@@ -34,7 +37,7 @@ if [[ -v SLURM_ARRAY_TASK_ID ]]
 then
     python "$@" --conversation-id $SLURM_ARRAY_TASK_ID
 else
-    python "$@"
+    python -u "$@"
 fi
 
 end=$(date +%s)