diff --git a/Makefile b/Makefile index 159b33b..b9821e8 100644 --- a/Makefile +++ b/Makefile @@ -92,15 +92,15 @@ download-247-pickles: gsutil -m rsync -x "^(?!.*676).*" gs://247-podcast-data/247-pickles/ results/676/ ## settings for targets: generate-embeddings, concatenate-embeddings -%-embeddings: PRJCT_ID := tfs +%-embeddings: PRJCT_ID := podcast # {tfs | podcast} -%-embeddings: SID := 625 +%-embeddings: SID := 661 # {625 | 676 | 7170 | 798 | 661} -%-embeddings: CONV_IDS = $(shell seq 1 54) +%-embeddings: CONV_IDS = $(shell seq 1 1) # {54 for 625 | 78 for 676 | 1 for 661 | 24 for 7170 | 15 for 798} %-embeddings: PKL_IDENTIFIER := full # {full | trimmed | binned} -%-embeddings: EMB_TYPE := openai/whisper-tiny.en +%-embeddings: EMB_TYPE := google/gemma-7b # {"gpt2", "gpt2-large", "gpt2-xl", \ "EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B", \ "EleutherAI/gpt-neox-20b", \ @@ -110,7 +110,7 @@ download-247-pickles: "openai/whisper-tiny.en", "openai/whisper-base.en", "openai/whisper-medium.en", \ "openai/whisper-large", "openai/whisper-large-v2" \ } -%-embeddings: CNXT_LEN := 1 +%-embeddings: CNXT_LEN := 8192 %-embeddings: LAYER := all # {'all' for all layers | 'last' for the last layer | (list of) integer(s) >= 1} @@ -141,7 +141,7 @@ download-247-pickles: # Note: embeddings file is the same for all podcast subjects \ and hence only generate once using subject: 661 %-embeddings: JOB_NAME = $(subst /,-,$(EMB_TYPE)) -%-embeddings: CMD = sbatch --job-name=$(SID)-$(JOB_NAME)-cnxt-$$cnxt_len submit.sh +%-embeddings: CMD = sbatch --job-name=perplexity-$(SID)-$(JOB_NAME)-cnxt-$(CNXT_LEN) submit.sh # {echo | python | sbatch --job-name=$(SID)-$(JOB_NAME)-cnxt-$$cnxt_len submit.sh} @@ -200,7 +200,18 @@ copy-embeddings: # Download huggingface models to cache (before generating embeddings) # This target needs to be run on the head node -cache-models: MODEL := openai/whisper-medium.en +cache-models: MODEL := causal # {causal | seq2seq | mlm | or any model name specified in EMB_TYPE comments} cache-models: python -c "from scripts import tfsemb_download; tfsemb_download.download_tokenizers_and_models(\"$(MODEL)\")" + +perp-embeddings: + mkdir -p logs + for conv_id in $(CONV_IDS); do \ + python scripts/tfsemb_perplexity.py \ + --project-id $(PRJCT_ID) \ + --pkl-identifier $(PKL_IDENTIFIER) \ + --subject $(SID) \ + --conversation-id $$conv_id \ + --embedding-type $(EMB_TYPE); \ + done; diff --git a/scripts/tfsemb_LMBase.py b/scripts/tfsemb_LMBase.py index 0d43cab..d1431f2 100644 --- a/scripts/tfsemb_LMBase.py +++ b/scripts/tfsemb_LMBase.py @@ -61,7 +61,7 @@ def main(): base_df = add_vocab_columns(args, base_df, column="word") else: base_df = tokenize_and_explode(args, base_df) - # base_df = add_vocab_columns(args, base_df, column="token2word") + base_df = add_vocab_columns(args, base_df, column="token2word") svpkl(base_df, args.base_df_file) diff --git a/scripts/tfsemb_download.py b/scripts/tfsemb_download.py index e2a3468..729c961 100644 --- a/scripts/tfsemb_download.py +++ b/scripts/tfsemb_download.py @@ -1,4 +1,7 @@ import os +from transformers import BitsAndBytesConfig +import torch + from transformers import ( AutoConfig, @@ -12,7 +15,7 @@ ) CAUSAL_MODELS = [ - "distilgpt2", + "distilgpt2", # distilbert/distilgpt2 "gpt2", "gpt2-medium", "gpt2-large", @@ -20,14 +23,21 @@ "EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B", - "EleutherAI/gpt-neox-20b", + "EleutherAI/gpt-neox-20b", # quantized for A100-80GB "facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", + "facebook/opt-13b", "facebook/opt-30b", + "facebook/opt-66b", # quantized for A100-80GB "bigscience/bloom", + "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-2-13b-hf", # + "meta-llama/Llama-2-70b-hf", # quantized for A100-80GB + "google/gemma-2b", + "google/gemma-7b" ] SEQ2SEQ_MODELS = ["facebook/blenderbot_small-90M", "facebook/blenderbot-3B"] @@ -59,6 +69,15 @@ "mlm": (MLM_MODELS, AutoModelForMaskedLM), } +# set quantization configuration to load large model with less GPU memory +# this requires the `bitsandbytes` library +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, +) + def clean_lm_model_name(item): """Remove unnecessary parts from the language model name. @@ -145,6 +164,8 @@ def download_hf_model( output_hidden_states=True, cache_dir=cache_dir, local_files_only=local_files_only, + device_map="auto", + quantization_config=bnb_config, ) return model @@ -162,7 +183,7 @@ def download_hf_tokenizer( tokenizer = tokenizer_class.from_pretrained( model_name, - add_prefix_space=True, + # add_prefix_space=True, cache_dir=cache_dir, local_files_only=local_files_only, ) diff --git a/scripts/tfsemb_genemb_causal.py b/scripts/tfsemb_genemb_causal.py index 89bf41f..3cfdc2d 100644 --- a/scripts/tfsemb_genemb_causal.py +++ b/scripts/tfsemb_genemb_causal.py @@ -70,7 +70,7 @@ def model_forward_pass(args, data_dl): for batch_idx, batch in enumerate(data_dl): if batch_idx % 10 == 0: print(f"Batch ID: {batch_idx}") - batch = batch.to(args.device) + batch = batch.to(device) model_output = model(batch) logits = model_output.logits.cpu() @@ -151,7 +151,7 @@ def process_extracted_logits(args, concat_logits, sentence_token_ids): sti = torch.tensor(sentence_token_ids) true_y = torch.cat([sti[0, 1:], sti[1:, -1]]).unsqueeze(-1) - prediction_probabilities = F.softmax(prediction_scores, dim=1) + prediction_probabilities = F.softmax(prediction_scores.float(), dim=1) logp = np.log2(prediction_probabilities) entropy = [None] + torch.sum(-prediction_probabilities * logp, dim=1).tolist() @@ -177,12 +177,12 @@ def process_extracted_logits(args, concat_logits, sentence_token_ids): if args.embedding_type in tfsemb_dwnld.CAUSAL_MODELS: if k == 1: predicted_words = [ - args.tokenizer.convert_tokens_to_string(token) + args.tokenizer.convert_tokens_to_string([token]) for token in predicted_tokens ] else: predicted_words = [ - [args.tokenizer.convert_tokens_to_string(token) for token in token_list] + [args.tokenizer.convert_tokens_to_string([token]) for token in token_list] for token_list in predicted_tokens ] diff --git a/scripts/tfsemb_main.py b/scripts/tfsemb_main.py index beb0f5f..1240d28 100644 --- a/scripts/tfsemb_main.py +++ b/scripts/tfsemb_main.py @@ -54,7 +54,7 @@ def check_token_is_root(args, df): token_is_root_string = args.embedding_type.split("/")[-1] + "_token_is_root" df[token_is_root_string] = ( df["word"] - == df["token"].apply(args.tokenizer.convert_tokens_to_string).str.strip() + == df["token"].apply(lambda x: [x]).apply(args.tokenizer.convert_tokens_to_string).str.strip() ) return df @@ -70,6 +70,7 @@ def convert_token_to_word(args, df): df["token2word"] = ( df["token"] + .apply(lambda x: [x]) .apply(args.tokenizer.convert_tokens_to_string) .str.strip() .str.lower() diff --git a/scripts/tfsemb_perplexity.py b/scripts/tfsemb_perplexity.py index def60bd..393b688 100644 --- a/scripts/tfsemb_perplexity.py +++ b/scripts/tfsemb_perplexity.py @@ -32,7 +32,8 @@ def main(): # else: # raise Exception("Base dataframe does not exist") - base_df_path = args.base_df_file.replace("661/embeddings", "777/pickles/embeddings") + # base_df_path = args.base_df_file.replace("661/embeddings", "777/pickles/embeddings") + base_df_path = args.base_df_file base_df = load_pickle(base_df_path) utterance_df = select_conversation(args, base_df) @@ -43,7 +44,7 @@ def main(): except: max_length = args.model.config.max_position_embeddings - strides = [512, 1024, 2048, 4096] + strides = [512, 1024, 2048, 4096, 8192] encodings = torch.tensor([tuple(utterance_df.token_id.tolist())]) seq_len = encodings.size(1) @@ -62,7 +63,7 @@ def main(): target_ids[:, :-trg_len] = -100 with torch.no_grad(): - model = model.to(device) + # model = model.to(device) model.eval() outputs = model(input_ids, labels=target_ids) diff --git a/submit.sh b/submit.sh index 7faf88b..2a58c4a 100644 --- a/submit.sh +++ b/submit.sh @@ -1,12 +1,15 @@ #!/bin/bash -#SBATCH --time=02:10:00 +#SBATCH --time=00:15:00 #SBATCH --mem=128GB -#SBATCH --gres=gpu:1 +#SBATCH --gres=gpu:2 +#SBATCH --constraint=gpu80 #SBATCH --nodes=1 ##SBATCH --cpus-per-task=4 #SBATCH --open-mode=truncate -#SBATCH -o './logs/%x.out' -#SBATCH -e './logs/%x.err' +#SBATCH -o './perplexity_logs/%x.out' +#SBATCH -e './perplexity_logs/%x.err' +#SBATCH --mail-user=hvgazula@umich.edu +#SBATCH --mail-type=FAIL if [[ "$HOSTNAME" == *"tiger"* ]] then @@ -34,7 +37,7 @@ if [[ -v SLURM_ARRAY_TASK_ID ]] then python "$@" --conversation-id $SLURM_ARRAY_TASK_ID else - python "$@" + python -u "$@" fi end=$(date +%s)