Skip to content

Commit

Permalink
resolved #170
Browse files Browse the repository at this point in the history
  • Loading branch information
hvgazula committed Mar 11, 2024
1 parent 9dc12c4 commit 457ba54
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 24 deletions.
25 changes: 18 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,15 @@ download-247-pickles:
gsutil -m rsync -x "^(?!.*676).*" gs://247-podcast-data/247-pickles/ results/676/

## settings for targets: generate-embeddings, concatenate-embeddings
%-embeddings: PRJCT_ID := tfs
%-embeddings: PRJCT_ID := podcast
# {tfs | podcast}
%-embeddings: SID := 625
%-embeddings: SID := 661
# {625 | 676 | 7170 | 798 | 661}
%-embeddings: CONV_IDS = $(shell seq 1 54)
%-embeddings: CONV_IDS = $(shell seq 1 1)
# {54 for 625 | 78 for 676 | 1 for 661 | 24 for 7170 | 15 for 798}
%-embeddings: PKL_IDENTIFIER := full
# {full | trimmed | binned}
%-embeddings: EMB_TYPE := openai/whisper-tiny.en
%-embeddings: EMB_TYPE := google/gemma-7b
# {"gpt2", "gpt2-large", "gpt2-xl", \
"EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B", \
"EleutherAI/gpt-neox-20b", \
Expand All @@ -110,7 +110,7 @@ download-247-pickles:
"openai/whisper-tiny.en", "openai/whisper-base.en", "openai/whisper-medium.en", \
"openai/whisper-large", "openai/whisper-large-v2" \
}
%-embeddings: CNXT_LEN := 1
%-embeddings: CNXT_LEN := 8192
%-embeddings: LAYER := all
# {'all' for all layers | 'last' for the last layer | (list of) integer(s) >= 1}

Expand Down Expand Up @@ -141,7 +141,7 @@ download-247-pickles:
# Note: embeddings file is the same for all podcast subjects \
and hence only generate once using subject: 661
%-embeddings: JOB_NAME = $(subst /,-,$(EMB_TYPE))
%-embeddings: CMD = sbatch --job-name=$(SID)-$(JOB_NAME)-cnxt-$$cnxt_len submit.sh
%-embeddings: CMD = sbatch --job-name=perplexity-$(SID)-$(JOB_NAME)-cnxt-$(CNXT_LEN) submit.sh
# {echo | python | sbatch --job-name=$(SID)-$(JOB_NAME)-cnxt-$$cnxt_len submit.sh}


Expand Down Expand Up @@ -200,7 +200,18 @@ copy-embeddings:

# Download huggingface models to cache (before generating embeddings)
# This target needs to be run on the head node
cache-models: MODEL := openai/whisper-medium.en
cache-models: MODEL := causal
# {causal | seq2seq | mlm | or any model name specified in EMB_TYPE comments}
cache-models:
python -c "from scripts import tfsemb_download; tfsemb_download.download_tokenizers_and_models(\"$(MODEL)\")"

perp-embeddings:
mkdir -p logs
for conv_id in $(CONV_IDS); do \
python scripts/tfsemb_perplexity.py \
--project-id $(PRJCT_ID) \
--pkl-identifier $(PKL_IDENTIFIER) \
--subject $(SID) \
--conversation-id $$conv_id \
--embedding-type $(EMB_TYPE); \
done;
2 changes: 1 addition & 1 deletion scripts/tfsemb_LMBase.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def main():
base_df = add_vocab_columns(args, base_df, column="word")
else:
base_df = tokenize_and_explode(args, base_df)
# base_df = add_vocab_columns(args, base_df, column="token2word")
base_df = add_vocab_columns(args, base_df, column="token2word")

svpkl(base_df, args.base_df_file)

Expand Down
27 changes: 24 additions & 3 deletions scripts/tfsemb_download.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import os
from transformers import BitsAndBytesConfig
import torch


from transformers import (
AutoConfig,
Expand All @@ -12,22 +15,29 @@
)

CAUSAL_MODELS = [
"distilgpt2",
"distilgpt2", # distilbert/distilgpt2
"gpt2",
"gpt2-medium",
"gpt2-large",
"gpt2-xl",
"EleutherAI/gpt-neo-125M",
"EleutherAI/gpt-neo-1.3B",
"EleutherAI/gpt-neo-2.7B",
"EleutherAI/gpt-neox-20b",
"EleutherAI/gpt-neox-20b", # quantized for A100-80GB
"facebook/opt-125m",
"facebook/opt-350m",
"facebook/opt-1.3b",
"facebook/opt-2.7b",
"facebook/opt-6.7b",
"facebook/opt-13b",
"facebook/opt-30b",
"facebook/opt-66b", # quantized for A100-80GB
"bigscience/bloom",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-13b-hf", #
"meta-llama/Llama-2-70b-hf", # quantized for A100-80GB
"google/gemma-2b",
"google/gemma-7b"
]

SEQ2SEQ_MODELS = ["facebook/blenderbot_small-90M", "facebook/blenderbot-3B"]
Expand Down Expand Up @@ -59,6 +69,15 @@
"mlm": (MLM_MODELS, AutoModelForMaskedLM),
}

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)


def clean_lm_model_name(item):
"""Remove unnecessary parts from the language model name.
Expand Down Expand Up @@ -145,6 +164,8 @@ def download_hf_model(
output_hidden_states=True,
cache_dir=cache_dir,
local_files_only=local_files_only,
device_map="auto",
quantization_config=bnb_config,
)

return model
Expand All @@ -162,7 +183,7 @@ def download_hf_tokenizer(

tokenizer = tokenizer_class.from_pretrained(
model_name,
add_prefix_space=True,
# add_prefix_space=True,
cache_dir=cache_dir,
local_files_only=local_files_only,
)
Expand Down
8 changes: 4 additions & 4 deletions scripts/tfsemb_genemb_causal.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def model_forward_pass(args, data_dl):
for batch_idx, batch in enumerate(data_dl):
if batch_idx % 10 == 0:
print(f"Batch ID: {batch_idx}")
batch = batch.to(args.device)
batch = batch.to(device)
model_output = model(batch)

logits = model_output.logits.cpu()
Expand Down Expand Up @@ -151,7 +151,7 @@ def process_extracted_logits(args, concat_logits, sentence_token_ids):
sti = torch.tensor(sentence_token_ids)
true_y = torch.cat([sti[0, 1:], sti[1:, -1]]).unsqueeze(-1)

prediction_probabilities = F.softmax(prediction_scores, dim=1)
prediction_probabilities = F.softmax(prediction_scores.float(), dim=1)

logp = np.log2(prediction_probabilities)
entropy = [None] + torch.sum(-prediction_probabilities * logp, dim=1).tolist()
Expand All @@ -177,12 +177,12 @@ def process_extracted_logits(args, concat_logits, sentence_token_ids):
if args.embedding_type in tfsemb_dwnld.CAUSAL_MODELS:
if k == 1:
predicted_words = [
args.tokenizer.convert_tokens_to_string(token)
args.tokenizer.convert_tokens_to_string([token])
for token in predicted_tokens
]
else:
predicted_words = [
[args.tokenizer.convert_tokens_to_string(token) for token in token_list]
[args.tokenizer.convert_tokens_to_string([token]) for token in token_list]
for token_list in predicted_tokens
]

Expand Down
3 changes: 2 additions & 1 deletion scripts/tfsemb_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def check_token_is_root(args, df):
token_is_root_string = args.embedding_type.split("/")[-1] + "_token_is_root"
df[token_is_root_string] = (
df["word"]
== df["token"].apply(args.tokenizer.convert_tokens_to_string).str.strip()
== df["token"].apply(lambda x: [x]).apply(args.tokenizer.convert_tokens_to_string).str.strip()
)

return df
Expand All @@ -70,6 +70,7 @@ def convert_token_to_word(args, df):

df["token2word"] = (
df["token"]
.apply(lambda x: [x])
.apply(args.tokenizer.convert_tokens_to_string)
.str.strip()
.str.lower()
Expand Down
7 changes: 4 additions & 3 deletions scripts/tfsemb_perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def main():
# else:
# raise Exception("Base dataframe does not exist")

base_df_path = args.base_df_file.replace("661/embeddings", "777/pickles/embeddings")
# base_df_path = args.base_df_file.replace("661/embeddings", "777/pickles/embeddings")
base_df_path = args.base_df_file
base_df = load_pickle(base_df_path)

utterance_df = select_conversation(args, base_df)
Expand All @@ -43,7 +44,7 @@ def main():
except:
max_length = args.model.config.max_position_embeddings

strides = [512, 1024, 2048, 4096]
strides = [512, 1024, 2048, 4096, 8192]
encodings = torch.tensor([tuple(utterance_df.token_id.tolist())])
seq_len = encodings.size(1)

Expand All @@ -62,7 +63,7 @@ def main():
target_ids[:, :-trg_len] = -100

with torch.no_grad():
model = model.to(device)
# model = model.to(device)
model.eval()
outputs = model(input_ids, labels=target_ids)

Expand Down
13 changes: 8 additions & 5 deletions submit.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
#!/bin/bash
#SBATCH --time=02:10:00
#SBATCH --time=00:15:00
#SBATCH --mem=128GB
#SBATCH --gres=gpu:1
#SBATCH --gres=gpu:2
#SBATCH --constraint=gpu80
#SBATCH --nodes=1
##SBATCH --cpus-per-task=4
#SBATCH --open-mode=truncate
#SBATCH -o './logs/%x.out'
#SBATCH -e './logs/%x.err'
#SBATCH -o './perplexity_logs/%x.out'
#SBATCH -e './perplexity_logs/%x.err'
#SBATCH [email protected]
#SBATCH --mail-type=FAIL

if [[ "$HOSTNAME" == *"tiger"* ]]
then
Expand Down Expand Up @@ -34,7 +37,7 @@ if [[ -v SLURM_ARRAY_TASK_ID ]]
then
python "$@" --conversation-id $SLURM_ARRAY_TASK_ID
else
python "$@"
python -u "$@"
fi

end=$(date +%s)
Expand Down

0 comments on commit 457ba54

Please sign in to comment.