From 1328a34c497f8443a8aefa2e5e565d4982c17900 Mon Sep 17 00:00:00 2001 From: Michael O'Brien Date: Wed, 4 Dec 2024 22:32:49 -0500 Subject: [PATCH] #28 - Create gemma2-pipeline.gpu --- .../src/google-gemma/gemma2-pipeline.gpu | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 environments/windows/src/google-gemma/gemma2-pipeline.gpu diff --git a/environments/windows/src/google-gemma/gemma2-pipeline.gpu b/environments/windows/src/google-gemma/gemma2-pipeline.gpu new file mode 100644 index 0000000..640232d --- /dev/null +++ b/environments/windows/src/google-gemma/gemma2-pipeline.gpu @@ -0,0 +1,52 @@ +# Michael O'Brien 20241204 +import os, torch +# default dual GPU - either PCIe bus or NVidia bus - slowdowns +#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" +# specific GPU - model must fit entierely in memory RTX-3500 ada = 12G, A4000=16G, A4500=20, A6000=48, 4000 ada = 20, 5000 ada = 32, 6000 ada = 48 +#os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +#from transformers import AutoTokenizer, AutoModelForCausalLM +from transformers import pipeline +from datetime import datetime + +#access_token='hf....CQqH' + +#https://huggingface.co/google/gemma-2-2b +#11g +amodel = "google/gemma-2-2b"#7b" +#https://huggingface.co/google/gemma-2-9b +#36g +#amodel = "google/gemma-2-9b"#7b" +#https://huggingface.co/google/gemma-2-27b +#amodel = "google/gemma-2-27b"#7b" + +#tokenizer = AutoTokenizer.from_pretrained(model, token=access_token) +# GPU +pipe = pipeline( + "text-generation", + model=amodel, + device="cuda", # replace with "mps" to run on a Mac device +) +#model = AutoModelForCausalLM.from_pretrained(amodel, device_map="auto", +# torch_dtype=torch.float16, +# token=access_token) +# CPU +#model = AutoModelForCausalLM.from_pretrained(amodel,token=access_token) + +input_text = "how is gold made in collapsing neutron stars - specifically what is the ratio created during the beta and r process." +time_start = datetime.now().strftime("%H:%M:%S") +print("genarate start: ", datetime.now().strftime("%H:%M:%S")) + +# GPU +#input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +# CPU +#input_ids = tokenizer(input_text, return_tensors="pt") +#outputs = model.generate(**input_ids, max_new_tokens=10000) +#print(tokenizer.decode(outputs[0])) + +outputs = pipe(input_text, max_new_tokens=256) +response = outputs[0]["generated_text"] +print(response) + +print("end", datetime.now().strftime("%H:%M:%S")) +time_end = datetime.now().strftime("%H:%M:%S")