Skip to content

Commit

Permalink
#28 - Create gemma2-pipeline.gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
obriensystems authored Dec 5, 2024
1 parent 3fb7172 commit 1328a34
Showing 1 changed file with 52 additions and 0 deletions.
52 changes: 52 additions & 0 deletions environments/windows/src/google-gemma/gemma2-pipeline.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Michael O'Brien 20241204
import os, torch
# default dual GPU - either PCIe bus or NVidia bus - slowdowns
#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
# specific GPU - model must fit entierely in memory RTX-3500 ada = 12G, A4000=16G, A4500=20, A6000=48, 4000 ada = 20, 5000 ada = 32, 6000 ada = 48
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"

#from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from datetime import datetime

#access_token='hf....CQqH'

#https://huggingface.co/google/gemma-2-2b
#11g
amodel = "google/gemma-2-2b"#7b"
#https://huggingface.co/google/gemma-2-9b
#36g
#amodel = "google/gemma-2-9b"#7b"
#https://huggingface.co/google/gemma-2-27b
#amodel = "google/gemma-2-27b"#7b"

#tokenizer = AutoTokenizer.from_pretrained(model, token=access_token)
# GPU
pipe = pipeline(
"text-generation",
model=amodel,
device="cuda", # replace with "mps" to run on a Mac device
)
#model = AutoModelForCausalLM.from_pretrained(amodel, device_map="auto",
# torch_dtype=torch.float16,
# token=access_token)
# CPU
#model = AutoModelForCausalLM.from_pretrained(amodel,token=access_token)

input_text = "how is gold made in collapsing neutron stars - specifically what is the ratio created during the beta and r process."
time_start = datetime.now().strftime("%H:%M:%S")
print("genarate start: ", datetime.now().strftime("%H:%M:%S"))

# GPU
#input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
# CPU
#input_ids = tokenizer(input_text, return_tensors="pt")
#outputs = model.generate(**input_ids, max_new_tokens=10000)
#print(tokenizer.decode(outputs[0]))

outputs = pipe(input_text, max_new_tokens=256)
response = outputs[0]["generated_text"]
print(response)

print("end", datetime.now().strftime("%H:%M:%S"))
time_end = datetime.now().strftime("%H:%M:%S")

0 comments on commit 1328a34

Please sign in to comment.