#28 - Create gemma2-pipeline.gpu

ObrienlabsDev · Dec 5, 2024 · 1328a34 · 1328a34
1 parent 3fb7172
commit 1328a34
Showing 1 changed file with 52 additions and 0 deletions.
diff --git a/environments/windows/src/google-gemma/gemma2-pipeline.gpu b/environments/windows/src/google-gemma/gemma2-pipeline.gpu
@@ -0,0 +1,52 @@
+# Michael O'Brien 20241204 
+import os, torch
+# default dual GPU - either PCIe bus or NVidia bus - slowdowns
+#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+# specific GPU - model must fit entierely in memory RTX-3500 ada = 12G, A4000=16G, A4500=20, A6000=48, 4000 ada = 20, 5000 ada = 32, 6000 ada = 48
+#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+#from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import pipeline
+from datetime import datetime
+
+#access_token='hf....CQqH'
+
+#https://huggingface.co/google/gemma-2-2b
+#11g
+amodel = "google/gemma-2-2b"#7b"
+#https://huggingface.co/google/gemma-2-9b
+#36g
+#amodel = "google/gemma-2-9b"#7b"
+#https://huggingface.co/google/gemma-2-27b
+#amodel = "google/gemma-2-27b"#7b"
+
+#tokenizer = AutoTokenizer.from_pretrained(model, token=access_token)
+# GPU
+pipe = pipeline(
+    "text-generation",
+    model=amodel,
+    device="cuda",  # replace with "mps" to run on a Mac device
+)
+#model = AutoModelForCausalLM.from_pretrained(amodel, device_map="auto", 
+#                                             torch_dtype=torch.float16,
+#                                             token=access_token)
+# CPU
+#model = AutoModelForCausalLM.from_pretrained(amodel,token=access_token)
+
+input_text = "how is gold made in collapsing neutron stars - specifically what is the ratio created during the beta and r process."
+time_start = datetime.now().strftime("%H:%M:%S")
+print("genarate start: ", datetime.now().strftime("%H:%M:%S"))
+
+# GPU
+#input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+# CPU
+#input_ids = tokenizer(input_text, return_tensors="pt")
+#outputs = model.generate(**input_ids, max_new_tokens=10000)
+#print(tokenizer.decode(outputs[0]))
+
+outputs = pipe(input_text, max_new_tokens=256)
+response = outputs[0]["generated_text"]
+print(response)
+
+print("end", datetime.now().strftime("%H:%M:%S"))
+time_end = datetime.now().strftime("%H:%M:%S")