Skip to content

Commit

Permalink
initial POC
Browse files Browse the repository at this point in the history
  • Loading branch information
Bobbins228 committed Oct 21, 2024
1 parent 2e28f8a commit 3e3ca8c
Show file tree
Hide file tree
Showing 7 changed files with 612 additions and 6 deletions.
280 changes: 280 additions & 0 deletions demo-notebooks/guided-demos/4_training.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "69f9bce8-b833-4b1e-af1b-a946f41d072f",
"metadata": {},
"outputs": [],
"source": [
"# Import pieces from codeflare-sdk\n",
"from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication\n",
"from codeflare_sdk.kubeflow.client.training_client import TrainingClient\n",
"from kubeflow.training.constants import constants"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bdea2430-8e17-48d9-9e48-42830af4fa5c",
"metadata": {},
"outputs": [],
"source": [
"# Create authentication object for user permissions\n",
"# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
"# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
"auth = TokenAuthentication(\n",
" token = \"sha256~HEKZT_MDfpf7S3bKOrFaZaD6SDD0xoHY-o7YmYiaH-k\",\n",
" server = \"https://api.mark-rosa.wtsd.p3.openshiftapps.com:443\",\n",
" skip_tls=False\n",
")\n",
"auth.login()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "daa18c62-8078-497d-9d78-7bc139f40571",
"metadata": {},
"outputs": [],
"source": [
"tc = TrainingClient(job_kind=constants.TFJOB_KIND)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e46effb4-96a5-466f-9dfa-5a95ed6cabfe",
"metadata": {},
"outputs": [],
"source": [
"from kubernetes.client import V1Container\n",
"from kubernetes.client import V1PodTemplateSpec\n",
"from kubernetes.client import V1ObjectMeta\n",
"from kubernetes.client import V1PodSpec\n",
"from kubernetes.client import V1Container\n",
"from kubeflow.training.api.training_client import TrainingClient\n",
"from kubeflow.training import KubeflowOrgV1ReplicaSpec, KubeflowOrgV1TFJobSpec,KubeflowOrgV1TFJob, KubeflowOrgV1RunPolicy\n",
"\n",
"container = V1Container(\n",
"\tname=\"tensorflow2\",\n",
"\timage=\"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\",\n",
"\tcommand=[\n",
" \t\"python\",\n",
" \t\"/var/tf_mnist/mnist_with_summaries.py\",\n",
" \t\"--learning_rate=0.01\",\n",
" \t\"--batch_size=150\"\n",
" \t]\n",
")\n",
"\n",
"worker = KubeflowOrgV1ReplicaSpec(\n",
"\treplicas=1,\n",
"\trestart_policy=\"Never\",\n",
"\ttemplate=V1PodTemplateSpec(\n",
" \tspec=V1PodSpec(\n",
" \tcontainers=[container]\n",
" \t)\n",
"\t)\n",
")\n",
"run_policy=KubeflowOrgV1RunPolicy(\n",
"\tactive_deadline_seconds = None,\n",
"\tbackoff_limit = None,\n",
"\tclean_pod_policy = None,\n",
"\tscheduling_policy = None,\n",
"\tsuspend = False,\n",
"\tttl_seconds_after_finished = None\n",
")\n",
"tfjob = KubeflowOrgV1TFJob(\n",
"\tapi_version=\"kubeflow.org/v1\",\n",
"\tkind=\"TFJob\",\n",
"\tmetadata=V1ObjectMeta(name=\"mnist-examples\",namespace=\"mark-dsp\"),\n",
"\tspec=KubeflowOrgV1TFJobSpec(\n",
" \trun_policy=run_policy,\n",
" \ttf_replica_specs={\"Worker\": worker}\n",
"\t)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12649abc-1169-42aa-af1c-cafeff5e10f8",
"metadata": {},
"outputs": [],
"source": [
"tc.create_job(namespace=\"mark-dsp\", job=tfjob)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0ea416bc-4907-41ee-8d4c-6b63fd07d68a",
"metadata": {},
"outputs": [],
"source": [
"print(tc.list_jobs(namespace=\"mark-dsp\", job_kind=constants.TFJOB_KIND))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "68b4ae57-c6ce-482a-8c43-5c3a4bc59268",
"metadata": {},
"outputs": [],
"source": [
"print(tc.get_job_conditions(name=\"mnist-examples\", namespace=\"mark-dsp\", job_kind=constants.TFJOB_KIND))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78d89180-9b10-4af7-9a02-1214c51568df",
"metadata": {},
"outputs": [],
"source": [
"tc.is_job_created(name=\"mnist-examples\", namespace=\"mark-dsp\", job_kind=constants.TFJOB_KIND)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0606647e-6b15-4108-8684-9e525ca92a88",
"metadata": {},
"outputs": [],
"source": [
"tc.is_job_succeeded(name=\"mnist-examples\", namespace=\"mark-dsp\", job_kind=constants.TFJOB_KIND)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "27800f25-6345-4d4a-bf6c-a4a5a33bb7cd",
"metadata": {},
"outputs": [],
"source": [
"tc.is_job_restarting(name=\"mnist-examples\", namespace=\"mark-dsp\", job_kind=constants.TFJOB_KIND)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "174f611e-3647-42fe-be22-09e524d32ce6",
"metadata": {},
"outputs": [],
"source": [
"tc.is_job_running(name=\"mnist-examples\", namespace=\"mark-dsp\", job_kind=constants.TFJOB_KIND)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b23339f6-1c43-4d22-9694-b2c2374bf610",
"metadata": {},
"outputs": [],
"source": [
"tc.is_job_failed(name=\"mnist-examples\", namespace=\"mark-dsp\", job_kind=constants.TFJOB_KIND)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df5b3c0b-b3e8-4d2c-9a8f-51b823345586",
"metadata": {},
"outputs": [],
"source": [
"tfjob = tc.get_job(name=\"mnist-examples\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48aa7c94-021a-46a5-9567-aa47719d5a73",
"metadata": {},
"outputs": [],
"source": [
"tc.wait_for_job_conditions(name=\"mnist-examples\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f802b544-fdb4-43c4-b856-69754608373e",
"metadata": {},
"outputs": [],
"source": [
"pods = tc.get_job_pods(name=\"mnist-examples\")\n",
"print(pods)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03824d58-93d1-4043-a711-36aefbce3fa9",
"metadata": {},
"outputs": [],
"source": [
"pod_names = tc.get_job_pod_names(name=\"mnist-examples\")\n",
"print(pod_names)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4b2fd03-6ced-4eba-803a-71c3ab77e04a",
"metadata": {},
"outputs": [],
"source": [
"print(tc.get_job_logs(name=\"mnist-examples\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e591d4b-1ae2-4d48-9412-25a7dcea5a25",
"metadata": {},
"outputs": [],
"source": [
"tc.update_job(tfjob, \"mnist-examples\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2cc73adf-8033-42b7-a776-0e1c27ccfcc0",
"metadata": {},
"outputs": [],
"source": [
"tc.delete_job(name=\"mnist-examples\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ebf2511-412d-4dba-b2dd-f3f5c3097eb6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.11",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
59 changes: 54 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ python = "^3.9"
openshift-client = "1.0.18"
rich = "^12.5"
ray = {version = "2.35.0", extras = ["data", "default"]}
kubernetes = ">= 25.3.0, < 27"
kubernetes = ">= 27.2.0"
cryptography = "40.0.2"
executing = "1.2.0"
pydantic = "< 2"
ipywidgets = "8.1.2"
kubeflow-training = "1.8.1"

[tool.poetry.group.docs]
optional = true
Expand Down
Loading

0 comments on commit 3e3ca8c

Please sign in to comment.