Skip to content

Commit

Permalink
Merge branch 'dev' into spark-rapids-tools-936
Browse files Browse the repository at this point in the history
  • Loading branch information
cindyyuanjiang committed Apr 17, 2024
2 parents 8a9fec9 + 1cb24c2 commit e542bf3
Show file tree
Hide file tree
Showing 9 changed files with 36 additions and 7 deletions.
3 changes: 3 additions & 0 deletions user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ def get_matching_executor_instance(self, cores_per_executor):
def generate_cluster_configuration(self, render_args: dict):
executor_names = ','.join([f'"test-node-e{i}"' for i in range(render_args['NUM_EXECUTOR_NODES'])])
render_args['EXECUTOR_NAMES'] = f'[{executor_names}]'
image_version = self.configs.get_value('clusterInference', 'defaultImage')
render_args['IMAGE'] = f'"{image_version}"'
render_args['ZONE'] = f'"{self.cli.get_zone()}"'
return super().generate_cluster_configuration(render_args)


Expand Down
17 changes: 17 additions & 0 deletions user_tools/src/spark_rapids_pytools/cloud_api/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ def create_local_submission_job(self, job_prop, ctxt) -> Any:
def _get_prediction_model_name(self) -> str:
return CspEnv.pretty_print(CspEnv.get_default())

def generate_cluster_configuration(self, render_args: dict):
image_version = self.configs.get_value('clusterInference', 'defaultImage')
render_args['IMAGE'] = f'"{image_version}"'
render_args['ZONE'] = f'"{self.cli.get_zone()}"'
return super().generate_cluster_configuration(render_args)


@dataclass
class EMRCMDDriver(CMDDriverBase):
Expand Down Expand Up @@ -199,6 +205,17 @@ def _build_platform_describe_node_instance(self, node: ClusterNode) -> list:
'--instance-types', f'{node.instance_type}']
return cmd_params

def get_zone(self) -> str:
describe_cmd = ['aws ec2 describe-availability-zones',
'--region', f'{self.get_region()}']
selected_zone = ''
try:
zones_list = json.loads(self.run_sys_cmd(describe_cmd))
selected_zone = zones_list['AvailabilityZones'][0]['ZoneName']
except Exception: # pylint: disable=broad-except
self.logger.warning('Unable to extract zone from region %s', self.get_region())
return selected_zone

def _build_platform_list_cluster(self,
cluster,
query_args: dict = None) -> list:
Expand Down
3 changes: 3 additions & 0 deletions user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,9 @@ def get_env_var(self, key: str):
def get_region(self) -> str:
return self.env_vars.get('region')

def get_zone(self) -> str:
return self.env_vars.get('zone')

def get_cmd_run_configs(self) -> dict:
return self.env_vars.get('cmdRunnerProperties')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,7 @@ def __infer_cluster_and_update_savings(self, cluster_info_df: pd.DataFrame):
Update savings if CPU cluster can be inferred and corresponding GPU cluster can be defined.
:param cluster_info_df: Parsed cluster information.
"""
if self.ctxt.get_ctxt('cpuClusterProxy') is not None:
if self.ctxt.get_ctxt('cpuClusterProxy') is not None or not self.ctxt.platform.cluster_inference_supported:
return

# Infer the CPU cluster from the cluster information
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,8 @@
"executor": {
"n1-standard": {"vCPUs": [1, 2, 4, 8, 16, 32, 64, 96]}
}
}
},
"defaultImage": "2.1.41-debian11"
},
"clusterSpecs": {
"minWorkerNodes": 2,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@
{"name": "m5d.12xlarge", "vCPUs": 48},
{"name": "m5d.16xlarge", "vCPUs": 64}
]
}
},
"defaultImage": "emr-6.10.0"
},
"clusterSpecs": {
"minWorkerNodes": 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"clusterUuid": "1234-5678-1234567",
"config": {
"gceClusterConfig": {
"zoneUri": "us-central1-a"
"zoneUri": {{{ ZONE }}}
},
"masterConfig": {
"instanceNames": [
Expand All @@ -16,6 +16,9 @@
"instanceNames": {{{ EXECUTOR_NAMES }}},
"machineTypeUri": {{{ EXECUTOR_INSTANCE }}},
"numInstances": {{ NUM_EXECUTOR_NODES }}
},
"softwareConfig": {
"imageVersion": {{{ IMAGE }}}
}
},
"status": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"State": "TERMINATED"
},
"Ec2InstanceAttributes": {
"Ec2AvailabilityZone": "us-west-2a"
"Ec2AvailabilityZone": {{{ ZONE }}}
},
"InstanceGroups": [
{
Expand All @@ -25,6 +25,7 @@
"InstanceType": {{{ DRIVER_INSTANCE }}},
"RequestedInstanceCount": {{ NUM_DRIVER_NODES }}
}
]
],
"ReleaseLabel": {{{ IMAGE }}}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export CLUSTER_NAME="{{ CLUSTER_NAME }}"
gcloud dataproc clusters create $CLUSTER_NAME \
--image-version={{ IMAGE }} \
--region {{ REGION }} \
--zone {{ ZONE }}-a \
--zone {{ ZONE }} \
--master-machine-type {{ MASTER_MACHINE }} \
--num-workers {{ WORKERS_COUNT }} \
--worker-machine-type {{ WORKERS_MACHINE }} \
Expand Down

0 comments on commit e542bf3

Please sign in to comment.