Skip to content

Commit 9b3312a

Browse files
[k8s][GKE] distinguish between h100 and h100-mega on gke (#3891)
* distinguish between h100 and h100-mega on gke * Update sky/provision/kubernetes/utils.py Co-authored-by: Romil Bhardwaj <romil.bhardwaj@gmail.com> * lint --------- Co-authored-by: Romil Bhardwaj <romil.bhardwaj@gmail.com>
1 parent 95b52c0 commit 9b3312a

File tree

1 file changed

+7
-9
lines changed

1 file changed

+7
-9
lines changed

sky/provision/kubernetes/utils.py

+7-9
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,9 @@ def get_gke_accelerator_name(accelerator: str) -> str:
110110
if accelerator == 'H100':
111111
# H100 is named as H100-80GB in GKE.
112112
accelerator = 'H100-80GB'
113-
if accelerator in ('A100-80GB', 'L4', 'H100-80GB'):
114-
# A100-80GB, L4 and H100-80GB have a different name pattern.
113+
if accelerator in ('A100-80GB', 'L4', 'H100-80GB', 'H100-MEGA-80GB'):
114+
# A100-80GB, L4, H100-80GB and H100-MEGA-80GB
115+
# have a different name pattern.
115116
return 'nvidia-{}'.format(accelerator.lower())
116117
else:
117118
return 'nvidia-tesla-{}'.format(accelerator.lower())
@@ -194,13 +195,10 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
194195
return value.replace('nvidia-tesla-', '').upper()
195196
elif value.startswith('nvidia-'):
196197
acc = value.replace('nvidia-', '').upper()
197-
if acc in ['H100-80GB', 'H100-MEGA-80GB']:
198-
# H100 is named H100-80GB or H100-MEGA-80GB in GKE,
199-
# where the latter has improved bandwidth.
200-
# See a3-mega instances on GCP.
201-
# TODO: we do not distinguish the two GPUs for simplicity,
202-
# but we can evaluate whether we should distinguish
203-
# them based on users' requests.
198+
if acc == 'H100-80GB':
199+
# H100 can be either H100-80GB or H100-MEGA-80GB in GKE
200+
# we map H100 ---> H100-80GB and keep H100-MEGA-80GB
201+
# to distinguish between a3-high and a3-mega instances
204202
return 'H100'
205203
return acc
206204
else:

0 commit comments

Comments
 (0)