Skip to content

Commit 61d6894

Browse files
[Core] Add SKYPILOT_NUM_NODES env var (#3656)
* Add SKYPILOT_NUM_NODES env var * Update docs/source/running-jobs/environment-variables.rst Co-authored-by: Zongheng Yang <zongheng.y@gmail.com> * Update docs/source/running-jobs/environment-variables.rst Co-authored-by: Zongheng Yang <zongheng.y@gmail.com> * Update docs/source/running-jobs/environment-variables.rst Co-authored-by: Zongheng Yang <zongheng.y@gmail.com> * format * add remove version * add smoke test for num nodes * fix test --------- Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>
1 parent 3cd768d commit 61d6894

File tree

4 files changed

+46
-23
lines changed

4 files changed

+46
-23
lines changed

docs/source/running-jobs/environment-variables.rst

+8-1
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,12 @@ Environment variables for ``setup``
120120
- Rank (an integer ID from 0 to :code:`num_nodes-1`) of the node being set up.
121121
- 0
122122
* - ``SKYPILOT_SETUP_NODE_IPS``
123-
- A string of IP addresses of the nodes in the cluster with the same order as the node ranks, where each line contains one IP address.
123+
- A string of IP addresses of the nodes in the cluster with the same order as the node ranks, where each line contains one IP address. Note that this is not necessarily the same as the nodes in ``run`` stage, as the ``setup`` stage runs on all nodes of the cluster, while the ``run`` stage can run on a subset of nodes.
124124
- 1.2.3.4
125+
3.4.5.6
126+
* - ``SKYPILOT_NUM_NODES``
127+
- Number of nodes in the cluster. Same value as ``$(echo "$SKYPILOT_NODE_IPS" | wc -l)``.
128+
- 2
125129
* - ``SKYPILOT_TASK_ID``
126130
- A unique ID assigned to each task.
127131

@@ -159,6 +163,9 @@ Environment variables for ``run``
159163
* - ``SKYPILOT_NODE_IPS``
160164
- A string of IP addresses of the nodes reserved to execute the task, where each line contains one IP address. Read more :ref:`here <dist-jobs>`.
161165
- 1.2.3.4
166+
* - ``SKYPILOT_NUM_NODES``
167+
- Number of nodes assigned to execute the current task. Same value as ``$(echo "$SKYPILOT_NODE_IPS" | wc -l)``. Read more :ref:`here <dist-jobs>`.
168+
- 1
162169
* - ``SKYPILOT_NUM_GPUS_PER_NODE``
163170
- Number of GPUs reserved on each node to execute the task; the same as the
164171
count in ``accelerators: <name>:<count>`` (rounded up if a fraction). Read

sky/backends/cloud_vm_ray_backend.py

+27-17
Original file line numberDiff line numberDiff line change
@@ -269,8 +269,9 @@ def add_prologue(self, job_id: int) -> None:
269269
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
270270
271271
kwargs = dict()
272-
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when the directory
273-
# exists for backward compatibility for the VM launched before #1790.
272+
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
273+
# the directory exists for backward compatibility for the VM
274+
# launched before #1790.
274275
if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
275276
kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
276277
ray.init(
@@ -308,8 +309,9 @@ def get_or_fail(futures, pg) -> List[int]:
308309
ready, unready = ray.wait(unready)
309310
idx = futures.index(ready[0])
310311
returncodes[idx] = ray.get(ready[0])
311-
# Remove the placement group after all tasks are done, so that the
312-
# next job can be scheduled on the released resources immediately.
312+
# Remove the placement group after all tasks are done, so that
313+
# the next job can be scheduled on the released resources
314+
# immediately.
313315
ray_util.remove_placement_group(pg)
314316
sys.stdout.flush()
315317
return returncodes
@@ -348,9 +350,9 @@ def add_gang_scheduling_placement_group_and_setup(
348350
num_nodes: int,
349351
resources_dict: Dict[str, float],
350352
stable_cluster_internal_ips: List[str],
353+
env_vars: Dict[str, str],
351354
setup_cmd: Optional[str] = None,
352355
setup_log_path: Optional[str] = None,
353-
env_vars: Optional[Dict[str, str]] = None,
354356
) -> None:
355357
"""Create the gang scheduling placement group for a Task.
356358
@@ -410,6 +412,8 @@ def add_gang_scheduling_placement_group_and_setup(
410412

411413
job_id = self.job_id
412414
if setup_cmd is not None:
415+
setup_envs = env_vars.copy()
416+
setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
413417
self._code += [
414418
textwrap.dedent(f"""\
415419
setup_cmd = {setup_cmd!r}
@@ -439,7 +443,7 @@ def add_gang_scheduling_placement_group_and_setup(
439443
.remote(
440444
setup_cmd,
441445
os.path.expanduser({setup_log_path!r}),
442-
env_vars={env_vars!r},
446+
env_vars={setup_envs!r},
443447
stream_logs=True,
444448
with_ray=True,
445449
) for i in range(total_num_nodes)]
@@ -550,11 +554,13 @@ def add_ray_task(self,
550554
f'placement_group_bundle_index={gang_scheduling_id})')
551555

552556
sky_env_vars_dict_str = [
553-
textwrap.dedent("""\
554-
sky_env_vars_dict = {}
555-
sky_env_vars_dict['SKYPILOT_NODE_IPS'] = job_ip_list_str
556-
# Environment starting with `SKY_` is deprecated.
557+
textwrap.dedent(f"""\
558+
sky_env_vars_dict = {{}}
559+
sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
560+
# Backward compatibility: Environment starting with `SKY_` is
561+
# deprecated. Remove it in v0.9.0.
557562
sky_env_vars_dict['SKY_NODE_IPS'] = job_ip_list_str
563+
sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
558564
""")
559565
]
560566

@@ -575,8 +581,9 @@ def add_ray_task(self,
575581
576582
577583
if script is not None:
578-
sky_env_vars_dict['SKYPILOT_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r}
579-
# Environment starting with `SKY_` is deprecated.
584+
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
585+
# Backward compatibility: Environment starting with `SKY_` is
586+
# deprecated. Remove it in v0.9.0.
580587
sky_env_vars_dict['SKY_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r}
581588
582589
ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
@@ -593,12 +600,14 @@ def add_ray_task(self,
593600
node_name = f'worker{{idx_in_cluster}}'
594601
name_str = f'{{node_name}}, rank={{rank}},'
595602
log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
596-
sky_env_vars_dict['SKYPILOT_NODE_RANK'] = rank
597-
# Environment starting with `SKY_` is deprecated.
603+
sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
604+
# Backward compatibility: Environment starting with `SKY_` is
605+
# deprecated. Remove it in v0.9.0.
598606
sky_env_vars_dict['SKY_NODE_RANK'] = rank
599607
600608
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
601-
# Environment starting with `SKY_` is deprecated.
609+
# Backward compatibility: Environment starting with `SKY_` is
610+
# deprecated. Remove it in v0.9.0.
602611
sky_env_vars_dict['SKY_INTERNAL_JOB_ID'] = {self.job_id}
603612
604613
futures.append(run_bash_command_with_log \\
@@ -4751,9 +4760,9 @@ def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
47514760
1,
47524761
resources_dict,
47534762
stable_cluster_internal_ips=internal_ips,
4763+
env_vars=task_env_vars,
47544764
setup_cmd=self._setup_cmd,
47554765
setup_log_path=os.path.join(log_dir, 'setup.log'),
4756-
env_vars=task_env_vars,
47574766
)
47584767

47594768
if callable(task.run):
@@ -4800,9 +4809,10 @@ def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
48004809
num_actual_nodes,
48014810
resources_dict,
48024811
stable_cluster_internal_ips=internal_ips,
4812+
env_vars=task_env_vars,
48034813
setup_cmd=self._setup_cmd,
48044814
setup_log_path=os.path.join(log_dir, 'setup.log'),
4805-
env_vars=task_env_vars)
4815+
)
48064816

48074817
if callable(task.run):
48084818
run_fn_code = textwrap.dedent(inspect.getsource(task.run))

sky/skylet/constants.py

+6
Original file line numberDiff line numberDiff line change
@@ -237,3 +237,9 @@
237237
# The name for the environment variable that stores the URL of the SkyPilot
238238
# API server.
239239
SKY_API_SERVER_URL_ENV_VAR = 'SKYPILOT_API_SERVER_URL'
240+
241+
# SkyPilot environment variables
242+
SKYPILOT_NUM_NODES = 'SKYPILOT_NUM_NODES'
243+
SKYPILOT_NODE_IPS = 'SKYPILOT_NODE_IPS'
244+
SKYPILOT_NUM_GPUS_PER_NODE = 'SKYPILOT_NUM_GPUS_PER_NODE'
245+
SKYPILOT_NODE_RANK = 'SKYPILOT_NODE_RANK'

tests/test_smoke.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -2936,7 +2936,7 @@ def test_managed_jobs_inline_env(generic_cloud: str):
29362936
test = Test(
29372937
'test-managed-jobs-inline-env',
29382938
[
2939-
f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"',
2939+
f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
29402940
'sleep 20',
29412941
f'{_JOB_QUEUE_WAIT} | grep {name} | grep SUCCEEDED',
29422942
],
@@ -2954,10 +2954,10 @@ def test_inline_env(generic_cloud: str):
29542954
test = Test(
29552955
'test-inline-env',
29562956
[
2957-
f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"',
2957+
f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
29582958
'sleep 20',
29592959
f'sky logs {name} 1 --status',
2960-
f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"',
2960+
f'sky exec {name} --env TEST_ENV2="success" "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
29612961
f'sky logs {name} 2 --status',
29622962
],
29632963
f'sky down -y {name}',
@@ -2973,9 +2973,9 @@ def test_inline_env_file(generic_cloud: str):
29732973
test = Test(
29742974
'test-inline-env-file',
29752975
[
2976-
f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"',
2976+
f'sky launch -c {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
29772977
f'sky logs {name} 1 --status',
2978-
f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_IPS\\" ]] && [[ ! -z \\"\$SKYPILOT_NODE_RANK\\" ]]) || exit 1"',
2978+
f'sky exec {name} --env-file examples/sample_dotenv "([[ ! -z \\"\$TEST_ENV2\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
29792979
f'sky logs {name} 2 --status',
29802980
],
29812981
f'sky down -y {name}',

0 commit comments

Comments
 (0)