Skip to content

[Azure] Optimize autostopping speed for azure #3519

New issue

Have a question about this project? No Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “No Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? No Sign in to your account

Merged
merged 1 commit into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion sky/skylet/autostop_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,16 @@ def set_autostopping_started() -> None:
configs.set_config(_AUTOSTOP_INDICATOR, str(psutil.boot_time()))


def get_is_autostopping_payload() -> str:
def get_is_autostopping() -> bool:
"""Returns whether the cluster is in the process of autostopping."""
result = configs.get_config(_AUTOSTOP_INDICATOR)
is_autostopping = (result == str(psutil.boot_time()))
return is_autostopping


def get_is_autostopping_payload() -> str:
"""Payload for whether the cluster is in the process of autostopping."""
is_autostopping = get_is_autostopping()
return common_utils.encode_payload(is_autostopping)


Expand Down
2 changes: 2 additions & 0 deletions sky/skylet/providers/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def _configure_resource_group(config):
create_or_update = get_azure_sdk_function(
client=resource_client.deployments, function_name="create_or_update"
)
# TODO (skypilot): this takes a long time (> 40 seconds) for stopping an
# azure VM, and this can be called twice during ray down.
outputs = (
create_or_update(
resource_group_name=resource_group,
Expand Down
28 changes: 18 additions & 10 deletions sky/skylet/providers/azure/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
bootstrap_azure,
get_azure_sdk_function,
)
from sky.skylet import autostop_lib
from sky.skylet.providers.command_runner import SkyDockerCommandRunner
from sky.provision import docker_utils

Expand Down Expand Up @@ -61,16 +62,23 @@ class AzureNodeProvider(NodeProvider):

def __init__(self, provider_config, cluster_name):
NodeProvider.__init__(self, provider_config, cluster_name)
# TODO(suquark): This is a temporary patch for resource group.
# By default, Ray autoscaler assumes the resource group is still here even
# after the whole cluster is destroyed. However, now we deletes the resource
# group after tearing down the cluster. To comfort the autoscaler, we need
# to create/update it here, so the resource group always exists.
from sky.skylet.providers.azure.config import _configure_resource_group

_configure_resource_group(
{"cluster_name": cluster_name, "provider": provider_config}
)
if not autostop_lib.get_is_autostopping():
# TODO(suquark): This is a temporary patch for resource group.
# By default, Ray autoscaler assumes the resource group is still
# here even after the whole cluster is destroyed. However, now we
# deletes the resource group after tearing down the cluster. To
# comfort the autoscaler, we need to create/update it here, so the
# resource group always exists.
#
# We should not re-configure the resource group again, when it is
# running on the remote VM and the autostopping is in progress,
# because the VM is running which guarantees the resource group
# exists.
from sky.skylet.providers.azure.config import _configure_resource_group

_configure_resource_group(
{"cluster_name": cluster_name, "provider": provider_config}
)
subscription_id = provider_config["subscription_id"]
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
# Sky only supports Azure CLI credential for now.
Expand Down
Loading