Skip to content

Commit

Permalink
aws-backend: add support for running instances on multiple subnets
Browse files Browse the repository at this point in the history
  • Loading branch information
timbrown5 committed Jan 16, 2025
1 parent 8969d71 commit 5913e8b
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 13 deletions.
56 changes: 48 additions & 8 deletions runner_manager/backend/aws.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import List, Literal, Optional
from copy import deepcopy
from typing import List, Literal, Optional, Sequence
from random import shuffle

from boto3 import client
from botocore.exceptions import ClientError
Expand All @@ -13,6 +15,7 @@
from runner_manager.models.backend import (
AWSConfig,
AwsInstance,
AwsSubnetListConfig,
AWSInstanceConfig,
Backends,
)
Expand All @@ -31,15 +34,52 @@ def client(self) -> EC2Client:

def create(self, runner: Runner) -> Runner:
"""Create a runner."""
instance_resource: AwsInstance = self.instance_config.configure_instance(runner)
try:
instance = self.client.run_instances(**instance_resource)
runner.instance_id = instance["Instances"][0]["InstanceId"]
except Exception as e:
log.error(e)
raise e
if self.instance_config.subnet_id and self.instance_config.subnet_configs:
raise Exception("Instance config contains both subnet_id and subnet_configs, only one allowed.")
if len(self.instance_config.subnet_configs) > 0:
runner = self._create_from_subnet_config(runner, self.instance_config.subnet_configs)
log.warn(f"Instance id: {runner.instance_id}")

Check warning on line 41 in runner_manager/backend/aws.py

View check run for this annotation

Codecov / codecov/patch

runner_manager/backend/aws.py#L37-L41

Added lines #L37 - L41 were not covered by tests
else:
instance_resource: AwsInstance = self.instance_config.configure_instance(runner)
try:
runner = self._create(runner, instance_resource)
log.warn(f"Instance id: {runner.instance_id}")
except Exception as e:
log.error(e)
raise e

Check warning on line 49 in runner_manager/backend/aws.py

View check run for this annotation

Codecov / codecov/patch

runner_manager/backend/aws.py#L43-L49

Added lines #L43 - L49 were not covered by tests
return super().create(runner)

def _create_from_subnet_config(self, runner: Runner, subnet_configs: Sequence[AwsSubnetListConfig]) -> Runner:
# Randomize the order of the Subnets - very coarse load balancing.
# TODO: Skip subnets that have failed recently. Maybe with an increasing backoff.
order = list(range(len(subnet_configs)))
shuffle(order)
subnet_config = self.instance_config.subnet_configs
print(f"Order: {order}")
for idx, i in enumerate(order):
subnet_config = subnet_configs[i]
try:

Check warning on line 61 in runner_manager/backend/aws.py

View check run for this annotation

Codecov / codecov/patch

runner_manager/backend/aws.py#L55-L61

Added lines #L55 - L61 were not covered by tests
# Copy the object to avoid modifying the object we were passed.
count = self.instance_config.max_count - self.instance_config.min_count
log.info(f"Trying to launch {count} containers on subnet {subnet_config['subnet_id']}")
concrete_instance_config = deepcopy(self.instance_config)
concrete_instance_config.subnet_id = subnet_config["subnet_id"]
concrete_instance_config.security_group_ids.extend(

Check warning on line 67 in runner_manager/backend/aws.py

View check run for this annotation

Codecov / codecov/patch

runner_manager/backend/aws.py#L63-L67

Added lines #L63 - L67 were not covered by tests
subnet_config.get("security_group_ids", [])
)
instance_resource: AwsInstance = concrete_instance_config.configure_instance(runner)
return self._create(runner, instance_resource)
except Exception as e:
log.warn(f"Creating instance in subnet {subnet_config['subnet_id']} failed with '{e}'. Retrying with another subnet.")
if idx >= len(order) - 1:
raise e
return runner

Check warning on line 76 in runner_manager/backend/aws.py

View check run for this annotation

Codecov / codecov/patch

runner_manager/backend/aws.py#L70-L76

Added lines #L70 - L76 were not covered by tests

def _create(self, runner: Runner, instance_resource: AwsInstance) -> Runner:
instance = self.client.run_instances(**instance_resource)
runner.instance_id = instance["Instances"][0]["InstanceId"]
return runner

Check warning on line 81 in runner_manager/backend/aws.py

View check run for this annotation

Codecov / codecov/patch

runner_manager/backend/aws.py#L79-L81

Added lines #L79 - L81 were not covered by tests

def delete(self, runner: Runner):
"""Delete a runner."""
if runner.instance_id:
Expand Down
13 changes: 11 additions & 2 deletions runner_manager/models/backend.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from enum import Enum
from pathlib import Path
from string import Template
from typing import Dict, List, Literal, Optional, Sequence, TypedDict
from typing import Dict, List, Literal, Optional, Sequence, TypedDict, NotRequired

from mypy_boto3_ec2.literals import (
InstanceMetadataTagsStateType,
Expand Down Expand Up @@ -134,6 +134,14 @@ class AWSConfig(BackendConfig):
region: str = "us-west-2"


AwsSubnetListConfig = TypedDict(
"AwsSubnetListConfig",
{
"subnet_id": str,
"security_group_ids": NotRequired[Sequence[str]],
}
)

AwsInstance = TypedDict(
"AwsInstance",
{
Expand All @@ -157,7 +165,7 @@ class AWSInstanceConfig(InstanceConfig):

image: str = "ami-0735c191cf914754d" # Ubuntu 22.04 for us-west-2
instance_type: InstanceTypeType = "t3.micro"
subnet_id: str
subnet_id: str = ""
security_group_ids: Sequence[str] = []
max_count: int = 1
min_count: int = 1
Expand All @@ -167,6 +175,7 @@ class AWSInstanceConfig(InstanceConfig):
disk_size_gb: int = 20
iam_instance_profile_arn: str = ""
instance_metadata_tags: InstanceMetadataTagsStateType = "disabled"
subnet_configs: Sequence[AwsSubnetListConfig] = []

def configure_instance(self, runner: Runner) -> AwsInstance:
"""Configure instance."""
Expand Down
84 changes: 81 additions & 3 deletions tests/unit/backend/test_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from mypy_boto3_ec2.type_defs import TagTypeDef
from pytest import fixture, mark, raises
from unittest.mock import patch
from redis_om import NotFoundError

from runner_manager.backend.aws import AWSBackend
Expand Down Expand Up @@ -35,11 +36,67 @@ def aws_group(settings) -> RunnerGroup:
)
return runner_group

@fixture()
def aws_multi_subnet_group(settings) -> RunnerGroup:
config = AWSConfig()
subnet_id = os.getenv("AWS_SUBNET_ID", "")
runner_group: RunnerGroup = RunnerGroup(
id=3,
name="default",
organization="test",
manager=settings.name,
backend=AWSBackend(
name=Backends.aws,
config=config,
instance_config=AWSInstanceConfig(
subnet_configs=[
{
"subnet_id": subnet_id,
"security_group_ids": [],
}
]
),
),
labels=[
"label",
],
)
return runner_group

@fixture()
def aws_multi_subnet_group_invalid_subnets(settings) -> RunnerGroup:
config = AWSConfig()
runner_group: RunnerGroup = RunnerGroup(
id=3,
name="default",
organization="test",
manager=settings.name,
backend=AWSBackend(
name=Backends.aws,
config=config,
instance_config=AWSInstanceConfig(
subnet_configs=[
{
"subnet_id": "does-not-exist",
},
{
"subnet_id": "also-does-not-exist",
}
]
),
),
labels=[
"label",
],
)
return runner_group


@fixture()
def aws_runner(runner: Runner, aws_group: RunnerGroup) -> Runner:
# Cleanup and return a runner for testing
aws_group.backend.delete(runner)
runner.instance_id = None
return runner


Expand Down Expand Up @@ -70,7 +127,7 @@ def test_aws_instance_config(runner: Runner):
assert instance["TagSpecifications"][1]["ResourceType"] == "volume"


@mark.skipif(not os.getenv("AWS_ACCESS_KEY_ID"), reason="AWS credentials not found")
@mark.skipif(not os.getenv("AWS_ACCESS_KEY_ID") and not os.getenv("AWS_PROFILE"), reason="AWS credentials not found")
def test_create_delete(aws_runner, aws_group):
runner = aws_group.backend.create(aws_runner)
assert runner.instance_id is not None
Expand All @@ -81,7 +138,7 @@ def test_create_delete(aws_runner, aws_group):
Runner.find(Runner.instance_id == runner.instance_id).first()


@mark.skipif(not os.getenv("AWS_ACCESS_KEY_ID"), reason="AWS credentials not found")
@mark.skipif(not os.getenv("AWS_ACCESS_KEY_ID") and not os.getenv("AWS_PROFILE"), reason="AWS credentials not found")
def test_list(aws_runner, aws_group):
runner = aws_group.backend.create(aws_runner)
runners = aws_group.backend.list()
Expand All @@ -91,7 +148,7 @@ def test_list(aws_runner, aws_group):
aws_group.backend.get(runner.instance_id)


@mark.skipif(not os.getenv("AWS_ACCESS_KEY_ID"), reason="AWS credentials not found")
@mark.skipif(not os.getenv("AWS_ACCESS_KEY_ID") and not os.getenv("AWS_PROFILE"), reason="AWS credentials not found")
def test_update(aws_runner, aws_group):
runner = aws_group.backend.create(aws_runner)
runner.labels = [RunnerLabel(name="test", type="custom")]
Expand All @@ -100,3 +157,24 @@ def test_update(aws_runner, aws_group):
aws_group.backend.delete(runner)
with raises(NotFoundError):
aws_group.backend.get(runner.instance_id)


@mark.skipif(not os.getenv("AWS_ACCESS_KEY_ID") and not os.getenv("AWS_PROFILE"), reason="AWS credentials not found")
def test_create_delete_multi_subnet(aws_runner, aws_multi_subnet_group):
runner = aws_multi_subnet_group.backend.create(aws_runner)
print(f"{runner.instance_id}")
assert runner.instance_id is not None
assert runner.backend == "aws"
assert Runner.find(Runner.instance_id == runner.instance_id).first() == runner
aws_multi_subnet_group.backend.delete(runner)
with raises(NotFoundError):
Runner.find(Runner.instance_id == runner.instance_id).first()


@mark.skipif(not os.getenv("AWS_ACCESS_KEY_ID") and not os.getenv("AWS_PROFILE"), reason="AWS credentials not found")
def test_create_delete_multi_subnet_invalid_subnets(aws_runner, aws_multi_subnet_group_invalid_subnets):
with patch.object(AWSBackend, '_create', wraps=aws_multi_subnet_group_invalid_subnets.backend._create) as mock:
with raises(Exception):
aws_multi_subnet_group_invalid_subnets.backend.create(aws_runner)
# Check that the code tries once for each subnet.
assert mock.call_count == 2

0 comments on commit 5913e8b

Please sign in to comment.