Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 56 additions & 8 deletions src/kernel_ci_cloud_labs/auth/aws_role_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,41 @@ def check_exists(self, resource_name: str) -> bool:
except self.client.exceptions.NoSuchEntityException:
return False

@staticmethod
def _is_ec2_role(resource_config: Dict[str, Any]) -> bool:
"""Return True if the role's trust policy allows EC2 to assume it."""
trust_policy = resource_config.get("trust_policy", {})
principals = trust_policy.get("Statement", [{}])[0].get("Principal", {})
return isinstance(principals, dict) and "ec2.amazonaws.com" in str(principals)

def _ensure_instance_profile(self, name: str) -> None:
"""Ensure the EC2 instance profile exists AND has the role bound.

The two operations are independently idempotent. A previous partial
setup (or manual cleanup) can leave the profile in place with no
role attached — invisible to RunInstances, but the VM boots without
credentials so the SSM agent can never register, and downstream
SSM-based test execution fails with "SSM agent not ready".
"""
try:
self.client.create_instance_profile(InstanceProfileName=name)
except self.client.exceptions.EntityAlreadyExistsException:
pass
try:
profile = self.client.get_instance_profile(InstanceProfileName=name)
bound = {r["RoleName"] for r in profile["InstanceProfile"]["Roles"]}
except self.client.exceptions.NoSuchEntityException:
bound = set()
if name not in bound:
try:
self.client.add_role_to_instance_profile(
InstanceProfileName=name, RoleName=name,
)
except self.client.exceptions.LimitExceededException:
# A different role is already bound to this profile;
# leave it alone rather than silently rebinding.
pass

def delete_role(self, resource_name: str) -> None:
"""Delete IAM role and detach policies"""
try:
Expand Down Expand Up @@ -69,17 +104,30 @@ def create(self, resource_name: str, resource_config: Dict[str, Any]) -> str:
)

# Create instance profile for EC2 roles
trust_policy = resource_config.get("trust_policy", {})
principals = trust_policy.get("Statement", [{}])[0].get("Principal", {})
if isinstance(principals, dict) and "ec2.amazonaws.com" in str(principals):
try:
self.client.create_instance_profile(InstanceProfileName=resource_name)
self.client.add_role_to_instance_profile(InstanceProfileName=resource_name, RoleName=resource_name)
except self.client.exceptions.EntityAlreadyExistsException:
pass
if self._is_ec2_role(resource_config):
self._ensure_instance_profile(resource_name)

return response["Role"]["Arn"]

def ensure_exists(
self,
resource_name: str,
resource_config: Dict[str, Any] = None,
force_recreate: bool = False,
):
"""Ensure role exists; also (re-)verify EC2 instance-profile binding.

The base implementation short-circuits when the role already exists,
which means a drifted instance profile (profile present but no role
attached) is never repaired on re-run. We always run the binding
check after the base call so re-runs heal that state.
"""
result = super().ensure_exists(resource_name, resource_config, force_recreate)
config = resource_config if resource_config is not None else self.config.get(resource_name, {})
if self._is_ec2_role(config):
self._ensure_instance_profile(resource_name)
return result

def get_identifier(self, resource_name: str) -> str:
"""Get role ARN"""
response = self.client.get_role(RoleName=resource_name)
Expand Down
7 changes: 4 additions & 3 deletions src/kernel_ci_cloud_labs/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,9 +414,10 @@ def run_pipeline(
vms_dir = Path(run_dir) / "vms"
vms_dir.mkdir(exist_ok=True)

# Wait for VM logs to appear
max_retries = 6
retry_delay = 5
# Wait for VM logs to appear (CloudWatch agent ships in batches
# after the VM shuts down — give it up to 5 min to surface).
max_retries = 10
retry_delay = 30

for attempt in range(max_retries):
# Get log streams from run-specific log group
Expand Down
124 changes: 124 additions & 0 deletions tests/test_role_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,127 @@ def test_ensure_exists_creates_missing(self, mock_iam_client, roles_config):
assert arn == "arn:aws:iam::123:role/test"
assert created is True
mock_iam_client.create_role.assert_called_once()

# ------------------------------------------------------------------
# Instance-profile binding: regression tests for the "profile exists
# but role is not attached" foot-gun that caused VMs to boot without
# IAM credentials and never register with SSM.
# ------------------------------------------------------------------

@pytest.fixture
def mock_iam_with_profile_exceptions(self):
"""Mock IAM client with all the exception types we use."""
mock = Mock()
mock.exceptions.NoSuchEntityException = type("NoSuchEntityException", (Exception,), {})
mock.exceptions.EntityAlreadyExistsException = type("EntityAlreadyExistsException", (Exception,), {})
mock.exceptions.LimitExceededException = type("LimitExceededException", (Exception,), {})
return mock

@pytest.fixture
def ec2_role_config(self):
"""A role with ec2.amazonaws.com in its trust policy."""
return {
"vm-role": {
"description": "EC2 instance profile role",
"trust_policy": {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {"Service": ["ec2.amazonaws.com", "ecs-tasks.amazonaws.com"]},
"Action": "sts:AssumeRole",
}
],
},
"policies": ["arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"],
}
}

def test_is_ec2_role_detects_ec2_principal(self, ec2_role_config):
"""Roles whose trust policy names ec2.amazonaws.com are EC2 roles."""
assert AWSRoleManager._is_ec2_role(ec2_role_config["vm-role"]) is True

def test_is_ec2_role_skips_non_ec2(self, roles_config):
"""ECS-only trust policies should not trigger instance-profile setup."""
assert AWSRoleManager._is_ec2_role(roles_config["ecsTaskExecutionRole"]) is False

def test_ensure_instance_profile_creates_and_binds(self, mock_iam_with_profile_exceptions):
"""Fresh profile: create it, then bind the role."""
client = mock_iam_with_profile_exceptions
client.get_instance_profile.return_value = {"InstanceProfile": {"Roles": []}}

manager = AWSRoleManager(client, {})
manager._ensure_instance_profile("vm-role")

client.create_instance_profile.assert_called_once_with(InstanceProfileName="vm-role")
client.add_role_to_instance_profile.assert_called_once_with(
InstanceProfileName="vm-role", RoleName="vm-role",
)

def test_ensure_instance_profile_repairs_empty_profile(self, mock_iam_with_profile_exceptions):
"""Regression: profile already exists but has no role — must bind.

This is the exact failure mode we hit in production: a previous partial
setup left an empty instance profile in place. RunInstances accepts it,
but the VM gets no IAM credentials and SSM never registers it.
"""
client = mock_iam_with_profile_exceptions
client.create_instance_profile.side_effect = client.exceptions.EntityAlreadyExistsException()
client.get_instance_profile.return_value = {"InstanceProfile": {"Roles": []}}

manager = AWSRoleManager(client, {})
manager._ensure_instance_profile("vm-role")

# Even though create raised, the bind must still happen.
client.add_role_to_instance_profile.assert_called_once_with(
InstanceProfileName="vm-role", RoleName="vm-role",
)

def test_ensure_instance_profile_idempotent_when_already_bound(self, mock_iam_with_profile_exceptions):
"""If our role is already bound, don't re-bind."""
client = mock_iam_with_profile_exceptions
client.create_instance_profile.side_effect = client.exceptions.EntityAlreadyExistsException()
client.get_instance_profile.return_value = {
"InstanceProfile": {"Roles": [{"RoleName": "vm-role"}]}
}

manager = AWSRoleManager(client, {})
manager._ensure_instance_profile("vm-role")

client.add_role_to_instance_profile.assert_not_called()

def test_ensure_instance_profile_preserves_foreign_binding(self, mock_iam_with_profile_exceptions):
"""A different role is bound — leave it, don't silently rebind."""
client = mock_iam_with_profile_exceptions
client.create_instance_profile.side_effect = client.exceptions.EntityAlreadyExistsException()
client.get_instance_profile.return_value = {
"InstanceProfile": {"Roles": [{"RoleName": "someone-else"}]}
}
# IAM enforces max 1 role per profile, so a real add would raise.
client.add_role_to_instance_profile.side_effect = client.exceptions.LimitExceededException()

manager = AWSRoleManager(client, {})
manager._ensure_instance_profile("vm-role") # must not raise

def test_ensure_exists_repairs_binding_when_role_already_present(
self, mock_iam_with_profile_exceptions, ec2_role_config,
):
"""The original bug: re-runs against an existing role never repaired
a drifted profile because the base ensure_exists short-circuits.
After the fix, ensure_exists must still run the binding check."""
client = mock_iam_with_profile_exceptions
# Role exists, so base ensure_exists short-circuits without create().
client.get_role.return_value = {"Role": {"Arn": "arn:aws:iam::123:role/vm-role"}}
# Profile exists but is empty (drifted state).
client.create_instance_profile.side_effect = client.exceptions.EntityAlreadyExistsException()
client.get_instance_profile.return_value = {"InstanceProfile": {"Roles": []}}

manager = AWSRoleManager(client, ec2_role_config)
manager.ensure_exists("vm-role", ec2_role_config["vm-role"])

# create_role must NOT be called — role already exists.
client.create_role.assert_not_called()
# But the binding repair MUST run.
client.add_role_to_instance_profile.assert_called_once_with(
InstanceProfileName="vm-role", RoleName="vm-role",
)
Loading