From aa84ba971911399a697f4990c0aa486ceeb0c758 Mon Sep 17 00:00:00 2001 From: Denys Fedoryshchenko Date: Sat, 23 May 2026 00:57:05 +0300 Subject: [PATCH 1/2] Increase log waiting time Signed-off-by: Denys Fedoryshchenko --- src/kernel_ci_cloud_labs/core/pipeline.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/kernel_ci_cloud_labs/core/pipeline.py b/src/kernel_ci_cloud_labs/core/pipeline.py index f956f38..1849e5f 100644 --- a/src/kernel_ci_cloud_labs/core/pipeline.py +++ b/src/kernel_ci_cloud_labs/core/pipeline.py @@ -414,9 +414,10 @@ def run_pipeline( vms_dir = Path(run_dir) / "vms" vms_dir.mkdir(exist_ok=True) - # Wait for VM logs to appear - max_retries = 6 - retry_delay = 5 + # Wait for VM logs to appear (CloudWatch agent ships in batches + # after the VM shuts down — give it up to 5 min to surface). + max_retries = 10 + retry_delay = 30 for attempt in range(max_retries): # Get log streams from run-specific log group From 9ed9f9b2ebebac2834ed3aaafe93b2833295a12d Mon Sep 17 00:00:00 2001 From: Denys Fedoryshchenko Date: Sat, 23 May 2026 01:15:03 +0300 Subject: [PATCH 2/2] Role fix from incomplete setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A previous interrupted setup left the EC2 instance profile in place with no IAM role attached to it, so VMs booted with no credentials and the SSM agent could never register — which looked like "SSM agent not ready after 5 minutes". Fix: setup now repairs that drifted state on every run (binds the role to the profile if it's missing), instead of skipping the bind whenever the profile already existed. Signed-off-by: Denys Fedoryshchenko --- .../auth/aws_role_manager.py | 64 +++++++-- tests/test_role_manager.py | 124 ++++++++++++++++++ 2 files changed, 180 insertions(+), 8 deletions(-) diff --git a/src/kernel_ci_cloud_labs/auth/aws_role_manager.py b/src/kernel_ci_cloud_labs/auth/aws_role_manager.py index 8f52237..6706a39 100644 --- a/src/kernel_ci_cloud_labs/auth/aws_role_manager.py +++ b/src/kernel_ci_cloud_labs/auth/aws_role_manager.py @@ -22,6 +22,41 @@ def check_exists(self, resource_name: str) -> bool: except self.client.exceptions.NoSuchEntityException: return False + @staticmethod + def _is_ec2_role(resource_config: Dict[str, Any]) -> bool: + """Return True if the role's trust policy allows EC2 to assume it.""" + trust_policy = resource_config.get("trust_policy", {}) + principals = trust_policy.get("Statement", [{}])[0].get("Principal", {}) + return isinstance(principals, dict) and "ec2.amazonaws.com" in str(principals) + + def _ensure_instance_profile(self, name: str) -> None: + """Ensure the EC2 instance profile exists AND has the role bound. + + The two operations are independently idempotent. A previous partial + setup (or manual cleanup) can leave the profile in place with no + role attached — invisible to RunInstances, but the VM boots without + credentials so the SSM agent can never register, and downstream + SSM-based test execution fails with "SSM agent not ready". + """ + try: + self.client.create_instance_profile(InstanceProfileName=name) + except self.client.exceptions.EntityAlreadyExistsException: + pass + try: + profile = self.client.get_instance_profile(InstanceProfileName=name) + bound = {r["RoleName"] for r in profile["InstanceProfile"]["Roles"]} + except self.client.exceptions.NoSuchEntityException: + bound = set() + if name not in bound: + try: + self.client.add_role_to_instance_profile( + InstanceProfileName=name, RoleName=name, + ) + except self.client.exceptions.LimitExceededException: + # A different role is already bound to this profile; + # leave it alone rather than silently rebinding. + pass + def delete_role(self, resource_name: str) -> None: """Delete IAM role and detach policies""" try: @@ -69,17 +104,30 @@ def create(self, resource_name: str, resource_config: Dict[str, Any]) -> str: ) # Create instance profile for EC2 roles - trust_policy = resource_config.get("trust_policy", {}) - principals = trust_policy.get("Statement", [{}])[0].get("Principal", {}) - if isinstance(principals, dict) and "ec2.amazonaws.com" in str(principals): - try: - self.client.create_instance_profile(InstanceProfileName=resource_name) - self.client.add_role_to_instance_profile(InstanceProfileName=resource_name, RoleName=resource_name) - except self.client.exceptions.EntityAlreadyExistsException: - pass + if self._is_ec2_role(resource_config): + self._ensure_instance_profile(resource_name) return response["Role"]["Arn"] + def ensure_exists( + self, + resource_name: str, + resource_config: Dict[str, Any] = None, + force_recreate: bool = False, + ): + """Ensure role exists; also (re-)verify EC2 instance-profile binding. + + The base implementation short-circuits when the role already exists, + which means a drifted instance profile (profile present but no role + attached) is never repaired on re-run. We always run the binding + check after the base call so re-runs heal that state. + """ + result = super().ensure_exists(resource_name, resource_config, force_recreate) + config = resource_config if resource_config is not None else self.config.get(resource_name, {}) + if self._is_ec2_role(config): + self._ensure_instance_profile(resource_name) + return result + def get_identifier(self, resource_name: str) -> str: """Get role ARN""" response = self.client.get_role(RoleName=resource_name) diff --git a/tests/test_role_manager.py b/tests/test_role_manager.py index 700ef3c..c3afbfb 100644 --- a/tests/test_role_manager.py +++ b/tests/test_role_manager.py @@ -95,3 +95,127 @@ def test_ensure_exists_creates_missing(self, mock_iam_client, roles_config): assert arn == "arn:aws:iam::123:role/test" assert created is True mock_iam_client.create_role.assert_called_once() + + # ------------------------------------------------------------------ + # Instance-profile binding: regression tests for the "profile exists + # but role is not attached" foot-gun that caused VMs to boot without + # IAM credentials and never register with SSM. + # ------------------------------------------------------------------ + + @pytest.fixture + def mock_iam_with_profile_exceptions(self): + """Mock IAM client with all the exception types we use.""" + mock = Mock() + mock.exceptions.NoSuchEntityException = type("NoSuchEntityException", (Exception,), {}) + mock.exceptions.EntityAlreadyExistsException = type("EntityAlreadyExistsException", (Exception,), {}) + mock.exceptions.LimitExceededException = type("LimitExceededException", (Exception,), {}) + return mock + + @pytest.fixture + def ec2_role_config(self): + """A role with ec2.amazonaws.com in its trust policy.""" + return { + "vm-role": { + "description": "EC2 instance profile role", + "trust_policy": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": ["ec2.amazonaws.com", "ecs-tasks.amazonaws.com"]}, + "Action": "sts:AssumeRole", + } + ], + }, + "policies": ["arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"], + } + } + + def test_is_ec2_role_detects_ec2_principal(self, ec2_role_config): + """Roles whose trust policy names ec2.amazonaws.com are EC2 roles.""" + assert AWSRoleManager._is_ec2_role(ec2_role_config["vm-role"]) is True + + def test_is_ec2_role_skips_non_ec2(self, roles_config): + """ECS-only trust policies should not trigger instance-profile setup.""" + assert AWSRoleManager._is_ec2_role(roles_config["ecsTaskExecutionRole"]) is False + + def test_ensure_instance_profile_creates_and_binds(self, mock_iam_with_profile_exceptions): + """Fresh profile: create it, then bind the role.""" + client = mock_iam_with_profile_exceptions + client.get_instance_profile.return_value = {"InstanceProfile": {"Roles": []}} + + manager = AWSRoleManager(client, {}) + manager._ensure_instance_profile("vm-role") + + client.create_instance_profile.assert_called_once_with(InstanceProfileName="vm-role") + client.add_role_to_instance_profile.assert_called_once_with( + InstanceProfileName="vm-role", RoleName="vm-role", + ) + + def test_ensure_instance_profile_repairs_empty_profile(self, mock_iam_with_profile_exceptions): + """Regression: profile already exists but has no role — must bind. + + This is the exact failure mode we hit in production: a previous partial + setup left an empty instance profile in place. RunInstances accepts it, + but the VM gets no IAM credentials and SSM never registers it. + """ + client = mock_iam_with_profile_exceptions + client.create_instance_profile.side_effect = client.exceptions.EntityAlreadyExistsException() + client.get_instance_profile.return_value = {"InstanceProfile": {"Roles": []}} + + manager = AWSRoleManager(client, {}) + manager._ensure_instance_profile("vm-role") + + # Even though create raised, the bind must still happen. + client.add_role_to_instance_profile.assert_called_once_with( + InstanceProfileName="vm-role", RoleName="vm-role", + ) + + def test_ensure_instance_profile_idempotent_when_already_bound(self, mock_iam_with_profile_exceptions): + """If our role is already bound, don't re-bind.""" + client = mock_iam_with_profile_exceptions + client.create_instance_profile.side_effect = client.exceptions.EntityAlreadyExistsException() + client.get_instance_profile.return_value = { + "InstanceProfile": {"Roles": [{"RoleName": "vm-role"}]} + } + + manager = AWSRoleManager(client, {}) + manager._ensure_instance_profile("vm-role") + + client.add_role_to_instance_profile.assert_not_called() + + def test_ensure_instance_profile_preserves_foreign_binding(self, mock_iam_with_profile_exceptions): + """A different role is bound — leave it, don't silently rebind.""" + client = mock_iam_with_profile_exceptions + client.create_instance_profile.side_effect = client.exceptions.EntityAlreadyExistsException() + client.get_instance_profile.return_value = { + "InstanceProfile": {"Roles": [{"RoleName": "someone-else"}]} + } + # IAM enforces max 1 role per profile, so a real add would raise. + client.add_role_to_instance_profile.side_effect = client.exceptions.LimitExceededException() + + manager = AWSRoleManager(client, {}) + manager._ensure_instance_profile("vm-role") # must not raise + + def test_ensure_exists_repairs_binding_when_role_already_present( + self, mock_iam_with_profile_exceptions, ec2_role_config, + ): + """The original bug: re-runs against an existing role never repaired + a drifted profile because the base ensure_exists short-circuits. + After the fix, ensure_exists must still run the binding check.""" + client = mock_iam_with_profile_exceptions + # Role exists, so base ensure_exists short-circuits without create(). + client.get_role.return_value = {"Role": {"Arn": "arn:aws:iam::123:role/vm-role"}} + # Profile exists but is empty (drifted state). + client.create_instance_profile.side_effect = client.exceptions.EntityAlreadyExistsException() + client.get_instance_profile.return_value = {"InstanceProfile": {"Roles": []}} + + manager = AWSRoleManager(client, ec2_role_config) + manager.ensure_exists("vm-role", ec2_role_config["vm-role"]) + + # create_role must NOT be called — role already exists. + client.create_role.assert_not_called() + # But the binding repair MUST run. + client.add_role_to_instance_profile.assert_called_once_with( + InstanceProfileName="vm-role", RoleName="vm-role", + )