From 0f73e65d067ed2248183e059c8a2aceb7608196d Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Tue, 31 Mar 2026 00:44:48 +0200 Subject: [PATCH 1/3] Improve Slack alert quality: show person, tags, reduce noise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Show GitHub actor name in "By:" line and source line instead of just the CI role name (e.g. "Alexanderamiri via javabin-ci-infra (CI/CD)") - Pass tags to deletion and modification alerts (previously only on creation) - Consolidate related deletion events on same resource (DeleteBucket, DeleteBucketWebsite, DeleteBucketPublicAccessBlock → single alert) - Suppress SecurityHub/GuardDuty findings for resources recently managed by CI (10-minute window) to avoid noise from intentional changes - Unify footer functions: replace context_footer with _build_footer so IAM changes and console logins also show actor info --- terraform/lambda-src/slack_alert/handler.py | 98 +++++++++++++++++---- 1 file changed, 79 insertions(+), 19 deletions(-) diff --git a/terraform/lambda-src/slack_alert/handler.py b/terraform/lambda-src/slack_alert/handler.py index 9cd0e90..d8d9d3e 100644 --- a/terraform/lambda-src/slack_alert/handler.py +++ b/terraform/lambda-src/slack_alert/handler.py @@ -31,6 +31,11 @@ DEDUP_WINDOW = 300 # 5 minutes DEDUP_TTL_DAYS = 30 +# CI activity tracker: resource_name -> timestamp +# Suppresses SecurityHub/GuardDuty findings for resources actively managed by CI +_recent_ci_resources = {} +CI_SUPPRESS_WINDOW = 600 # 10 minutes + # --------------------------------------------------------------------------- # DynamoDB dedup — persistent dedup for Security Hub + compliance alerts @@ -69,6 +74,29 @@ def record_finding_alert(dedup_key): except Exception as e: logger.warning("DynamoDB dedup write failed: %s", e) +# --------------------------------------------------------------------------- +# CI resource suppression — track resources managed by CI, suppress noisy findings +# --------------------------------------------------------------------------- +def _track_ci_resource(resource_name): + """Record a resource being actively managed by CI.""" + if not resource_name: + return + now = time.time() + for k in list(_recent_ci_resources): + if now - _recent_ci_resources[k] > CI_SUPPRESS_WINDOW: + del _recent_ci_resources[k] + _recent_ci_resources[resource_name] = now + + +def _is_ci_managed_resource(resource_name): + """Check if a resource was recently managed by CI (within suppress window).""" + now = time.time() + for k in list(_recent_ci_resources): + if now - _recent_ci_resources[k] > CI_SUPPRESS_WINDOW: + del _recent_ci_resources[k] + return resource_name in _recent_ci_resources if resource_name else False + + # --------------------------------------------------------------------------- # Console URL builder — derives service from eventSource, minimal overrides # --------------------------------------------------------------------------- @@ -205,13 +233,19 @@ def _is_service_linked_role(detail): def is_duplicate(action, resource): key = f"{action}:{resource}" + resource_key = f"__resource__:{resource}" now = time.time() for k in list(_recent_alerts): if now - _recent_alerts[k] > DEDUP_WINDOW: del _recent_alerts[k] if key in _recent_alerts: return True + # For deletions: suppress if any delete on same resource happened recently + if action.startswith(("Delete", "Terminate")) and resource_key in _recent_alerts: + return True _recent_alerts[key] = now + if action.startswith(("Delete", "Terminate")): + _recent_alerts[resource_key] = now return False @@ -335,6 +369,9 @@ def parse_identity(detail): if app_match: ci_ctx["repo"] = f"{GITHUB_ORG_URL.split('/')[-1]}/{app_match.group(1)}" + gh_actor = ci_ctx.get("GitHubActor", "") + if gh_actor: + return f"{gh_actor} via {role_name} (CI/CD)", True, ci_ctx or None return f"{role_name} (CI/CD)", True, ci_ctx or None # Identity Center SSO: AWSReservedSSO_{permission-set}_{hash} @@ -376,7 +413,14 @@ def format_ci_source(ci_context): if run_id: parts.append(f"<{repo_url}/actions/runs/{run_id}|run>") - return f":robot_face: *Source:* {' / '.join(parts)}" if parts else ":robot_face: *Source:* CI/CD pipeline" + if not parts: + return ":robot_face: *Source:* CI/CD pipeline" + + source = f":robot_face: *Source:* {' / '.join(parts)}" + actor = ci_context.get("GitHubActor") + if actor: + source += f" — by {actor}" + return source def extract_resource_name(event_name, detail): @@ -472,19 +516,6 @@ def resource_link(resource_name, url): # --------------------------------------------------------------------------- # Block Kit formatters # --------------------------------------------------------------------------- -def context_footer(parsed): - """Standard context footer block.""" - account = parsed.get("account", "unknown") - region = parsed.get("region", "unknown") - ts = parsed.get("time", datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")) - return { - "type": "context", - "elements": [ - {"type": "mrkdwn", - "text": f"AWS Account {account} | {region} | {ts}"} - ] - } - def _extract_tags(detail): """Extract resource tags from CloudTrail event request/response if available.""" @@ -556,6 +587,7 @@ def format_resource_creation(parsed): cost_estimate = estimate_cost(event_name, detail) if is_ci: + _track_ci_resource(resource_name) header_text = f":package: {service or 'Resource'} Created (CI/CD)" source_line = format_ci_source(ci_ctx) else: @@ -666,10 +698,14 @@ def format_resource_modification(parsed): if is_duplicate(event_name, resource_name or "unknown"): return None + if is_ci: + _track_ci_resource(resource_name) + display_name = re.sub(r"\d{8}$", "", event_name) console_url = build_console_url(event_name, event_source, resource_name, region) res_display = resource_link(resource_name, console_url) if resource_name else "_unknown_" service = service_display_name(event_source) + tags = _extract_tags(detail) fields = [ {"type": "mrkdwn", "text": f"*Resource*\n{res_display}"}, @@ -714,7 +750,7 @@ def format_resource_modification(parsed): }) blocks.append({"type": "divider"}) - blocks.append(_build_footer(parsed, actor=actor)) + blocks.append(_build_footer(parsed, tags=tags, actor=actor)) fallback = f"{service or 'Resource'} modified: {display_name} on {resource_name or 'unknown'}" return {"blocks": blocks, "text": fallback} @@ -732,8 +768,13 @@ def format_resource_deletion(parsed): if is_duplicate(event_name, resource_name or "unknown"): return None + # Track CI-managed resources to suppress noisy SecurityHub/GuardDuty findings + if is_ci: + _track_ci_resource(resource_name) + display_name = re.sub(r"\d{8}$", "", event_name) service = service_display_name(event_source) + tags = _extract_tags(detail) fields = [ {"type": "mrkdwn", "text": f"*Resource*\n`{resource_name or 'unknown'}`"}, @@ -753,7 +794,7 @@ def format_resource_deletion(parsed): blocks.extend([ {"type": "section", "fields": fields}, {"type": "divider"}, - _build_footer(parsed, actor=actor), + _build_footer(parsed, tags=tags, actor=actor), ]) fallback = f"{service or 'Resource'} deleted: {resource_name or 'unknown'}" @@ -796,7 +837,7 @@ def format_compliance_alert(parsed): "text": {"type": "mrkdwn", "text": "_Tags should be applied via CI/CD. Check the deploying workflow._"}}, {"type": "divider"}, - context_footer(parsed), + _build_footer(parsed), ] fallback = f"Untagged resource: {short_type} — {resource_id}" @@ -850,7 +891,7 @@ def format_iam_change(parsed): }) blocks.append({"type": "divider"}) - blocks.append(context_footer(parsed)) + blocks.append(_build_footer(parsed, actor=actor)) fallback = f"IAM change: {event_name} on {target} by {actor}" return {"blocks": blocks, "text": fallback} @@ -874,7 +915,7 @@ def format_console_login(parsed): {"type": "mrkdwn", "text": f"*Region*\n{region}"}, ]}, {"type": "divider"}, - context_footer(parsed), + _build_footer(parsed, actor=actor), ] fallback = f"Console login: {actor} — {result}" @@ -891,6 +932,15 @@ def format_guardduty_finding(parsed): region = parsed.get("region", detail.get("region", "unknown")) account = parsed.get("account", detail.get("accountId", "unknown")) + # Suppress findings for resources recently managed by CI + resource = detail.get("resource", {}) + for s3_detail in resource.get("S3BucketDetails", []): + bucket_name = s3_detail.get("Name", "") + if _is_ci_managed_resource(bucket_name): + logger.info("GuardDuty finding suppressed (CI-managed resource): %s on %s", + finding_type, bucket_name) + return None + if severity >= 7: sev_emoji = ":red_circle:" sev_label = "HIGH" @@ -967,6 +1017,16 @@ def format_securityhub_finding(parsed): account = parsed.get("account", "unknown") finding_id = finding.get("Id", "") + # Suppress findings for resources recently managed by CI + for res in finding.get("Resources", []): + res_id = res.get("Id", "") + # Extract resource name from ARN (e.g. arn:aws:s3:::aws.javabin.no -> aws.javabin.no) + resource_name = res_id.split(":")[-1].split("/")[-1] if res_id else "" + if _is_ci_managed_resource(resource_name): + logger.info("Security Hub finding suppressed (CI-managed resource): %s on %s", + title, resource_name) + return None + # DynamoDB dedup — suppress if already alerted for this resource+finding dedup_key = _finding_dedup_key(finding) if is_finding_already_alerted(dedup_key): From 3258720e23149d333cdbcaa76c2eb6a13cd55d7e Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Tue, 31 Mar 2026 01:12:00 +0200 Subject: [PATCH 2/3] Fix state backend tag: team=platform (not javabin) --- terraform/state/providers.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/state/providers.tf b/terraform/state/providers.tf index 08c39e6..33a6954 100644 --- a/terraform/state/providers.tf +++ b/terraform/state/providers.tf @@ -11,7 +11,7 @@ terraform { locals { required_tags = { - team = "javabin" + team = "platform" service = "state" repo = "javaBin/platform" environment = var.environment From a9956c4ea40bf34bab26d99b68444f7ece47b8b7 Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Fri, 10 Apr 2026 14:50:57 +0200 Subject: [PATCH 3/3] Raise daily cost spike thresholds: $10 min cost and $10 min delta Filters out noisy alerts for services under $10/day and spikes where the absolute dollar difference is under $10. Matches ai-platform thresholds. --- .../lambda-src/daily_cost_check/handler.py | 18 ++++++++++++------ terraform/platform/lambdas/main.tf | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/terraform/lambda-src/daily_cost_check/handler.py b/terraform/lambda-src/daily_cost_check/handler.py index cb2619f..5a9351b 100644 --- a/terraform/lambda-src/daily_cost_check/handler.py +++ b/terraform/lambda-src/daily_cost_check/handler.py @@ -17,7 +17,9 @@ COST_WEBHOOK_PARAM = os.environ["COST_WEBHOOK_PARAM"] SPIKE_THRESHOLD = float(os.environ.get("SPIKE_THRESHOLD", "1.2")) # 20% above average # Minimum daily spend (USD) to qualify as a spike — filters noise on tiny amounts -MIN_SPIKE_AMOUNT = float(os.environ.get("MIN_SPIKE_AMOUNT", "1.00")) +MIN_SPIKE_AMOUNT = float(os.environ.get("MIN_SPIKE_AMOUNT", "10.00")) +# Minimum absolute cost difference (USD) to qualify — ignores small dollar swings +MIN_SPIKE_DELTA = float(os.environ.get("MIN_SPIKE_DELTA", "10.00")) CUR_DATABASE = os.environ.get("CUR_DATABASE", "") CUR_TABLE = os.environ.get("CUR_TABLE", "") ATHENA_WORKGROUP = os.environ.get("ATHENA_WORKGROUP", "") @@ -188,7 +190,8 @@ def detect_spikes(yesterday_costs, avg_costs): """Return list of (service, yesterday, avg, pct_change) for spikes. Filters out services where yesterday's spend is below MIN_SPIKE_AMOUNT - to avoid alerting on insignificant absolute values with high percentage changes. + or where the absolute difference is below MIN_SPIKE_DELTA to avoid + alerting on insignificant cost changes. """ spikes = [] for svc, cost in yesterday_costs.items(): @@ -196,10 +199,13 @@ def detect_spikes(yesterday_costs, avg_costs): continue avg = avg_costs.get(svc, 0) if avg < 0.01: - spikes.append((svc, cost, 0, None)) + if cost >= MIN_SPIKE_DELTA: + spikes.append((svc, cost, 0, None)) elif cost > avg * SPIKE_THRESHOLD: - pct = ((cost - avg) / avg) * 100 - spikes.append((svc, cost, avg, pct)) + delta = cost - avg + if delta >= MIN_SPIKE_DELTA: + pct = (delta / avg) * 100 + spikes.append((svc, cost, avg, pct)) spikes.sort(key=lambda x: x[1] - x[2], reverse=True) return spikes @@ -317,7 +323,7 @@ def build_alert_blocks(spikes, spike_details, yesterday_date): "type": "context", "elements": [{"type": "mrkdwn", "text": f"<{overall_url}|View all in Cost Explorer> | Generated {ts} | " - f"Min spike threshold: ${MIN_SPIKE_AMOUNT:.2f} | NOK rate: ~{USD_TO_NOK}"}] + f"Min cost: ${MIN_SPIKE_AMOUNT:.2f} | Min delta: ${MIN_SPIKE_DELTA:.2f} | NOK rate: ~{USD_TO_NOK}"}] }) return blocks diff --git a/terraform/platform/lambdas/main.tf b/terraform/platform/lambdas/main.tf index 5894fbd..d4723f8 100644 --- a/terraform/platform/lambdas/main.tf +++ b/terraform/platform/lambdas/main.tf @@ -818,6 +818,8 @@ resource "aws_lambda_function" "daily_cost_check" { environment { variables = { COST_WEBHOOK_PARAM = "/javabin/slack/platform-cost-alerts-webhook" + MIN_SPIKE_AMOUNT = "10.00" + MIN_SPIKE_DELTA = "10.00" CUR_DATABASE = var.cur_glue_database CUR_TABLE = var.cur_glue_table ATHENA_WORKGROUP = var.athena_workgroup