sidjha57

AWS Infra

Apr 9th, 2025 (edited)
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 36.54 KB | None | 0 0
  1. from aws_cdk import (
  2.     Stack,
  3.     RemovalPolicy,
  4.     Duration,
  5.     CfnOutput,
  6.     aws_ec2 as ec2,
  7.     aws_iam as iam,
  8.     aws_rds as rds,
  9.     aws_s3 as s3,
  10.     aws_ecr as ecr,
  11.     aws_autoscaling as autoscaling,
  12.     aws_elasticloadbalancingv2 as elbv2,
  13.     aws_secretsmanager as secretsmanager,
  14.     aws_logs as logs,
  15.     # aws_grafana as grafana, # Grafana not used in the final setup, commented out
  16.     aws_cloudfront as cloudfront,
  17.     aws_cloudfront_origins as origins,
  18.     aws_ssm as ssm
  19. )
  20. from constructs import Construct
  21.  
  22. class CalorieMitraStack(Stack):
  23.     def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
  24.         super().__init__(scope, construct_id, **kwargs)
  25.  
  26.         # --- Get context variables ---
  27.         app_name = self.node.try_get_context("app_name") or "CalorieMitra"
  28.         app_port = self.node.try_get_context("app_port") or 8000
  29.         container_port = 8000 # Set based on your Dockerfile EXPOSE
  30.         docker_image_tag_context = self.node.try_get_context("docker_image_tag") or "latest"
  31.         app_secret_arn = self.node.try_get_context("app_secret_arn")
  32.         acm_certificate_arn = self.node.try_get_context("acm_certificate_arn")
  33.  
  34.         if not app_secret_arn:
  35.              raise ValueError("Missing required secret ARN in CDK context (app_secret_arn)")
  36.         if not acm_certificate_arn:
  37.              raise ValueError("Missing required ACM Certificate ARN in CDK context (acm_certificate_arn)") # Ensure ACM ARN is provided for HTTPS
  38.  
  39.         ssm_image_tag_parameter_name = f"/app/{app_name.lower()}/image-tag"
  40.  
  41.         # --- Networking ---
  42.         # VPC with Public and Isolated Subnets. Public subnets allow EC2 instances
  43.         # to access the internet (and AWS public endpoints) via the Internet Gateway.
  44.         # Isolated subnets provide a secure location for the RDS database.
  45.         vpc = ec2.Vpc(
  46.             self, f"{app_name}Vpc",
  47.             max_azs=2,
  48.             cidr="10.10.0.0/16",
  49.             nat_gateways=0, # No NAT Gateways needed as EC2 is in Public Subnets
  50.             subnet_configuration=[
  51.                 ec2.SubnetConfiguration(name="public", subnet_type=ec2.SubnetType.PUBLIC, cidr_mask=24),
  52.                 # Private subnet is ISOLATED for RDS
  53.                 ec2.SubnetConfiguration(name="private", subnet_type=ec2.SubnetType.PRIVATE_ISOLATED, cidr_mask=24),
  54.             ]
  55.         )
  56.  
  57.         # --- VPC Endpoints (Cost Optimization Applied) ---
  58.         # Keep ONLY the S3 Gateway Endpoint. It's free and keeps S3 traffic
  59.         # within the AWS network backbone, enhancing security and potentially performance.
  60.         s3_gateway_endpoint = vpc.add_gateway_endpoint(
  61.             "S3GatewayEndpoint",
  62.             service=ec2.GatewayVpcEndpointAwsService.S3
  63.         )
  64.  
  65.         # REMOVED: Interface Endpoint Security Group. Not needed as Interface Endpoints are removed.
  66.         # vpc_endpoint_sg = ec2.SecurityGroup(...)
  67.  
  68.         # REMOVED: All Interface Endpoints (ECR, Secrets Manager, SSM, Logs, EC2/SSM Messages).
  69.         # These were causing high costs. Instances in Public Subnets will now access these
  70.         # services via their public endpoints through the Internet Gateway.
  71.         # vpc.add_interface_endpoint("EcrDockerEndpoint", ...)
  72.         # vpc.add_interface_endpoint("EcrApiEndpoint", ...)
  73.         # vpc.add_interface_endpoint("SecretsManagerEndpoint", ...)
  74.         # vpc.add_interface_endpoint("SsmEndpoint", ...)
  75.         # vpc.add_interface_endpoint("LogsEndpoint", ...)
  76.         # vpc.add_interface_endpoint("Ec2MessagesEndpoint", ...)
  77.         # vpc.add_interface_endpoint("SsmMessagesEndpoint", ...)
  78.  
  79.  
  80.         # --- Security Groups ---
  81.         # ALB Security Group: Allows internet traffic on HTTP/HTTPS.
  82.         alb_sg = ec2.SecurityGroup(
  83.             self, f"{app_name}AlbSg",
  84.             vpc=vpc,
  85.             description="ALB SG allowing HTTP/HTTPS ingress",
  86.             allow_all_outbound=True # Standard practice for ALBs
  87.         )
  88.         alb_sg.add_ingress_rule(ec2.Peer.any_ipv4(), ec2.Port.tcp(80), "Allow HTTP for redirect")
  89.         alb_sg.add_ingress_rule(ec2.Peer.any_ipv4(), ec2.Port.tcp(443), "Allow HTTPS from internet")
  90.  
  91.         # EC2 Instance Security Group: Critical for security.
  92.         ec2_sg = ec2.SecurityGroup(
  93.             self, f"{app_name}Ec2Sg",
  94.             vpc=vpc,
  95.             description="EC2 Instance SG allowing traffic only from ALB and required outbound",
  96.             # IMPORTANT: Allow all outbound is needed for instances in Public Subnets
  97.             # to reach AWS public endpoints (ECR, SSM, etc.) and OS update repos via the IGW.
  98.             # While less strict than fully private, it's necessary for this cost-optimized path.
  99.             allow_all_outbound=True
  100.         )
  101.         # Allow traffic ONLY from the ALB on the application port. Prevents direct access to instances.
  102.         ec2_sg.add_ingress_rule(
  103.             alb_sg, ec2.Port.tcp(app_port),
  104.             f"Allow traffic from ALB on port {app_port}"
  105.         )
  106.         # REMOVED: Ingress rule for SSM via HTTPS. Not needed as SSM agent uses outbound connection.
  107.  
  108.         # RDS Security Group: Isolates the database.
  109.         rds_sg = ec2.SecurityGroup(
  110.             self, f"{app_name}RdsSg",
  111.             vpc=vpc,
  112.             description="RDS SG allowing traffic only from EC2 SG",
  113.             # Allow outbound is less critical here, but True is acceptable default.
  114.             # Could be restricted further if desired, but usually not necessary.
  115.             allow_all_outbound=True
  116.         )
  117.         # Allow traffic only from application EC2 instances on the standard PostgreSQL port.
  118.         rds_sg.add_ingress_rule(
  119.             ec2_sg, ec2.Port.tcp(5432), # Standard PostgreSQL port
  120.             "Allow DB traffic from EC2"
  121.         )
  122.  
  123.         # REMOVED: Rule allowing EC2 to connect to VPC Interface Endpoints SG. No longer needed.
  124.         # vpc_endpoint_sg.add_ingress_rule(ec2_sg, ec2.Port.tcp(443), "Allow EC2 instances to connect to Endpoints")
  125.  
  126.  
  127.         # --- ECR Repository ---
  128.         # Securely stores Docker images. IAM permissions control access.
  129.         ecr_repository = ecr.Repository(
  130.             self, f"{app_name}EcrRepo", repository_name=f"{app_name.lower()}-app-repo",
  131.             removal_policy=RemovalPolicy.RETAIN, # Keep repo even if stack is deleted
  132.             image_tag_mutability=ecr.TagMutability.MUTABLE # Allows overwriting tags like 'latest'
  133.         )
  134.  
  135.         # --- S3 Bucket ---
  136.         # Securely stores meal images. Access restricted via CloudFront OAC and IAM.
  137.         meal_images_bucket = s3.Bucket(
  138.             self, f"{app_name}MealImagesBucket",
  139.             block_public_access=s3.BlockPublicAccess.BLOCK_ALL, # Essential for security
  140.             encryption=s3.BucketEncryption.S3_MANAGED, # Server-side encryption
  141.             removal_policy=RemovalPolicy.RETAIN, # Keep bucket even if stack is deleted
  142.             auto_delete_objects=False, # Safer default for production
  143.             lifecycle_rules=[s3.LifecycleRule(id="MoveToIA", enabled=True, transitions=[
  144.                 s3.Transition(storage_class=s3.StorageClass.INFREQUENT_ACCESS, transition_after=Duration.days(90))])] # Cost optimization
  145.         )
  146.  
  147.         # --- Cloudfront Distribution with Origin Access Control (OAC) ---
  148.         # OAC is the recommended secure way for CloudFront to access S3 privately.
  149.         cfn_oac = cloudfront.CfnOriginAccessControl(self, f"{app_name}OAC",
  150.             origin_access_control_config=cloudfront.CfnOriginAccessControl.OriginAccessControlConfigProperty(
  151.                 name=f"{app_name}-OAC-{self.region}", origin_access_control_origin_type="s3",
  152.                 signing_behavior="always", signing_protocol="sigv4", description="OAC for S3 Bucket"))
  153.  
  154.         cloudfront_distribution = cloudfront.Distribution(
  155.             self, f"{app_name}CloudfrontDistribution",
  156.             default_behavior=cloudfront.BehaviorOptions(
  157.                 origin=origins.S3Origin(meal_images_bucket), # Using the secure S3 origin
  158.                 viewer_protocol_policy=cloudfront.ViewerProtocolPolicy.REDIRECT_TO_HTTPS, # Enforce HTTPS
  159.                 cache_policy=cloudfront.CachePolicy.CACHING_OPTIMIZED), # Standard caching policy
  160.             comment=f"{app_name} CDN for Meal Images")
  161.  
  162.         # --- Add OAC configuration to the CloudFront Distribution ---
  163.         # This overrides the default settings to ensure OAC is used.
  164.         cfn_distribution = cloudfront_distribution.node.default_child
  165.         cfn_distribution.add_property_override("DistributionConfig.Origins.0.OriginAccessControlId", cfn_oac.attr_id)
  166.         # Remove the legacy Origin Access Identity if it exists (OAC replaces OAI)
  167.         cfn_distribution.add_property_override("DistributionConfig.Origins.0.S3OriginConfig.OriginAccessIdentity", "")
  168.  
  169.         # --- Update S3 Bucket Policy to allow CloudFront access via OAC ---
  170.         # This policy grants CloudFront permission to GetObject, restricted by the distribution ARN.
  171.         meal_images_bucket.add_to_resource_policy(
  172.             iam.PolicyStatement(
  173.                 actions=["s3:GetObject"],
  174.                 resources=[meal_images_bucket.arn_for_objects("*")],
  175.                 principals=[iam.ServicePrincipal("cloudfront.amazonaws.com")],
  176.                 # Condition ensures only THIS CloudFront distribution can access the bucket
  177.                 conditions={"StringEquals": {
  178.                     "AWS:SourceArn": f"arn:aws:cloudfront::{self.account}:distribution/{cloudfront_distribution.distribution_id}"
  179.                 }}
  180.             )
  181.         )
  182.  
  183.         # --- RDS PostgreSQL Instance ---
  184.         # Securely placed in PRIVATE_ISOLATED subnets.
  185.         db_instance_name = "defaultdb"
  186.         db_instance = rds.DatabaseInstance(
  187.             self, f"{app_name}RdsInstance",
  188.             engine=rds.DatabaseInstanceEngine.postgres(version=rds.PostgresEngineVersion.VER_15),
  189.             instance_type=ec2.InstanceType.of(ec2.InstanceClass.BURSTABLE4_GRAVITON, ec2.InstanceSize.MEDIUM),
  190.             vpc=vpc,
  191.             # Place RDS in the PRIVATE_ISOLATED subnets for maximum network security
  192.             vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_ISOLATED),
  193.             security_groups=[rds_sg], # Apply the restrictive RDS Security Group
  194.             allocated_storage=50,
  195.             storage_type=rds.StorageType.GP3, # Cost-effective GP3 storage
  196.             database_name=db_instance_name,
  197.             multi_az=False, # Set to True for production HA (higher cost)
  198.             backup_retention=Duration.days(7), # Adjust as needed
  199.             delete_automated_backups=False, # Keep backups if instance deleted (safer)
  200.             deletion_protection=True, # Prevent accidental deletion (recommended for prod)
  201.             copy_tags_to_snapshot=True,
  202.             enable_performance_insights=True, # Useful for debugging performance
  203.             removal_policy=RemovalPolicy.RETAIN # Keep DB even if stack is deleted
  204.         )
  205.  
  206.         # --- SSM Parameter for Docker Image Tag ---
  207.         # Securely stores the deployment tag, accessible via IAM role.
  208.         ssm_image_tag_parameter = ssm.StringParameter(
  209.             self, f"{app_name}ImageTagParam",
  210.             parameter_name=ssm_image_tag_parameter_name,
  211.             string_value=docker_image_tag_context,
  212.             description=f"Docker image tag for {app_name}",
  213.             tier=ssm.ParameterTier.STANDARD
  214.         )
  215.  
  216.         # --- IAM Role for EC2 Instances ---
  217.         # Defines permissions for EC2 instances, following least privilege principle.
  218.         ec2_role = iam.Role(
  219.             self, f"{app_name}Ec2Role",
  220.             assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"),
  221.             managed_policies=[
  222.                 # Required for SSM Agent to function (used for management, patching, etc.)
  223.                 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonSSMManagedInstanceCore"),
  224.                 # Required for CloudWatch Agent to send logs/metrics
  225.                 iam.ManagedPolicy.from_aws_managed_policy_name("CloudWatchAgentServerPolicy"),
  226.                 # Required to pull images from ECR
  227.                 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEC2ContainerRegistryReadOnly")
  228.             ]
  229.         )
  230.  
  231.         # Grant specific permissions needed by the application and startup script.
  232.         # Log permissions (scoped to specific log groups)
  233.         ec2_role.add_to_policy(iam.PolicyStatement(
  234.             effect=iam.Effect.ALLOW,
  235.             actions=[
  236.                 "logs:CreateLogGroup",
  237.                 "logs:CreateLogStream",
  238.                 "logs:PutLogEvents",
  239.                 "logs:DescribeLogStreams",
  240.                 "logs:DescribeLogGroups"
  241.             ],
  242.             resources=[
  243.                 f"arn:aws:logs:{self.region}:{self.account}:log-group:/app/{app_name.lower()}/*",
  244.                 f"arn:aws:logs:{self.region}:{self.account}:log-group:/app/{app_name.lower()}/*:*",
  245.                 f"arn:aws:logs:{self.region}:{self.account}:log-group:/var/log/user-data*",
  246.                 f"arn:aws:logs:{self.region}:{self.account}:log-group:/var/log/user-data*:*"
  247.             ]
  248.         ))
  249.         # SSM Parameter Read permission (scoped to the specific parameter)
  250.         ec2_role.add_to_policy(iam.PolicyStatement(
  251.             effect=iam.Effect.ALLOW,
  252.             actions=["ssm:GetParameter", "ssm:GetParameters", "ssm:GetParametersByPath"],
  253.             resources=[ssm_image_tag_parameter.parameter_arn] # Use parameter ARN for least privilege
  254.         ))
  255.         # Autoscaling permissions (needed for UserData error handling to set instance unhealthy)
  256.         ec2_role.add_to_policy(iam.PolicyStatement(
  257.             effect=iam.Effect.ALLOW,
  258.             actions=["autoscaling:SetInstanceHealth", "autoscaling:DescribeAutoScalingInstances"],
  259.             # Scoping this further requires knowing the ASG ARN or using conditions,
  260.             # "*" is often acceptable here for simplicity.
  261.             resources=["*"]
  262.         ))
  263.  
  264.         # Grant resource-based permissions (simpler way to grant access)
  265.         ecr_repository.grant_pull(ec2_role) # Allow role to pull from this ECR repo
  266.         meal_images_bucket.grant_read_write(ec2_role) # Allow role to R/W to this S3 bucket
  267.         app_secret = secretsmanager.Secret.from_secret_complete_arn(self, "AppSecret", app_secret_arn)
  268.         app_secret.grant_read(ec2_role) # Allow role to read the specified secret
  269.         # Note: ssm_image_tag_parameter.grant_read(ec2_role) could also be used instead of the policy above
  270.  
  271.         # --- CloudWatch Log Group for Application Logs ---
  272.         # Centralized logging for the Docker container via awslogs driver.
  273.         app_log_group = logs.LogGroup(
  274.             self, f"{app_name}AppLogGroup",
  275.             log_group_name=f"/app/{app_name.lower()}/docker-logs",
  276.             retention=logs.RetentionDays.TWO_WEEKS, # Adjust retention as needed
  277.             removal_policy=RemovalPolicy.DESTROY # OK to destroy log group if stack deleted
  278.         )
  279.         # Granting write is implicitly handled by CloudWatchAgentServerPolicy and docker config,
  280.         # but explicit grant doesn't hurt.
  281.         app_log_group.grant_write(ec2_role)
  282.  
  283.         # --- User Data Script ---
  284.         # This script runs on instance launch to set up the environment and start the application.
  285.         # No changes needed here due to endpoint removal, as AWS CLI/SDK calls will
  286.         # automatically use the IGW path from the public subnet.
  287.         app_log_group_name = app_log_group.log_group_name
  288.         custom_ami_id = "ami-04c5ae52f42a72334"
  289.  
  290.         user_data_script = f"""#!/bin/bash -xeu
  291. # -x: print commands, -e: exit on error, -u: treat unset variables as error
  292. set -o pipefail
  293.  
  294. # Redirect stdout/stderr to a log file and to the system logger
  295. exec > >(tee /var/log/user-data.log | logger -t user-data -s 2>/dev/console) 2>&1
  296.  
  297. # --- Basic Logging Function (Output to stderr) ---
  298. log() {{
  299.    # Use >&2 to redirect echo output to standard error
  300.    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >&2
  301. }}
  302.  
  303. log "--- UserData Script Start (Using Custom AMI: {custom_ami_id}) ---"
  304.  
  305. # --- Essential Variables (Injected by CDK f-string) ---
  306. APP_PORT="{app_port}"
  307. CONTAINER_PORT="{container_port}"
  308. AWS_REGION="{self.region}"
  309. AWS_ACCOUNT_ID="{self.account}"
  310. APP_SECRET_ARN="{app_secret_arn}"
  311. SSM_PARAM_NAME="{ssm_image_tag_parameter_name}"
  312. ECR_REPOSITORY_NAME="{ecr_repository.repository_name}"
  313. DB_ENDPOINT_ADDRESS="{db_instance.db_instance_endpoint_address}"
  314. DB_ENDPOINT_PORT="{db_instance.db_instance_endpoint_port}"
  315. DB_NAME="{db_instance_name}"
  316. S3_BUCKET="{meal_images_bucket.bucket_name}"
  317. APP_LOG_GROUP_NAME="{app_log_group_name}" # Use the variable passed from CDK
  318. # -----------------------------------------------------
  319.  
  320. # --- Error Handling Setup ---
  321. handle_error() {{
  322.    local error_msg="$1"
  323.    local exit_code="${{2:-1}}"
  324.    log "ERROR: $error_msg (Exit Code: $exit_code)"
  325.    TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 60" || true)
  326.    INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id || curl -s http://169.254.169.254/latest/meta-data/instance-id || echo "unknown")
  327.    if [[ "$INSTANCE_ID" != "unknown" ]] && command -v aws &> /dev/null; then
  328.        log "Attempting to signal ASG Unhealthy for instance $INSTANCE_ID in region $AWS_REGION..."
  329.        aws autoscaling set-instance-health --instance-id "$INSTANCE_ID" --health-status Unhealthy --region "$AWS_REGION" || \\
  330.            log "Warning: Failed to set instance health via AWS CLI. Check IAM permissions (autoscaling:SetInstanceHealth)."
  331.    else
  332.        log "Warning: Could not get Instance ID or AWS CLI not available. Cannot signal ASG."
  333.    fi
  334.    exit $exit_code
  335. }}
  336. trap 'handle_error "Script failed on line $LINENO" $?' ERR
  337.  
  338. # --- Verify Prerequisites (Should be installed in AMI) ---
  339. log "Verifying prerequisite tools (should be installed in AMI)..."
  340. command -v docker >/dev/null || handle_error "Docker command not found. AMI build might have failed."
  341. command -v aws >/dev/null || handle_error "AWS CLI command not found. AMI build might have failed."
  342. command -v jq >/dev/null || handle_error "jq command not found. AMI build might have failed."
  343. # command -v git >/dev/null || handle_error "git command not found. AMI build might have failed." # Check if needed
  344. command -v nc >/dev/null || handle_error "netcat (nc) command not found. AMI build might have failed."
  345. log "Prerequisite tools verified."
  346.  
  347. # --- Configure and Start Docker ---
  348. log "Configuring Docker log driver (awslogs)..."
  349. sudo mkdir -p /etc/docker # Ensure directory exists
  350. sudo tee /etc/docker/daemon.json > /dev/null <<EOF
  351. {{
  352.  "log-driver": "awslogs",
  353.  "log-opts": {{
  354.    "awslogs-region": "$AWS_REGION",
  355.    "awslogs-group": "$APP_LOG_GROUP_NAME",
  356.    "awslogs-create-group": "true",
  357.    "tag": "{{{{.Name}}}}/{{{{.ID}}}}"
  358.  }}
  359. }}
  360. EOF
  361. log "Docker daemon.json configured for awslogs."
  362.  
  363. log "Ensuring Docker service is enabled and restarting it..."
  364. # Docker service should be enabled by the AMI build process
  365. # Restart is needed to apply the daemon.json configuration
  366. sudo systemctl restart docker
  367. sleep 5 # Give docker a moment to restart
  368. sudo systemctl is-active --quiet docker || handle_error "Docker service failed to restart after configuration. Check /etc/docker/daemon.json syntax and system logs (journalctl -u docker.service)."
  369. # Add ubuntu user to docker group if not already added (useful for potential debugging)
  370. sudo usermod -a -G docker ubuntu || log "Warning: Failed to add 'ubuntu' user to docker group (might already exist)."
  371. log "Docker service configured and running."
  372.  
  373.  
  374. # --- Fetch Secrets Function (with retries) ---
  375. function get_secret {{
  376.    local secret_arn="$1"
  377.    local retries=3
  378.    local count=0
  379.    local delay=5
  380.    local secret_json=""
  381.  
  382.    log "Attempting to fetch secret: $secret_arn (Retry logic active)"
  383.    while [ $count -lt $retries ]; do
  384.        # AWS CLI call will use IAM role and network path (IGW in this case)
  385.        secret_json=$(aws secretsmanager get-secret-value --secret-id "$secret_arn" --region "$AWS_REGION" --query SecretString --output text 2>/var/log/awscli_secret_error.log)
  386.        local aws_exit_code=$?
  387.        if [ $aws_exit_code -eq 0 ] && [ -n "$secret_json" ] && [ "$secret_json" != "null" ]; then
  388.            # Validate JSON structure
  389.            if echo "$secret_json" | jq -e . > /dev/null; then
  390.                log "Successfully fetched and validated secret JSON (attempt $((count+1))/$retries)."
  391.                rm -f /var/log/awscli_secret_error.log
  392.                echo "$secret_json" # Output ONLY the JSON
  393.                return 0
  394.            else
  395.                log "Warning: Fetched secret for $secret_arn is not valid JSON (attempt $((count+1))/$retries). Retrying in $delay seconds..."
  396.            fi
  397.        else
  398.            log "Warning: Failed to fetch secret $secret_arn (AWS CLI exit code: $aws_exit_code, attempt $((count+1))/$retries). Check /var/log/awscli_secret_error.log. Retrying in $delay seconds..."
  399.        fi
  400.        count=$((count+1))
  401.        sleep $delay
  402.    done
  403.    log "ERROR: Failed to fetch valid secret JSON for $secret_arn after $retries attempts."
  404.    cat /var/log/awscli_secret_error.log # Log the actual error
  405.    return 1
  406. }}
  407.  
  408. log "Fetching App secrets from ARN: $APP_SECRET_ARN"
  409. APP_SECRET_JSON=$(get_secret "$APP_SECRET_ARN") || handle_error "Failed to fetch App secrets from $APP_SECRET_ARN after retries" $?
  410. # Avoid logging secrets to UserData logs if possible
  411. log "Successfully fetched secrets."
  412.  
  413. # Parse secrets using jq (ensure required fields are present)
  414. log "Parsing essential secrets..."
  415. GEMINI_API_KEY=$(echo "$APP_SECRET_JSON" | jq -r '.GEMINI_API_KEY // ""')
  416. ADMIN_EMAILS=$(echo "$APP_SECRET_JSON" | jq -r '.ADMIN_EMAILS // ""')
  417. RATE_LIMIT_MAX=$(echo "$APP_SECRET_JSON" | jq -r '.RATE_LIMIT_MAX // ""')
  418. ENABLE_SWAGGER=$(echo "$APP_SECRET_JSON" | jq -r '.ENABLE_SWAGGER // ""')
  419. CLOUDFRONT_DOMAIN_NAME=$(echo "$APP_SECRET_JSON" | jq -r '.CLOUDFRONT_DOMAIN_NAME // ""')
  420. ENDPOINT=$(echo "$APP_SECRET_JSON" | jq -r '.ENDPOINT // ""')
  421. FIREBASE_PROJECT_ID=$(echo "$APP_SECRET_JSON" | jq -r '.FIREBASE_PROJECT_ID // ""')
  422. FIREBASE_CLIENT_EMAIL=$(echo "$APP_SECRET_JSON" | jq -r '.FIREBASE_CLIENT_EMAIL // ""')
  423. FIREBASE_PRIVATE_KEY=$(echo "$APP_SECRET_JSON" | jq -r '.FIREBASE_PRIVATE_KEY // ""')
  424. DB_USERNAME=$(echo "$APP_SECRET_JSON" | jq -r '.DB_USERNAME // ""')
  425. DB_PASSWORD=$(echo "$APP_SECRET_JSON" | jq -r '.DB_PASSWORD // ""')
  426.  
  427. # Validate essential secrets that are critical for startup
  428. if [ -z "$DB_USERNAME" ]; then handle_error "DB_USERNAME missing from secrets"; fi
  429. if [ -z "$DB_PASSWORD" ]; then handle_error "DB_PASSWORD missing from secrets"; fi
  430. # Add checks for other critical secrets if needed
  431. log "Secrets parsed."
  432.  
  433. # --- Fetch Docker Image Tag from SSM ---
  434. log "Fetching image tag from SSM parameter: $SSM_PARAM_NAME"
  435. # AWS CLI call uses IAM role and network path (IGW)
  436. TARGET_IMAGE_TAG=$(aws ssm get-parameter --name "$SSM_PARAM_NAME" --query "Parameter.Value" --output text --region "$AWS_REGION" 2>/var/log/awscli_ssm_error.log)
  437. ssm_exit_code=$?
  438. if [ $ssm_exit_code -ne 0 ] || [ -z "$TARGET_IMAGE_TAG" ] || [ "$TARGET_IMAGE_TAG" == "None" ] || [ "$TARGET_IMAGE_TAG" == "null" ]; then
  439.    log "Warning: Failed to get tag from SSM $SSM_PARAM_NAME (Exit code: $ssm_exit_code). Check /var/log/awscli_ssm_error.log. Using 'latest' as fallback."
  440.    cat /var/log/awscli_ssm_error.log # Log the actual error
  441.    TARGET_IMAGE_TAG="latest" # Define a safe fallback
  442. fi
  443. rm -f /var/log/awscli_ssm_error.log
  444. log "Using image tag: $TARGET_IMAGE_TAG"
  445.  
  446. # --- ECR Login ---
  447. ECR_LOGIN_SERVER="$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com"
  448. log "Logging into ECR: $ECR_LOGIN_SERVER"
  449. # AWS CLI call uses IAM role and network path (IGW)
  450. aws ecr get-login-password --region "$AWS_REGION" | sudo docker login --username AWS --password-stdin "$ECR_LOGIN_SERVER" || \\
  451.    handle_error "Failed to log into ECR server $ECR_LOGIN_SERVER. Check IAM permissions (ecr:GetAuthorizationToken) and network connectivity."
  452. log "ECR login successful."
  453.  
  454. # --- Prepare and Run Container ---
  455. ECR_REPO_URI="$ECR_LOGIN_SERVER/$ECR_REPOSITORY_NAME"
  456. IMAGE_URI="$ECR_REPO_URI:$TARGET_IMAGE_TAG"
  457. CONTAINER_NAME="myapp" # Use a variable for the container name
  458.  
  459. log "Stopping and removing existing container named '$CONTAINER_NAME' (if any)..."
  460. sudo docker stop "$CONTAINER_NAME" 2>/dev/null || true
  461. sudo docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
  462.  
  463. log "Pulling image: $IMAGE_URI"
  464. PULL_SUCCESS="false"
  465. for i in {{1..3}}; do
  466.    if sudo docker pull "$IMAGE_URI"; then
  467.        log "Image pull successful (attempt $i)."
  468.        PULL_SUCCESS="true"
  469.        break
  470.    else
  471.        log "Warning: Pull attempt $i failed for $IMAGE_URI. Retrying in 5 seconds..."
  472.        sleep 5;
  473.    fi
  474. done
  475. if [ "$PULL_SUCCESS" != "true" ]; then
  476.    handle_error "FATAL: Failed to pull image $IMAGE_URI after 3 attempts. Check ECR repository, tag, network, and IAM permissions (ecr:BatchGetImage, ecr:GetDownloadUrlForLayer)."
  477. fi
  478. log "Image pull successful."
  479.  
  480. log "Running container '$CONTAINER_NAME' from image $IMAGE_URI..."
  481. # Build DATABASE_URL securely (avoiding logging password if possible)
  482. DB_URL_CREDENTIALS=""
  483. if [ -n "$DB_USERNAME" ]; then
  484.    # Note: Password is still visible in process list (`ps aux`) during run.
  485.    # More secure methods involve reading from mounted files or container secrets management.
  486.    DB_URL_CREDENTIALS="$DB_USERNAME:$DB_PASSWORD@"
  487. fi
  488. DATABASE_URL="postgresql://$DB_URL_CREDENTIALS$DB_ENDPOINT_ADDRESS:$DB_ENDPOINT_PORT/$DB_NAME"
  489.  
  490. # Build docker run options array (passing secrets as environment variables)
  491. # Note: Environment variables can potentially be inspected. Consider alternatives for highly sensitive data.
  492. declare -a docker_env_opts
  493. docker_env_opts+=( "-e" "DATABASE_HOST=$DB_ENDPOINT_ADDRESS" )
  494. docker_env_opts+=( "-e" "DATABASE_PORT=$DB_ENDPOINT_PORT" )
  495. docker_env_opts+=( "-e" "DATABASE_NAME=$DB_NAME" )
  496. docker_env_opts+=( "-e" "DATABASE_URL=$DATABASE_URL" ) # App might use this directly
  497. docker_env_opts+=( "-e" "S3_BUCKET_NAME=$S3_BUCKET" )
  498. docker_env_opts+=( "-e" "AWS_REGION=$AWS_REGION" ) # Useful for SDK inside container if needed
  499. docker_env_opts+=( "-e" "NODE_ENV=production" ) # Example for Node.js apps
  500. docker_env_opts+=( "-e" "PORT=$CONTAINER_PORT" ) # Port inside the container
  501.  
  502. # Conditionally add other secrets if they exist
  503. [ -n "$DB_USERNAME" ] && docker_env_opts+=( "-e" "DATABASE_USERNAME=$DB_USERNAME" )
  504. [ -n "$DB_PASSWORD" ] && docker_env_opts+=( "-e" "DATABASE_PASSWORD=$DB_PASSWORD" ) # Sensitive
  505. [ -n "$GEMINI_API_KEY" ] && docker_env_opts+=( "-e" "GEMINI_API_KEY=$GEMINI_API_KEY" ) # Sensitive
  506. [ -n "$ADMIN_EMAILS" ] && docker_env_opts+=( "-e" "ADMIN_EMAILS=$ADMIN_EMAILS" )
  507. [ -n "$RATE_LIMIT_MAX" ] && docker_env_opts+=( "-e" "RATE_LIMIT_MAX=$RATE_LIMIT_MAX" )
  508. [ -n "$ENABLE_SWAGGER" ] && docker_env_opts+=( "-e" "ENABLE_SWAGGER=$ENABLE_SWAGGER" )
  509. [ -n "$CLOUDFRONT_DOMAIN_NAME" ] && docker_env_opts+=( "-e" "CLOUDFRONT_DOMAIN_NAME=$CLOUDFRONT_DOMAIN_NAME" )
  510. [ -n "$ENDPOINT" ] && docker_env_opts+=( "-e" "ENDPOINT=$ENDPOINT" )
  511. [ -n "$FIREBASE_PROJECT_ID" ] && docker_env_opts+=( "-e" "FIREBASE_PROJECT_ID=$FIREBASE_PROJECT_ID" )
  512. [ -n "$FIREBASE_CLIENT_EMAIL" ] && docker_env_opts+=( "-e" "FIREBASE_CLIENT_EMAIL=$FIREBASE_CLIENT_EMAIL" )
  513. [ -n "$FIREBASE_PRIVATE_KEY" ] && docker_env_opts+=( "-e" "FIREBASE_PRIVATE_KEY=$FIREBASE_PRIVATE_KEY" ) # Sensitive
  514.  
  515. # Execute docker run
  516. # Log driver is set globally in daemon.json
  517. sudo docker run -d --name "$CONTAINER_NAME" \\
  518.    --restart always \\
  519.    -p "$APP_PORT":"$CONTAINER_PORT" \\
  520.    "${{docker_env_opts[@]}}" \\
  521.    "$IMAGE_URI" || handle_error "Failed to execute 'docker run' command for image $IMAGE_URI."
  522.  
  523. # Brief pause and check if container started and is running
  524. log "Waiting 10 seconds for container '$CONTAINER_NAME' to stabilize..."
  525. sleep 10
  526. if ! sudo docker ps -q --filter name="^/${{CONTAINER_NAME}}$"; then
  527.    log "--- Docker Status After Failed Start ---"
  528.    sudo docker ps -a || log "Warning: Failed to run 'docker ps -a'."
  529.    log "--- Last 100 lines of '$CONTAINER_NAME' logs (attempting fetch) ---"
  530.    # Attempt to get logs; might fail if container never started properly or logging driver issue
  531.    sudo docker logs --tail 100 "$CONTAINER_NAME" || log "Could not get logs for '$CONTAINER_NAME' via docker logs command."
  532.    handle_error "Container '$CONTAINER_NAME' is not running shortly after start. Check CloudWatch Logs ($APP_LOG_GROUP_NAME), system logs (/var/log/syslog or journalctl -u docker.service), and container logs (if fetchable)."
  533. fi
  534.  
  535. log "Container '$CONTAINER_NAME' started successfully. Application should be initializing."
  536. log "Logs should appear in CloudWatch Group: $APP_LOG_GROUP_NAME"
  537. log "--- UserData Script Finished Successfully ---"
  538. exit 0
  539. """
  540.  
  541.         # --- User Data Object ---
  542.         user_data = ec2.UserData.custom(user_data_script)
  543.  
  544.         # --- Use the Custom AMI ---
  545.         # Ensure the specified AMI exists in the target region and has necessary tools (Docker, AWS CLI, jq, nc)
  546.         custom_calorie_mitra_ami = ec2.MachineImage.generic_linux({
  547.             self.region: custom_ami_id
  548.         })
  549.  
  550.         # --- EC2 Launch Template ---
  551.         # Defines the configuration for instances launched by the Auto Scaling Group.
  552.         launch_template = ec2.LaunchTemplate(
  553.             self, f"{app_name}LaunchTemplate",
  554.             instance_type=ec2.InstanceType.of(ec2.InstanceClass.BURSTABLE4_GRAVITON, ec2.InstanceSize.MEDIUM), # Graviton for cost/performance
  555.             machine_image=custom_calorie_mitra_ami, # Use the prepared custom AMI
  556.             role=ec2_role, # Attach the defined IAM role
  557.             security_group=ec2_sg, # Attach the EC2 Security Group
  558.             user_data=user_data, # Provide the startup script
  559.             # Configure the root EBS volume
  560.             block_devices=[ec2.BlockDevice(
  561.                 device_name="/dev/sda1", # Verify this is the correct root device name for your AMI
  562.                 volume=ec2.BlockDeviceVolume.ebs(
  563.                     volume_size=20, # Adjust size as needed
  564.                     volume_type=ec2.EbsDeviceVolumeType.GP3, # Cost-effective and performant
  565.                     encrypted=True, # Encrypt EBS volume for security at rest
  566.                     delete_on_termination=True # Ensure volume is deleted with instance
  567.                 )
  568.             )],
  569.             # Enable detailed CloudWatch monitoring for instances (optional, incurs cost)
  570.             # detailed_monitoring=True,
  571.             # Ensure IMDSv2 is required for better security (metadata service access)
  572.             http_tokens=ec2.LaunchTemplateHttpTokens.REQUIRED,
  573.             http_put_response_hop_limit=2 # Standard setting when requiring IMDSv2
  574.         )
  575.  
  576.         # --- Define Auto Scaling Group ---
  577.         # Manages the EC2 instances, ensuring desired capacity and scaling.
  578.         asg = autoscaling.AutoScalingGroup(
  579.             self, f"{app_name}Asg",
  580.             vpc=vpc,
  581.             launch_template=launch_template, # Use the defined Launch Template
  582.             # Launch instances into PUBLIC subnets to utilize the Internet Gateway
  583.             vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
  584.             min_capacity=1,
  585.             max_capacity=4, # Adjust max capacity based on expected load
  586.             desired_capacity=1,
  587.             # Use ELB health checks PLUS EC2 health checks for robustness
  588.             health_check=autoscaling.HealthCheck.elb(grace=Duration.minutes(5)), # Shorter grace period maybe? Depends on app start time
  589.         )
  590.         # Simple CPU-based scaling policy
  591.         asg.scale_on_cpu_utilization(
  592.             f"{app_name}CpuScaling",
  593.             target_utilization_percent=75, # Target 75% CPU before scaling out (adjust as needed)
  594.             cooldown=Duration.minutes(5) # Cooldown period after scaling activity
  595.         )
  596.  
  597.         # --- Application Load Balancer ---
  598.         # Securely distributes incoming traffic across instances.
  599.         alb = elbv2.ApplicationLoadBalancer(
  600.             self, f"{app_name}Alb",
  601.             vpc=vpc,
  602.             internet_facing=True, # Receives traffic from the internet
  603.             security_group=alb_sg, # Apply the ALB Security Group
  604.             # Enable deletion protection for production ALBs
  605.             deletion_protection=True,
  606.             # Enable access logs for security auditing and analysis (requires an S3 bucket)
  607.             # access_logs_bucket=s3.Bucket(self, f"{app_name}AlbLogsBucket", ...) # Define an S3 bucket for logs
  608.         )
  609.         # Enable ALB Access Logs (Optional but recommended for security)
  610.         # You would need to create an S3 bucket first
  611.         # alb_logs_bucket = s3.Bucket(self, f"{app_name}AlbLogsBucket", ...)
  612.         # alb.log_access_logs(alb_logs_bucket, prefix=f"{app_name}-alb-logs")
  613.  
  614.         # --- Target Group ---
  615.         # Defines how the ALB routes traffic to the registered instances.
  616.         target_group = elbv2.ApplicationTargetGroup(
  617.             self, f"{app_name}TargetGroup",
  618.             vpc=vpc,
  619.             port=app_port, # Port the application listens on inside the instance
  620.             protocol=elbv2.ApplicationProtocol.HTTP, # ALB communicates with instances over HTTP
  621.             target_type=elbv2.TargetType.INSTANCE,
  622.             targets=[asg], # Automatically register instances from the ASG
  623.             health_check=elbv2.HealthCheck(
  624.                 enabled=True,
  625.                 path="/healthz", # Specific health check endpoint in your application
  626.                 port=str(app_port), # Health check port must match target port
  627.                 protocol=elbv2.Protocol.HTTP,
  628.                 interval=Duration.seconds(60), # Check more frequently
  629.                 timeout=Duration.seconds(30), # Shorter timeout
  630.                 healthy_threshold_count=2,
  631.                 unhealthy_threshold_count=10, # Slightly more tolerant to transient issues
  632.                 healthy_http_codes="200-299" # Be specific about healthy response code
  633.             ),
  634.             # Adjust deregistration delay (time to drain connections before termination)
  635.             deregistration_delay=Duration.seconds(60)
  636.         )
  637.  
  638.         # --- ALB Listener ---
  639.         # Handles incoming HTTPS traffic on port 443.
  640.         certificate = elbv2.ListenerCertificate.from_arn(acm_certificate_arn)
  641.         https_listener = alb.add_listener(f"{app_name}HttpsListener",
  642.             port=443,
  643.             protocol=elbv2.ApplicationProtocol.HTTPS,
  644.             certificates=[certificate], # Attach the ACM SSL/TLS certificate
  645.             # Set a secure default action (e.g., fixed response or redirect) if needed,
  646.             # but usually default_target_groups is correct.
  647.             default_target_groups=[target_group],
  648.             # Recommended: Use a secure SSL policy from AWS
  649.             ssl_policy=elbv2.SslPolicy.RECOMMENDED_TLS # Uses modern TLS versions/ciphers
  650.         )
  651.  
  652.         # --- ALB HTTP to HTTPS Redirect ---
  653.         # Automatically redirects any HTTP traffic to HTTPS for security.
  654.         alb.add_redirect(
  655.             source_protocol=elbv2.ApplicationProtocol.HTTP,
  656.             source_port=80,
  657.             target_protocol=elbv2.ApplicationProtocol.HTTPS,
  658.             target_port=443,
  659.             # redirect_status_code="HTTP_301" # Permanent redirect (default)
  660.         )
  661.  
  662.  
  663.         # --- Outputs ---
  664.         # Provide useful information about the created resources.
  665.         CfnOutput(self, "Region", value=self.region)
  666.         CfnOutput(self, "VpcId", value=vpc.vpc_id)
  667.         CfnOutput(self, "EcrRepositoryUri", value=ecr_repository.repository_uri)
  668.         CfnOutput(self, "MealImagesBucketName", value=meal_images_bucket.bucket_name)
  669.         CfnOutput(self, "CloudfrontDomainName", value=cloudfront_distribution.distribution_domain_name)
  670.         CfnOutput(self, "LoadBalancerDns", value=alb.load_balancer_dns_name)
  671.         CfnOutput(self, "LoadBalancerUrlHttps", value=f"https://{alb.load_balancer_dns_name}")
  672.         CfnOutput(self, "RdsEndpointAddress", value=db_instance.db_instance_endpoint_address)
  673.         CfnOutput(self, "RdsEndpointPort", value=db_instance.db_instance_endpoint_port)
  674.         CfnOutput(self, "EC2RoleArn", value=ec2_role.role_arn)
  675.         CfnOutput(self, "AppLogGroupName", value=app_log_group.log_group_name)
  676.         CfnOutput(self, "SsmImageTagParameterName", value=ssm_image_tag_parameter.parameter_name)
  677.         CfnOutput(self, "SelectedAmiId", value=custom_ami_id)
  678.         CfnOutput(self, "PublicSubnetIds", value=", ".join(vpc.select_subnets(subnet_type=ec2.SubnetType.PUBLIC).subnet_ids))
  679.         CfnOutput(self, "PrivateIsolatedSubnetIds", value=", ".join(vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_ISOLATED).subnet_ids))
Advertisement
Add Comment
Please, Sign In to add comment