Compute Infrastructure: Operational Runbooks

Production procedures and troubleshooting for compute infrastructure operations.

Infrastructure Setup
Instance Deployment
Scaling Operations
Performance Optimization
Monitoring & Alerting
Troubleshooting
Disaster Recovery
Security Hardening

1. Infrastructure Setup

AWS VPC & Networking Setup

Objective: Create production-grade VPC with multi-AZ deployment

# Set variables
REGION=us-east-1
ENVIRONMENT=production
PROJECT=myapp
VPC_CIDR="10.0.0.0/16"

# Create VPC
aws ec2 create-vpc \
  --cidr-block $VPC_CIDR \
  --tag-specifications "ResourceType=vpc,Tags=[{Key=Name,Value=$PROJECT-vpc},{Key=Environment,Value=$ENVIRONMENT}]"

# Output: vpc-xxxxx
VPC_ID="vpc-xxxxx"

# Create Internet Gateway
IGW_ID=$(aws ec2 create-internet-gateway \
  --tag-specifications "ResourceType=internet-gateway,Tags=[{Key=Name,Value=$PROJECT-igw}]" \
  --query 'InternetGateway.InternetGatewayId' \
  --output text)

# Attach IGW to VPC
aws ec2 attach-internet-gateway \
  --vpc-id $VPC_ID \
  --internet-gateway-id $IGW_ID

# Create public subnets (2 AZs)
SUBNET1=$(aws ec2 create-subnet \
  --vpc-id $VPC_ID \
  --cidr-block 10.0.1.0/24 \
  --availability-zone us-east-1a \
  --tag-specifications "ResourceType=subnet,Tags=[{Key=Name,Value=$PROJECT-public-1a}]" \
  --query 'Subnet.SubnetId' \
  --output text)

SUBNET2=$(aws ec2 create-subnet \
  --vpc-id $VPC_ID \
  --cidr-block 10.0.2.0/24 \
  --availability-zone us-east-1b \
  --tag-specifications "ResourceType=subnet,Tags=[{Key=Name,Value=$PROJECT-public-1b}]" \
  --query 'Subnet.SubnetId' \
  --output text)

# Create private subnets (2 AZs)
SUBNET3=$(aws ec2 create-subnet \
  --vpc-id $VPC_ID \
  --cidr-block 10.0.10.0/24 \
  --availability-zone us-east-1a \
  --tag-specifications "ResourceType=subnet,Tags=[{Key=Name,Value=$PROJECT-private-1a}]" \
  --query 'Subnet.SubnetId' \
  --output text)

SUBNET4=$(aws ec2 create-subnet \
  --vpc-id $VPC_ID \
  --cidr-block 10.0.11.0/24 \
  --availability-zone us-east-1b \
  --tag-specifications "ResourceType=subnet,Tags=[{Key=Name,Value=$PROJECT-private-1b}]" \
  --query 'Subnet.SubnetId' \
  --output text)

# Create route table for public subnets
RT_PUBLIC=$(aws ec2 create-route-table \
  --vpc-id $VPC_ID \
  --tag-specifications "ResourceType=route-table,Tags=[{Key=Name,Value=$PROJECT-rt-public}]" \
  --query 'RouteTable.RouteTableId' \
  --output text)

# Add default route to IGW
aws ec2 create-route \
  --route-table-id $RT_PUBLIC \
  --destination-cidr-block 0.0.0.0/0 \
  --gateway-id $IGW_ID

# Associate public subnets with public route table
aws ec2 associate-route-table --subnet-id $SUBNET1 --route-table-id $RT_PUBLIC
aws ec2 associate-route-table --subnet-id $SUBNET2 --route-table-id $RT_PUBLIC

# Create security group
SG=$(aws ec2 create-security-group \
  --group-name "$PROJECT-app-sg" \
  --description "Security group for $PROJECT application" \
  --vpc-id $VPC_ID \
  --tag-specifications "ResourceType=security-group,Tags=[{Key=Name,Value=$PROJECT-app-sg}]" \
  --query 'GroupId' \
  --output text)

# Add inbound rules
aws ec2 authorize-security-group-ingress \
  --group-id $SG \
  --protocol tcp \
  --port 80 \
  --cidr 0.0.0.0/0

aws ec2 authorize-security-group-ingress \
  --group-id $SG \
  --protocol tcp \
  --port 443 \
  --cidr 0.0.0.0/0

aws ec2 authorize-security-group-ingress \
  --group-id $SG \
  --protocol tcp \
  --port 22 \
  --cidr 203.0.113.0/24  # Your IP range

echo "✓ VPC setup complete"
echo "VPC ID: $VPC_ID"
echo "Public Subnets: $SUBNET1, $SUBNET2"
echo "Private Subnets: $SUBNET3, $SUBNET4"
echo "Security Group: $SG"

Azure Resource Group & Network Setup

# Set variables
RESOURCE_GROUP="myapp-rg"
LOCATION="eastus"
VNET_NAME="myapp-vnet"
VNET_CIDR="10.0.0.0/16"

# Create resource group
az group create \
  --name $RESOURCE_GROUP \
  --location $LOCATION

# Create virtual network
az network vnet create \
  --resource-group $RESOURCE_GROUP \
  --name $VNET_NAME \
  --address-prefix $VNET_CIDR \
  --subnet-name public-subnet-1 \
  --subnet-prefix 10.0.1.0/24

# Add additional subnets
az network vnet subnet create \
  --resource-group $RESOURCE_GROUP \
  --vnet-name $VNET_NAME \
  --name public-subnet-2 \
  --address-prefix 10.0.2.0/24

az network vnet subnet create \
  --resource-group $RESOURCE_GROUP \
  --vnet-name $VNET_NAME \
  --name private-subnet-1 \
  --address-prefix 10.0.10.0/24

# Create network security group
az network nsg create \
  --resource-group $RESOURCE_GROUP \
  --name "$VNET_NAME-nsg"

# Add inbound rules
az network nsg rule create \
  --resource-group $RESOURCE_GROUP \
  --nsg-name "$VNET_NAME-nsg" \
  --name AllowHTTP \
  --priority 100 \
  --source-address-prefixes '*' \
  --source-port-ranges '*' \
  --destination-address-prefixes '*' \
  --destination-port-ranges 80 \
  --access Allow

az network nsg rule create \
  --resource-group $RESOURCE_GROUP \
  --nsg-name "$VNET_NAME-nsg" \
  --name AllowHTTPS \
  --priority 110 \
  --source-address-prefixes '*' \
  --source-port-ranges '*' \
  --destination-address-prefixes '*' \
  --destination-port-ranges 443 \
  --access Allow

echo "✓ Azure network setup complete"
echo "Resource Group: $RESOURCE_GROUP"
echo "Virtual Network: $VNET_NAME"

2. Instance Deployment

AWS EC2 Instance Launch

Objective: Deploy production web server instance

# Set variables
INSTANCE_TYPE="m5.large"
IMAGE_ID="ami-0c55b159cbfafe1f0"  # Ubuntu 22.04 LTS
KEY_NAME="my-key-pair"
SECURITY_GROUP="sg-12345678"
SUBNET_ID="subnet-12345678"
VOLUME_SIZE=100
VOLUME_TYPE="gp3"
ENVIRONMENT="production"
APPLICATION="webserver"

# Create user data script (runs on first boot)
cat > user-data.sh << 'EOF'
#!/bin/bash
set -e

# Update system
apt-get update && apt-get upgrade -y

# Install Docker
curl -fsSL https://get.docker.com | sh
usermod -aG docker ubuntu

# Install CloudWatch agent
wget https://s3.amazonaws.com/amazoncloudwatch-agent/ubuntu/amd64/latest/amazon-cloudwatch-agent.deb
dpkg -i amazon-cloudwatch-agent.deb

echo "✓ User data script completed"
EOF

# Launch instance
aws ec2 run-instances \
  --image-id $IMAGE_ID \
  --instance-type $INSTANCE_TYPE \
  --key-name $KEY_NAME \
  --security-group-ids $SECURITY_GROUP \
  --subnet-id $SUBNET_ID \
  --associate-public-ip-address \
  --block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=$VOLUME_SIZE,VolumeType=$VOLUME_TYPE,DeleteOnTermination=true,Encrypted=true}" \
  --iam-instance-profile Name=ec2-cloudwatch-role \
  --user-data file://user-data.sh \
  --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$APPLICATION},{Key=Environment,Value=$ENVIRONMENT},{Key=Owner,Value=devops-team}]" \
  --tag-specifications "ResourceType=volume,Tags=[{Key=Name,Value=$APPLICATION-root},{Key=Environment,Value=$ENVIRONMENT}]" \
  --monitoring Enabled=true

# Output
echo "✓ Instance launched"
echo "Check instance status with:"
echo "aws ec2 describe-instances --query 'Reservations[].Instances[].{ID:InstanceId,State:State.Name,IP:PrivateIpAddress}'"

Azure VM Creation

# Set variables
RESOURCE_GROUP="myapp-rg"
VM_NAME="webserver-01"
IMAGE="UbuntuLTS"
SIZE="Standard_B2s"
VNET_NAME="myapp-vnet"
SUBNET_NAME="public-subnet-1"
NIC_NAME="$VM_NAME-nic"
OS_DISK_SIZE=100

# Create network interface
az network nic create \
  --resource-group $RESOURCE_GROUP \
  --name $NIC_NAME \
  --vnet-name $VNET_NAME \
  --subnet $SUBNET_NAME

# Create VM
az vm create \
  --resource-group $RESOURCE_GROUP \
  --name $VM_NAME \
  --nics $NIC_NAME \
  --image $IMAGE \
  --size $SIZE \
  --os-disk-size-gb $OS_DISK_SIZE \
  --os-disk-name "$VM_NAME-osdisk" \
  --generate-ssh-keys \
  --assign-identity \
  --role Contributor \
  --scope /subscriptions/{subscription-id}

# Enable managed identity for monitoring
az vm update \
  --resource-group $RESOURCE_GROUP \
  --name $VM_NAME \
  --set identity.type='SystemAssigned'

# Add custom script extension (bootstrap)
az vm extension set \
  --resource-group $RESOURCE_GROUP \
  --vm-name $VM_NAME \
  --name CustomScript \
  --publisher Microsoft.Azure.Extensions \
  --settings '{"fileUris": ["https://raw.githubusercontent.com/yourepo/bootstrap.sh"], "commandToExecute": "bash bootstrap.sh"}'

echo "✓ Azure VM created"

GCP Compute Engine Instance

# Set variables
PROJECT_ID="my-project"
ZONE="us-central1-a"
INSTANCE_NAME="webserver-01"
MACHINE_TYPE="e2-medium"
IMAGE_FAMILY="ubuntu-2204-lts"
IMAGE_PROJECT="ubuntu-os-cloud"
BOOT_DISK_SIZE="100GB"
BOOT_DISK_TYPE="pd-ssd"

# Create instance
gcloud compute instances create $INSTANCE_NAME \
  --project=$PROJECT_ID \
  --zone=$ZONE \
  --machine-type=$MACHINE_TYPE \
  --image-family=$IMAGE_FAMILY \
  --image-project=$IMAGE_PROJECT \
  --boot-disk-size=$BOOT_DISK_SIZE \
  --boot-disk-type=$BOOT_DISK_TYPE \
  --enable-display-device \
  --tags=webserver,http-server,https-server \
  --labels=environment=production,application=webserver,owner=devops \
  --metadata-from-file startup-script=./bootstrap.sh \
  --create-disk size=200GB,type=pd-ssd,name="$INSTANCE_NAME-data"

# Attach disk
gcloud compute instances attach-disk $INSTANCE_NAME \
  --disk="$INSTANCE_NAME-data" \
  --zone=$ZONE

# Configure firewall rules
gcloud compute firewall-rules create allow-http \
  --project=$PROJECT_ID \
  --allow=tcp:80 \
  --source-ranges=0.0.0.0/0 \
  --target-tags=http-server

gcloud compute firewall-rules create allow-https \
  --project=$PROJECT_ID \
  --allow=tcp:443 \
  --source-ranges=0.0.0.0/0 \
  --target-tags=https-server

echo "✓ GCP instance created"

3. Scaling Operations

Enable Auto-Scaling on AWS

# Set variables
ASG_NAME="app-asg"
MIN_SIZE=2
MAX_SIZE=10
DESIRED_CAPACITY=3
LAUNCH_TEMPLATE="app-lt-v1"
SUBNETS="subnet-12345,subnet-67890"

# Create launch template
aws ec2 create-launch-template \
  --launch-template-name $LAUNCH_TEMPLATE \
  --version-description "Production template" \
  --launch-template-data '{
    "ImageId": "ami-0c55b159cbfafe1f0",
    "InstanceType": "t3.large",
    "KeyName": "my-key",
    "SecurityGroupIds": ["sg-12345678"],
    "Monitoring": {"Enabled": true},
    "MetadataOptions": {
      "HttpTokens": "required",
      "HttpPutResponseHopLimit": 1
    }
  }'

# Create Auto Scaling Group
aws autoscaling create-auto-scaling-group \
  --auto-scaling-group-name $ASG_NAME \
  --launch-template "LaunchTemplateName=$LAUNCH_TEMPLATE,Version=\$Latest" \
  --min-size $MIN_SIZE \
  --max-size $MAX_SIZE \
  --desired-capacity $DESIRED_CAPACITY \
  --vpc-zone-identifier $SUBNETS \
  --health-check-type ELB \
  --health-check-grace-period 300 \
  --tags "Key=Name,Value=app-instance,PropagateAtLaunch=true" \
           "Key=Environment,Value=production,PropagateAtLaunch=true"

# Create scaling policy (scale up)
aws autoscaling put-scaling-policy \
  --auto-scaling-group-name $ASG_NAME \
  --policy-name "scale-up-policy" \
  --policy-type TargetTrackingScaling \
  --target-tracking-configuration '{
    "TargetValue": 70.0,
    "PredefinedMetricSpecification": {
      "PredefinedMetricType": "ASGAverageCPUUtilization"
    },
    "ScaleOutCooldown": 60,
    "ScaleInCooldown": 300
  }'

echo "✓ Auto-scaling configured"
echo "ASG: $ASG_NAME"
echo "Min: $MIN_SIZE, Max: $MAX_SIZE, Desired: $DESIRED_CAPACITY"

Azure Virtual Machine Scale Set

# Set variables
RESOURCE_GROUP="myapp-rg"
VMSS_NAME="app-vmss"
MIN_SIZE=2
MAX_SIZE=10
DESIRED_CAPACITY=3
IMAGE="UbuntuLTS"
SIZE="Standard_B2s"

# Create VM Scale Set
az vmss create \
  --resource-group $RESOURCE_GROUP \
  --name $VMSS_NAME \
  --image $IMAGE \
  --vm-sku $SIZE \
  --instance-count $DESIRED_CAPACITY \
  --min-count $MIN_SIZE \
  --max-count $MAX_SIZE \
  --admin-username azureuser \
  --generate-ssh-keys \
  --upgrade-policy-mode Automatic

# Create autoscale settings
az monitor autoscale create \
  --resource-group $RESOURCE_GROUP \
  --resource-name $VMSS_NAME \
  --resource-type "Microsoft.Compute/virtualMachineScaleSets" \
  --min-count $MIN_SIZE \
  --max-count $MAX_SIZE \
  --count $DESIRED_CAPACITY

# Add scale-up rule (CPU > 70%)
az monitor autoscale rule create \
  --resource-group $RESOURCE_GROUP \
  --autoscale-name "autoscale-$VMSS_NAME" \
  --condition "Percentage CPU > 70 avg 5m" \
  --scale out 1

# Add scale-down rule (CPU < 30%)
az monitor autoscale rule create \
  --resource-group $RESOURCE_GROUP \
  --autoscale-name "autoscale-$VMSS_NAME" \
  --condition "Percentage CPU < 30 avg 5m" \
  --scale in 1

echo "✓ Azure VMSS autoscaling configured"

GCP Instance Group Autoscaling

# Set variables
PROJECT_ID="my-project"
ZONE="us-central1-a"
INSTANCE_GROUP_NAME="app-ig"
MIN_SIZE=2
MAX_SIZE=10
TEMPLATE_NAME="app-template"
TARGET_CPU=70

# Create instance template
gcloud compute instance-templates create $TEMPLATE_NAME \
  --project=$PROJECT_ID \
  --machine-type=e2-medium \
  --image-family=ubuntu-2204-lts \
  --image-project=ubuntu-os-cloud \
  --boot-disk-size=100GB \
  --boot-disk-type=pd-ssd

# Create managed instance group
gcloud compute instance-groups managed create $INSTANCE_GROUP_NAME \
  --project=$PROJECT_ID \
  --base-instance-name=app \
  --template=$TEMPLATE_NAME \
  --size=3 \
  --zone=$ZONE

# Set autoscaling policy
gcloud compute instance-groups managed set-autoscaling $INSTANCE_GROUP_NAME \
  --project=$PROJECT_ID \
  --min-num-replicas=$MIN_SIZE \
  --max-num-replicas=$MAX_SIZE \
  --target-cpu-utilization=$((TARGET_CPU / 100)) \
  --zone=$ZONE

echo "✓ GCP autoscaling configured"
echo "Instance Group: $INSTANCE_GROUP_NAME"

4. Performance Optimization

Linux Kernel Tuning

#!/bin/bash

# Network optimizations for high-throughput workloads
cat >> /etc/sysctl.conf << EOF

# Increase TCP connection queue lengths
net.core.somaxconn = 65535
net.ipv4.tcp_max_syn_backlog = 65535

# Increase max file descriptors
fs.file-max = 2097152
fs.nr_open = 2097152

# TCP buffer optimization
net.core.rmem_default = 134217728
net.core.wmem_default = 134217728
net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
net.ipv4.tcp_rmem = 4096 87380 67108864
net.ipv4.tcp_wmem = 4096 65536 67108864

# Enable TCP window scaling
net.ipv4.tcp_window_scaling = 1

# Enable IP forward (for NAT, load balancing)
net.ipv4.ip_forward = 1

# Increase UDP queue sizes
net.core.netdev_max_backlog = 65535

# Connection tracking
net.netfilter.nf_conntrack_max = 262144
EOF

sysctl -p

# Disk I/O tuning
cat > /etc/udev/rules.d/99-disk-tuning.rules << EOF
# Set deadline scheduler and increase queue depth for NVMe
ACTION=="add|change", SUBSYSTEM=="block", DEVPATH=="*/nvme*", ATTR{queue/scheduler}="mq-deadline", ATTR{queue/nr_requests}="256"
ACTION=="add|change", SUBSYSTEM=="block", DEVPATH=="*/sd*", ATTR{queue/scheduler}="mq-deadline", ATTR{queue/nr_requests}="256", ATTR{queue/read_ahead_kb}="1024"
EOF

udevadm control --reload-rules && udevadm trigger

# Verify settings
sysctl net.core.somaxconn
cat /sys/block/nvme0n1/queue/nr_requests
cat /sys/block/sda/queue/scheduler

echo "✓ Kernel tuning applied"

Database Performance Tuning

PostgreSQL optimized config:

# postgresql.conf settings for 32GB server with 8 vCPU

# Memory settings (25% of system RAM)
shared_buffers = 8GB
effective_cache_size = 24GB
work_mem = 2GB
maintenance_work_mem = 2GB

# Connection settings
max_connections = 400
superuser_reserved_connections = 3

# Parallelization
max_worker_processes = 8
max_parallel_workers_per_gather = 4
max_parallel_workers = 8

# WAL tuning
wal_buffers = 16MB
checkpoint_timeout = 30min
checkpoint_completion_target = 0.9
wal_level = replica

# Query planning
random_page_cost = 1.1  # For SSD
effective_io_concurrency = 100

# Logging
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_statement = 'mod'
log_duration = off
log_min_duration_statement = 1000  # Log queries > 1s

# Apply and reload
systemctl restart postgresql

5. Monitoring & Alerting

Prometheus Metrics Collection

# prometheus.yml - Scrape configuration

global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    environment: production
    cluster: us-east-1

alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093

rule_files:
  - /etc/prometheus/rules/*.yml

scrape_configs:
  # Node exporter (OS metrics)
  - job_name: 'node'
    static_configs:
      - targets: ['localhost:9100', 'server1:9100', 'server2:9100']

  # Application metrics
  - job_name: 'application'
    static_configs:
      - targets: ['app1:8080', 'app2:8080', 'app3:8080']
    relabel_configs:
      - source_labels: [__address__]
        regex: '([^:]+)(?::\d+)?'
        target_label: instance

  # Database metrics
  - job_name: 'postgres'
    static_configs:
      - targets: ['db1:9187']

  # Docker containers
  - job_name: 'docker'
    unix_sock_opts:
      path: /var/run/docker.sock
    relabel_configs:
      - source_labels: [__meta_docker_container_name]
        target_label: container

  # Kubernetes (if using K8s)
  - job_name: 'kubernetes-apiservers'
    kubernetes_sd_configs:
      - role: endpoints
    scheme: https
    tls_config:
      ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt

Alert rules:

# /etc/prometheus/rules/alerts.yml

groups:
  - name: compute
    interval: 30s
    rules:
      # CPU alerts
      - alert: HighCPUUtilization
        expr: rate(node_cpu_seconds_total{mode="user"}[5m]) > 0.8
        for: 5m
        annotations:
          summary: "High CPU utilization on {{ $labels.instance }}"
          description: "CPU usage is {{ $value | humanizePercentage }}"

      # Memory alerts
      - alert: LowMemoryAvailable
        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
        for: 5m
        annotations:
          summary: "Low available memory on {{ $labels.instance }}"
          description: "Only {{ $value | humanizePercentage }} available"

      # Disk space
      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs"} / node_filesystem_size_bytes) < 0.1
        for: 5m
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"

      # Network issues
      - alert: HighNetworkErrors
        expr: rate(node_network_receive_errs_total[5m]) > 100
        for: 5m
        annotations:
          summary: "High network errors on {{ $labels.instance }}"

      # Instance down
      - alert: InstanceDown
        expr: up{job="node"} == 0
        for: 1m
        annotations:
          summary: "Instance {{ $labels.instance }} is down"

6. Troubleshooting

CPU Exhaustion - Diagnosis & Recovery

#!/bin/bash

# 1. Check overall CPU usage
echo "=== CPU Usage Overview ==="
top -bn1 | head -15

# 2. Identify top CPU consumers
echo -e "\n=== Top 10 CPU Consuming Processes ==="
ps aux --sort=-%cpu | head -11 | tail -10

# 3. Check CPU throttling (container)
echo -e "\n=== CPU Throttling ==="
cat /sys/fs/cgroup/cpuacct/cpuacct.stat
cat /sys/fs/cgroup/cpu,cpuacct/cpu.stat | grep nr_throttled

# 4. Check load average
echo -e "\n=== Load Average ==="
uptime

# 5. Check process-specific CPU
PID=$(pgrep -f "java")
if [ ! -z "$PID" ]; then
  echo -e "\n=== Java Process CPU Details ==="
  cat /proc/$PID/stat | awk '{print "CPU time (ticks):", $14, $15}'
  grep Threads /proc/$PID/status
fi

# Recovery actions
echo -e "\n=== Recovery Actions ==="

# Option 1: Restart service
echo "1. Restarting application:"
systemctl restart myapp

# Option 2: Scale horizontally (if using orchestration)
echo "2. Scale up instances:"
# aws autoscaling set-desired-capacity --auto-scaling-group-name app-asg --desired-capacity 5

# Option 3: Kill specific process (if runaway found)
# kill -9 $PID

echo "✓ CPU troubleshooting complete"

Memory Issues - Diagnosis & Recovery

#!/bin/bash

# 1. Memory overview
echo "=== Memory Usage ==="
free -h
echo ""

# 2. Check for memory leaks
echo "=== Memory per Process ==="
ps aux --sort=-%mem | head -11

# 3. Check for swapping
echo -e "\n=== Swap Usage ==="
swapon -s
vmstat 1 5 | tail -3

# 4. OOM killer events
echo -e "\n=== OOM Killer Events ==="
dmesg | grep -i "out of memory" | tail -5

# 5. Page cache usage
echo -e "\n=== Memory Details ==="
cat /proc/meminfo | grep -E "^MemTotal|^MemAvailable|^MemFree|^Cached|^Buffers"

# 6. Container memory limits
echo -e "\n=== Container Memory Limits ==="
if [ -f /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
  echo "Memory limit: $(numfmt --to=iec $(cat /sys/fs/cgroup/memory/memory.limit_in_bytes))"
  echo "Memory usage: $(numfmt --to=iec $(cat /sys/fs/cgroup/memory/memory.usage_in_bytes))"
fi

# Recovery
echo -e "\n=== Recovery Actions ==="

# Option 1: Increase memory
echo "1. Increase instance memory (manual scaling)"

# Option 2: Restart service to clear memory
echo "2. Restarting service to clear memory:"
systemctl restart myapp
sleep 5
free -h

# Option 3: Check for memory leaks
echo "3. Enable verbose garbage collection (Java):"
echo "   Add: -Xlog:gc*:file=gc.log"

echo "✓ Memory troubleshooting complete"

Network Latency Investigation

#!/bin/bash

# 1. Check basic connectivity
echo "=== Connectivity Check ==="
ping -c 4 8.8.8.8

# 2. Measure latency to target
echo -e "\n=== Latency to Application ==="
for i in {1..10}; do
  curl -o /dev/null -s -w "%{time_total}\n" http://app-server:8080/health
done

# 3. Check network interface status
echo -e "\n=== Network Interface Status ==="
ip link show
ethtool eth0 | grep Speed

# 4. Monitor packet loss
echo -e "\n=== Packet Loss Test ==="
mtr -r -c 100 app-server

# 5. Check network stack
echo -e "\n=== Network Stack Stats ==="
netstat -s | grep -E "total packets|packet loss|retransmitted"

# 6. Check TCP connections
echo -e "\n=== TCP Connection Status ==="
ss -s
echo ""
ss -tan | grep ESTAB | wc -l

# 7. DNS resolution
echo -e "\n=== DNS Resolution ==="
nslookup app-server
dig app-server

# 8. Trace route
echo -e "\n=== Route Trace ==="
traceroute -m 10 app-server

echo "✓ Network troubleshooting complete"

7. Disaster Recovery

Automated Backup to S3

#!/bin/bash

# Backup EC2 volumes to S3
INSTANCE_ID="i-1234567890abcdef0"
BACKUP_BUCKET="s3://company-backups/compute"
RETENTION_DAYS=30
DATE=$(date +%Y-%m-%d-%H:%M:%S)

# Get all volumes for instance
aws ec2 describe-instances \
  --instance-ids $INSTANCE_ID \
  --query 'Reservations[].Instances[].BlockDeviceMappings[].Ebs.VolumeId' \
  --output text | while read VOLUME_ID; do
  
  # Create snapshot
  SNAPSHOT_ID=$(aws ec2 create-snapshot \
    --volume-id $VOLUME_ID \
    --description "Backup-$DATE" \
    --tag-specifications "ResourceType=snapshot,Tags=[{Key=Name,Value=backup-$INSTANCE_ID-$DATE},{Key=Retention,Value=$RETENTION_DAYS}]" \
    --query 'SnapshotId' \
    --output text)
  
  echo "✓ Snapshot created: $SNAPSHOT_ID for volume $VOLUME_ID"
done

# Clean up old snapshots (older than retention period)
aws ec2 describe-snapshots \
  --owner-ids self \
  --query "Snapshots[?StartTime<='$(date -u -d "$RETENTION_DAYS days ago" +%Y-%m-%dT%H:%M:%S.000Z)'].SnapshotId" \
  --output text | while read SNAP_ID; do
  if [ ! -z "$SNAP_ID" ]; then
    aws ec2 delete-snapshot --snapshot-id $SNAP_ID
    echo "✓ Deleted old snapshot: $SNAP_ID"
  fi
done

Failover Procedure

#!/bin/bash

# Multi-AZ failover automation

PRIMARY_INSTANCE="i-111111111111111"
STANDBY_INSTANCE="i-222222222222222"
ELB_ID="arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/my-app"
ROUTE53_ZONE="Z12345ABCDEF"
HOSTNAME="app.example.com"

# 1. Check primary instance health
echo "=== Checking primary instance health ==="
PRIMARY_STATE=$(aws ec2 describe-instances \
  --instance-ids $PRIMARY_INSTANCE \
  --query 'Reservations[].Instances[].State.Name' \
  --output text)

if [ "$PRIMARY_STATE" == "running" ]; then
  PRIMARY_STATUS_CHECK=$(aws ec2 describe-instance-status \
    --instance-ids $PRIMARY_INSTANCE \
    --query 'InstanceStatuses[0].SystemStatus.Status' \
    --output text)
fi

# 2. If primary is down, initiate failover
if [ "$PRIMARY_STATE" != "running" ] || [ "$PRIMARY_STATUS_CHECK" != "ok" ]; then
  echo "⚠ Primary instance unhealthy. Initiating failover..."
  
  # Start standby instance
  aws ec2 start-instances --instance-ids $STANDBY_INSTANCE
  echo "✓ Started standby instance"
  
  # Wait for instance to be ready
  aws ec2 wait instance-running --instance-ids $STANDBY_INSTANCE
  echo "✓ Standby instance is running"
  
  # Update Route53 DNS
  STANDBY_IP=$(aws ec2 describe-instances \
    --instance-ids $STANDBY_INSTANCE \
    --query 'Reservations[].Instances[].PrivateIpAddress' \
    --output text)
  
  aws route53 change-resource-record-sets \
    --hosted-zone-id $ROUTE53_ZONE \
    --change-batch "{
      \"Changes\": [{
        \"Action\": \"UPSERT\",
        \"ResourceRecordSet\": {
          \"Name\": \"$HOSTNAME\",
          \"Type\": \"A\",
          \"TTL\": 300,
          \"ResourceRecords\": [{\"Value\": \"$STANDBY_IP\"}]
        }
      }]
    }"
  
  echo "✓ DNS updated to standby instance ($STANDBY_IP)"
  echo "✓ Failover complete"
else
  echo "✓ Primary instance is healthy"
fi

8. Security Hardening

Instance Security Scan

#!/bin/bash

echo "=== Instance Security Audit ==="

# 1. SSH Configuration
echo -e "\n1. SSH Security:"
echo "   - Root login: $(grep -E '^PermitRootLogin' /etc/ssh/sshd_config || echo 'Not explicitly set')"
echo "   - Password auth: $(grep -E '^PasswordAuthentication' /etc/ssh/sshd_config || echo 'Not explicitly set')"
echo "   - X11 forwarding: $(grep -E '^X11Forwarding' /etc/ssh/sshd_config || echo 'Not explicitly set')"

# 2. Firewall Status
echo -e "\n2. Firewall Status:"
systemctl status ufw 2>/dev/null || systemctl status firewalld 2>/dev/null || echo "   Firewall not configured"

# 3. Listening Ports
echo -e "\n3. Listening Ports (should be minimal):"
ss -tuln | grep LISTEN

# 4. User Accounts
echo -e "\n4. User Accounts (non-system):"
awk -F: '$3 >= 1000 {print $1 " - UID:" $3}' /etc/passwd

# 5. Sudo Configuration
echo -e "\n5. Sudo Access:"
echo "   Users in sudo group:"
getent group sudo | cut -d: -f4

# 6. File Permissions
echo -e "\n6. Critical File Permissions:"
ls -la /etc/passwd /etc/shadow /etc/sudoers

# 7. Fail2Ban Status
echo -e "\n7. Fail2Ban Status (if enabled):"
systemctl status fail2ban 2>/dev/null || echo "   Not installed"

# 8. SELinux Status
echo -e "\n8. SELinux Status:"
getenforce 2>/dev/null || echo "   Not enabled"

# 9. AppArmor Status
echo -e "\n9. AppArmor Status:"
aa-status 2>/dev/null || echo "   Not enabled"

# 10. Security Updates
echo -e "\n10. Security Updates Pending:"
apt list --upgradable 2>/dev/null | wc -l

echo -e "\n✓ Security audit complete"

Document Version: 1.0
Last Updated: January 31, 2026
Audience: DevOps Engineers, System Administrators
Contact: Infrastructure Operations Team

RUNBOOK