From 7b69090262c7d6b5576554c14a4815027b29cc63 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Dec 2025 01:40:33 +0000 Subject: [PATCH 1/6] feat: add RDS/Aurora refresh component for DBLab Add a new component that automates DBLab full refresh using temporary RDS/Aurora database clones created from snapshots. This enables a hassle-free data sync workflow that doesn't impact production. Features: - Create temporary RDS/Aurora clones from latest automated snapshots - Wait for clone availability with proper timeout handling - Trigger DBLab full refresh via API - Poll refresh status until completion - Clean up temporary clones automatically Deployment options: - AWS Lambda with SAM template and EventBridge scheduling - Standalone CLI binary for cron/manual execution - Docker container for containerized environments Includes comprehensive documentation with IAM policy examples and example configuration files. --- engine/cmd/rds-refresh/main.go | 176 ++++++++ engine/configs/rds-refresh.example.yaml | 95 ++++ engine/deploy/rds-refresh/Dockerfile | 39 ++ engine/deploy/rds-refresh/README.md | 356 +++++++++++++++ engine/deploy/rds-refresh/iam-policy.json | 49 +++ engine/deploy/rds-refresh/template.yaml | 241 ++++++++++ engine/go.mod | 16 + engine/go.sum | 32 ++ engine/internal/rdsrefresh/config.go | 190 ++++++++ engine/internal/rdsrefresh/dblab.go | 205 +++++++++ engine/internal/rdsrefresh/lambda.go | 174 ++++++++ engine/internal/rdsrefresh/rds.go | 509 ++++++++++++++++++++++ engine/internal/rdsrefresh/refresher.go | 243 +++++++++++ 13 files changed, 2325 insertions(+) create mode 100644 engine/cmd/rds-refresh/main.go create mode 100644 engine/configs/rds-refresh.example.yaml create mode 100644 engine/deploy/rds-refresh/Dockerfile create mode 100644 engine/deploy/rds-refresh/README.md create mode 100644 engine/deploy/rds-refresh/iam-policy.json create mode 100644 engine/deploy/rds-refresh/template.yaml create mode 100644 engine/internal/rdsrefresh/config.go create mode 100644 engine/internal/rdsrefresh/dblab.go create mode 100644 engine/internal/rdsrefresh/lambda.go create mode 100644 engine/internal/rdsrefresh/rds.go create mode 100644 engine/internal/rdsrefresh/refresher.go diff --git a/engine/cmd/rds-refresh/main.go b/engine/cmd/rds-refresh/main.go new file mode 100644 index 00000000..b51f0efa --- /dev/null +++ b/engine/cmd/rds-refresh/main.go @@ -0,0 +1,176 @@ +/* +2024 © Postgres.ai +*/ + +// Package main provides the entry point for the rds-refresh CLI tool. +// This tool automates DBLab full refresh using temporary RDS/Aurora clones. +package main + +import ( + "context" + "flag" + "fmt" + "os" + "os/signal" + "syscall" + + "github.com/aws/aws-lambda-go/lambda" + + "gitlab.com/postgres-ai/database-lab/v3/internal/rdsrefresh" +) + +var ( + version = "dev" + buildTime = "unknown" +) + +func main() { + // Check if running in Lambda + if os.Getenv("AWS_LAMBDA_FUNCTION_NAME") != "" { + lambda.Start(rdsrefresh.HandleLambda) + return + } + + // CLI mode + configPath := flag.String("config", "", "Path to configuration file") + dryRun := flag.Bool("dry-run", false, "Validate configuration without creating resources") + showVersion := flag.Bool("version", false, "Show version information") + help := flag.Bool("help", false, "Show help") + + flag.Usage = printUsage + flag.Parse() + + if *help { + printUsage() + os.Exit(0) + } + + if *showVersion { + fmt.Printf("rds-refresh version %s (built: %s)\n", version, buildTime) + os.Exit(0) + } + + if *configPath == "" { + fmt.Fprintln(os.Stderr, "error: -config flag is required") + printUsage() + os.Exit(1) + } + + if err := run(*configPath, *dryRun); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +func run(configPath string, dryRun bool) error { + cfg, err := rdsrefresh.LoadConfig(configPath) + if err != nil { + return fmt.Errorf("failed to load config: %w", err) + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Handle interrupt signals + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + + go func() { + sig := <-sigCh + fmt.Printf("\nReceived signal %v, initiating graceful shutdown...\n", sig) + cancel() + }() + + logger := &rdsrefresh.DefaultLogger{} + + refresher, err := rdsrefresh.NewRefresher(ctx, cfg, logger) + if err != nil { + return fmt.Errorf("failed to initialize refresher: %w", err) + } + + if dryRun { + return refresher.DryRun(ctx) + } + + result := refresher.Run(ctx) + + fmt.Println() + fmt.Println("=== Refresh Summary ===") + fmt.Printf("Success: %v\n", result.Success) + fmt.Printf("Snapshot: %s\n", result.SnapshotID) + fmt.Printf("Clone ID: %s\n", result.CloneID) + fmt.Printf("Duration: %v\n", result.Duration.Round(1e9)) + + if result.Error != nil { + return result.Error + } + + return nil +} + +func printUsage() { + fmt.Fprintf(os.Stderr, `rds-refresh - Automate DBLab full refresh using RDS/Aurora snapshots + +This tool creates a temporary RDS/Aurora clone from a snapshot, triggers +a DBLab Engine full refresh, and then cleans up the temporary clone. + +USAGE: + rds-refresh -config [options] + +OPTIONS: + -config Path to YAML configuration file (required) + -dry-run Validate configuration without creating resources + -version Show version information + -help Show this help message + +LAMBDA MODE: + When running as an AWS Lambda function (detected via AWS_LAMBDA_FUNCTION_NAME + environment variable), configuration is loaded from environment variables: + + Required: + RDS_SOURCE_IDENTIFIER Source RDS instance or Aurora cluster ID + RDS_CLONE_INSTANCE_CLASS Instance class for the clone (e.g., db.t3.medium) + DBLAB_API_ENDPOINT DBLab Engine API endpoint + DBLAB_TOKEN DBLab verification token + AWS_REGION AWS region + + Optional: + RDS_SOURCE_TYPE "rds" or "aurora-cluster" (default: rds) + RDS_SNAPSHOT_IDENTIFIER Specific snapshot ID (default: latest) + RDS_CLONE_SUBNET_GROUP DB subnet group name + RDS_CLONE_SECURITY_GROUPS JSON array of security group IDs + RDS_CLONE_PUBLIC "true" to make clone publicly accessible + RDS_CLONE_PARAMETER_GROUP DB parameter group name + RDS_CLONE_ENABLE_IAM_AUTH "true" to enable IAM authentication + RDS_CLONE_STORAGE_TYPE Storage type (gp2, gp3, io1, etc.) + RDS_CLONE_TAGS JSON object of additional tags + DBLAB_INSECURE "true" to skip TLS verification + +EXAMPLE CONFIGURATION: + + source: + type: rds + identifier: production-db + + clone: + instanceClass: db.t3.medium + subnetGroup: default-vpc-subnet + securityGroups: + - sg-12345678 + publiclyAccessible: false + enableIAMAuth: true + + dblab: + apiEndpoint: https://dblab.example.com:2345 + token: ${DBLAB_TOKEN} + pollInterval: 30s + timeout: 4h + + aws: + region: us-east-1 + +For more information, see: + https://postgres.ai/docs/database-lab-engine + +`) +} diff --git a/engine/configs/rds-refresh.example.yaml b/engine/configs/rds-refresh.example.yaml new file mode 100644 index 00000000..300d5ab8 --- /dev/null +++ b/engine/configs/rds-refresh.example.yaml @@ -0,0 +1,95 @@ +# Example configuration for rds-refresh component +# +# This component automates DBLab full refresh using temporary RDS/Aurora clones. +# Copy this file and customize for your environment. +# +# For Lambda deployment, see deploy/rds-refresh/template.yaml +# For CLI usage: rds-refresh -config rds-refresh.yaml + +# Source database configuration +source: + # Type of source database: + # - "rds" for RDS DB instance + # - "aurora-cluster" for Aurora cluster + type: rds + + # RDS DB instance identifier or Aurora cluster identifier + identifier: production-db + + # Optional: Specific snapshot identifier to use + # If empty, the latest automated snapshot will be used + # snapshotIdentifier: rds:production-db-2024-01-15-02-00 + +# Temporary clone configuration +clone: + # Instance class for the clone (can be smaller than production) + instanceClass: db.t3.medium + + # DB subnet group (must be in a VPC accessible from DBLab Engine) + subnetGroup: default-vpc-subnet + + # VPC security groups for the clone + # Must allow inbound connections from DBLab Engine on PostgreSQL port + securityGroups: + - sg-12345678 + - sg-87654321 + + # Whether the clone should be publicly accessible + # Set to false if DBLab is in the same VPC + publiclyAccessible: false + + # Enable IAM database authentication (recommended) + enableIAMAuth: true + + # Optional: DB parameter group name + # parameterGroup: custom-postgres-params + + # Optional: DB option group name (RDS only) + # optionGroup: custom-options + + # Optional: Cluster parameter group (Aurora only) + # clusterParameterGroup: aurora-postgres-params + + # Optional: Engine version override + # engineVersion: "15.4" + + # Optional: Custom port (default: 5432) + # port: 5432 + + # Optional: Storage type (gp2, gp3, io1, io2) + # storageType: gp3 + + # Deletion protection (should be false for temporary clones) + deletionProtection: false + + # Additional tags for the clone + tags: + Environment: dblab-refresh + Team: platform + CostCenter: engineering + +# DBLab Engine configuration +dblab: + # DBLab Engine API endpoint + apiEndpoint: https://dblab.example.com:2345 + + # Verification token for DBLab API + # Use environment variable expansion for security + token: ${DBLAB_TOKEN} + + # Skip TLS certificate verification (not recommended for production) + insecure: false + + # How often to poll DBLab status during refresh + pollInterval: 30s + + # Maximum time to wait for refresh to complete + timeout: 4h + +# AWS configuration +aws: + # AWS region where RDS/Aurora resources are located + region: us-east-1 + + # Optional: Custom AWS endpoint (for testing with LocalStack) + # endpoint: http://localhost:4566 diff --git a/engine/deploy/rds-refresh/Dockerfile b/engine/deploy/rds-refresh/Dockerfile new file mode 100644 index 00000000..4ff75443 --- /dev/null +++ b/engine/deploy/rds-refresh/Dockerfile @@ -0,0 +1,39 @@ +# Build stage +FROM golang:1.23-alpine AS builder + +RUN apk add --no-cache git ca-certificates + +WORKDIR /build + +# Copy go mod files first for better caching +COPY engine/go.mod engine/go.sum ./ +RUN go mod download + +# Copy source code +COPY engine/ ./ + +# Build the binary +ARG VERSION=dev +ARG BUILD_TIME=unknown + +RUN CGO_ENABLED=0 GOOS=linux go build \ + -ldflags="-s -w -X main.version=${VERSION} -X main.buildTime=${BUILD_TIME}" \ + -o /rds-refresh \ + ./cmd/rds-refresh + +# Runtime stage +FROM alpine:3.19 + +RUN apk add --no-cache ca-certificates tzdata + +# Create non-root user +RUN adduser -D -u 1000 appuser + +WORKDIR /app + +COPY --from=builder /rds-refresh /usr/local/bin/rds-refresh + +USER appuser + +ENTRYPOINT ["/usr/local/bin/rds-refresh"] +CMD ["--help"] diff --git a/engine/deploy/rds-refresh/README.md b/engine/deploy/rds-refresh/README.md new file mode 100644 index 00000000..d7d46c82 --- /dev/null +++ b/engine/deploy/rds-refresh/README.md @@ -0,0 +1,356 @@ +# DBLab RDS/Aurora Refresh Component + +Automates DBLab Engine full refresh using temporary RDS or Aurora clones created from snapshots. + +## Overview + +This component provides a hassle-free way to keep your DBLab Engine data synchronized with your production RDS/Aurora database. It: + +1. **Creates a temporary clone** from the latest RDS/Aurora snapshot +2. **Triggers DBLab full refresh** to sync data from the clone +3. **Deletes the temporary clone** after refresh completes + +This approach avoids impacting your production database during the data sync process. + +## Deployment Options + +### Option 1: AWS Lambda (Recommended) + +Deploy as a serverless function with automatic scheduling via EventBridge. + +#### Prerequisites + +- [AWS SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-sam-cli.html) +- AWS credentials configured +- Go 1.21+ (for building) + +#### Quick Start + +```bash +# Clone the repository +git clone https://gitlab.com/postgres-ai/database-lab.git +cd database-lab/engine/deploy/rds-refresh + +# Build and deploy +sam build +sam deploy --guided +``` + +During guided deployment, you'll be prompted for: + +| Parameter | Description | Example | +|-----------|-------------|---------| +| `RDSSourceType` | `rds` or `aurora-cluster` | `rds` | +| `RDSSourceIdentifier` | Source DB identifier | `production-db` | +| `RDSCloneInstanceClass` | Clone instance size | `db.t3.medium` | +| `DBLabAPIEndpoint` | DBLab API URL | `https://dblab.example.com:2345` | +| `DBLabToken` | DBLab verification token | `your-secret-token` | +| `ScheduleExpression` | Refresh schedule | `rate(7 days)` | + +#### Manual Invocation + +```bash +# Dry run (validates configuration) +aws lambda invoke --function-name dblab-rds-refresh \ + --cli-binary-format raw-in-base64-out \ + --payload '{"dryRun": true}' \ + response.json && cat response.json + +# Full refresh +aws lambda invoke --function-name dblab-rds-refresh \ + --cli-binary-format raw-in-base64-out \ + --payload '{"dryRun": false}' \ + response.json && cat response.json +``` + +### Option 2: CLI Binary + +Run as a standalone binary via cron or systemd timer. + +#### Build + +```bash +cd engine +go build -o rds-refresh ./cmd/rds-refresh +``` + +#### Usage + +```bash +# Dry run +./rds-refresh -config config.yaml -dry-run + +# Full refresh +./rds-refresh -config config.yaml +``` + +#### Example Configuration + +```yaml +# config.yaml +source: + type: rds # or aurora-cluster + identifier: production-db # RDS instance or Aurora cluster ID + # snapshotIdentifier: "" # optional: specific snapshot (default: latest) + +clone: + instanceClass: db.t3.medium # smaller than prod for cost savings + subnetGroup: default-vpc # same VPC as DBLab Engine + securityGroups: + - sg-12345678 # must allow DBLab to connect + publiclyAccessible: false + enableIAMAuth: true # recommended for secure access + # parameterGroup: "" # optional: custom parameter group + # storageType: gp3 # optional: storage type + +dblab: + apiEndpoint: https://dblab.example.com:2345 + token: ${DBLAB_TOKEN} # environment variable expansion + pollInterval: 30s + timeout: 4h + +aws: + region: us-east-1 +``` + +#### Cron Example + +```bash +# Run every Sunday at 2 AM +0 2 * * 0 /usr/local/bin/rds-refresh -config /etc/dblab/rds-refresh.yaml >> /var/log/rds-refresh.log 2>&1 +``` + +### Option 3: Docker Container + +```bash +# Build (from repository root) +docker build -t dblab-rds-refresh -f engine/deploy/rds-refresh/Dockerfile . + +# Run +docker run -v /path/to/config.yaml:/config.yaml \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DBLAB_TOKEN \ + dblab-rds-refresh -config /config.yaml +``` + +### Option 4: Kubernetes CronJob + +```yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: dblab-rds-refresh +spec: + schedule: "0 2 * * 0" # Every Sunday at 2 AM + jobTemplate: + spec: + template: + spec: + serviceAccountName: dblab-rds-refresh # with IRSA + containers: + - name: rds-refresh + image: postgresai/rds-refresh:latest + args: ["-config", "/config/config.yaml"] + volumeMounts: + - name: config + mountPath: /config + env: + - name: DBLAB_TOKEN + valueFrom: + secretKeyRef: + name: dblab-secrets + key: token + volumes: + - name: config + configMap: + name: rds-refresh-config + restartPolicy: OnFailure +``` + +## AWS IAM Permissions + +### Minimal IAM Policy + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "RDSReadSnapshots", + "Effect": "Allow", + "Action": [ + "rds:DescribeDBSnapshots", + "rds:DescribeDBClusterSnapshots", + "rds:DescribeDBInstances", + "rds:DescribeDBClusters" + ], + "Resource": "*" + }, + { + "Sid": "RDSCreateClone", + "Effect": "Allow", + "Action": [ + "rds:RestoreDBInstanceFromDBSnapshot", + "rds:RestoreDBClusterFromSnapshot", + "rds:CreateDBInstance", + "rds:AddTagsToResource", + "rds:ModifyDBInstance", + "rds:ModifyDBCluster" + ], + "Resource": [ + "arn:aws:rds:*:ACCOUNT_ID:db:dblab-refresh-*", + "arn:aws:rds:*:ACCOUNT_ID:cluster:dblab-refresh-*", + "arn:aws:rds:*:ACCOUNT_ID:snapshot:*", + "arn:aws:rds:*:ACCOUNT_ID:cluster-snapshot:*", + "arn:aws:rds:*:ACCOUNT_ID:subgrp:*", + "arn:aws:rds:*:ACCOUNT_ID:pg:*", + "arn:aws:rds:*:ACCOUNT_ID:og:*" + ] + }, + { + "Sid": "RDSDeleteClone", + "Effect": "Allow", + "Action": [ + "rds:DeleteDBInstance", + "rds:DeleteDBCluster" + ], + "Resource": [ + "arn:aws:rds:*:ACCOUNT_ID:db:dblab-refresh-*", + "arn:aws:rds:*:ACCOUNT_ID:cluster:dblab-refresh-*" + ] + } + ] +} +``` + +Replace `ACCOUNT_ID` with your AWS account ID. + +### For IAM Database Authentication + +If using RDS IAM authentication (recommended), the DBLab Engine also needs: + +```json +{ + "Sid": "RDSIAMConnect", + "Effect": "Allow", + "Action": "rds-db:connect", + "Resource": "arn:aws:rds-db:*:ACCOUNT_ID:dbuser:*/dblab_user" +} +``` + +## DBLab Engine Configuration + +Configure DBLab Engine to connect to the temporary clone using RDS IAM authentication: + +```yaml +# server.yml (DBLab Engine config) +retrieval: + refresh: + timetable: "" # Disable built-in scheduler (managed externally) + skipStartRefresh: true + + jobs: + - logicalDump + - logicalRestore + - logicalSnapshot + + spec: + logicalDump: + options: + dockerImage: "postgresai/extended-postgres:17" + dumpLocation: "/var/lib/dblab/dblab_pool/dump" + + source: + type: rdsIam + connection: + dbname: mydb + username: dblab_user + rdsIam: + awsRegion: us-east-1 + # This will be updated by rds-refresh or pre-configured + dbInstanceIdentifier: dblab-refresh-current + sslRootCert: "/cert/rds-combined-ca-bundle.pem" + + parallelJobs: 4 + customOptions: + - "--exclude-schema=rdsdms" +``` + +## Security Best Practices + +1. **Use IAM Database Authentication** - Avoid storing database passwords +2. **Use Secrets Manager** - Store the DBLab token in AWS Secrets Manager +3. **VPC Configuration** - Run clones in a private subnet accessible only to DBLab +4. **Minimal Permissions** - Use the minimal IAM policy above +5. **Encryption** - Ensure clones inherit encryption from snapshots + +## Monitoring + +### CloudWatch Metrics (Lambda) + +The Lambda function emits standard metrics: +- `Invocations` - Number of refresh attempts +- `Errors` - Failed refreshes +- `Duration` - Execution time + +### Custom CloudWatch Dashboard + +```bash +# View recent logs +aws logs tail /aws/lambda/dblab-rds-refresh --follow +``` + +### Alerting + +Set up CloudWatch Alarms for: +- Lambda errors > 0 +- Lambda duration > threshold +- (Optional) Custom metrics on refresh success/failure + +## Troubleshooting + +### Common Issues + +**Clone creation fails with "DBSubnetGroup not found"** +- Ensure the subnet group exists and is in the same VPC + +**Clone creation fails with "VPCSecurityGroupNotFound"** +- Verify security group IDs are correct + +**DBLab refresh timeout** +- Increase `dblab.timeout` in configuration +- Check DBLab Engine logs for issues + +**Clone not accessible from DBLab** +- Verify security groups allow connection from DBLab +- Check if publiclyAccessible setting is correct + +### Debug Mode + +```bash +# CLI: Enable verbose logging +./rds-refresh -config config.yaml 2>&1 | tee refresh.log + +# Lambda: Check CloudWatch logs +aws logs tail /aws/lambda/dblab-rds-refresh --since 1h +``` + +## Cost Considerations + +- **Clone runtime**: You pay for the clone instance while it exists +- **Storage**: Clones don't duplicate storage (snapshot-based) +- **Lambda**: Minimal cost (typically < $0.10/month for weekly refreshes) + +**Cost optimization tips**: +- Use a smaller instance class than production +- Use `gp3` storage type for better price/performance +- Schedule refreshes during off-peak hours + +## Contributing + +See the main [Database Lab Engine contributing guide](../../CONTRIBUTING.md). + +## License + +Apache 2.0 - see [LICENSE](../../LICENSE). diff --git a/engine/deploy/rds-refresh/iam-policy.json b/engine/deploy/rds-refresh/iam-policy.json new file mode 100644 index 00000000..deb13f67 --- /dev/null +++ b/engine/deploy/rds-refresh/iam-policy.json @@ -0,0 +1,49 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "RDSReadSnapshots", + "Effect": "Allow", + "Action": [ + "rds:DescribeDBSnapshots", + "rds:DescribeDBClusterSnapshots", + "rds:DescribeDBInstances", + "rds:DescribeDBClusters" + ], + "Resource": "*" + }, + { + "Sid": "RDSCreateClone", + "Effect": "Allow", + "Action": [ + "rds:RestoreDBInstanceFromDBSnapshot", + "rds:RestoreDBClusterFromSnapshot", + "rds:CreateDBInstance", + "rds:AddTagsToResource", + "rds:ModifyDBInstance", + "rds:ModifyDBCluster" + ], + "Resource": [ + "arn:aws:rds:*:*:db:dblab-refresh-*", + "arn:aws:rds:*:*:cluster:dblab-refresh-*", + "arn:aws:rds:*:*:snapshot:*", + "arn:aws:rds:*:*:cluster-snapshot:*", + "arn:aws:rds:*:*:subgrp:*", + "arn:aws:rds:*:*:pg:*", + "arn:aws:rds:*:*:og:*" + ] + }, + { + "Sid": "RDSDeleteClone", + "Effect": "Allow", + "Action": [ + "rds:DeleteDBInstance", + "rds:DeleteDBCluster" + ], + "Resource": [ + "arn:aws:rds:*:*:db:dblab-refresh-*", + "arn:aws:rds:*:*:cluster:dblab-refresh-*" + ] + } + ] +} diff --git a/engine/deploy/rds-refresh/template.yaml b/engine/deploy/rds-refresh/template.yaml new file mode 100644 index 00000000..6a7f7826 --- /dev/null +++ b/engine/deploy/rds-refresh/template.yaml @@ -0,0 +1,241 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: > + DBLab RDS/Aurora Refresh Lambda + + Automates DBLab full refresh using temporary RDS/Aurora clones created from snapshots. + +Metadata: + AWS::ServerlessRepo::Application: + Name: dblab-rds-refresh + Description: Automates DBLab full refresh using temporary RDS/Aurora clones + Author: Postgres.ai + SpdxLicenseId: Apache-2.0 + Labels: ['dblab', 'rds', 'aurora', 'postgresql', 'database'] + HomePageUrl: https://postgres.ai + SourceCodeUrl: https://gitlab.com/postgres-ai/database-lab + +Parameters: + # Source Configuration + RDSSourceType: + Type: String + Default: rds + AllowedValues: + - rds + - aurora-cluster + Description: Type of source database (rds for RDS instance, aurora-cluster for Aurora) + + RDSSourceIdentifier: + Type: String + Description: RDS DB instance identifier or Aurora cluster identifier + + RDSSnapshotIdentifier: + Type: String + Default: '' + Description: Specific snapshot ID to use (leave empty for latest automated snapshot) + + # Clone Configuration + RDSCloneInstanceClass: + Type: String + Default: db.t3.medium + Description: Instance class for the temporary clone + + RDSCloneSubnetGroup: + Type: String + Default: '' + Description: DB subnet group name for the clone + + RDSCloneSecurityGroups: + Type: CommaDelimitedList + Default: '' + Description: Comma-separated list of VPC security group IDs + + RDSClonePubliclyAccessible: + Type: String + Default: 'false' + AllowedValues: + - 'true' + - 'false' + Description: Whether the clone should be publicly accessible + + RDSCloneEnableIAMAuth: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Enable IAM database authentication on the clone + + RDSCloneParameterGroup: + Type: String + Default: '' + Description: DB parameter group name for the clone + + RDSCloneStorageType: + Type: String + Default: '' + Description: Storage type for the clone (gp2, gp3, io1, etc.) + + # DBLab Configuration + DBLabAPIEndpoint: + Type: String + Description: DBLab Engine API endpoint (e.g., https://dblab.example.com:2345) + + DBLabToken: + Type: String + NoEcho: true + Description: DBLab verification token + + DBLabInsecure: + Type: String + Default: 'false' + AllowedValues: + - 'true' + - 'false' + Description: Skip TLS certificate verification for DBLab API + + # Schedule Configuration + ScheduleExpression: + Type: String + Default: 'rate(7 days)' + Description: Schedule expression for automatic refresh (e.g., 'rate(7 days)' or 'cron(0 2 ? * SUN *)') + + EnableSchedule: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Enable scheduled automatic refresh + + # Lambda Configuration + LambdaTimeout: + Type: Number + Default: 900 + MinValue: 60 + MaxValue: 900 + Description: Lambda function timeout in seconds (max 15 minutes) + + LambdaMemorySize: + Type: Number + Default: 256 + MinValue: 128 + MaxValue: 1024 + Description: Lambda function memory size in MB + +Conditions: + ScheduleEnabled: !Equals [!Ref EnableSchedule, 'true'] + HasSubnetGroup: !Not [!Equals [!Ref RDSCloneSubnetGroup, '']] + HasSecurityGroups: !Not [!Equals [!Join ['', !Ref RDSCloneSecurityGroups], '']] + HasParameterGroup: !Not [!Equals [!Ref RDSCloneParameterGroup, '']] + HasStorageType: !Not [!Equals [!Ref RDSCloneStorageType, '']] + HasSnapshotId: !Not [!Equals [!Ref RDSSnapshotIdentifier, '']] + +Globals: + Function: + Timeout: !Ref LambdaTimeout + MemorySize: !Ref LambdaMemorySize + Runtime: provided.al2023 + Architectures: + - arm64 + +Resources: + RDSRefreshFunction: + Type: AWS::Serverless::Function + Metadata: + BuildMethod: go1.x + Properties: + CodeUri: ../../ + Handler: bootstrap + Description: Automates DBLab full refresh using temporary RDS/Aurora clones + Environment: + Variables: + RDS_SOURCE_TYPE: !Ref RDSSourceType + RDS_SOURCE_IDENTIFIER: !Ref RDSSourceIdentifier + RDS_SNAPSHOT_IDENTIFIER: !If [HasSnapshotId, !Ref RDSSnapshotIdentifier, ''] + RDS_CLONE_INSTANCE_CLASS: !Ref RDSCloneInstanceClass + RDS_CLONE_SUBNET_GROUP: !If [HasSubnetGroup, !Ref RDSCloneSubnetGroup, ''] + RDS_CLONE_SECURITY_GROUPS: !If [HasSecurityGroups, !Sub '["${RDSCloneSecurityGroups}"]', ''] + RDS_CLONE_PUBLIC: !Ref RDSClonePubliclyAccessible + RDS_CLONE_ENABLE_IAM_AUTH: !Ref RDSCloneEnableIAMAuth + RDS_CLONE_PARAMETER_GROUP: !If [HasParameterGroup, !Ref RDSCloneParameterGroup, ''] + RDS_CLONE_STORAGE_TYPE: !If [HasStorageType, !Ref RDSCloneStorageType, ''] + DBLAB_API_ENDPOINT: !Ref DBLabAPIEndpoint + DBLAB_TOKEN: !Ref DBLabToken + DBLAB_INSECURE: !Ref DBLabInsecure + Policies: + - Version: '2012-10-17' + Statement: + - Sid: RDSReadSnapshots + Effect: Allow + Action: + - rds:DescribeDBSnapshots + - rds:DescribeDBClusterSnapshots + - rds:DescribeDBInstances + - rds:DescribeDBClusters + Resource: '*' + - Sid: RDSCreateClone + Effect: Allow + Action: + - rds:RestoreDBInstanceFromDBSnapshot + - rds:RestoreDBClusterFromSnapshot + - rds:CreateDBInstance + - rds:AddTagsToResource + - rds:ModifyDBInstance + - rds:ModifyDBCluster + Resource: + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:db:dblab-refresh-*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster:dblab-refresh-*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:snapshot:*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster-snapshot:*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:subgrp:*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:pg:*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:og:*' + - Sid: RDSDeleteClone + Effect: Allow + Action: + - rds:DeleteDBInstance + - rds:DeleteDBCluster + Resource: + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:db:dblab-refresh-*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster:dblab-refresh-*' + Events: + ScheduledRefresh: + Type: Schedule + Properties: + Schedule: !Ref ScheduleExpression + Description: Scheduled DBLab refresh trigger + Enabled: !If [ScheduleEnabled, true, false] + + RDSRefreshLogGroup: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: !Sub '/aws/lambda/${RDSRefreshFunction}' + RetentionInDays: 30 + +Outputs: + RDSRefreshFunctionArn: + Description: ARN of the RDS Refresh Lambda function + Value: !GetAtt RDSRefreshFunction.Arn + Export: + Name: !Sub '${AWS::StackName}-FunctionArn' + + RDSRefreshFunctionName: + Description: Name of the RDS Refresh Lambda function + Value: !Ref RDSRefreshFunction + + InvocationCommand: + Description: AWS CLI command to manually invoke the function + Value: !Sub | + aws lambda invoke --function-name ${RDSRefreshFunction} \ + --cli-binary-format raw-in-base64-out \ + --payload '{"dryRun": false}' \ + response.json && cat response.json + + DryRunCommand: + Description: AWS CLI command to run a dry-run test + Value: !Sub | + aws lambda invoke --function-name ${RDSRefreshFunction} \ + --cli-binary-format raw-in-base64-out \ + --payload '{"dryRun": true}' \ + response.json && cat response.json diff --git a/engine/go.mod b/engine/go.mod index 81d0e24c..cf8d3ada 100644 --- a/engine/go.mod +++ b/engine/go.mod @@ -42,6 +42,22 @@ require ( dario.cat/mergo v1.0.2 // indirect github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/aws/aws-lambda-go v1.51.0 // indirect + github.com/aws/aws-sdk-go-v2 v1.41.0 // indirect + github.com/aws/aws-sdk-go-v2/config v1.32.5 // indirect + github.com/aws/aws-sdk-go-v2/credentials v1.19.5 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 // indirect + github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16 // indirect + github.com/aws/aws-sdk-go-v2/service/rds v1.113.1 // indirect + github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.30.7 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.41.5 // indirect + github.com/aws/smithy-go v1.24.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect diff --git a/engine/go.sum b/engine/go.sum index 16595c52..8d6ff5a5 100644 --- a/engine/go.sum +++ b/engine/go.sum @@ -14,8 +14,40 @@ github.com/ahmetalpbalkan/dlog v0.0.0-20170105205344-4fb5f8204f26 h1:pzStYMLAXM7 github.com/ahmetalpbalkan/dlog v0.0.0-20170105205344-4fb5f8204f26/go.mod h1:ilK+u7u1HoqaDk0mjhh27QJB7PyWMreGffEvOCoEKiY= github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA= github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw= +github.com/aws/aws-lambda-go v1.51.0 h1:/THH60NjiAs3K5TWet3Gx5w8MdR7oPOQH9utaKYY1JQ= +github.com/aws/aws-lambda-go v1.51.0/go.mod h1:dpMpZgvWx5vuQJfBt0zqBha60q7Dd7RfgJv23DymV8A= github.com/aws/aws-sdk-go v1.44.309 h1:IPJOFBzXekakxmEpDwd4RTKmmBR6LIAiXgNsM51bWbU= github.com/aws/aws-sdk-go v1.44.309/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go-v2 v1.41.0 h1:tNvqh1s+v0vFYdA1xq0aOJH+Y5cRyZ5upu6roPgPKd4= +github.com/aws/aws-sdk-go-v2 v1.41.0/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0= +github.com/aws/aws-sdk-go-v2/config v1.32.5 h1:pz3duhAfUgnxbtVhIK39PGF/AHYyrzGEyRD9Og0QrE8= +github.com/aws/aws-sdk-go-v2/config v1.32.5/go.mod h1:xmDjzSUs/d0BB7ClzYPAZMmgQdrodNjPPhd6bGASwoE= +github.com/aws/aws-sdk-go-v2/credentials v1.19.5 h1:xMo63RlqP3ZZydpJDMBsH9uJ10hgHYfQFIk1cHDXrR4= +github.com/aws/aws-sdk-go-v2/credentials v1.19.5/go.mod h1:hhbH6oRcou+LpXfA/0vPElh/e0M3aFeOblE1sssAAEk= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 h1:80+uETIWS1BqjnN9uJ0dBUaETh+P1XwFy5vwHwK5r9k= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16/go.mod h1:wOOsYuxYuB/7FlnVtzeBYRcjSRtQpAW0hCP7tIULMwo= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 h1:rgGwPzb82iBYSvHMHXc8h9mRoOUBZIGFgKb9qniaZZc= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16/go.mod h1:L/UxsGeKpGoIj6DxfhOWHWQ/kGKcd4I1VncE4++IyKA= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 h1:1jtGzuV7c82xnqOVfx2F0xmJcOw5374L7N6juGW6x6U= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16/go.mod h1:M2E5OQf+XLe+SZGmmpaI2yy+J326aFf6/+54PoxSANc= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 h1:0ryTNEdJbzUCEWkVXEXoqlXV72J5keC1GvILMOuD00E= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4/go.mod h1:HQ4qwNZh32C3CBeO6iJLQlgtMzqeG17ziAA/3KDJFow= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16 h1:oHjJHeUy0ImIV0bsrX0X91GkV5nJAyv1l1CC9lnO0TI= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16/go.mod h1:iRSNGgOYmiYwSCXxXaKb9HfOEj40+oTKn8pTxMlYkRM= +github.com/aws/aws-sdk-go-v2/service/rds v1.113.1 h1:/vV0g/Su8rCTqT57UUYiFU/aRrPXz//fGDn1dkXblG4= +github.com/aws/aws-sdk-go-v2/service/rds v1.113.1/go.mod h1:q02df+DL73LN+jDXzj86tMsI6kKf1kfv61nB684H+o8= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 h1:HpI7aMmJ+mm1wkSHIA2t5EaFFv5EFYXePW30p1EIrbQ= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.4/go.mod h1:C5RdGMYGlfM0gYq/tifqgn4EbyX99V15P2V3R+VHbQU= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.7 h1:eYnlt6QxnFINKzwxP5/Ucs1vkG7VT3Iezmvfgc2waUw= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.7/go.mod h1:+fWt2UHSb4kS7Pu8y+BMBvJF0EWx+4H0hzNwtDNRTrg= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 h1:AHDr0DaHIAo8c9t1emrzAlVDFp+iMMKnPdYy6XO4MCE= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12/go.mod h1:GQ73XawFFiWxyWXMHWfhiomvP3tXtdNar/fi8z18sx0= +github.com/aws/aws-sdk-go-v2/service/sts v1.41.5 h1:SciGFVNZ4mHdm7gpD1dgZYnCuVdX1s+lFTg4+4DOy70= +github.com/aws/aws-sdk-go-v2/service/sts v1.41.5/go.mod h1:iW40X4QBmUxdP+fZNOpfmkdMZqsovezbAeO+Ubiv2pk= +github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk= +github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cockroachdb/apd v1.1.0 h1:3LFP3629v+1aKXU5Q37mxmRxX/pIu1nijXydLShEq5I= diff --git a/engine/internal/rdsrefresh/config.go b/engine/internal/rdsrefresh/config.go new file mode 100644 index 00000000..4f114f93 --- /dev/null +++ b/engine/internal/rdsrefresh/config.go @@ -0,0 +1,190 @@ +/* +2024 © Postgres.ai +*/ + +// Package rdsrefresh provides functionality to automate DBLab full refresh +// using temporary RDS/Aurora clones created from snapshots. +package rdsrefresh + +import ( + "fmt" + "os" + "time" + + "gopkg.in/yaml.v3" +) + +// Config holds the configuration for the RDS refresh component. +type Config struct { + Source SourceConfig `yaml:"source"` + Clone CloneConfig `yaml:"clone"` + DBLab DBLabConfig `yaml:"dblab"` + AWS AWSConfig `yaml:"aws"` +} + +// SourceConfig defines the source RDS/Aurora database to clone from. +type SourceConfig struct { + // Type specifies the source type: "rds" for RDS instance, "aurora-cluster" for Aurora cluster. + Type string `yaml:"type"` + // Identifier is the RDS DB instance identifier or Aurora cluster identifier. + Identifier string `yaml:"identifier"` + // SnapshotIdentifier is the specific snapshot to use. If empty, the latest automated snapshot is used. + SnapshotIdentifier string `yaml:"snapshotIdentifier"` +} + +// CloneConfig defines settings for the temporary clone. +type CloneConfig struct { + // InstanceClass is the DB instance class for the clone (e.g., "db.t3.medium"). + InstanceClass string `yaml:"instanceClass"` + // DBSubnetGroupName is the DB subnet group for the clone. + DBSubnetGroupName string `yaml:"subnetGroup"` + // VPCSecurityGroupIDs are the security group IDs to assign to the clone. + VPCSecurityGroupIDs []string `yaml:"securityGroups"` + // PubliclyAccessible determines if the clone should be publicly accessible. + PubliclyAccessible bool `yaml:"publiclyAccessible"` + // Tags are additional tags to add to the clone. + Tags map[string]string `yaml:"tags"` + // ParameterGroupName is the parameter group to use for the clone. + ParameterGroupName string `yaml:"parameterGroup"` + // OptionGroupName is the option group to use for the clone (RDS only). + OptionGroupName string `yaml:"optionGroup"` + // DBClusterParameterGroupName is the cluster parameter group for Aurora clones. + DBClusterParameterGroupName string `yaml:"clusterParameterGroup"` + // EngineVersion specifies the engine version for the clone. If empty, uses source version. + EngineVersion string `yaml:"engineVersion"` + // Port is the port for the clone. If 0, uses default port. + Port int32 `yaml:"port"` + // EnableIAMAuth enables IAM database authentication. + EnableIAMAuth bool `yaml:"enableIAMAuth"` + // StorageType specifies storage type (gp2, gp3, io1, etc.) for RDS clones. + StorageType string `yaml:"storageType"` + // DeletionProtection enables deletion protection on the clone. + DeletionProtection bool `yaml:"deletionProtection"` +} + +// DBLabConfig defines the DBLab Engine connection settings. +type DBLabConfig struct { + // APIEndpoint is the DBLab Engine API endpoint (e.g., "https://dblab.example.com:2345"). + APIEndpoint string `yaml:"apiEndpoint"` + // Token is the verification token for the DBLab API. + Token string `yaml:"token"` + // Insecure allows connections to DBLab with invalid TLS certificates. + Insecure bool `yaml:"insecure"` + // PollInterval is how often to poll the DBLab status during refresh. + PollInterval Duration `yaml:"pollInterval"` + // Timeout is the maximum time to wait for the refresh to complete. + Timeout Duration `yaml:"timeout"` +} + +// AWSConfig holds AWS-specific settings. +type AWSConfig struct { + // Region is the AWS region where the RDS/Aurora resources are located. + Region string `yaml:"region"` + // Endpoint is a custom AWS endpoint (useful for testing with LocalStack). + Endpoint string `yaml:"endpoint"` +} + +// Duration is a wrapper around time.Duration for YAML parsing. +type Duration time.Duration + +// UnmarshalYAML implements yaml.Unmarshaler for Duration. +func (d *Duration) UnmarshalYAML(value *yaml.Node) error { + var s string + if err := value.Decode(&s); err != nil { + return err + } + + dur, err := time.ParseDuration(s) + if err != nil { + return fmt.Errorf("invalid duration %q: %w", s, err) + } + + *d = Duration(dur) + + return nil +} + +// MarshalYAML implements yaml.Marshaler for Duration. +func (d Duration) MarshalYAML() (interface{}, error) { + return time.Duration(d).String(), nil +} + +// Duration returns the time.Duration value. +func (d Duration) Duration() time.Duration { + return time.Duration(d) +} + +// LoadConfig loads configuration from a YAML file. +func LoadConfig(path string) (*Config, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read config file: %w", err) + } + + // Expand environment variables in the config + data = []byte(os.ExpandEnv(string(data))) + + var cfg Config + if err := yaml.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("failed to parse config file: %w", err) + } + + if err := cfg.Validate(); err != nil { + return nil, fmt.Errorf("invalid configuration: %w", err) + } + + cfg.SetDefaults() + + return &cfg, nil +} + +// Validate checks that the configuration is valid. +func (c *Config) Validate() error { + if c.Source.Type == "" { + return fmt.Errorf("source.type is required (rds or aurora-cluster)") + } + + if c.Source.Type != "rds" && c.Source.Type != "aurora-cluster" { + return fmt.Errorf("source.type must be 'rds' or 'aurora-cluster', got %q", c.Source.Type) + } + + if c.Source.Identifier == "" { + return fmt.Errorf("source.identifier is required") + } + + if c.Clone.InstanceClass == "" { + return fmt.Errorf("clone.instanceClass is required") + } + + if c.DBLab.APIEndpoint == "" { + return fmt.Errorf("dblab.apiEndpoint is required") + } + + if c.DBLab.Token == "" { + return fmt.Errorf("dblab.token is required") + } + + if c.AWS.Region == "" { + return fmt.Errorf("aws.region is required") + } + + return nil +} + +// SetDefaults sets default values for optional configuration fields. +func (c *Config) SetDefaults() { + if c.DBLab.PollInterval == 0 { + c.DBLab.PollInterval = Duration(30 * time.Second) + } + + if c.DBLab.Timeout == 0 { + c.DBLab.Timeout = Duration(4 * time.Hour) + } + + if c.Clone.Tags == nil { + c.Clone.Tags = make(map[string]string) + } + + c.Clone.Tags["ManagedBy"] = "dblab-rds-refresh" + c.Clone.Tags["AutoDelete"] = "true" +} diff --git a/engine/internal/rdsrefresh/dblab.go b/engine/internal/rdsrefresh/dblab.go new file mode 100644 index 00000000..83892e7e --- /dev/null +++ b/engine/internal/rdsrefresh/dblab.go @@ -0,0 +1,205 @@ +/* +2024 © Postgres.ai +*/ + +package rdsrefresh + +import ( + "bytes" + "context" + "crypto/tls" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "gitlab.com/postgres-ai/database-lab/v3/pkg/models" +) + +const ( + verificationHeader = "Verification-Token" + contentTypeJSON = "application/json" +) + +// DBLabClient provides methods to interact with the DBLab Engine API. +type DBLabClient struct { + baseURL string + token string + httpClient *http.Client +} + +// NewDBLabClient creates a new DBLab API client. +func NewDBLabClient(cfg *DBLabConfig) *DBLabClient { + transport := &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: cfg.Insecure}, + } + + return &DBLabClient{ + baseURL: cfg.APIEndpoint, + token: cfg.Token, + httpClient: &http.Client{ + Transport: transport, + Timeout: 60 * time.Second, + }, + } +} + +// GetStatus returns the current DBLab Engine instance status. +func (c *DBLabClient) GetStatus(ctx context.Context) (*models.InstanceStatus, error) { + resp, err := c.doRequest(ctx, http.MethodGet, "/status", nil) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var status models.InstanceStatus + if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { + return nil, fmt.Errorf("failed to decode status response: %w", err) + } + + return &status, nil +} + +// TriggerFullRefresh triggers a full data refresh on the DBLab Engine. +func (c *DBLabClient) TriggerFullRefresh(ctx context.Context) error { + resp, err := c.doRequest(ctx, http.MethodPost, "/full-refresh", nil) + if err != nil { + return err + } + defer resp.Body.Close() + + var result models.Response + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return fmt.Errorf("failed to decode response: %w", err) + } + + if result.Status != "OK" { + return fmt.Errorf("full refresh failed: %s", result.Message) + } + + return nil +} + +// UpdateConfig updates the DBLab Engine configuration. +func (c *DBLabClient) UpdateConfig(ctx context.Context, configPatch map[string]interface{}) error { + body, err := json.Marshal(configPatch) + if err != nil { + return fmt.Errorf("failed to marshal config: %w", err) + } + + resp, err := c.doRequest(ctx, http.MethodPatch, "/config", bytes.NewReader(body)) + if err != nil { + return err + } + defer resp.Body.Close() + + return nil +} + +// WaitForRefreshComplete polls the DBLab status until refresh is complete or timeout. +func (c *DBLabClient) WaitForRefreshComplete(ctx context.Context, pollInterval, timeout time.Duration) error { + ticker := time.NewTicker(pollInterval) + defer ticker.Stop() + + timeoutTimer := time.NewTimer(timeout) + defer timeoutTimer.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-timeoutTimer.C: + return fmt.Errorf("timeout waiting for refresh to complete after %v", timeout) + case <-ticker.C: + status, err := c.GetStatus(ctx) + if err != nil { + return fmt.Errorf("failed to get status: %w", err) + } + + retrievalStatus := status.Retrieving.Status + + switch retrievalStatus { + case models.Finished: + return nil + case models.Failed: + if len(status.Retrieving.Alerts) > 0 { + for _, alert := range status.Retrieving.Alerts { + return fmt.Errorf("refresh failed: %s", alert.Message) + } + } + + return fmt.Errorf("refresh failed (no details available)") + case models.Refreshing, models.Snapshotting, models.Renewed: + // still in progress + continue + case models.Inactive, models.Pending: + // not started yet or pending + continue + default: + continue + } + } + } +} + +// IsRefreshInProgress checks if a refresh is currently in progress. +func (c *DBLabClient) IsRefreshInProgress(ctx context.Context) (bool, error) { + status, err := c.GetStatus(ctx) + if err != nil { + return false, err + } + + switch status.Retrieving.Status { + case models.Refreshing, models.Snapshotting: + return true, nil + default: + return false, nil + } +} + +// Health checks if the DBLab Engine is healthy. +func (c *DBLabClient) Health(ctx context.Context) error { + resp, err := c.doRequest(ctx, http.MethodGet, "/healthz", nil) + if err != nil { + return err + } + defer resp.Body.Close() + + return nil +} + +func (c *DBLabClient) doRequest(ctx context.Context, method, path string, body io.Reader) (*http.Response, error) { + url := c.baseURL + path + + req, err := http.NewRequestWithContext(ctx, method, url, body) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set(verificationHeader, c.token) + + if body != nil { + req.Header.Set("Content-Type", contentTypeJSON) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + + if resp.StatusCode >= http.StatusBadRequest { + defer resp.Body.Close() + + bodyBytes, _ := io.ReadAll(resp.Body) + + var errModel models.Error + if err := json.Unmarshal(bodyBytes, &errModel); err == nil && errModel.Message != "" { + return nil, fmt.Errorf("API error (status %d): %s", resp.StatusCode, errModel.Message) + } + + return nil, fmt.Errorf("API error (status %d): %s", resp.StatusCode, string(bodyBytes)) + } + + return resp, nil +} diff --git a/engine/internal/rdsrefresh/lambda.go b/engine/internal/rdsrefresh/lambda.go new file mode 100644 index 00000000..c38896dc --- /dev/null +++ b/engine/internal/rdsrefresh/lambda.go @@ -0,0 +1,174 @@ +/* +2024 © Postgres.ai +*/ + +package rdsrefresh + +import ( + "context" + "encoding/json" + "fmt" + "os" +) + +// LambdaEvent is the input event for the Lambda function. +type LambdaEvent struct { + // DryRun, if true, only validates configuration without creating resources. + DryRun bool `json:"dryRun"` + // ConfigOverrides allows overriding configuration values. + ConfigOverrides *ConfigOverrides `json:"configOverrides"` +} + +// ConfigOverrides allows partial configuration overrides via the Lambda event. +type ConfigOverrides struct { + SnapshotIdentifier string `json:"snapshotIdentifier"` +} + +// LambdaResponse is the output response from the Lambda function. +type LambdaResponse struct { + Success bool `json:"success"` + Message string `json:"message"` + SnapshotID string `json:"snapshotId,omitempty"` + CloneID string `json:"cloneId,omitempty"` + CloneEndpoint string `json:"cloneEndpoint,omitempty"` + DurationSec int64 `json:"durationSeconds,omitempty"` + Error string `json:"error,omitempty"` +} + +// LambdaLogger implements Logger for Lambda/CloudWatch. +type LambdaLogger struct{} + +// Info logs an info message. +func (l *LambdaLogger) Info(msg string, args ...interface{}) { + fmt.Printf("[INFO] "+msg+"\n", args...) +} + +// Error logs an error message. +func (l *LambdaLogger) Error(msg string, args ...interface{}) { + fmt.Printf("[ERROR] "+msg+"\n", args...) +} + +// Debug logs a debug message. +func (l *LambdaLogger) Debug(msg string, args ...interface{}) { + fmt.Printf("[DEBUG] "+msg+"\n", args...) +} + +// HandleLambda is the Lambda function handler. +func HandleLambda(ctx context.Context, event LambdaEvent) (LambdaResponse, error) { + logger := &LambdaLogger{} + + cfg, err := loadLambdaConfig() + if err != nil { + return LambdaResponse{ + Success: false, + Error: err.Error(), + Message: "failed to load configuration", + }, nil + } + + // Apply overrides + if event.ConfigOverrides != nil && event.ConfigOverrides.SnapshotIdentifier != "" { + cfg.Source.SnapshotIdentifier = event.ConfigOverrides.SnapshotIdentifier + } + + refresher, err := NewRefresher(ctx, cfg, logger) + if err != nil { + return LambdaResponse{ + Success: false, + Error: err.Error(), + Message: "failed to initialize refresher", + }, nil + } + + if event.DryRun { + if err := refresher.DryRun(ctx); err != nil { + return LambdaResponse{ + Success: false, + Error: err.Error(), + Message: "dry run failed", + }, nil + } + + return LambdaResponse{ + Success: true, + Message: "dry run completed successfully", + }, nil + } + + result := refresher.Run(ctx) + + resp := LambdaResponse{ + Success: result.Success, + SnapshotID: result.SnapshotID, + CloneID: result.CloneID, + CloneEndpoint: result.CloneEndpoint, + DurationSec: int64(result.Duration.Seconds()), + } + + if result.Error != nil { + resp.Error = result.Error.Error() + resp.Message = "refresh failed" + } else { + resp.Message = "refresh completed successfully" + } + + return resp, nil +} + +// loadLambdaConfig loads configuration from environment variables. +func loadLambdaConfig() (*Config, error) { + cfg := &Config{} + + // Source configuration + cfg.Source.Type = getEnvOrDefault("RDS_SOURCE_TYPE", "rds") + cfg.Source.Identifier = os.Getenv("RDS_SOURCE_IDENTIFIER") + cfg.Source.SnapshotIdentifier = os.Getenv("RDS_SNAPSHOT_IDENTIFIER") + + // Clone configuration + cfg.Clone.InstanceClass = os.Getenv("RDS_CLONE_INSTANCE_CLASS") + cfg.Clone.DBSubnetGroupName = os.Getenv("RDS_CLONE_SUBNET_GROUP") + + if sgJSON := os.Getenv("RDS_CLONE_SECURITY_GROUPS"); sgJSON != "" { + if err := json.Unmarshal([]byte(sgJSON), &cfg.Clone.VPCSecurityGroupIDs); err != nil { + return nil, fmt.Errorf("invalid RDS_CLONE_SECURITY_GROUPS JSON: %w", err) + } + } + + cfg.Clone.PubliclyAccessible = os.Getenv("RDS_CLONE_PUBLIC") == "true" + cfg.Clone.ParameterGroupName = os.Getenv("RDS_CLONE_PARAMETER_GROUP") + cfg.Clone.OptionGroupName = os.Getenv("RDS_CLONE_OPTION_GROUP") + cfg.Clone.DBClusterParameterGroupName = os.Getenv("RDS_CLONE_CLUSTER_PARAMETER_GROUP") + cfg.Clone.EnableIAMAuth = os.Getenv("RDS_CLONE_ENABLE_IAM_AUTH") == "true" + cfg.Clone.StorageType = os.Getenv("RDS_CLONE_STORAGE_TYPE") + + // Parse tags from JSON + if tagsJSON := os.Getenv("RDS_CLONE_TAGS"); tagsJSON != "" { + if err := json.Unmarshal([]byte(tagsJSON), &cfg.Clone.Tags); err != nil { + return nil, fmt.Errorf("invalid RDS_CLONE_TAGS JSON: %w", err) + } + } + + // DBLab configuration + cfg.DBLab.APIEndpoint = os.Getenv("DBLAB_API_ENDPOINT") + cfg.DBLab.Token = os.Getenv("DBLAB_TOKEN") + cfg.DBLab.Insecure = os.Getenv("DBLAB_INSECURE") == "true" + + // AWS configuration + cfg.AWS.Region = os.Getenv("AWS_REGION") + + if err := cfg.Validate(); err != nil { + return nil, err + } + + cfg.SetDefaults() + + return cfg, nil +} + +func getEnvOrDefault(key, defaultValue string) string { + if v := os.Getenv(key); v != "" { + return v + } + + return defaultValue +} diff --git a/engine/internal/rdsrefresh/rds.go b/engine/internal/rdsrefresh/rds.go new file mode 100644 index 00000000..f39d382d --- /dev/null +++ b/engine/internal/rdsrefresh/rds.go @@ -0,0 +1,509 @@ +/* +2024 © Postgres.ai +*/ + +package rdsrefresh + +import ( + "context" + "fmt" + "sort" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/rds" + "github.com/aws/aws-sdk-go-v2/service/rds/types" +) + +const ( + cloneNamePrefix = "dblab-refresh-" + waitPollInterval = 30 * time.Second + maxWaitTime = 2 * time.Hour + defaultPort int32 = 5432 +) + +// RDSClient wraps the AWS RDS client with convenience methods. +type RDSClient struct { + client *rds.Client + cfg *Config +} + +// CloneInfo holds information about a created clone. +type CloneInfo struct { + Identifier string + Endpoint string + Port int32 + IsCluster bool +} + +// NewRDSClient creates a new RDS client. +func NewRDSClient(ctx context.Context, cfg *Config) (*RDSClient, error) { + awsCfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(cfg.AWS.Region)) + if err != nil { + return nil, fmt.Errorf("failed to load AWS config: %w", err) + } + + var opts []func(*rds.Options) + if cfg.AWS.Endpoint != "" { + opts = append(opts, func(o *rds.Options) { + o.BaseEndpoint = aws.String(cfg.AWS.Endpoint) + }) + } + + return &RDSClient{ + client: rds.NewFromConfig(awsCfg, opts...), + cfg: cfg, + }, nil +} + +// FindLatestSnapshot finds the latest available snapshot for the source. +func (r *RDSClient) FindLatestSnapshot(ctx context.Context) (string, error) { + if r.cfg.Source.SnapshotIdentifier != "" { + return r.cfg.Source.SnapshotIdentifier, nil + } + + if r.cfg.Source.Type == "aurora-cluster" { + return r.findLatestClusterSnapshot(ctx) + } + + return r.findLatestDBSnapshot(ctx) +} + +func (r *RDSClient) findLatestDBSnapshot(ctx context.Context) (string, error) { + input := &rds.DescribeDBSnapshotsInput{ + DBInstanceIdentifier: aws.String(r.cfg.Source.Identifier), + SnapshotType: aws.String("automated"), + } + + result, err := r.client.DescribeDBSnapshots(ctx, input) + if err != nil { + return "", fmt.Errorf("failed to describe DB snapshots: %w", err) + } + + if len(result.DBSnapshots) == 0 { + return "", fmt.Errorf("no automated snapshots found for RDS instance %q", r.cfg.Source.Identifier) + } + + // Sort by creation time (newest first) + sort.Slice(result.DBSnapshots, func(i, j int) bool { + ti := result.DBSnapshots[i].SnapshotCreateTime + tj := result.DBSnapshots[j].SnapshotCreateTime + + if ti == nil || tj == nil { + return ti != nil + } + + return ti.After(*tj) + }) + + // Find the first available snapshot + for _, snap := range result.DBSnapshots { + if snap.Status != nil && *snap.Status == "available" { + return *snap.DBSnapshotIdentifier, nil + } + } + + return "", fmt.Errorf("no available snapshots found for RDS instance %q", r.cfg.Source.Identifier) +} + +func (r *RDSClient) findLatestClusterSnapshot(ctx context.Context) (string, error) { + input := &rds.DescribeDBClusterSnapshotsInput{ + DBClusterIdentifier: aws.String(r.cfg.Source.Identifier), + SnapshotType: aws.String("automated"), + } + + result, err := r.client.DescribeDBClusterSnapshots(ctx, input) + if err != nil { + return "", fmt.Errorf("failed to describe DB cluster snapshots: %w", err) + } + + if len(result.DBClusterSnapshots) == 0 { + return "", fmt.Errorf("no automated snapshots found for Aurora cluster %q", r.cfg.Source.Identifier) + } + + // Sort by creation time (newest first) + sort.Slice(result.DBClusterSnapshots, func(i, j int) bool { + ti := result.DBClusterSnapshots[i].SnapshotCreateTime + tj := result.DBClusterSnapshots[j].SnapshotCreateTime + + if ti == nil || tj == nil { + return ti != nil + } + + return ti.After(*tj) + }) + + // Find the first available snapshot + for _, snap := range result.DBClusterSnapshots { + if snap.Status != nil && *snap.Status == "available" { + return *snap.DBClusterSnapshotIdentifier, nil + } + } + + return "", fmt.Errorf("no available snapshots found for Aurora cluster %q", r.cfg.Source.Identifier) +} + +// CreateClone creates a temporary clone from a snapshot. +func (r *RDSClient) CreateClone(ctx context.Context, snapshotID string) (*CloneInfo, error) { + cloneName := fmt.Sprintf("%s%s", cloneNamePrefix, time.Now().UTC().Format("20060102-150405")) + + if r.cfg.Source.Type == "aurora-cluster" { + return r.createAuroraClone(ctx, snapshotID, cloneName) + } + + return r.createRDSClone(ctx, snapshotID, cloneName) +} + +func (r *RDSClient) createRDSClone(ctx context.Context, snapshotID, cloneName string) (*CloneInfo, error) { + tags := r.buildTags() + + input := &rds.RestoreDBInstanceFromDBSnapshotInput{ + DBInstanceIdentifier: aws.String(cloneName), + DBSnapshotIdentifier: aws.String(snapshotID), + DBInstanceClass: aws.String(r.cfg.Clone.InstanceClass), + PubliclyAccessible: aws.Bool(r.cfg.Clone.PubliclyAccessible), + Tags: tags, + DeletionProtection: aws.Bool(r.cfg.Clone.DeletionProtection), + } + + if r.cfg.Clone.DBSubnetGroupName != "" { + input.DBSubnetGroupName = aws.String(r.cfg.Clone.DBSubnetGroupName) + } + + if len(r.cfg.Clone.VPCSecurityGroupIDs) > 0 { + input.VpcSecurityGroupIds = r.cfg.Clone.VPCSecurityGroupIDs + } + + if r.cfg.Clone.ParameterGroupName != "" { + input.DBParameterGroupName = aws.String(r.cfg.Clone.ParameterGroupName) + } + + if r.cfg.Clone.OptionGroupName != "" { + input.OptionGroupName = aws.String(r.cfg.Clone.OptionGroupName) + } + + if r.cfg.Clone.Port > 0 { + input.Port = aws.Int32(r.cfg.Clone.Port) + } + + if r.cfg.Clone.EnableIAMAuth { + input.EnableIAMDatabaseAuthentication = aws.Bool(true) + } + + if r.cfg.Clone.StorageType != "" { + input.StorageType = aws.String(r.cfg.Clone.StorageType) + } + + _, err := r.client.RestoreDBInstanceFromDBSnapshot(ctx, input) + if err != nil { + return nil, fmt.Errorf("failed to restore DB instance from snapshot: %w", err) + } + + return &CloneInfo{ + Identifier: cloneName, + IsCluster: false, + }, nil +} + +func (r *RDSClient) createAuroraClone(ctx context.Context, snapshotID, cloneName string) (*CloneInfo, error) { + tags := r.buildTags() + + // First, restore the Aurora cluster + clusterInput := &rds.RestoreDBClusterFromSnapshotInput{ + DBClusterIdentifier: aws.String(cloneName), + SnapshotIdentifier: aws.String(snapshotID), + Tags: tags, + DeletionProtection: aws.Bool(r.cfg.Clone.DeletionProtection), + } + + if r.cfg.Clone.DBSubnetGroupName != "" { + clusterInput.DBSubnetGroupName = aws.String(r.cfg.Clone.DBSubnetGroupName) + } + + if len(r.cfg.Clone.VPCSecurityGroupIDs) > 0 { + clusterInput.VpcSecurityGroupIds = r.cfg.Clone.VPCSecurityGroupIDs + } + + if r.cfg.Clone.DBClusterParameterGroupName != "" { + clusterInput.DBClusterParameterGroupName = aws.String(r.cfg.Clone.DBClusterParameterGroupName) + } + + if r.cfg.Clone.EngineVersion != "" { + clusterInput.EngineVersion = aws.String(r.cfg.Clone.EngineVersion) + } + + if r.cfg.Clone.Port > 0 { + clusterInput.Port = aws.Int32(r.cfg.Clone.Port) + } + + if r.cfg.Clone.EnableIAMAuth { + clusterInput.EnableIAMDatabaseAuthentication = aws.Bool(true) + } + + // Get the engine from the snapshot + snapshotResp, err := r.client.DescribeDBClusterSnapshots(ctx, &rds.DescribeDBClusterSnapshotsInput{ + DBClusterSnapshotIdentifier: aws.String(snapshotID), + }) + if err != nil { + return nil, fmt.Errorf("failed to describe cluster snapshot: %w", err) + } + + if len(snapshotResp.DBClusterSnapshots) == 0 { + return nil, fmt.Errorf("snapshot %q not found", snapshotID) + } + + snapshot := snapshotResp.DBClusterSnapshots[0] + clusterInput.Engine = snapshot.Engine + + _, err = r.client.RestoreDBClusterFromSnapshot(ctx, clusterInput) + if err != nil { + return nil, fmt.Errorf("failed to restore DB cluster from snapshot: %w", err) + } + + // Wait for cluster to be available before creating instance + if err := r.waitForClusterAvailable(ctx, cloneName); err != nil { + // Try to clean up the cluster + _ = r.deleteAuroraCluster(ctx, cloneName) + return nil, fmt.Errorf("cluster did not become available: %w", err) + } + + // Create a DB instance in the cluster + instanceName := cloneName + "-instance" + instanceInput := &rds.CreateDBInstanceInput{ + DBInstanceIdentifier: aws.String(instanceName), + DBInstanceClass: aws.String(r.cfg.Clone.InstanceClass), + DBClusterIdentifier: aws.String(cloneName), + Engine: snapshot.Engine, + Tags: tags, + } + + if r.cfg.Clone.ParameterGroupName != "" { + instanceInput.DBParameterGroupName = aws.String(r.cfg.Clone.ParameterGroupName) + } + + _, err = r.client.CreateDBInstance(ctx, instanceInput) + if err != nil { + // Try to clean up the cluster + _ = r.deleteAuroraCluster(ctx, cloneName) + return nil, fmt.Errorf("failed to create DB instance in cluster: %w", err) + } + + return &CloneInfo{ + Identifier: cloneName, + IsCluster: true, + }, nil +} + +func (r *RDSClient) buildTags() []types.Tag { + tags := make([]types.Tag, 0, len(r.cfg.Clone.Tags)) + + for k, v := range r.cfg.Clone.Tags { + tags = append(tags, types.Tag{ + Key: aws.String(k), + Value: aws.String(v), + }) + } + + return tags +} + +// WaitForCloneAvailable waits for the clone to become available and returns connection info. +func (r *RDSClient) WaitForCloneAvailable(ctx context.Context, clone *CloneInfo) error { + if clone.IsCluster { + instanceName := clone.Identifier + "-instance" + + if err := r.waitForInstanceAvailable(ctx, instanceName); err != nil { + return err + } + + // Get the cluster endpoint + clusterResp, err := r.client.DescribeDBClusters(ctx, &rds.DescribeDBClustersInput{ + DBClusterIdentifier: aws.String(clone.Identifier), + }) + if err != nil { + return fmt.Errorf("failed to describe cluster: %w", err) + } + + if len(clusterResp.DBClusters) == 0 { + return fmt.Errorf("cluster %q not found", clone.Identifier) + } + + cluster := clusterResp.DBClusters[0] + clone.Endpoint = aws.ToString(cluster.Endpoint) + clone.Port = aws.ToInt32(cluster.Port) + + if clone.Port == 0 { + clone.Port = defaultPort + } + + return nil + } + + if err := r.waitForInstanceAvailable(ctx, clone.Identifier); err != nil { + return err + } + + // Get the instance endpoint + instanceResp, err := r.client.DescribeDBInstances(ctx, &rds.DescribeDBInstancesInput{ + DBInstanceIdentifier: aws.String(clone.Identifier), + }) + if err != nil { + return fmt.Errorf("failed to describe instance: %w", err) + } + + if len(instanceResp.DBInstances) == 0 { + return fmt.Errorf("instance %q not found", clone.Identifier) + } + + instance := instanceResp.DBInstances[0] + + if instance.Endpoint != nil { + clone.Endpoint = aws.ToString(instance.Endpoint.Address) + clone.Port = aws.ToInt32(instance.Endpoint.Port) + } + + if clone.Port == 0 { + clone.Port = defaultPort + } + + return nil +} + +func (r *RDSClient) waitForInstanceAvailable(ctx context.Context, identifier string) error { + waiter := rds.NewDBInstanceAvailableWaiter(r.client) + + return waiter.Wait(ctx, &rds.DescribeDBInstancesInput{ + DBInstanceIdentifier: aws.String(identifier), + }, maxWaitTime) +} + +func (r *RDSClient) waitForClusterAvailable(ctx context.Context, identifier string) error { + waiter := rds.NewDBClusterAvailableWaiter(r.client) + + return waiter.Wait(ctx, &rds.DescribeDBClustersInput{ + DBClusterIdentifier: aws.String(identifier), + }, maxWaitTime) +} + +// DeleteClone deletes the temporary clone. +func (r *RDSClient) DeleteClone(ctx context.Context, clone *CloneInfo) error { + if clone.IsCluster { + return r.deleteAuroraCluster(ctx, clone.Identifier) + } + + return r.deleteRDSInstance(ctx, clone.Identifier) +} + +func (r *RDSClient) deleteRDSInstance(ctx context.Context, identifier string) error { + // First, disable deletion protection if enabled + _, _ = r.client.ModifyDBInstance(ctx, &rds.ModifyDBInstanceInput{ + DBInstanceIdentifier: aws.String(identifier), + DeletionProtection: aws.Bool(false), + ApplyImmediately: aws.Bool(true), + }) + + _, err := r.client.DeleteDBInstance(ctx, &rds.DeleteDBInstanceInput{ + DBInstanceIdentifier: aws.String(identifier), + SkipFinalSnapshot: aws.Bool(true), + DeleteAutomatedBackups: aws.Bool(true), + }) + + if err != nil { + return fmt.Errorf("failed to delete DB instance: %w", err) + } + + return nil +} + +func (r *RDSClient) deleteAuroraCluster(ctx context.Context, clusterIdentifier string) error { + // First, delete all instances in the cluster + instancesResp, err := r.client.DescribeDBInstances(ctx, &rds.DescribeDBInstancesInput{ + Filters: []types.Filter{ + { + Name: aws.String("db-cluster-id"), + Values: []string{clusterIdentifier}, + }, + }, + }) + if err != nil { + return fmt.Errorf("failed to list cluster instances: %w", err) + } + + for _, instance := range instancesResp.DBInstances { + if err := r.deleteRDSInstance(ctx, aws.ToString(instance.DBInstanceIdentifier)); err != nil { + return fmt.Errorf("failed to delete cluster instance: %w", err) + } + } + + // Wait for all instances to be deleted + for _, instance := range instancesResp.DBInstances { + waiter := rds.NewDBInstanceDeletedWaiter(r.client) + + if err := waiter.Wait(ctx, &rds.DescribeDBInstancesInput{ + DBInstanceIdentifier: instance.DBInstanceIdentifier, + }, maxWaitTime); err != nil { + return fmt.Errorf("failed waiting for instance deletion: %w", err) + } + } + + // Disable deletion protection on cluster + _, _ = r.client.ModifyDBCluster(ctx, &rds.ModifyDBClusterInput{ + DBClusterIdentifier: aws.String(clusterIdentifier), + DeletionProtection: aws.Bool(false), + ApplyImmediately: aws.Bool(true), + }) + + // Delete the cluster + _, err = r.client.DeleteDBCluster(ctx, &rds.DeleteDBClusterInput{ + DBClusterIdentifier: aws.String(clusterIdentifier), + SkipFinalSnapshot: aws.Bool(true), + }) + + if err != nil { + return fmt.Errorf("failed to delete DB cluster: %w", err) + } + + return nil +} + +// GetSourceInfo returns information about the source database. +func (r *RDSClient) GetSourceInfo(ctx context.Context) (string, error) { + if r.cfg.Source.Type == "aurora-cluster" { + resp, err := r.client.DescribeDBClusters(ctx, &rds.DescribeDBClustersInput{ + DBClusterIdentifier: aws.String(r.cfg.Source.Identifier), + }) + if err != nil { + return "", fmt.Errorf("failed to describe source cluster: %w", err) + } + + if len(resp.DBClusters) == 0 { + return "", fmt.Errorf("source cluster %q not found", r.cfg.Source.Identifier) + } + + cluster := resp.DBClusters[0] + + return fmt.Sprintf("Aurora cluster %s (engine: %s, version: %s)", + r.cfg.Source.Identifier, + aws.ToString(cluster.Engine), + aws.ToString(cluster.EngineVersion)), nil + } + + resp, err := r.client.DescribeDBInstances(ctx, &rds.DescribeDBInstancesInput{ + DBInstanceIdentifier: aws.String(r.cfg.Source.Identifier), + }) + if err != nil { + return "", fmt.Errorf("failed to describe source instance: %w", err) + } + + if len(resp.DBInstances) == 0 { + return "", fmt.Errorf("source instance %q not found", r.cfg.Source.Identifier) + } + + instance := resp.DBInstances[0] + + return fmt.Sprintf("RDS instance %s (engine: %s, version: %s)", + r.cfg.Source.Identifier, + aws.ToString(instance.Engine), + aws.ToString(instance.EngineVersion)), nil +} diff --git a/engine/internal/rdsrefresh/refresher.go b/engine/internal/rdsrefresh/refresher.go new file mode 100644 index 00000000..e0349383 --- /dev/null +++ b/engine/internal/rdsrefresh/refresher.go @@ -0,0 +1,243 @@ +/* +2024 © Postgres.ai +*/ + +package rdsrefresh + +import ( + "context" + "fmt" + "time" +) + +// Logger defines the logging interface. +type Logger interface { + Info(msg string, args ...interface{}) + Error(msg string, args ...interface{}) + Debug(msg string, args ...interface{}) +} + +// DefaultLogger is a simple stdout logger. +type DefaultLogger struct{} + +// Info logs an info message. +func (l *DefaultLogger) Info(msg string, args ...interface{}) { + fmt.Printf("[INFO] "+msg+"\n", args...) +} + +// Error logs an error message. +func (l *DefaultLogger) Error(msg string, args ...interface{}) { + fmt.Printf("[ERROR] "+msg+"\n", args...) +} + +// Debug logs a debug message. +func (l *DefaultLogger) Debug(msg string, args ...interface{}) { + fmt.Printf("[DEBUG] "+msg+"\n", args...) +} + +// Refresher orchestrates the RDS/Aurora clone and DBLab refresh workflow. +type Refresher struct { + cfg *Config + rds *RDSClient + dblab *DBLabClient + logger Logger +} + +// RefreshResult contains the result of a refresh operation. +type RefreshResult struct { + Success bool + SnapshotID string + CloneID string + StartTime time.Time + EndTime time.Time + Duration time.Duration + Error error + CloneEndpoint string +} + +// NewRefresher creates a new Refresher instance. +func NewRefresher(ctx context.Context, cfg *Config, logger Logger) (*Refresher, error) { + if logger == nil { + logger = &DefaultLogger{} + } + + rdsClient, err := NewRDSClient(ctx, cfg) + if err != nil { + return nil, fmt.Errorf("failed to create RDS client: %w", err) + } + + dblabClient := NewDBLabClient(&cfg.DBLab) + + return &Refresher{ + cfg: cfg, + rds: rdsClient, + dblab: dblabClient, + logger: logger, + }, nil +} + +// Run executes the full refresh workflow: +// 1. Verifies DBLab is healthy and not already refreshing +// 2. Finds the latest snapshot +// 3. Creates a temporary clone from the snapshot +// 4. Waits for the clone to be available +// 5. Triggers DBLab full refresh +// 6. Waits for refresh to complete +// 7. Deletes the temporary clone +func (r *Refresher) Run(ctx context.Context) *RefreshResult { + result := &RefreshResult{ + StartTime: time.Now(), + } + + defer func() { + result.EndTime = time.Now() + result.Duration = result.EndTime.Sub(result.StartTime) + }() + + // Step 1: Check DBLab health and status + r.logger.Info("Checking DBLab Engine health...") + + if err := r.dblab.Health(ctx); err != nil { + result.Error = fmt.Errorf("DBLab health check failed: %w", err) + return result + } + + inProgress, err := r.dblab.IsRefreshInProgress(ctx) + if err != nil { + result.Error = fmt.Errorf("failed to check DBLab status: %w", err) + return result + } + + if inProgress { + result.Error = fmt.Errorf("refresh already in progress, skipping") + return result + } + + // Step 2: Get source info + r.logger.Info("Checking source database...") + + sourceInfo, err := r.rds.GetSourceInfo(ctx) + if err != nil { + result.Error = fmt.Errorf("failed to get source info: %w", err) + return result + } + + r.logger.Info("Source: %s", sourceInfo) + + // Step 3: Find latest snapshot + r.logger.Info("Finding latest snapshot...") + + snapshotID, err := r.rds.FindLatestSnapshot(ctx) + if err != nil { + result.Error = fmt.Errorf("failed to find snapshot: %w", err) + return result + } + + result.SnapshotID = snapshotID + r.logger.Info("Using snapshot: %s", snapshotID) + + // Step 4: Create temporary clone + r.logger.Info("Creating temporary RDS clone from snapshot...") + + clone, err := r.rds.CreateClone(ctx, snapshotID) + if err != nil { + result.Error = fmt.Errorf("failed to create clone: %w", err) + return result + } + + result.CloneID = clone.Identifier + r.logger.Info("Created clone: %s", clone.Identifier) + + // Ensure cleanup on any exit + defer func() { + r.logger.Info("Cleaning up temporary clone %s...", clone.Identifier) + + if deleteErr := r.rds.DeleteClone(context.Background(), clone); deleteErr != nil { + r.logger.Error("Failed to delete clone %s: %v (manual cleanup may be required)", clone.Identifier, deleteErr) + } else { + r.logger.Info("Successfully deleted temporary clone %s", clone.Identifier) + } + }() + + // Step 5: Wait for clone to be available + r.logger.Info("Waiting for clone to become available (this may take 10-30 minutes)...") + + if err := r.rds.WaitForCloneAvailable(ctx, clone); err != nil { + result.Error = fmt.Errorf("clone did not become available: %w", err) + return result + } + + result.CloneEndpoint = clone.Endpoint + r.logger.Info("Clone available at: %s:%d", clone.Endpoint, clone.Port) + + // Step 6: Trigger DBLab full refresh + r.logger.Info("Triggering DBLab full refresh...") + + if err := r.dblab.TriggerFullRefresh(ctx); err != nil { + result.Error = fmt.Errorf("failed to trigger refresh: %w", err) + return result + } + + r.logger.Info("Full refresh triggered, waiting for completion...") + + // Step 7: Wait for refresh to complete + pollInterval := r.cfg.DBLab.PollInterval.Duration() + timeout := r.cfg.DBLab.Timeout.Duration() + + if err := r.dblab.WaitForRefreshComplete(ctx, pollInterval, timeout); err != nil { + result.Error = fmt.Errorf("refresh did not complete: %w", err) + return result + } + + r.logger.Info("DBLab refresh completed successfully!") + result.Success = true + + return result +} + +// DryRun performs all validation steps without actually creating resources. +func (r *Refresher) DryRun(ctx context.Context) error { + r.logger.Info("=== DRY RUN MODE ===") + + // Check DBLab + r.logger.Info("Checking DBLab Engine health...") + + if err := r.dblab.Health(ctx); err != nil { + return fmt.Errorf("DBLab health check failed: %w", err) + } + + r.logger.Info("DBLab Engine is healthy") + + // Check current status + status, err := r.dblab.GetStatus(ctx) + if err != nil { + return fmt.Errorf("failed to get DBLab status: %w", err) + } + + r.logger.Info("DBLab retrieval status: %s", status.Retrieving.Status) + + // Check source + r.logger.Info("Checking source database...") + + sourceInfo, err := r.rds.GetSourceInfo(ctx) + if err != nil { + return fmt.Errorf("failed to get source info: %w", err) + } + + r.logger.Info("Source: %s", sourceInfo) + + // Check snapshot + r.logger.Info("Finding latest snapshot...") + + snapshotID, err := r.rds.FindLatestSnapshot(ctx) + if err != nil { + return fmt.Errorf("failed to find snapshot: %w", err) + } + + r.logger.Info("Would use snapshot: %s", snapshotID) + r.logger.Info("Would create clone with instance class: %s", r.cfg.Clone.InstanceClass) + + r.logger.Info("=== DRY RUN COMPLETE - All checks passed ===") + + return nil +} From 28291d6036e8904b754d41078c8f4d08ab15bc31 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Dec 2025 02:12:57 +0000 Subject: [PATCH 2/6] feat: add standalone rds-refresh component Add a fully standalone version of the RDS/Aurora refresh tool that can be built and deployed immediately without requiring a DBLab Engine release. This version has no dependencies on DBLab internal packages. The standalone component includes: - Self-contained Go module with minimal dependencies - Makefile for easy building (CLI, Lambda, Docker) - SAM template for AWS Lambda deployment - Dockerfile for container builds - Comprehensive documentation Can be used immediately by: 1. go build -o rds-refresh . 2. ./rds-refresh -config config.yaml --- rds-refresh/Dockerfile | 39 +++ rds-refresh/Makefile | 49 ++++ rds-refresh/README.md | 321 ++++++++++++++++++++ rds-refresh/config.example.yaml | 86 ++++++ rds-refresh/config.go | 186 ++++++++++++ rds-refresh/dblab.go | 238 +++++++++++++++ rds-refresh/go.mod | 28 ++ rds-refresh/go.sum | 42 +++ rds-refresh/main.go | 321 ++++++++++++++++++++ rds-refresh/rds.go | 505 ++++++++++++++++++++++++++++++++ rds-refresh/refresher.go | 243 +++++++++++++++ rds-refresh/template.yaml | 241 +++++++++++++++ 12 files changed, 2299 insertions(+) create mode 100644 rds-refresh/Dockerfile create mode 100644 rds-refresh/Makefile create mode 100644 rds-refresh/README.md create mode 100644 rds-refresh/config.example.yaml create mode 100644 rds-refresh/config.go create mode 100644 rds-refresh/dblab.go create mode 100644 rds-refresh/go.mod create mode 100644 rds-refresh/go.sum create mode 100644 rds-refresh/main.go create mode 100644 rds-refresh/rds.go create mode 100644 rds-refresh/refresher.go create mode 100644 rds-refresh/template.yaml diff --git a/rds-refresh/Dockerfile b/rds-refresh/Dockerfile new file mode 100644 index 00000000..8863d1b6 --- /dev/null +++ b/rds-refresh/Dockerfile @@ -0,0 +1,39 @@ +# Build stage +FROM golang:1.21-alpine AS builder + +RUN apk add --no-cache git ca-certificates + +WORKDIR /build + +# Copy go mod files first for better caching +COPY go.mod go.sum ./ +RUN go mod download + +# Copy source code +COPY *.go ./ + +# Build the binary +ARG VERSION=dev +ARG BUILD_TIME=unknown + +RUN CGO_ENABLED=0 GOOS=linux go build \ + -ldflags="-s -w -X main.version=${VERSION} -X main.buildTime=${BUILD_TIME}" \ + -o /rds-refresh \ + . + +# Runtime stage +FROM alpine:3.19 + +RUN apk add --no-cache ca-certificates tzdata + +# Create non-root user +RUN adduser -D -u 1000 appuser + +WORKDIR /app + +COPY --from=builder /rds-refresh /usr/local/bin/rds-refresh + +USER appuser + +ENTRYPOINT ["/usr/local/bin/rds-refresh"] +CMD ["--help"] diff --git a/rds-refresh/Makefile b/rds-refresh/Makefile new file mode 100644 index 00000000..158083f2 --- /dev/null +++ b/rds-refresh/Makefile @@ -0,0 +1,49 @@ +.PHONY: build build-linux build-lambda clean test fmt vet + +VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo "dev") +BUILD_TIME ?= $(shell date -u +"%Y-%m-%dT%H:%M:%SZ") +LDFLAGS = -ldflags "-s -w -X main.version=$(VERSION) -X main.buildTime=$(BUILD_TIME)" + +# Build for current platform +build: + go build $(LDFLAGS) -o rds-refresh . + +# Build for Linux (for Docker/Lambda) +build-linux: + GOOS=linux GOARCH=amd64 go build $(LDFLAGS) -o rds-refresh-linux-amd64 . + GOOS=linux GOARCH=arm64 go build $(LDFLAGS) -o rds-refresh-linux-arm64 . + +# Build Lambda bootstrap binary +build-lambda: + GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build $(LDFLAGS) -o bootstrap . + zip rds-refresh-lambda.zip bootstrap + rm bootstrap + +# Clean build artifacts +clean: + rm -f rds-refresh rds-refresh-linux-* bootstrap rds-refresh-lambda.zip + +# Run tests +test: + go test -v ./... + +# Format code +fmt: + go fmt ./... + +# Run go vet +vet: + go vet ./... + +# Download dependencies +deps: + go mod download + go mod tidy + +# Run locally (requires config.yaml) +run: + go run . -config config.yaml + +# Run dry-run locally +dry-run: + go run . -config config.yaml -dry-run diff --git a/rds-refresh/README.md b/rds-refresh/README.md new file mode 100644 index 00000000..eecd1b68 --- /dev/null +++ b/rds-refresh/README.md @@ -0,0 +1,321 @@ +# DBLab RDS/Aurora Refresh + +A standalone tool that automates DBLab Engine full refresh using temporary RDS or Aurora clones created from snapshots. + +## Overview + +This tool provides a hassle-free way to keep your DBLab Engine data synchronized with your production RDS/Aurora database: + +1. **Creates a temporary clone** from the latest RDS/Aurora snapshot +2. **Triggers DBLab full refresh** to sync data from the clone +3. **Deletes the temporary clone** after refresh completes + +This approach avoids impacting your production database during the data sync process. + +## Quick Start + +### Build + +```bash +# Clone this repository +git clone https://github.com/postgres-ai/rds-refresh.git +cd rds-refresh + +# Build +make build + +# Or build directly +go build -o rds-refresh . +``` + +### Configure + +```bash +# Copy example config +cp config.example.yaml config.yaml + +# Edit with your settings +vim config.yaml +``` + +### Run + +```bash +# Dry run (validates configuration) +./rds-refresh -config config.yaml -dry-run + +# Full refresh +./rds-refresh -config config.yaml +``` + +## Deployment Options + +### Option 1: AWS Lambda (Recommended) + +Deploy as a serverless function with automatic scheduling via EventBridge. + +#### Prerequisites + +- [AWS SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-sam-cli.html) +- AWS credentials configured +- Go 1.21+ + +#### Deploy + +```bash +# Build and deploy +sam build +sam deploy --guided +``` + +During guided deployment, you'll be prompted for: + +| Parameter | Description | Example | +|-----------|-------------|---------| +| `RDSSourceType` | `rds` or `aurora-cluster` | `rds` | +| `RDSSourceIdentifier` | Source DB identifier | `production-db` | +| `RDSCloneInstanceClass` | Clone instance size | `db.t3.medium` | +| `DBLabAPIEndpoint` | DBLab API URL | `https://dblab.example.com:2345` | +| `DBLabToken` | DBLab verification token | `your-secret-token` | +| `ScheduleExpression` | Refresh schedule | `rate(7 days)` | + +#### Manual Invocation + +```bash +# Dry run +aws lambda invoke --function-name dblab-rds-refresh \ + --cli-binary-format raw-in-base64-out \ + --payload '{"dryRun": true}' \ + response.json && cat response.json + +# Full refresh +aws lambda invoke --function-name dblab-rds-refresh \ + --cli-binary-format raw-in-base64-out \ + --payload '{"dryRun": false}' \ + response.json && cat response.json +``` + +### Option 2: CLI with Cron + +```bash +# Build +make build + +# Install +sudo mv rds-refresh /usr/local/bin/ + +# Create config +sudo mkdir -p /etc/dblab +sudo cp config.example.yaml /etc/dblab/rds-refresh.yaml +sudo vim /etc/dblab/rds-refresh.yaml + +# Add to crontab (every Sunday at 2 AM) +echo "0 2 * * 0 /usr/local/bin/rds-refresh -config /etc/dblab/rds-refresh.yaml >> /var/log/rds-refresh.log 2>&1" | crontab - +``` + +### Option 3: Docker + +```bash +# Build +docker build -t rds-refresh . + +# Run +docker run \ + -v /path/to/config.yaml:/config.yaml \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DBLAB_TOKEN \ + rds-refresh -config /config.yaml +``` + +### Option 4: Kubernetes CronJob + +```yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: dblab-rds-refresh +spec: + schedule: "0 2 * * 0" # Every Sunday at 2 AM + jobTemplate: + spec: + template: + spec: + serviceAccountName: dblab-rds-refresh # with IRSA + containers: + - name: rds-refresh + image: your-registry/rds-refresh:latest + args: ["-config", "/config/config.yaml"] + volumeMounts: + - name: config + mountPath: /config + env: + - name: DBLAB_TOKEN + valueFrom: + secretKeyRef: + name: dblab-secrets + key: token + volumes: + - name: config + configMap: + name: rds-refresh-config + restartPolicy: OnFailure +``` + +## Configuration + +See [config.example.yaml](config.example.yaml) for a fully documented example. + +### Environment Variables + +When running as Lambda, configuration is loaded from environment variables: + +| Variable | Required | Description | +|----------|----------|-------------| +| `RDS_SOURCE_IDENTIFIER` | Yes | Source RDS instance or Aurora cluster ID | +| `RDS_CLONE_INSTANCE_CLASS` | Yes | Instance class for clone (e.g., `db.t3.medium`) | +| `DBLAB_API_ENDPOINT` | Yes | DBLab Engine API endpoint | +| `DBLAB_TOKEN` | Yes | DBLab verification token | +| `AWS_REGION` | Yes | AWS region | +| `RDS_SOURCE_TYPE` | No | `rds` or `aurora-cluster` (default: `rds`) | +| `RDS_SNAPSHOT_IDENTIFIER` | No | Specific snapshot ID (default: latest) | +| `RDS_CLONE_SUBNET_GROUP` | No | DB subnet group name | +| `RDS_CLONE_SECURITY_GROUPS` | No | JSON array of security group IDs | +| `RDS_CLONE_PUBLIC` | No | `true` to make clone publicly accessible | +| `RDS_CLONE_ENABLE_IAM_AUTH` | No | `true` to enable IAM authentication | +| `RDS_CLONE_STORAGE_TYPE` | No | Storage type (gp2, gp3, io1, etc.) | +| `DBLAB_INSECURE` | No | `true` to skip TLS verification | + +## AWS IAM Permissions + +The tool requires the following IAM permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "RDSReadSnapshots", + "Effect": "Allow", + "Action": [ + "rds:DescribeDBSnapshots", + "rds:DescribeDBClusterSnapshots", + "rds:DescribeDBInstances", + "rds:DescribeDBClusters" + ], + "Resource": "*" + }, + { + "Sid": "RDSCreateClone", + "Effect": "Allow", + "Action": [ + "rds:RestoreDBInstanceFromDBSnapshot", + "rds:RestoreDBClusterFromSnapshot", + "rds:CreateDBInstance", + "rds:AddTagsToResource", + "rds:ModifyDBInstance", + "rds:ModifyDBCluster" + ], + "Resource": [ + "arn:aws:rds:*:ACCOUNT_ID:db:dblab-refresh-*", + "arn:aws:rds:*:ACCOUNT_ID:cluster:dblab-refresh-*", + "arn:aws:rds:*:ACCOUNT_ID:snapshot:*", + "arn:aws:rds:*:ACCOUNT_ID:cluster-snapshot:*", + "arn:aws:rds:*:ACCOUNT_ID:subgrp:*", + "arn:aws:rds:*:ACCOUNT_ID:pg:*", + "arn:aws:rds:*:ACCOUNT_ID:og:*" + ] + }, + { + "Sid": "RDSDeleteClone", + "Effect": "Allow", + "Action": [ + "rds:DeleteDBInstance", + "rds:DeleteDBCluster" + ], + "Resource": [ + "arn:aws:rds:*:ACCOUNT_ID:db:dblab-refresh-*", + "arn:aws:rds:*:ACCOUNT_ID:cluster:dblab-refresh-*" + ] + } + ] +} +``` + +Replace `ACCOUNT_ID` with your AWS account ID. + +## DBLab Engine Configuration + +Configure DBLab Engine to connect to the temporary clone. The clone will be named `dblab-refresh-YYYYMMDD-HHMMSS`. + +Example DBLab retrieval configuration: + +```yaml +retrieval: + refresh: + timetable: "" # Disable built-in scheduler (managed externally) + skipStartRefresh: true + + jobs: + - logicalDump + - logicalRestore + - logicalSnapshot + + spec: + logicalDump: + options: + source: + type: rdsIam + connection: + dbname: mydb + username: dblab_user + rdsIam: + awsRegion: us-east-1 + dbInstanceIdentifier: dblab-refresh-current # Will be the temp clone +``` + +## Troubleshooting + +### Common Issues + +**Clone creation fails with "DBSubnetGroup not found"** +- Ensure the subnet group exists and is in the correct VPC + +**Clone not accessible from DBLab** +- Verify security groups allow inbound connections from DBLab +- Check if `publiclyAccessible` setting matches your network topology + +**DBLab refresh timeout** +- Increase `dblab.timeout` in configuration +- Check DBLab Engine logs for issues + +**AWS credentials not found** +- Ensure AWS credentials are configured (env vars, IAM role, or credentials file) + +### Debug Mode + +```bash +# Enable verbose AWS SDK logging +export AWS_SDK_LOAD_CONFIG=1 +./rds-refresh -config config.yaml 2>&1 | tee refresh.log +``` + +## Cost Considerations + +- **Clone runtime**: You pay for the clone instance while it exists +- **Storage**: Clones don't duplicate storage (snapshot-based) +- **Lambda**: Minimal cost (typically < $0.10/month for weekly refreshes) + +**Cost optimization tips**: +- Use a smaller instance class than production +- Use `gp3` storage type for better price/performance +- Schedule refreshes during off-peak hours + +## License + +Apache 2.0 + +## Links + +- [DBLab Engine Documentation](https://postgres.ai/docs/database-lab-engine) +- [Postgres.ai](https://postgres.ai) diff --git a/rds-refresh/config.example.yaml b/rds-refresh/config.example.yaml new file mode 100644 index 00000000..cec82385 --- /dev/null +++ b/rds-refresh/config.example.yaml @@ -0,0 +1,86 @@ +# Example configuration for rds-refresh +# +# Copy this file to config.yaml and customize for your environment. + +# Source database configuration +source: + # Type of source database: + # - "rds" for RDS DB instance + # - "aurora-cluster" for Aurora cluster + type: rds + + # RDS DB instance identifier or Aurora cluster identifier + identifier: production-db + + # Optional: Specific snapshot identifier to use + # If empty, the latest automated snapshot will be used + # snapshotIdentifier: rds:production-db-2024-01-15-02-00 + +# Temporary clone configuration +clone: + # Instance class for the clone (can be smaller than production) + instanceClass: db.t3.medium + + # DB subnet group (must be in a VPC accessible from DBLab Engine) + subnetGroup: default-vpc-subnet + + # VPC security groups for the clone + # Must allow inbound connections from DBLab Engine on PostgreSQL port + securityGroups: + - sg-12345678 + + # Whether the clone should be publicly accessible + # Set to false if DBLab is in the same VPC + publiclyAccessible: false + + # Enable IAM database authentication (recommended) + enableIAMAuth: true + + # Optional: DB parameter group name + # parameterGroup: custom-postgres-params + + # Optional: DB option group name (RDS only) + # optionGroup: custom-options + + # Optional: Cluster parameter group (Aurora only) + # clusterParameterGroup: aurora-postgres-params + + # Optional: Custom port (default: 5432) + # port: 5432 + + # Optional: Storage type (gp2, gp3, io1, io2) + # storageType: gp3 + + # Deletion protection (should be false for temporary clones) + deletionProtection: false + + # Additional tags for the clone + tags: + Environment: dblab-refresh + Team: platform + +# DBLab Engine configuration +dblab: + # DBLab Engine API endpoint + apiEndpoint: https://dblab.example.com:2345 + + # Verification token for DBLab API + # Use environment variable expansion for security + token: ${DBLAB_TOKEN} + + # Skip TLS certificate verification (not recommended for production) + insecure: false + + # How often to poll DBLab status during refresh + pollInterval: 30s + + # Maximum time to wait for refresh to complete + timeout: 4h + +# AWS configuration +aws: + # AWS region where RDS/Aurora resources are located + region: us-east-1 + + # Optional: Custom AWS endpoint (for testing with LocalStack) + # endpoint: http://localhost:4566 diff --git a/rds-refresh/config.go b/rds-refresh/config.go new file mode 100644 index 00000000..588a9aa4 --- /dev/null +++ b/rds-refresh/config.go @@ -0,0 +1,186 @@ +/* +2024 © Postgres.ai +*/ + +package main + +import ( + "fmt" + "os" + "time" + + "gopkg.in/yaml.v3" +) + +// Config holds the configuration for the RDS refresh component. +type Config struct { + Source SourceConfig `yaml:"source"` + Clone CloneConfig `yaml:"clone"` + DBLab DBLabConfig `yaml:"dblab"` + AWS AWSConfig `yaml:"aws"` +} + +// SourceConfig defines the source RDS/Aurora database to clone from. +type SourceConfig struct { + // Type specifies the source type: "rds" for RDS instance, "aurora-cluster" for Aurora cluster. + Type string `yaml:"type"` + // Identifier is the RDS DB instance identifier or Aurora cluster identifier. + Identifier string `yaml:"identifier"` + // SnapshotIdentifier is the specific snapshot to use. If empty, the latest automated snapshot is used. + SnapshotIdentifier string `yaml:"snapshotIdentifier"` +} + +// CloneConfig defines settings for the temporary clone. +type CloneConfig struct { + // InstanceClass is the DB instance class for the clone (e.g., "db.t3.medium"). + InstanceClass string `yaml:"instanceClass"` + // DBSubnetGroupName is the DB subnet group for the clone. + DBSubnetGroupName string `yaml:"subnetGroup"` + // VPCSecurityGroupIDs are the security group IDs to assign to the clone. + VPCSecurityGroupIDs []string `yaml:"securityGroups"` + // PubliclyAccessible determines if the clone should be publicly accessible. + PubliclyAccessible bool `yaml:"publiclyAccessible"` + // Tags are additional tags to add to the clone. + Tags map[string]string `yaml:"tags"` + // ParameterGroupName is the parameter group to use for the clone. + ParameterGroupName string `yaml:"parameterGroup"` + // OptionGroupName is the option group to use for the clone (RDS only). + OptionGroupName string `yaml:"optionGroup"` + // DBClusterParameterGroupName is the cluster parameter group for Aurora clones. + DBClusterParameterGroupName string `yaml:"clusterParameterGroup"` + // Port is the port for the clone. If 0, uses default port. + Port int32 `yaml:"port"` + // EnableIAMAuth enables IAM database authentication. + EnableIAMAuth bool `yaml:"enableIAMAuth"` + // StorageType specifies storage type (gp2, gp3, io1, etc.) for RDS clones. + StorageType string `yaml:"storageType"` + // DeletionProtection enables deletion protection on the clone. + DeletionProtection bool `yaml:"deletionProtection"` +} + +// DBLabConfig defines the DBLab Engine connection settings. +type DBLabConfig struct { + // APIEndpoint is the DBLab Engine API endpoint (e.g., "https://dblab.example.com:2345"). + APIEndpoint string `yaml:"apiEndpoint"` + // Token is the verification token for the DBLab API. + Token string `yaml:"token"` + // Insecure allows connections to DBLab with invalid TLS certificates. + Insecure bool `yaml:"insecure"` + // PollInterval is how often to poll the DBLab status during refresh. + PollInterval Duration `yaml:"pollInterval"` + // Timeout is the maximum time to wait for the refresh to complete. + Timeout Duration `yaml:"timeout"` +} + +// AWSConfig holds AWS-specific settings. +type AWSConfig struct { + // Region is the AWS region where the RDS/Aurora resources are located. + Region string `yaml:"region"` + // Endpoint is a custom AWS endpoint (useful for testing with LocalStack). + Endpoint string `yaml:"endpoint"` +} + +// Duration is a wrapper around time.Duration for YAML parsing. +type Duration time.Duration + +// UnmarshalYAML implements yaml.Unmarshaler for Duration. +func (d *Duration) UnmarshalYAML(value *yaml.Node) error { + var s string + if err := value.Decode(&s); err != nil { + return err + } + + dur, err := time.ParseDuration(s) + if err != nil { + return fmt.Errorf("invalid duration %q: %w", s, err) + } + + *d = Duration(dur) + + return nil +} + +// MarshalYAML implements yaml.Marshaler for Duration. +func (d Duration) MarshalYAML() (interface{}, error) { + return time.Duration(d).String(), nil +} + +// Duration returns the time.Duration value. +func (d Duration) Duration() time.Duration { + return time.Duration(d) +} + +// LoadConfig loads configuration from a YAML file. +func LoadConfig(path string) (*Config, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read config file: %w", err) + } + + // Expand environment variables in the config + data = []byte(os.ExpandEnv(string(data))) + + var cfg Config + if err := yaml.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("failed to parse config file: %w", err) + } + + if err := cfg.Validate(); err != nil { + return nil, fmt.Errorf("invalid configuration: %w", err) + } + + cfg.SetDefaults() + + return &cfg, nil +} + +// Validate checks that the configuration is valid. +func (c *Config) Validate() error { + if c.Source.Type == "" { + return fmt.Errorf("source.type is required (rds or aurora-cluster)") + } + + if c.Source.Type != "rds" && c.Source.Type != "aurora-cluster" { + return fmt.Errorf("source.type must be 'rds' or 'aurora-cluster', got %q", c.Source.Type) + } + + if c.Source.Identifier == "" { + return fmt.Errorf("source.identifier is required") + } + + if c.Clone.InstanceClass == "" { + return fmt.Errorf("clone.instanceClass is required") + } + + if c.DBLab.APIEndpoint == "" { + return fmt.Errorf("dblab.apiEndpoint is required") + } + + if c.DBLab.Token == "" { + return fmt.Errorf("dblab.token is required") + } + + if c.AWS.Region == "" { + return fmt.Errorf("aws.region is required") + } + + return nil +} + +// SetDefaults sets default values for optional configuration fields. +func (c *Config) SetDefaults() { + if c.DBLab.PollInterval == 0 { + c.DBLab.PollInterval = Duration(30 * time.Second) + } + + if c.DBLab.Timeout == 0 { + c.DBLab.Timeout = Duration(4 * time.Hour) + } + + if c.Clone.Tags == nil { + c.Clone.Tags = make(map[string]string) + } + + c.Clone.Tags["ManagedBy"] = "dblab-rds-refresh" + c.Clone.Tags["AutoDelete"] = "true" +} diff --git a/rds-refresh/dblab.go b/rds-refresh/dblab.go new file mode 100644 index 00000000..57c590f4 --- /dev/null +++ b/rds-refresh/dblab.go @@ -0,0 +1,238 @@ +/* +2024 © Postgres.ai +*/ + +package main + +import ( + "context" + "crypto/tls" + "encoding/json" + "fmt" + "io" + "net/http" + "time" +) + +const ( + verificationHeader = "Verification-Token" + contentTypeJSON = "application/json" +) + +// RetrievalStatus defines status of refreshing data. +type RetrievalStatus string + +const ( + StatusInactive RetrievalStatus = "inactive" + StatusPending RetrievalStatus = "pending" + StatusFailed RetrievalStatus = "failed" + StatusRefreshing RetrievalStatus = "refreshing" + StatusRenewed RetrievalStatus = "renewed" + StatusSnapshotting RetrievalStatus = "snapshotting" + StatusFinished RetrievalStatus = "finished" +) + +// InstanceStatus represents the DBLab Engine status response. +type InstanceStatus struct { + Status *Status `json:"status"` + Retrieving Retrieving `json:"retrieving"` +} + +// Status represents a generic status. +type Status struct { + Code string `json:"code"` + Message string `json:"message"` +} + +// Retrieving represents state of retrieval subsystem. +type Retrieving struct { + Mode string `json:"mode"` + Status RetrievalStatus `json:"status"` + LastRefresh string `json:"lastRefresh"` + NextRefresh string `json:"nextRefresh"` + Alerts map[string]Alert `json:"alerts"` +} + +// Alert describes an alert. +type Alert struct { + Level string `json:"level"` + Message string `json:"message"` +} + +// APIResponse represents a generic API response. +type APIResponse struct { + Status string `json:"status"` + Message string `json:"message"` +} + +// APIError represents an API error response. +type APIError struct { + Code string `json:"code"` + Message string `json:"message"` +} + +// DBLabClient provides methods to interact with the DBLab Engine API. +type DBLabClient struct { + baseURL string + token string + httpClient *http.Client +} + +// NewDBLabClient creates a new DBLab API client. +func NewDBLabClient(cfg *DBLabConfig) *DBLabClient { + transport := &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: cfg.Insecure}, + } + + return &DBLabClient{ + baseURL: cfg.APIEndpoint, + token: cfg.Token, + httpClient: &http.Client{ + Transport: transport, + Timeout: 60 * time.Second, + }, + } +} + +// GetStatus returns the current DBLab Engine instance status. +func (c *DBLabClient) GetStatus(ctx context.Context) (*InstanceStatus, error) { + resp, err := c.doRequest(ctx, http.MethodGet, "/status", nil) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var status InstanceStatus + if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { + return nil, fmt.Errorf("failed to decode status response: %w", err) + } + + return &status, nil +} + +// TriggerFullRefresh triggers a full data refresh on the DBLab Engine. +func (c *DBLabClient) TriggerFullRefresh(ctx context.Context) error { + resp, err := c.doRequest(ctx, http.MethodPost, "/full-refresh", nil) + if err != nil { + return err + } + defer resp.Body.Close() + + var result APIResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return fmt.Errorf("failed to decode response: %w", err) + } + + if result.Status != "OK" { + return fmt.Errorf("full refresh failed: %s", result.Message) + } + + return nil +} + +// WaitForRefreshComplete polls the DBLab status until refresh is complete or timeout. +func (c *DBLabClient) WaitForRefreshComplete(ctx context.Context, pollInterval, timeout time.Duration) error { + ticker := time.NewTicker(pollInterval) + defer ticker.Stop() + + timeoutTimer := time.NewTimer(timeout) + defer timeoutTimer.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-timeoutTimer.C: + return fmt.Errorf("timeout waiting for refresh to complete after %v", timeout) + case <-ticker.C: + status, err := c.GetStatus(ctx) + if err != nil { + return fmt.Errorf("failed to get status: %w", err) + } + + retrievalStatus := status.Retrieving.Status + + switch retrievalStatus { + case StatusFinished: + return nil + case StatusFailed: + if len(status.Retrieving.Alerts) > 0 { + for _, alert := range status.Retrieving.Alerts { + return fmt.Errorf("refresh failed: %s", alert.Message) + } + } + + return fmt.Errorf("refresh failed (no details available)") + case StatusRefreshing, StatusSnapshotting, StatusRenewed: + // still in progress + continue + case StatusInactive, StatusPending: + // not started yet or pending + continue + default: + continue + } + } + } +} + +// IsRefreshInProgress checks if a refresh is currently in progress. +func (c *DBLabClient) IsRefreshInProgress(ctx context.Context) (bool, error) { + status, err := c.GetStatus(ctx) + if err != nil { + return false, err + } + + switch status.Retrieving.Status { + case StatusRefreshing, StatusSnapshotting: + return true, nil + default: + return false, nil + } +} + +// Health checks if the DBLab Engine is healthy. +func (c *DBLabClient) Health(ctx context.Context) error { + resp, err := c.doRequest(ctx, http.MethodGet, "/healthz", nil) + if err != nil { + return err + } + defer resp.Body.Close() + + return nil +} + +func (c *DBLabClient) doRequest(ctx context.Context, method, path string, body io.Reader) (*http.Response, error) { + url := c.baseURL + path + + req, err := http.NewRequestWithContext(ctx, method, url, body) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set(verificationHeader, c.token) + + if body != nil { + req.Header.Set("Content-Type", contentTypeJSON) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + + if resp.StatusCode >= http.StatusBadRequest { + defer resp.Body.Close() + + bodyBytes, _ := io.ReadAll(resp.Body) + + var errModel APIError + if err := json.Unmarshal(bodyBytes, &errModel); err == nil && errModel.Message != "" { + return nil, fmt.Errorf("API error (status %d): %s", resp.StatusCode, errModel.Message) + } + + return nil, fmt.Errorf("API error (status %d): %s", resp.StatusCode, string(bodyBytes)) + } + + return resp, nil +} diff --git a/rds-refresh/go.mod b/rds-refresh/go.mod new file mode 100644 index 00000000..51b1c9d3 --- /dev/null +++ b/rds-refresh/go.mod @@ -0,0 +1,28 @@ +module github.com/postgres-ai/rds-refresh + +go 1.23 + +toolchain go1.24.7 + +require ( + github.com/aws/aws-lambda-go v1.51.0 + github.com/aws/aws-sdk-go-v2 v1.41.0 + github.com/aws/aws-sdk-go-v2/config v1.32.5 + github.com/aws/aws-sdk-go-v2/service/rds v1.113.1 + gopkg.in/yaml.v3 v3.0.1 +) + +require ( + github.com/aws/aws-sdk-go-v2/credentials v1.19.5 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 // indirect + github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16 // indirect + github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.30.7 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.41.5 // indirect + github.com/aws/smithy-go v1.24.0 // indirect +) diff --git a/rds-refresh/go.sum b/rds-refresh/go.sum new file mode 100644 index 00000000..2907c9d4 --- /dev/null +++ b/rds-refresh/go.sum @@ -0,0 +1,42 @@ +github.com/aws/aws-lambda-go v1.51.0 h1:/THH60NjiAs3K5TWet3Gx5w8MdR7oPOQH9utaKYY1JQ= +github.com/aws/aws-lambda-go v1.51.0/go.mod h1:dpMpZgvWx5vuQJfBt0zqBha60q7Dd7RfgJv23DymV8A= +github.com/aws/aws-sdk-go-v2 v1.41.0 h1:tNvqh1s+v0vFYdA1xq0aOJH+Y5cRyZ5upu6roPgPKd4= +github.com/aws/aws-sdk-go-v2 v1.41.0/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0= +github.com/aws/aws-sdk-go-v2/config v1.32.5 h1:pz3duhAfUgnxbtVhIK39PGF/AHYyrzGEyRD9Og0QrE8= +github.com/aws/aws-sdk-go-v2/config v1.32.5/go.mod h1:xmDjzSUs/d0BB7ClzYPAZMmgQdrodNjPPhd6bGASwoE= +github.com/aws/aws-sdk-go-v2/credentials v1.19.5 h1:xMo63RlqP3ZZydpJDMBsH9uJ10hgHYfQFIk1cHDXrR4= +github.com/aws/aws-sdk-go-v2/credentials v1.19.5/go.mod h1:hhbH6oRcou+LpXfA/0vPElh/e0M3aFeOblE1sssAAEk= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 h1:80+uETIWS1BqjnN9uJ0dBUaETh+P1XwFy5vwHwK5r9k= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16/go.mod h1:wOOsYuxYuB/7FlnVtzeBYRcjSRtQpAW0hCP7tIULMwo= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 h1:rgGwPzb82iBYSvHMHXc8h9mRoOUBZIGFgKb9qniaZZc= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16/go.mod h1:L/UxsGeKpGoIj6DxfhOWHWQ/kGKcd4I1VncE4++IyKA= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 h1:1jtGzuV7c82xnqOVfx2F0xmJcOw5374L7N6juGW6x6U= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16/go.mod h1:M2E5OQf+XLe+SZGmmpaI2yy+J326aFf6/+54PoxSANc= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 h1:0ryTNEdJbzUCEWkVXEXoqlXV72J5keC1GvILMOuD00E= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4/go.mod h1:HQ4qwNZh32C3CBeO6iJLQlgtMzqeG17ziAA/3KDJFow= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16 h1:oHjJHeUy0ImIV0bsrX0X91GkV5nJAyv1l1CC9lnO0TI= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16/go.mod h1:iRSNGgOYmiYwSCXxXaKb9HfOEj40+oTKn8pTxMlYkRM= +github.com/aws/aws-sdk-go-v2/service/rds v1.113.1 h1:/vV0g/Su8rCTqT57UUYiFU/aRrPXz//fGDn1dkXblG4= +github.com/aws/aws-sdk-go-v2/service/rds v1.113.1/go.mod h1:q02df+DL73LN+jDXzj86tMsI6kKf1kfv61nB684H+o8= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 h1:HpI7aMmJ+mm1wkSHIA2t5EaFFv5EFYXePW30p1EIrbQ= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.4/go.mod h1:C5RdGMYGlfM0gYq/tifqgn4EbyX99V15P2V3R+VHbQU= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.7 h1:eYnlt6QxnFINKzwxP5/Ucs1vkG7VT3Iezmvfgc2waUw= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.7/go.mod h1:+fWt2UHSb4kS7Pu8y+BMBvJF0EWx+4H0hzNwtDNRTrg= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 h1:AHDr0DaHIAo8c9t1emrzAlVDFp+iMMKnPdYy6XO4MCE= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12/go.mod h1:GQ73XawFFiWxyWXMHWfhiomvP3tXtdNar/fi8z18sx0= +github.com/aws/aws-sdk-go-v2/service/sts v1.41.5 h1:SciGFVNZ4mHdm7gpD1dgZYnCuVdX1s+lFTg4+4DOy70= +github.com/aws/aws-sdk-go-v2/service/sts v1.41.5/go.mod h1:iW40X4QBmUxdP+fZNOpfmkdMZqsovezbAeO+Ubiv2pk= +github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk= +github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s= +github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/rds-refresh/main.go b/rds-refresh/main.go new file mode 100644 index 00000000..e353b51a --- /dev/null +++ b/rds-refresh/main.go @@ -0,0 +1,321 @@ +/* +2024 © Postgres.ai + +rds-refresh - Automate DBLab full refresh using RDS/Aurora snapshots + +This tool creates a temporary RDS/Aurora clone from a snapshot, triggers +a DBLab Engine full refresh, and then cleans up the temporary clone. +*/ +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "os" + "os/signal" + "syscall" + + "github.com/aws/aws-lambda-go/lambda" +) + +var ( + version = "dev" + buildTime = "unknown" +) + +func main() { + // Check if running in Lambda + if os.Getenv("AWS_LAMBDA_FUNCTION_NAME") != "" { + lambda.Start(HandleLambda) + return + } + + // CLI mode + configPath := flag.String("config", "", "Path to configuration file") + dryRun := flag.Bool("dry-run", false, "Validate configuration without creating resources") + showVersion := flag.Bool("version", false, "Show version information") + help := flag.Bool("help", false, "Show help") + + flag.Usage = printUsage + flag.Parse() + + if *help { + printUsage() + os.Exit(0) + } + + if *showVersion { + fmt.Printf("rds-refresh version %s (built: %s)\n", version, buildTime) + os.Exit(0) + } + + if *configPath == "" { + fmt.Fprintln(os.Stderr, "error: -config flag is required") + printUsage() + os.Exit(1) + } + + if err := run(*configPath, *dryRun); err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } +} + +func run(configPath string, dryRun bool) error { + cfg, err := LoadConfig(configPath) + if err != nil { + return fmt.Errorf("failed to load config: %w", err) + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Handle interrupt signals + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + + go func() { + sig := <-sigCh + fmt.Printf("\nReceived signal %v, initiating graceful shutdown...\n", sig) + cancel() + }() + + logger := &DefaultLogger{} + + refresher, err := NewRefresher(ctx, cfg, logger) + if err != nil { + return fmt.Errorf("failed to initialize refresher: %w", err) + } + + if dryRun { + return refresher.DryRun(ctx) + } + + result := refresher.Run(ctx) + + fmt.Println() + fmt.Println("=== Refresh Summary ===") + fmt.Printf("Success: %v\n", result.Success) + fmt.Printf("Snapshot: %s\n", result.SnapshotID) + fmt.Printf("Clone ID: %s\n", result.CloneID) + fmt.Printf("Duration: %v\n", result.Duration.Round(1e9)) + + if result.Error != nil { + return result.Error + } + + return nil +} + +// LambdaEvent is the input event for the Lambda function. +type LambdaEvent struct { + // DryRun, if true, only validates configuration without creating resources. + DryRun bool `json:"dryRun"` + // ConfigOverrides allows overriding configuration values. + ConfigOverrides *ConfigOverrides `json:"configOverrides"` +} + +// ConfigOverrides allows partial configuration overrides via the Lambda event. +type ConfigOverrides struct { + SnapshotIdentifier string `json:"snapshotIdentifier"` +} + +// LambdaResponse is the output response from the Lambda function. +type LambdaResponse struct { + Success bool `json:"success"` + Message string `json:"message"` + SnapshotID string `json:"snapshotId,omitempty"` + CloneID string `json:"cloneId,omitempty"` + CloneEndpoint string `json:"cloneEndpoint,omitempty"` + DurationSec int64 `json:"durationSeconds,omitempty"` + Error string `json:"error,omitempty"` +} + +// HandleLambda is the Lambda function handler. +func HandleLambda(ctx context.Context, event LambdaEvent) (LambdaResponse, error) { + logger := &DefaultLogger{} + + cfg, err := loadLambdaConfig() + if err != nil { + return LambdaResponse{ + Success: false, + Error: err.Error(), + Message: "failed to load configuration", + }, nil + } + + // Apply overrides + if event.ConfigOverrides != nil && event.ConfigOverrides.SnapshotIdentifier != "" { + cfg.Source.SnapshotIdentifier = event.ConfigOverrides.SnapshotIdentifier + } + + refresher, err := NewRefresher(ctx, cfg, logger) + if err != nil { + return LambdaResponse{ + Success: false, + Error: err.Error(), + Message: "failed to initialize refresher", + }, nil + } + + if event.DryRun { + if err := refresher.DryRun(ctx); err != nil { + return LambdaResponse{ + Success: false, + Error: err.Error(), + Message: "dry run failed", + }, nil + } + + return LambdaResponse{ + Success: true, + Message: "dry run completed successfully", + }, nil + } + + result := refresher.Run(ctx) + + resp := LambdaResponse{ + Success: result.Success, + SnapshotID: result.SnapshotID, + CloneID: result.CloneID, + CloneEndpoint: result.CloneEndpoint, + DurationSec: int64(result.Duration.Seconds()), + } + + if result.Error != nil { + resp.Error = result.Error.Error() + resp.Message = "refresh failed" + } else { + resp.Message = "refresh completed successfully" + } + + return resp, nil +} + +// loadLambdaConfig loads configuration from environment variables. +func loadLambdaConfig() (*Config, error) { + cfg := &Config{} + + // Source configuration + cfg.Source.Type = getEnvOrDefault("RDS_SOURCE_TYPE", "rds") + cfg.Source.Identifier = os.Getenv("RDS_SOURCE_IDENTIFIER") + cfg.Source.SnapshotIdentifier = os.Getenv("RDS_SNAPSHOT_IDENTIFIER") + + // Clone configuration + cfg.Clone.InstanceClass = os.Getenv("RDS_CLONE_INSTANCE_CLASS") + cfg.Clone.DBSubnetGroupName = os.Getenv("RDS_CLONE_SUBNET_GROUP") + + if sgJSON := os.Getenv("RDS_CLONE_SECURITY_GROUPS"); sgJSON != "" { + if err := json.Unmarshal([]byte(sgJSON), &cfg.Clone.VPCSecurityGroupIDs); err != nil { + return nil, fmt.Errorf("invalid RDS_CLONE_SECURITY_GROUPS JSON: %w", err) + } + } + + cfg.Clone.PubliclyAccessible = os.Getenv("RDS_CLONE_PUBLIC") == "true" + cfg.Clone.ParameterGroupName = os.Getenv("RDS_CLONE_PARAMETER_GROUP") + cfg.Clone.OptionGroupName = os.Getenv("RDS_CLONE_OPTION_GROUP") + cfg.Clone.DBClusterParameterGroupName = os.Getenv("RDS_CLONE_CLUSTER_PARAMETER_GROUP") + cfg.Clone.EnableIAMAuth = os.Getenv("RDS_CLONE_ENABLE_IAM_AUTH") == "true" + cfg.Clone.StorageType = os.Getenv("RDS_CLONE_STORAGE_TYPE") + + // Parse tags from JSON + if tagsJSON := os.Getenv("RDS_CLONE_TAGS"); tagsJSON != "" { + if err := json.Unmarshal([]byte(tagsJSON), &cfg.Clone.Tags); err != nil { + return nil, fmt.Errorf("invalid RDS_CLONE_TAGS JSON: %w", err) + } + } + + // DBLab configuration + cfg.DBLab.APIEndpoint = os.Getenv("DBLAB_API_ENDPOINT") + cfg.DBLab.Token = os.Getenv("DBLAB_TOKEN") + cfg.DBLab.Insecure = os.Getenv("DBLAB_INSECURE") == "true" + + // AWS configuration + cfg.AWS.Region = os.Getenv("AWS_REGION") + + if err := cfg.Validate(); err != nil { + return nil, err + } + + cfg.SetDefaults() + + return cfg, nil +} + +func getEnvOrDefault(key, defaultValue string) string { + if v := os.Getenv(key); v != "" { + return v + } + + return defaultValue +} + +func printUsage() { + fmt.Fprintf(os.Stderr, `rds-refresh - Automate DBLab full refresh using RDS/Aurora snapshots + +This tool creates a temporary RDS/Aurora clone from a snapshot, triggers +a DBLab Engine full refresh, and then cleans up the temporary clone. + +USAGE: + rds-refresh -config [options] + +OPTIONS: + -config Path to YAML configuration file (required) + -dry-run Validate configuration without creating resources + -version Show version information + -help Show this help message + +LAMBDA MODE: + When running as an AWS Lambda function (detected via AWS_LAMBDA_FUNCTION_NAME + environment variable), configuration is loaded from environment variables: + + Required: + RDS_SOURCE_IDENTIFIER Source RDS instance or Aurora cluster ID + RDS_CLONE_INSTANCE_CLASS Instance class for the clone (e.g., db.t3.medium) + DBLAB_API_ENDPOINT DBLab Engine API endpoint + DBLAB_TOKEN DBLab verification token + AWS_REGION AWS region + + Optional: + RDS_SOURCE_TYPE "rds" or "aurora-cluster" (default: rds) + RDS_SNAPSHOT_IDENTIFIER Specific snapshot ID (default: latest) + RDS_CLONE_SUBNET_GROUP DB subnet group name + RDS_CLONE_SECURITY_GROUPS JSON array of security group IDs + RDS_CLONE_PUBLIC "true" to make clone publicly accessible + RDS_CLONE_PARAMETER_GROUP DB parameter group name + RDS_CLONE_ENABLE_IAM_AUTH "true" to enable IAM authentication + RDS_CLONE_STORAGE_TYPE Storage type (gp2, gp3, io1, etc.) + RDS_CLONE_TAGS JSON object of additional tags + DBLAB_INSECURE "true" to skip TLS verification + +EXAMPLE CONFIGURATION: + + source: + type: rds + identifier: production-db + + clone: + instanceClass: db.t3.medium + subnetGroup: default-vpc-subnet + securityGroups: + - sg-12345678 + publiclyAccessible: false + enableIAMAuth: true + + dblab: + apiEndpoint: https://dblab.example.com:2345 + token: ${DBLAB_TOKEN} + pollInterval: 30s + timeout: 4h + + aws: + region: us-east-1 + +For more information, see: + https://postgres.ai/docs/database-lab-engine + +`) +} diff --git a/rds-refresh/rds.go b/rds-refresh/rds.go new file mode 100644 index 00000000..91757d0b --- /dev/null +++ b/rds-refresh/rds.go @@ -0,0 +1,505 @@ +/* +2024 © Postgres.ai +*/ + +package main + +import ( + "context" + "fmt" + "sort" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/rds" + "github.com/aws/aws-sdk-go-v2/service/rds/types" +) + +const ( + cloneNamePrefix = "dblab-refresh-" + waitPollInterval = 30 * time.Second + maxWaitTime = 2 * time.Hour + defaultPort int32 = 5432 +) + +// RDSClient wraps the AWS RDS client with convenience methods. +type RDSClient struct { + client *rds.Client + cfg *Config +} + +// CloneInfo holds information about a created clone. +type CloneInfo struct { + Identifier string + Endpoint string + Port int32 + IsCluster bool +} + +// NewRDSClient creates a new RDS client. +func NewRDSClient(ctx context.Context, cfg *Config) (*RDSClient, error) { + awsCfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(cfg.AWS.Region)) + if err != nil { + return nil, fmt.Errorf("failed to load AWS config: %w", err) + } + + var opts []func(*rds.Options) + if cfg.AWS.Endpoint != "" { + opts = append(opts, func(o *rds.Options) { + o.BaseEndpoint = aws.String(cfg.AWS.Endpoint) + }) + } + + return &RDSClient{ + client: rds.NewFromConfig(awsCfg, opts...), + cfg: cfg, + }, nil +} + +// FindLatestSnapshot finds the latest available snapshot for the source. +func (r *RDSClient) FindLatestSnapshot(ctx context.Context) (string, error) { + if r.cfg.Source.SnapshotIdentifier != "" { + return r.cfg.Source.SnapshotIdentifier, nil + } + + if r.cfg.Source.Type == "aurora-cluster" { + return r.findLatestClusterSnapshot(ctx) + } + + return r.findLatestDBSnapshot(ctx) +} + +func (r *RDSClient) findLatestDBSnapshot(ctx context.Context) (string, error) { + input := &rds.DescribeDBSnapshotsInput{ + DBInstanceIdentifier: aws.String(r.cfg.Source.Identifier), + SnapshotType: aws.String("automated"), + } + + result, err := r.client.DescribeDBSnapshots(ctx, input) + if err != nil { + return "", fmt.Errorf("failed to describe DB snapshots: %w", err) + } + + if len(result.DBSnapshots) == 0 { + return "", fmt.Errorf("no automated snapshots found for RDS instance %q", r.cfg.Source.Identifier) + } + + // Sort by creation time (newest first) + sort.Slice(result.DBSnapshots, func(i, j int) bool { + ti := result.DBSnapshots[i].SnapshotCreateTime + tj := result.DBSnapshots[j].SnapshotCreateTime + + if ti == nil || tj == nil { + return ti != nil + } + + return ti.After(*tj) + }) + + // Find the first available snapshot + for _, snap := range result.DBSnapshots { + if snap.Status != nil && *snap.Status == "available" { + return *snap.DBSnapshotIdentifier, nil + } + } + + return "", fmt.Errorf("no available snapshots found for RDS instance %q", r.cfg.Source.Identifier) +} + +func (r *RDSClient) findLatestClusterSnapshot(ctx context.Context) (string, error) { + input := &rds.DescribeDBClusterSnapshotsInput{ + DBClusterIdentifier: aws.String(r.cfg.Source.Identifier), + SnapshotType: aws.String("automated"), + } + + result, err := r.client.DescribeDBClusterSnapshots(ctx, input) + if err != nil { + return "", fmt.Errorf("failed to describe DB cluster snapshots: %w", err) + } + + if len(result.DBClusterSnapshots) == 0 { + return "", fmt.Errorf("no automated snapshots found for Aurora cluster %q", r.cfg.Source.Identifier) + } + + // Sort by creation time (newest first) + sort.Slice(result.DBClusterSnapshots, func(i, j int) bool { + ti := result.DBClusterSnapshots[i].SnapshotCreateTime + tj := result.DBClusterSnapshots[j].SnapshotCreateTime + + if ti == nil || tj == nil { + return ti != nil + } + + return ti.After(*tj) + }) + + // Find the first available snapshot + for _, snap := range result.DBClusterSnapshots { + if snap.Status != nil && *snap.Status == "available" { + return *snap.DBClusterSnapshotIdentifier, nil + } + } + + return "", fmt.Errorf("no available snapshots found for Aurora cluster %q", r.cfg.Source.Identifier) +} + +// CreateClone creates a temporary clone from a snapshot. +func (r *RDSClient) CreateClone(ctx context.Context, snapshotID string) (*CloneInfo, error) { + cloneName := fmt.Sprintf("%s%s", cloneNamePrefix, time.Now().UTC().Format("20060102-150405")) + + if r.cfg.Source.Type == "aurora-cluster" { + return r.createAuroraClone(ctx, snapshotID, cloneName) + } + + return r.createRDSClone(ctx, snapshotID, cloneName) +} + +func (r *RDSClient) createRDSClone(ctx context.Context, snapshotID, cloneName string) (*CloneInfo, error) { + tags := r.buildTags() + + input := &rds.RestoreDBInstanceFromDBSnapshotInput{ + DBInstanceIdentifier: aws.String(cloneName), + DBSnapshotIdentifier: aws.String(snapshotID), + DBInstanceClass: aws.String(r.cfg.Clone.InstanceClass), + PubliclyAccessible: aws.Bool(r.cfg.Clone.PubliclyAccessible), + Tags: tags, + DeletionProtection: aws.Bool(r.cfg.Clone.DeletionProtection), + } + + if r.cfg.Clone.DBSubnetGroupName != "" { + input.DBSubnetGroupName = aws.String(r.cfg.Clone.DBSubnetGroupName) + } + + if len(r.cfg.Clone.VPCSecurityGroupIDs) > 0 { + input.VpcSecurityGroupIds = r.cfg.Clone.VPCSecurityGroupIDs + } + + if r.cfg.Clone.ParameterGroupName != "" { + input.DBParameterGroupName = aws.String(r.cfg.Clone.ParameterGroupName) + } + + if r.cfg.Clone.OptionGroupName != "" { + input.OptionGroupName = aws.String(r.cfg.Clone.OptionGroupName) + } + + if r.cfg.Clone.Port > 0 { + input.Port = aws.Int32(r.cfg.Clone.Port) + } + + if r.cfg.Clone.EnableIAMAuth { + input.EnableIAMDatabaseAuthentication = aws.Bool(true) + } + + if r.cfg.Clone.StorageType != "" { + input.StorageType = aws.String(r.cfg.Clone.StorageType) + } + + _, err := r.client.RestoreDBInstanceFromDBSnapshot(ctx, input) + if err != nil { + return nil, fmt.Errorf("failed to restore DB instance from snapshot: %w", err) + } + + return &CloneInfo{ + Identifier: cloneName, + IsCluster: false, + }, nil +} + +func (r *RDSClient) createAuroraClone(ctx context.Context, snapshotID, cloneName string) (*CloneInfo, error) { + tags := r.buildTags() + + // Get the engine from the snapshot first + snapshotResp, err := r.client.DescribeDBClusterSnapshots(ctx, &rds.DescribeDBClusterSnapshotsInput{ + DBClusterSnapshotIdentifier: aws.String(snapshotID), + }) + if err != nil { + return nil, fmt.Errorf("failed to describe cluster snapshot: %w", err) + } + + if len(snapshotResp.DBClusterSnapshots) == 0 { + return nil, fmt.Errorf("snapshot %q not found", snapshotID) + } + + snapshot := snapshotResp.DBClusterSnapshots[0] + + // Restore the Aurora cluster + clusterInput := &rds.RestoreDBClusterFromSnapshotInput{ + DBClusterIdentifier: aws.String(cloneName), + SnapshotIdentifier: aws.String(snapshotID), + Engine: snapshot.Engine, + Tags: tags, + DeletionProtection: aws.Bool(r.cfg.Clone.DeletionProtection), + } + + if r.cfg.Clone.DBSubnetGroupName != "" { + clusterInput.DBSubnetGroupName = aws.String(r.cfg.Clone.DBSubnetGroupName) + } + + if len(r.cfg.Clone.VPCSecurityGroupIDs) > 0 { + clusterInput.VpcSecurityGroupIds = r.cfg.Clone.VPCSecurityGroupIDs + } + + if r.cfg.Clone.DBClusterParameterGroupName != "" { + clusterInput.DBClusterParameterGroupName = aws.String(r.cfg.Clone.DBClusterParameterGroupName) + } + + if r.cfg.Clone.Port > 0 { + clusterInput.Port = aws.Int32(r.cfg.Clone.Port) + } + + if r.cfg.Clone.EnableIAMAuth { + clusterInput.EnableIAMDatabaseAuthentication = aws.Bool(true) + } + + _, err = r.client.RestoreDBClusterFromSnapshot(ctx, clusterInput) + if err != nil { + return nil, fmt.Errorf("failed to restore DB cluster from snapshot: %w", err) + } + + // Wait for cluster to be available before creating instance + if err := r.waitForClusterAvailable(ctx, cloneName); err != nil { + // Try to clean up the cluster + _ = r.deleteAuroraCluster(ctx, cloneName) + return nil, fmt.Errorf("cluster did not become available: %w", err) + } + + // Create a DB instance in the cluster + instanceName := cloneName + "-instance" + instanceInput := &rds.CreateDBInstanceInput{ + DBInstanceIdentifier: aws.String(instanceName), + DBInstanceClass: aws.String(r.cfg.Clone.InstanceClass), + DBClusterIdentifier: aws.String(cloneName), + Engine: snapshot.Engine, + Tags: tags, + } + + if r.cfg.Clone.ParameterGroupName != "" { + instanceInput.DBParameterGroupName = aws.String(r.cfg.Clone.ParameterGroupName) + } + + _, err = r.client.CreateDBInstance(ctx, instanceInput) + if err != nil { + // Try to clean up the cluster + _ = r.deleteAuroraCluster(ctx, cloneName) + return nil, fmt.Errorf("failed to create DB instance in cluster: %w", err) + } + + return &CloneInfo{ + Identifier: cloneName, + IsCluster: true, + }, nil +} + +func (r *RDSClient) buildTags() []types.Tag { + tags := make([]types.Tag, 0, len(r.cfg.Clone.Tags)) + + for k, v := range r.cfg.Clone.Tags { + tags = append(tags, types.Tag{ + Key: aws.String(k), + Value: aws.String(v), + }) + } + + return tags +} + +// WaitForCloneAvailable waits for the clone to become available and returns connection info. +func (r *RDSClient) WaitForCloneAvailable(ctx context.Context, clone *CloneInfo) error { + if clone.IsCluster { + instanceName := clone.Identifier + "-instance" + + if err := r.waitForInstanceAvailable(ctx, instanceName); err != nil { + return err + } + + // Get the cluster endpoint + clusterResp, err := r.client.DescribeDBClusters(ctx, &rds.DescribeDBClustersInput{ + DBClusterIdentifier: aws.String(clone.Identifier), + }) + if err != nil { + return fmt.Errorf("failed to describe cluster: %w", err) + } + + if len(clusterResp.DBClusters) == 0 { + return fmt.Errorf("cluster %q not found", clone.Identifier) + } + + cluster := clusterResp.DBClusters[0] + clone.Endpoint = aws.ToString(cluster.Endpoint) + clone.Port = aws.ToInt32(cluster.Port) + + if clone.Port == 0 { + clone.Port = defaultPort + } + + return nil + } + + if err := r.waitForInstanceAvailable(ctx, clone.Identifier); err != nil { + return err + } + + // Get the instance endpoint + instanceResp, err := r.client.DescribeDBInstances(ctx, &rds.DescribeDBInstancesInput{ + DBInstanceIdentifier: aws.String(clone.Identifier), + }) + if err != nil { + return fmt.Errorf("failed to describe instance: %w", err) + } + + if len(instanceResp.DBInstances) == 0 { + return fmt.Errorf("instance %q not found", clone.Identifier) + } + + instance := instanceResp.DBInstances[0] + + if instance.Endpoint != nil { + clone.Endpoint = aws.ToString(instance.Endpoint.Address) + clone.Port = aws.ToInt32(instance.Endpoint.Port) + } + + if clone.Port == 0 { + clone.Port = defaultPort + } + + return nil +} + +func (r *RDSClient) waitForInstanceAvailable(ctx context.Context, identifier string) error { + waiter := rds.NewDBInstanceAvailableWaiter(r.client) + + return waiter.Wait(ctx, &rds.DescribeDBInstancesInput{ + DBInstanceIdentifier: aws.String(identifier), + }, maxWaitTime) +} + +func (r *RDSClient) waitForClusterAvailable(ctx context.Context, identifier string) error { + waiter := rds.NewDBClusterAvailableWaiter(r.client) + + return waiter.Wait(ctx, &rds.DescribeDBClustersInput{ + DBClusterIdentifier: aws.String(identifier), + }, maxWaitTime) +} + +// DeleteClone deletes the temporary clone. +func (r *RDSClient) DeleteClone(ctx context.Context, clone *CloneInfo) error { + if clone.IsCluster { + return r.deleteAuroraCluster(ctx, clone.Identifier) + } + + return r.deleteRDSInstance(ctx, clone.Identifier) +} + +func (r *RDSClient) deleteRDSInstance(ctx context.Context, identifier string) error { + // First, disable deletion protection if enabled + _, _ = r.client.ModifyDBInstance(ctx, &rds.ModifyDBInstanceInput{ + DBInstanceIdentifier: aws.String(identifier), + DeletionProtection: aws.Bool(false), + ApplyImmediately: aws.Bool(true), + }) + + _, err := r.client.DeleteDBInstance(ctx, &rds.DeleteDBInstanceInput{ + DBInstanceIdentifier: aws.String(identifier), + SkipFinalSnapshot: aws.Bool(true), + DeleteAutomatedBackups: aws.Bool(true), + }) + + if err != nil { + return fmt.Errorf("failed to delete DB instance: %w", err) + } + + return nil +} + +func (r *RDSClient) deleteAuroraCluster(ctx context.Context, clusterIdentifier string) error { + // First, delete all instances in the cluster + instancesResp, err := r.client.DescribeDBInstances(ctx, &rds.DescribeDBInstancesInput{ + Filters: []types.Filter{ + { + Name: aws.String("db-cluster-id"), + Values: []string{clusterIdentifier}, + }, + }, + }) + if err != nil { + return fmt.Errorf("failed to list cluster instances: %w", err) + } + + for _, instance := range instancesResp.DBInstances { + if err := r.deleteRDSInstance(ctx, aws.ToString(instance.DBInstanceIdentifier)); err != nil { + return fmt.Errorf("failed to delete cluster instance: %w", err) + } + } + + // Wait for all instances to be deleted + for _, instance := range instancesResp.DBInstances { + waiter := rds.NewDBInstanceDeletedWaiter(r.client) + + if err := waiter.Wait(ctx, &rds.DescribeDBInstancesInput{ + DBInstanceIdentifier: instance.DBInstanceIdentifier, + }, maxWaitTime); err != nil { + return fmt.Errorf("failed waiting for instance deletion: %w", err) + } + } + + // Disable deletion protection on cluster + _, _ = r.client.ModifyDBCluster(ctx, &rds.ModifyDBClusterInput{ + DBClusterIdentifier: aws.String(clusterIdentifier), + DeletionProtection: aws.Bool(false), + ApplyImmediately: aws.Bool(true), + }) + + // Delete the cluster + _, err = r.client.DeleteDBCluster(ctx, &rds.DeleteDBClusterInput{ + DBClusterIdentifier: aws.String(clusterIdentifier), + SkipFinalSnapshot: aws.Bool(true), + }) + + if err != nil { + return fmt.Errorf("failed to delete DB cluster: %w", err) + } + + return nil +} + +// GetSourceInfo returns information about the source database. +func (r *RDSClient) GetSourceInfo(ctx context.Context) (string, error) { + if r.cfg.Source.Type == "aurora-cluster" { + resp, err := r.client.DescribeDBClusters(ctx, &rds.DescribeDBClustersInput{ + DBClusterIdentifier: aws.String(r.cfg.Source.Identifier), + }) + if err != nil { + return "", fmt.Errorf("failed to describe source cluster: %w", err) + } + + if len(resp.DBClusters) == 0 { + return "", fmt.Errorf("source cluster %q not found", r.cfg.Source.Identifier) + } + + cluster := resp.DBClusters[0] + + return fmt.Sprintf("Aurora cluster %s (engine: %s, version: %s)", + r.cfg.Source.Identifier, + aws.ToString(cluster.Engine), + aws.ToString(cluster.EngineVersion)), nil + } + + resp, err := r.client.DescribeDBInstances(ctx, &rds.DescribeDBInstancesInput{ + DBInstanceIdentifier: aws.String(r.cfg.Source.Identifier), + }) + if err != nil { + return "", fmt.Errorf("failed to describe source instance: %w", err) + } + + if len(resp.DBInstances) == 0 { + return "", fmt.Errorf("source instance %q not found", r.cfg.Source.Identifier) + } + + instance := resp.DBInstances[0] + + return fmt.Sprintf("RDS instance %s (engine: %s, version: %s)", + r.cfg.Source.Identifier, + aws.ToString(instance.Engine), + aws.ToString(instance.EngineVersion)), nil +} diff --git a/rds-refresh/refresher.go b/rds-refresh/refresher.go new file mode 100644 index 00000000..7d9bfc4f --- /dev/null +++ b/rds-refresh/refresher.go @@ -0,0 +1,243 @@ +/* +2024 © Postgres.ai +*/ + +package main + +import ( + "context" + "fmt" + "time" +) + +// Logger defines the logging interface. +type Logger interface { + Info(msg string, args ...interface{}) + Error(msg string, args ...interface{}) + Debug(msg string, args ...interface{}) +} + +// DefaultLogger is a simple stdout logger. +type DefaultLogger struct{} + +// Info logs an info message. +func (l *DefaultLogger) Info(msg string, args ...interface{}) { + fmt.Printf("[INFO] "+msg+"\n", args...) +} + +// Error logs an error message. +func (l *DefaultLogger) Error(msg string, args ...interface{}) { + fmt.Printf("[ERROR] "+msg+"\n", args...) +} + +// Debug logs a debug message. +func (l *DefaultLogger) Debug(msg string, args ...interface{}) { + fmt.Printf("[DEBUG] "+msg+"\n", args...) +} + +// Refresher orchestrates the RDS/Aurora clone and DBLab refresh workflow. +type Refresher struct { + cfg *Config + rds *RDSClient + dblab *DBLabClient + logger Logger +} + +// RefreshResult contains the result of a refresh operation. +type RefreshResult struct { + Success bool + SnapshotID string + CloneID string + StartTime time.Time + EndTime time.Time + Duration time.Duration + Error error + CloneEndpoint string +} + +// NewRefresher creates a new Refresher instance. +func NewRefresher(ctx context.Context, cfg *Config, logger Logger) (*Refresher, error) { + if logger == nil { + logger = &DefaultLogger{} + } + + rdsClient, err := NewRDSClient(ctx, cfg) + if err != nil { + return nil, fmt.Errorf("failed to create RDS client: %w", err) + } + + dblabClient := NewDBLabClient(&cfg.DBLab) + + return &Refresher{ + cfg: cfg, + rds: rdsClient, + dblab: dblabClient, + logger: logger, + }, nil +} + +// Run executes the full refresh workflow: +// 1. Verifies DBLab is healthy and not already refreshing +// 2. Finds the latest snapshot +// 3. Creates a temporary clone from the snapshot +// 4. Waits for the clone to be available +// 5. Triggers DBLab full refresh +// 6. Waits for refresh to complete +// 7. Deletes the temporary clone +func (r *Refresher) Run(ctx context.Context) *RefreshResult { + result := &RefreshResult{ + StartTime: time.Now(), + } + + defer func() { + result.EndTime = time.Now() + result.Duration = result.EndTime.Sub(result.StartTime) + }() + + // Step 1: Check DBLab health and status + r.logger.Info("Checking DBLab Engine health...") + + if err := r.dblab.Health(ctx); err != nil { + result.Error = fmt.Errorf("DBLab health check failed: %w", err) + return result + } + + inProgress, err := r.dblab.IsRefreshInProgress(ctx) + if err != nil { + result.Error = fmt.Errorf("failed to check DBLab status: %w", err) + return result + } + + if inProgress { + result.Error = fmt.Errorf("refresh already in progress, skipping") + return result + } + + // Step 2: Get source info + r.logger.Info("Checking source database...") + + sourceInfo, err := r.rds.GetSourceInfo(ctx) + if err != nil { + result.Error = fmt.Errorf("failed to get source info: %w", err) + return result + } + + r.logger.Info("Source: %s", sourceInfo) + + // Step 3: Find latest snapshot + r.logger.Info("Finding latest snapshot...") + + snapshotID, err := r.rds.FindLatestSnapshot(ctx) + if err != nil { + result.Error = fmt.Errorf("failed to find snapshot: %w", err) + return result + } + + result.SnapshotID = snapshotID + r.logger.Info("Using snapshot: %s", snapshotID) + + // Step 4: Create temporary clone + r.logger.Info("Creating temporary RDS clone from snapshot...") + + clone, err := r.rds.CreateClone(ctx, snapshotID) + if err != nil { + result.Error = fmt.Errorf("failed to create clone: %w", err) + return result + } + + result.CloneID = clone.Identifier + r.logger.Info("Created clone: %s", clone.Identifier) + + // Ensure cleanup on any exit + defer func() { + r.logger.Info("Cleaning up temporary clone %s...", clone.Identifier) + + if deleteErr := r.rds.DeleteClone(context.Background(), clone); deleteErr != nil { + r.logger.Error("Failed to delete clone %s: %v (manual cleanup may be required)", clone.Identifier, deleteErr) + } else { + r.logger.Info("Successfully deleted temporary clone %s", clone.Identifier) + } + }() + + // Step 5: Wait for clone to be available + r.logger.Info("Waiting for clone to become available (this may take 10-30 minutes)...") + + if err := r.rds.WaitForCloneAvailable(ctx, clone); err != nil { + result.Error = fmt.Errorf("clone did not become available: %w", err) + return result + } + + result.CloneEndpoint = clone.Endpoint + r.logger.Info("Clone available at: %s:%d", clone.Endpoint, clone.Port) + + // Step 6: Trigger DBLab full refresh + r.logger.Info("Triggering DBLab full refresh...") + + if err := r.dblab.TriggerFullRefresh(ctx); err != nil { + result.Error = fmt.Errorf("failed to trigger refresh: %w", err) + return result + } + + r.logger.Info("Full refresh triggered, waiting for completion...") + + // Step 7: Wait for refresh to complete + pollInterval := r.cfg.DBLab.PollInterval.Duration() + timeout := r.cfg.DBLab.Timeout.Duration() + + if err := r.dblab.WaitForRefreshComplete(ctx, pollInterval, timeout); err != nil { + result.Error = fmt.Errorf("refresh did not complete: %w", err) + return result + } + + r.logger.Info("DBLab refresh completed successfully!") + result.Success = true + + return result +} + +// DryRun performs all validation steps without actually creating resources. +func (r *Refresher) DryRun(ctx context.Context) error { + r.logger.Info("=== DRY RUN MODE ===") + + // Check DBLab + r.logger.Info("Checking DBLab Engine health...") + + if err := r.dblab.Health(ctx); err != nil { + return fmt.Errorf("DBLab health check failed: %w", err) + } + + r.logger.Info("DBLab Engine is healthy") + + // Check current status + status, err := r.dblab.GetStatus(ctx) + if err != nil { + return fmt.Errorf("failed to get DBLab status: %w", err) + } + + r.logger.Info("DBLab retrieval status: %s", status.Retrieving.Status) + + // Check source + r.logger.Info("Checking source database...") + + sourceInfo, err := r.rds.GetSourceInfo(ctx) + if err != nil { + return fmt.Errorf("failed to get source info: %w", err) + } + + r.logger.Info("Source: %s", sourceInfo) + + // Check snapshot + r.logger.Info("Finding latest snapshot...") + + snapshotID, err := r.rds.FindLatestSnapshot(ctx) + if err != nil { + return fmt.Errorf("failed to find snapshot: %w", err) + } + + r.logger.Info("Would use snapshot: %s", snapshotID) + r.logger.Info("Would create clone with instance class: %s", r.cfg.Clone.InstanceClass) + + r.logger.Info("=== DRY RUN COMPLETE - All checks passed ===") + + return nil +} diff --git a/rds-refresh/template.yaml b/rds-refresh/template.yaml new file mode 100644 index 00000000..c14dfd9a --- /dev/null +++ b/rds-refresh/template.yaml @@ -0,0 +1,241 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: > + DBLab RDS/Aurora Refresh Lambda + + Automates DBLab full refresh using temporary RDS/Aurora clones created from snapshots. + +Metadata: + AWS::ServerlessRepo::Application: + Name: dblab-rds-refresh + Description: Automates DBLab full refresh using temporary RDS/Aurora clones + Author: Postgres.ai + SpdxLicenseId: Apache-2.0 + Labels: ['dblab', 'rds', 'aurora', 'postgresql', 'database'] + HomePageUrl: https://postgres.ai + SourceCodeUrl: https://github.com/postgres-ai/rds-refresh + +Parameters: + # Source Configuration + RDSSourceType: + Type: String + Default: rds + AllowedValues: + - rds + - aurora-cluster + Description: Type of source database (rds for RDS instance, aurora-cluster for Aurora) + + RDSSourceIdentifier: + Type: String + Description: RDS DB instance identifier or Aurora cluster identifier + + RDSSnapshotIdentifier: + Type: String + Default: '' + Description: Specific snapshot ID to use (leave empty for latest automated snapshot) + + # Clone Configuration + RDSCloneInstanceClass: + Type: String + Default: db.t3.medium + Description: Instance class for the temporary clone + + RDSCloneSubnetGroup: + Type: String + Default: '' + Description: DB subnet group name for the clone + + RDSCloneSecurityGroups: + Type: String + Default: '' + Description: JSON array of VPC security group IDs (e.g., '["sg-123", "sg-456"]') + + RDSClonePubliclyAccessible: + Type: String + Default: 'false' + AllowedValues: + - 'true' + - 'false' + Description: Whether the clone should be publicly accessible + + RDSCloneEnableIAMAuth: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Enable IAM database authentication on the clone + + RDSCloneParameterGroup: + Type: String + Default: '' + Description: DB parameter group name for the clone + + RDSCloneStorageType: + Type: String + Default: '' + Description: Storage type for the clone (gp2, gp3, io1, etc.) + + # DBLab Configuration + DBLabAPIEndpoint: + Type: String + Description: DBLab Engine API endpoint (e.g., https://dblab.example.com:2345) + + DBLabToken: + Type: String + NoEcho: true + Description: DBLab verification token + + DBLabInsecure: + Type: String + Default: 'false' + AllowedValues: + - 'true' + - 'false' + Description: Skip TLS certificate verification for DBLab API + + # Schedule Configuration + ScheduleExpression: + Type: String + Default: 'rate(7 days)' + Description: Schedule expression for automatic refresh (e.g., 'rate(7 days)' or 'cron(0 2 ? * SUN *)') + + EnableSchedule: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: Enable scheduled automatic refresh + + # Lambda Configuration + LambdaTimeout: + Type: Number + Default: 900 + MinValue: 60 + MaxValue: 900 + Description: Lambda function timeout in seconds (max 15 minutes) + + LambdaMemorySize: + Type: Number + Default: 256 + MinValue: 128 + MaxValue: 1024 + Description: Lambda function memory size in MB + +Conditions: + ScheduleEnabled: !Equals [!Ref EnableSchedule, 'true'] + HasSubnetGroup: !Not [!Equals [!Ref RDSCloneSubnetGroup, '']] + HasSecurityGroups: !Not [!Equals [!Ref RDSCloneSecurityGroups, '']] + HasParameterGroup: !Not [!Equals [!Ref RDSCloneParameterGroup, '']] + HasStorageType: !Not [!Equals [!Ref RDSCloneStorageType, '']] + HasSnapshotId: !Not [!Equals [!Ref RDSSnapshotIdentifier, '']] + +Globals: + Function: + Timeout: !Ref LambdaTimeout + MemorySize: !Ref LambdaMemorySize + Runtime: provided.al2023 + Architectures: + - arm64 + +Resources: + RDSRefreshFunction: + Type: AWS::Serverless::Function + Metadata: + BuildMethod: go1.x + Properties: + CodeUri: . + Handler: bootstrap + Description: Automates DBLab full refresh using temporary RDS/Aurora clones + Environment: + Variables: + RDS_SOURCE_TYPE: !Ref RDSSourceType + RDS_SOURCE_IDENTIFIER: !Ref RDSSourceIdentifier + RDS_SNAPSHOT_IDENTIFIER: !If [HasSnapshotId, !Ref RDSSnapshotIdentifier, ''] + RDS_CLONE_INSTANCE_CLASS: !Ref RDSCloneInstanceClass + RDS_CLONE_SUBNET_GROUP: !If [HasSubnetGroup, !Ref RDSCloneSubnetGroup, ''] + RDS_CLONE_SECURITY_GROUPS: !If [HasSecurityGroups, !Ref RDSCloneSecurityGroups, ''] + RDS_CLONE_PUBLIC: !Ref RDSClonePubliclyAccessible + RDS_CLONE_ENABLE_IAM_AUTH: !Ref RDSCloneEnableIAMAuth + RDS_CLONE_PARAMETER_GROUP: !If [HasParameterGroup, !Ref RDSCloneParameterGroup, ''] + RDS_CLONE_STORAGE_TYPE: !If [HasStorageType, !Ref RDSCloneStorageType, ''] + DBLAB_API_ENDPOINT: !Ref DBLabAPIEndpoint + DBLAB_TOKEN: !Ref DBLabToken + DBLAB_INSECURE: !Ref DBLabInsecure + Policies: + - Version: '2012-10-17' + Statement: + - Sid: RDSReadSnapshots + Effect: Allow + Action: + - rds:DescribeDBSnapshots + - rds:DescribeDBClusterSnapshots + - rds:DescribeDBInstances + - rds:DescribeDBClusters + Resource: '*' + - Sid: RDSCreateClone + Effect: Allow + Action: + - rds:RestoreDBInstanceFromDBSnapshot + - rds:RestoreDBClusterFromSnapshot + - rds:CreateDBInstance + - rds:AddTagsToResource + - rds:ModifyDBInstance + - rds:ModifyDBCluster + Resource: + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:db:dblab-refresh-*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster:dblab-refresh-*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:snapshot:*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster-snapshot:*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:subgrp:*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:pg:*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:og:*' + - Sid: RDSDeleteClone + Effect: Allow + Action: + - rds:DeleteDBInstance + - rds:DeleteDBCluster + Resource: + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:db:dblab-refresh-*' + - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster:dblab-refresh-*' + Events: + ScheduledRefresh: + Type: Schedule + Properties: + Schedule: !Ref ScheduleExpression + Description: Scheduled DBLab refresh trigger + Enabled: !If [ScheduleEnabled, true, false] + + RDSRefreshLogGroup: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: !Sub '/aws/lambda/${RDSRefreshFunction}' + RetentionInDays: 30 + +Outputs: + RDSRefreshFunctionArn: + Description: ARN of the RDS Refresh Lambda function + Value: !GetAtt RDSRefreshFunction.Arn + Export: + Name: !Sub '${AWS::StackName}-FunctionArn' + + RDSRefreshFunctionName: + Description: Name of the RDS Refresh Lambda function + Value: !Ref RDSRefreshFunction + + InvocationCommand: + Description: AWS CLI command to manually invoke the function + Value: !Sub | + aws lambda invoke --function-name ${RDSRefreshFunction} \ + --cli-binary-format raw-in-base64-out \ + --payload '{"dryRun": false}' \ + response.json && cat response.json + + DryRunCommand: + Description: AWS CLI command to run a dry-run test + Value: !Sub | + aws lambda invoke --function-name ${RDSRefreshFunction} \ + --cli-binary-format raw-in-base64-out \ + --payload '{"dryRun": true}' \ + response.json && cat response.json From b8aadf174bcdd9757370bf256a0a74586e2fba45 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Dec 2025 02:49:30 +0000 Subject: [PATCH 3/6] =?UTF-8?q?chore:=20update=20copyright=20to=202025=20?= =?UTF-8?q?=C2=A9=20PostgresAI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- engine/cmd/rds-refresh/main.go | 2 +- engine/internal/rdsrefresh/config.go | 2 +- engine/internal/rdsrefresh/dblab.go | 2 +- engine/internal/rdsrefresh/lambda.go | 2 +- engine/internal/rdsrefresh/rds.go | 2 +- engine/internal/rdsrefresh/refresher.go | 2 +- rds-refresh/config.go | 2 +- rds-refresh/dblab.go | 2 +- rds-refresh/main.go | 2 +- rds-refresh/rds.go | 2 +- rds-refresh/refresher.go | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/engine/cmd/rds-refresh/main.go b/engine/cmd/rds-refresh/main.go index b51f0efa..5d4eed4f 100644 --- a/engine/cmd/rds-refresh/main.go +++ b/engine/cmd/rds-refresh/main.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ // Package main provides the entry point for the rds-refresh CLI tool. diff --git a/engine/internal/rdsrefresh/config.go b/engine/internal/rdsrefresh/config.go index 4f114f93..6bc90643 100644 --- a/engine/internal/rdsrefresh/config.go +++ b/engine/internal/rdsrefresh/config.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ // Package rdsrefresh provides functionality to automate DBLab full refresh diff --git a/engine/internal/rdsrefresh/dblab.go b/engine/internal/rdsrefresh/dblab.go index 83892e7e..90b86144 100644 --- a/engine/internal/rdsrefresh/dblab.go +++ b/engine/internal/rdsrefresh/dblab.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ package rdsrefresh diff --git a/engine/internal/rdsrefresh/lambda.go b/engine/internal/rdsrefresh/lambda.go index c38896dc..bddaa00d 100644 --- a/engine/internal/rdsrefresh/lambda.go +++ b/engine/internal/rdsrefresh/lambda.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ package rdsrefresh diff --git a/engine/internal/rdsrefresh/rds.go b/engine/internal/rdsrefresh/rds.go index f39d382d..b105532d 100644 --- a/engine/internal/rdsrefresh/rds.go +++ b/engine/internal/rdsrefresh/rds.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ package rdsrefresh diff --git a/engine/internal/rdsrefresh/refresher.go b/engine/internal/rdsrefresh/refresher.go index e0349383..a2455ca7 100644 --- a/engine/internal/rdsrefresh/refresher.go +++ b/engine/internal/rdsrefresh/refresher.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ package rdsrefresh diff --git a/rds-refresh/config.go b/rds-refresh/config.go index 588a9aa4..3e16ec30 100644 --- a/rds-refresh/config.go +++ b/rds-refresh/config.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ package main diff --git a/rds-refresh/dblab.go b/rds-refresh/dblab.go index 57c590f4..25306874 100644 --- a/rds-refresh/dblab.go +++ b/rds-refresh/dblab.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ package main diff --git a/rds-refresh/main.go b/rds-refresh/main.go index e353b51a..452c1a99 100644 --- a/rds-refresh/main.go +++ b/rds-refresh/main.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI rds-refresh - Automate DBLab full refresh using RDS/Aurora snapshots diff --git a/rds-refresh/rds.go b/rds-refresh/rds.go index 91757d0b..6f6e0faf 100644 --- a/rds-refresh/rds.go +++ b/rds-refresh/rds.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ package main diff --git a/rds-refresh/refresher.go b/rds-refresh/refresher.go index 7d9bfc4f..74741f78 100644 --- a/rds-refresh/refresher.go +++ b/rds-refresh/refresher.go @@ -1,5 +1,5 @@ /* -2024 © Postgres.ai +2025 © PostgresAI */ package main From 869eba1e02f66a8f708d4126353fbf67936a263a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 10 Dec 2025 03:18:52 +0000 Subject: [PATCH 4/6] feat(rds-refresh): add DBLab config update and remove Lambda code - Add UpdateSourceConfig method to update DBLab source connection before triggering refresh, ensuring DBLab knows the clone endpoint - Add source database credentials (dbName, username, password) to config for DBLab connection configuration - Remove Lambda-specific code, focusing on container/CLI deployment which better suits the long-running nature of refresh operations - Remove duplicate engine-integrated code (engine/internal/rdsrefresh and engine/cmd/rds-refresh) since standalone component is preferred - Update documentation with container deployment options (Docker, ECS Task, Kubernetes CronJob, CLI with cron) - Update workflow to include config update step before refresh trigger --- engine/cmd/rds-refresh/main.go | 176 -------- engine/deploy/rds-refresh/Dockerfile | 39 -- engine/deploy/rds-refresh/README.md | 356 --------------- engine/deploy/rds-refresh/iam-policy.json | 49 --- engine/deploy/rds-refresh/template.yaml | 241 ---------- engine/internal/rdsrefresh/config.go | 190 -------- engine/internal/rdsrefresh/dblab.go | 205 --------- engine/internal/rdsrefresh/lambda.go | 174 -------- engine/internal/rdsrefresh/rds.go | 509 ---------------------- engine/internal/rdsrefresh/refresher.go | 243 ----------- rds-refresh/Makefile | 22 +- rds-refresh/README.md | 296 ++++++++----- rds-refresh/config.example.yaml | 8 + rds-refresh/config.go | 18 + rds-refresh/dblab.go | 71 ++- rds-refresh/go.mod | 1 - rds-refresh/main.go | 219 ++-------- rds-refresh/refresher.go | 35 +- rds-refresh/template.yaml | 241 ---------- 19 files changed, 354 insertions(+), 2739 deletions(-) delete mode 100644 engine/cmd/rds-refresh/main.go delete mode 100644 engine/deploy/rds-refresh/Dockerfile delete mode 100644 engine/deploy/rds-refresh/README.md delete mode 100644 engine/deploy/rds-refresh/iam-policy.json delete mode 100644 engine/deploy/rds-refresh/template.yaml delete mode 100644 engine/internal/rdsrefresh/config.go delete mode 100644 engine/internal/rdsrefresh/dblab.go delete mode 100644 engine/internal/rdsrefresh/lambda.go delete mode 100644 engine/internal/rdsrefresh/rds.go delete mode 100644 engine/internal/rdsrefresh/refresher.go delete mode 100644 rds-refresh/template.yaml diff --git a/engine/cmd/rds-refresh/main.go b/engine/cmd/rds-refresh/main.go deleted file mode 100644 index 5d4eed4f..00000000 --- a/engine/cmd/rds-refresh/main.go +++ /dev/null @@ -1,176 +0,0 @@ -/* -2025 © PostgresAI -*/ - -// Package main provides the entry point for the rds-refresh CLI tool. -// This tool automates DBLab full refresh using temporary RDS/Aurora clones. -package main - -import ( - "context" - "flag" - "fmt" - "os" - "os/signal" - "syscall" - - "github.com/aws/aws-lambda-go/lambda" - - "gitlab.com/postgres-ai/database-lab/v3/internal/rdsrefresh" -) - -var ( - version = "dev" - buildTime = "unknown" -) - -func main() { - // Check if running in Lambda - if os.Getenv("AWS_LAMBDA_FUNCTION_NAME") != "" { - lambda.Start(rdsrefresh.HandleLambda) - return - } - - // CLI mode - configPath := flag.String("config", "", "Path to configuration file") - dryRun := flag.Bool("dry-run", false, "Validate configuration without creating resources") - showVersion := flag.Bool("version", false, "Show version information") - help := flag.Bool("help", false, "Show help") - - flag.Usage = printUsage - flag.Parse() - - if *help { - printUsage() - os.Exit(0) - } - - if *showVersion { - fmt.Printf("rds-refresh version %s (built: %s)\n", version, buildTime) - os.Exit(0) - } - - if *configPath == "" { - fmt.Fprintln(os.Stderr, "error: -config flag is required") - printUsage() - os.Exit(1) - } - - if err := run(*configPath, *dryRun); err != nil { - fmt.Fprintf(os.Stderr, "error: %v\n", err) - os.Exit(1) - } -} - -func run(configPath string, dryRun bool) error { - cfg, err := rdsrefresh.LoadConfig(configPath) - if err != nil { - return fmt.Errorf("failed to load config: %w", err) - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - // Handle interrupt signals - sigCh := make(chan os.Signal, 1) - signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) - - go func() { - sig := <-sigCh - fmt.Printf("\nReceived signal %v, initiating graceful shutdown...\n", sig) - cancel() - }() - - logger := &rdsrefresh.DefaultLogger{} - - refresher, err := rdsrefresh.NewRefresher(ctx, cfg, logger) - if err != nil { - return fmt.Errorf("failed to initialize refresher: %w", err) - } - - if dryRun { - return refresher.DryRun(ctx) - } - - result := refresher.Run(ctx) - - fmt.Println() - fmt.Println("=== Refresh Summary ===") - fmt.Printf("Success: %v\n", result.Success) - fmt.Printf("Snapshot: %s\n", result.SnapshotID) - fmt.Printf("Clone ID: %s\n", result.CloneID) - fmt.Printf("Duration: %v\n", result.Duration.Round(1e9)) - - if result.Error != nil { - return result.Error - } - - return nil -} - -func printUsage() { - fmt.Fprintf(os.Stderr, `rds-refresh - Automate DBLab full refresh using RDS/Aurora snapshots - -This tool creates a temporary RDS/Aurora clone from a snapshot, triggers -a DBLab Engine full refresh, and then cleans up the temporary clone. - -USAGE: - rds-refresh -config [options] - -OPTIONS: - -config Path to YAML configuration file (required) - -dry-run Validate configuration without creating resources - -version Show version information - -help Show this help message - -LAMBDA MODE: - When running as an AWS Lambda function (detected via AWS_LAMBDA_FUNCTION_NAME - environment variable), configuration is loaded from environment variables: - - Required: - RDS_SOURCE_IDENTIFIER Source RDS instance or Aurora cluster ID - RDS_CLONE_INSTANCE_CLASS Instance class for the clone (e.g., db.t3.medium) - DBLAB_API_ENDPOINT DBLab Engine API endpoint - DBLAB_TOKEN DBLab verification token - AWS_REGION AWS region - - Optional: - RDS_SOURCE_TYPE "rds" or "aurora-cluster" (default: rds) - RDS_SNAPSHOT_IDENTIFIER Specific snapshot ID (default: latest) - RDS_CLONE_SUBNET_GROUP DB subnet group name - RDS_CLONE_SECURITY_GROUPS JSON array of security group IDs - RDS_CLONE_PUBLIC "true" to make clone publicly accessible - RDS_CLONE_PARAMETER_GROUP DB parameter group name - RDS_CLONE_ENABLE_IAM_AUTH "true" to enable IAM authentication - RDS_CLONE_STORAGE_TYPE Storage type (gp2, gp3, io1, etc.) - RDS_CLONE_TAGS JSON object of additional tags - DBLAB_INSECURE "true" to skip TLS verification - -EXAMPLE CONFIGURATION: - - source: - type: rds - identifier: production-db - - clone: - instanceClass: db.t3.medium - subnetGroup: default-vpc-subnet - securityGroups: - - sg-12345678 - publiclyAccessible: false - enableIAMAuth: true - - dblab: - apiEndpoint: https://dblab.example.com:2345 - token: ${DBLAB_TOKEN} - pollInterval: 30s - timeout: 4h - - aws: - region: us-east-1 - -For more information, see: - https://postgres.ai/docs/database-lab-engine - -`) -} diff --git a/engine/deploy/rds-refresh/Dockerfile b/engine/deploy/rds-refresh/Dockerfile deleted file mode 100644 index 4ff75443..00000000 --- a/engine/deploy/rds-refresh/Dockerfile +++ /dev/null @@ -1,39 +0,0 @@ -# Build stage -FROM golang:1.23-alpine AS builder - -RUN apk add --no-cache git ca-certificates - -WORKDIR /build - -# Copy go mod files first for better caching -COPY engine/go.mod engine/go.sum ./ -RUN go mod download - -# Copy source code -COPY engine/ ./ - -# Build the binary -ARG VERSION=dev -ARG BUILD_TIME=unknown - -RUN CGO_ENABLED=0 GOOS=linux go build \ - -ldflags="-s -w -X main.version=${VERSION} -X main.buildTime=${BUILD_TIME}" \ - -o /rds-refresh \ - ./cmd/rds-refresh - -# Runtime stage -FROM alpine:3.19 - -RUN apk add --no-cache ca-certificates tzdata - -# Create non-root user -RUN adduser -D -u 1000 appuser - -WORKDIR /app - -COPY --from=builder /rds-refresh /usr/local/bin/rds-refresh - -USER appuser - -ENTRYPOINT ["/usr/local/bin/rds-refresh"] -CMD ["--help"] diff --git a/engine/deploy/rds-refresh/README.md b/engine/deploy/rds-refresh/README.md deleted file mode 100644 index d7d46c82..00000000 --- a/engine/deploy/rds-refresh/README.md +++ /dev/null @@ -1,356 +0,0 @@ -# DBLab RDS/Aurora Refresh Component - -Automates DBLab Engine full refresh using temporary RDS or Aurora clones created from snapshots. - -## Overview - -This component provides a hassle-free way to keep your DBLab Engine data synchronized with your production RDS/Aurora database. It: - -1. **Creates a temporary clone** from the latest RDS/Aurora snapshot -2. **Triggers DBLab full refresh** to sync data from the clone -3. **Deletes the temporary clone** after refresh completes - -This approach avoids impacting your production database during the data sync process. - -## Deployment Options - -### Option 1: AWS Lambda (Recommended) - -Deploy as a serverless function with automatic scheduling via EventBridge. - -#### Prerequisites - -- [AWS SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-sam-cli.html) -- AWS credentials configured -- Go 1.21+ (for building) - -#### Quick Start - -```bash -# Clone the repository -git clone https://gitlab.com/postgres-ai/database-lab.git -cd database-lab/engine/deploy/rds-refresh - -# Build and deploy -sam build -sam deploy --guided -``` - -During guided deployment, you'll be prompted for: - -| Parameter | Description | Example | -|-----------|-------------|---------| -| `RDSSourceType` | `rds` or `aurora-cluster` | `rds` | -| `RDSSourceIdentifier` | Source DB identifier | `production-db` | -| `RDSCloneInstanceClass` | Clone instance size | `db.t3.medium` | -| `DBLabAPIEndpoint` | DBLab API URL | `https://dblab.example.com:2345` | -| `DBLabToken` | DBLab verification token | `your-secret-token` | -| `ScheduleExpression` | Refresh schedule | `rate(7 days)` | - -#### Manual Invocation - -```bash -# Dry run (validates configuration) -aws lambda invoke --function-name dblab-rds-refresh \ - --cli-binary-format raw-in-base64-out \ - --payload '{"dryRun": true}' \ - response.json && cat response.json - -# Full refresh -aws lambda invoke --function-name dblab-rds-refresh \ - --cli-binary-format raw-in-base64-out \ - --payload '{"dryRun": false}' \ - response.json && cat response.json -``` - -### Option 2: CLI Binary - -Run as a standalone binary via cron or systemd timer. - -#### Build - -```bash -cd engine -go build -o rds-refresh ./cmd/rds-refresh -``` - -#### Usage - -```bash -# Dry run -./rds-refresh -config config.yaml -dry-run - -# Full refresh -./rds-refresh -config config.yaml -``` - -#### Example Configuration - -```yaml -# config.yaml -source: - type: rds # or aurora-cluster - identifier: production-db # RDS instance or Aurora cluster ID - # snapshotIdentifier: "" # optional: specific snapshot (default: latest) - -clone: - instanceClass: db.t3.medium # smaller than prod for cost savings - subnetGroup: default-vpc # same VPC as DBLab Engine - securityGroups: - - sg-12345678 # must allow DBLab to connect - publiclyAccessible: false - enableIAMAuth: true # recommended for secure access - # parameterGroup: "" # optional: custom parameter group - # storageType: gp3 # optional: storage type - -dblab: - apiEndpoint: https://dblab.example.com:2345 - token: ${DBLAB_TOKEN} # environment variable expansion - pollInterval: 30s - timeout: 4h - -aws: - region: us-east-1 -``` - -#### Cron Example - -```bash -# Run every Sunday at 2 AM -0 2 * * 0 /usr/local/bin/rds-refresh -config /etc/dblab/rds-refresh.yaml >> /var/log/rds-refresh.log 2>&1 -``` - -### Option 3: Docker Container - -```bash -# Build (from repository root) -docker build -t dblab-rds-refresh -f engine/deploy/rds-refresh/Dockerfile . - -# Run -docker run -v /path/to/config.yaml:/config.yaml \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DBLAB_TOKEN \ - dblab-rds-refresh -config /config.yaml -``` - -### Option 4: Kubernetes CronJob - -```yaml -apiVersion: batch/v1 -kind: CronJob -metadata: - name: dblab-rds-refresh -spec: - schedule: "0 2 * * 0" # Every Sunday at 2 AM - jobTemplate: - spec: - template: - spec: - serviceAccountName: dblab-rds-refresh # with IRSA - containers: - - name: rds-refresh - image: postgresai/rds-refresh:latest - args: ["-config", "/config/config.yaml"] - volumeMounts: - - name: config - mountPath: /config - env: - - name: DBLAB_TOKEN - valueFrom: - secretKeyRef: - name: dblab-secrets - key: token - volumes: - - name: config - configMap: - name: rds-refresh-config - restartPolicy: OnFailure -``` - -## AWS IAM Permissions - -### Minimal IAM Policy - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "RDSReadSnapshots", - "Effect": "Allow", - "Action": [ - "rds:DescribeDBSnapshots", - "rds:DescribeDBClusterSnapshots", - "rds:DescribeDBInstances", - "rds:DescribeDBClusters" - ], - "Resource": "*" - }, - { - "Sid": "RDSCreateClone", - "Effect": "Allow", - "Action": [ - "rds:RestoreDBInstanceFromDBSnapshot", - "rds:RestoreDBClusterFromSnapshot", - "rds:CreateDBInstance", - "rds:AddTagsToResource", - "rds:ModifyDBInstance", - "rds:ModifyDBCluster" - ], - "Resource": [ - "arn:aws:rds:*:ACCOUNT_ID:db:dblab-refresh-*", - "arn:aws:rds:*:ACCOUNT_ID:cluster:dblab-refresh-*", - "arn:aws:rds:*:ACCOUNT_ID:snapshot:*", - "arn:aws:rds:*:ACCOUNT_ID:cluster-snapshot:*", - "arn:aws:rds:*:ACCOUNT_ID:subgrp:*", - "arn:aws:rds:*:ACCOUNT_ID:pg:*", - "arn:aws:rds:*:ACCOUNT_ID:og:*" - ] - }, - { - "Sid": "RDSDeleteClone", - "Effect": "Allow", - "Action": [ - "rds:DeleteDBInstance", - "rds:DeleteDBCluster" - ], - "Resource": [ - "arn:aws:rds:*:ACCOUNT_ID:db:dblab-refresh-*", - "arn:aws:rds:*:ACCOUNT_ID:cluster:dblab-refresh-*" - ] - } - ] -} -``` - -Replace `ACCOUNT_ID` with your AWS account ID. - -### For IAM Database Authentication - -If using RDS IAM authentication (recommended), the DBLab Engine also needs: - -```json -{ - "Sid": "RDSIAMConnect", - "Effect": "Allow", - "Action": "rds-db:connect", - "Resource": "arn:aws:rds-db:*:ACCOUNT_ID:dbuser:*/dblab_user" -} -``` - -## DBLab Engine Configuration - -Configure DBLab Engine to connect to the temporary clone using RDS IAM authentication: - -```yaml -# server.yml (DBLab Engine config) -retrieval: - refresh: - timetable: "" # Disable built-in scheduler (managed externally) - skipStartRefresh: true - - jobs: - - logicalDump - - logicalRestore - - logicalSnapshot - - spec: - logicalDump: - options: - dockerImage: "postgresai/extended-postgres:17" - dumpLocation: "/var/lib/dblab/dblab_pool/dump" - - source: - type: rdsIam - connection: - dbname: mydb - username: dblab_user - rdsIam: - awsRegion: us-east-1 - # This will be updated by rds-refresh or pre-configured - dbInstanceIdentifier: dblab-refresh-current - sslRootCert: "/cert/rds-combined-ca-bundle.pem" - - parallelJobs: 4 - customOptions: - - "--exclude-schema=rdsdms" -``` - -## Security Best Practices - -1. **Use IAM Database Authentication** - Avoid storing database passwords -2. **Use Secrets Manager** - Store the DBLab token in AWS Secrets Manager -3. **VPC Configuration** - Run clones in a private subnet accessible only to DBLab -4. **Minimal Permissions** - Use the minimal IAM policy above -5. **Encryption** - Ensure clones inherit encryption from snapshots - -## Monitoring - -### CloudWatch Metrics (Lambda) - -The Lambda function emits standard metrics: -- `Invocations` - Number of refresh attempts -- `Errors` - Failed refreshes -- `Duration` - Execution time - -### Custom CloudWatch Dashboard - -```bash -# View recent logs -aws logs tail /aws/lambda/dblab-rds-refresh --follow -``` - -### Alerting - -Set up CloudWatch Alarms for: -- Lambda errors > 0 -- Lambda duration > threshold -- (Optional) Custom metrics on refresh success/failure - -## Troubleshooting - -### Common Issues - -**Clone creation fails with "DBSubnetGroup not found"** -- Ensure the subnet group exists and is in the same VPC - -**Clone creation fails with "VPCSecurityGroupNotFound"** -- Verify security group IDs are correct - -**DBLab refresh timeout** -- Increase `dblab.timeout` in configuration -- Check DBLab Engine logs for issues - -**Clone not accessible from DBLab** -- Verify security groups allow connection from DBLab -- Check if publiclyAccessible setting is correct - -### Debug Mode - -```bash -# CLI: Enable verbose logging -./rds-refresh -config config.yaml 2>&1 | tee refresh.log - -# Lambda: Check CloudWatch logs -aws logs tail /aws/lambda/dblab-rds-refresh --since 1h -``` - -## Cost Considerations - -- **Clone runtime**: You pay for the clone instance while it exists -- **Storage**: Clones don't duplicate storage (snapshot-based) -- **Lambda**: Minimal cost (typically < $0.10/month for weekly refreshes) - -**Cost optimization tips**: -- Use a smaller instance class than production -- Use `gp3` storage type for better price/performance -- Schedule refreshes during off-peak hours - -## Contributing - -See the main [Database Lab Engine contributing guide](../../CONTRIBUTING.md). - -## License - -Apache 2.0 - see [LICENSE](../../LICENSE). diff --git a/engine/deploy/rds-refresh/iam-policy.json b/engine/deploy/rds-refresh/iam-policy.json deleted file mode 100644 index deb13f67..00000000 --- a/engine/deploy/rds-refresh/iam-policy.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "RDSReadSnapshots", - "Effect": "Allow", - "Action": [ - "rds:DescribeDBSnapshots", - "rds:DescribeDBClusterSnapshots", - "rds:DescribeDBInstances", - "rds:DescribeDBClusters" - ], - "Resource": "*" - }, - { - "Sid": "RDSCreateClone", - "Effect": "Allow", - "Action": [ - "rds:RestoreDBInstanceFromDBSnapshot", - "rds:RestoreDBClusterFromSnapshot", - "rds:CreateDBInstance", - "rds:AddTagsToResource", - "rds:ModifyDBInstance", - "rds:ModifyDBCluster" - ], - "Resource": [ - "arn:aws:rds:*:*:db:dblab-refresh-*", - "arn:aws:rds:*:*:cluster:dblab-refresh-*", - "arn:aws:rds:*:*:snapshot:*", - "arn:aws:rds:*:*:cluster-snapshot:*", - "arn:aws:rds:*:*:subgrp:*", - "arn:aws:rds:*:*:pg:*", - "arn:aws:rds:*:*:og:*" - ] - }, - { - "Sid": "RDSDeleteClone", - "Effect": "Allow", - "Action": [ - "rds:DeleteDBInstance", - "rds:DeleteDBCluster" - ], - "Resource": [ - "arn:aws:rds:*:*:db:dblab-refresh-*", - "arn:aws:rds:*:*:cluster:dblab-refresh-*" - ] - } - ] -} diff --git a/engine/deploy/rds-refresh/template.yaml b/engine/deploy/rds-refresh/template.yaml deleted file mode 100644 index 6a7f7826..00000000 --- a/engine/deploy/rds-refresh/template.yaml +++ /dev/null @@ -1,241 +0,0 @@ -AWSTemplateFormatVersion: '2010-09-09' -Transform: AWS::Serverless-2016-10-31 -Description: > - DBLab RDS/Aurora Refresh Lambda - - Automates DBLab full refresh using temporary RDS/Aurora clones created from snapshots. - -Metadata: - AWS::ServerlessRepo::Application: - Name: dblab-rds-refresh - Description: Automates DBLab full refresh using temporary RDS/Aurora clones - Author: Postgres.ai - SpdxLicenseId: Apache-2.0 - Labels: ['dblab', 'rds', 'aurora', 'postgresql', 'database'] - HomePageUrl: https://postgres.ai - SourceCodeUrl: https://gitlab.com/postgres-ai/database-lab - -Parameters: - # Source Configuration - RDSSourceType: - Type: String - Default: rds - AllowedValues: - - rds - - aurora-cluster - Description: Type of source database (rds for RDS instance, aurora-cluster for Aurora) - - RDSSourceIdentifier: - Type: String - Description: RDS DB instance identifier or Aurora cluster identifier - - RDSSnapshotIdentifier: - Type: String - Default: '' - Description: Specific snapshot ID to use (leave empty for latest automated snapshot) - - # Clone Configuration - RDSCloneInstanceClass: - Type: String - Default: db.t3.medium - Description: Instance class for the temporary clone - - RDSCloneSubnetGroup: - Type: String - Default: '' - Description: DB subnet group name for the clone - - RDSCloneSecurityGroups: - Type: CommaDelimitedList - Default: '' - Description: Comma-separated list of VPC security group IDs - - RDSClonePubliclyAccessible: - Type: String - Default: 'false' - AllowedValues: - - 'true' - - 'false' - Description: Whether the clone should be publicly accessible - - RDSCloneEnableIAMAuth: - Type: String - Default: 'true' - AllowedValues: - - 'true' - - 'false' - Description: Enable IAM database authentication on the clone - - RDSCloneParameterGroup: - Type: String - Default: '' - Description: DB parameter group name for the clone - - RDSCloneStorageType: - Type: String - Default: '' - Description: Storage type for the clone (gp2, gp3, io1, etc.) - - # DBLab Configuration - DBLabAPIEndpoint: - Type: String - Description: DBLab Engine API endpoint (e.g., https://dblab.example.com:2345) - - DBLabToken: - Type: String - NoEcho: true - Description: DBLab verification token - - DBLabInsecure: - Type: String - Default: 'false' - AllowedValues: - - 'true' - - 'false' - Description: Skip TLS certificate verification for DBLab API - - # Schedule Configuration - ScheduleExpression: - Type: String - Default: 'rate(7 days)' - Description: Schedule expression for automatic refresh (e.g., 'rate(7 days)' or 'cron(0 2 ? * SUN *)') - - EnableSchedule: - Type: String - Default: 'true' - AllowedValues: - - 'true' - - 'false' - Description: Enable scheduled automatic refresh - - # Lambda Configuration - LambdaTimeout: - Type: Number - Default: 900 - MinValue: 60 - MaxValue: 900 - Description: Lambda function timeout in seconds (max 15 minutes) - - LambdaMemorySize: - Type: Number - Default: 256 - MinValue: 128 - MaxValue: 1024 - Description: Lambda function memory size in MB - -Conditions: - ScheduleEnabled: !Equals [!Ref EnableSchedule, 'true'] - HasSubnetGroup: !Not [!Equals [!Ref RDSCloneSubnetGroup, '']] - HasSecurityGroups: !Not [!Equals [!Join ['', !Ref RDSCloneSecurityGroups], '']] - HasParameterGroup: !Not [!Equals [!Ref RDSCloneParameterGroup, '']] - HasStorageType: !Not [!Equals [!Ref RDSCloneStorageType, '']] - HasSnapshotId: !Not [!Equals [!Ref RDSSnapshotIdentifier, '']] - -Globals: - Function: - Timeout: !Ref LambdaTimeout - MemorySize: !Ref LambdaMemorySize - Runtime: provided.al2023 - Architectures: - - arm64 - -Resources: - RDSRefreshFunction: - Type: AWS::Serverless::Function - Metadata: - BuildMethod: go1.x - Properties: - CodeUri: ../../ - Handler: bootstrap - Description: Automates DBLab full refresh using temporary RDS/Aurora clones - Environment: - Variables: - RDS_SOURCE_TYPE: !Ref RDSSourceType - RDS_SOURCE_IDENTIFIER: !Ref RDSSourceIdentifier - RDS_SNAPSHOT_IDENTIFIER: !If [HasSnapshotId, !Ref RDSSnapshotIdentifier, ''] - RDS_CLONE_INSTANCE_CLASS: !Ref RDSCloneInstanceClass - RDS_CLONE_SUBNET_GROUP: !If [HasSubnetGroup, !Ref RDSCloneSubnetGroup, ''] - RDS_CLONE_SECURITY_GROUPS: !If [HasSecurityGroups, !Sub '["${RDSCloneSecurityGroups}"]', ''] - RDS_CLONE_PUBLIC: !Ref RDSClonePubliclyAccessible - RDS_CLONE_ENABLE_IAM_AUTH: !Ref RDSCloneEnableIAMAuth - RDS_CLONE_PARAMETER_GROUP: !If [HasParameterGroup, !Ref RDSCloneParameterGroup, ''] - RDS_CLONE_STORAGE_TYPE: !If [HasStorageType, !Ref RDSCloneStorageType, ''] - DBLAB_API_ENDPOINT: !Ref DBLabAPIEndpoint - DBLAB_TOKEN: !Ref DBLabToken - DBLAB_INSECURE: !Ref DBLabInsecure - Policies: - - Version: '2012-10-17' - Statement: - - Sid: RDSReadSnapshots - Effect: Allow - Action: - - rds:DescribeDBSnapshots - - rds:DescribeDBClusterSnapshots - - rds:DescribeDBInstances - - rds:DescribeDBClusters - Resource: '*' - - Sid: RDSCreateClone - Effect: Allow - Action: - - rds:RestoreDBInstanceFromDBSnapshot - - rds:RestoreDBClusterFromSnapshot - - rds:CreateDBInstance - - rds:AddTagsToResource - - rds:ModifyDBInstance - - rds:ModifyDBCluster - Resource: - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:db:dblab-refresh-*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster:dblab-refresh-*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:snapshot:*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster-snapshot:*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:subgrp:*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:pg:*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:og:*' - - Sid: RDSDeleteClone - Effect: Allow - Action: - - rds:DeleteDBInstance - - rds:DeleteDBCluster - Resource: - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:db:dblab-refresh-*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster:dblab-refresh-*' - Events: - ScheduledRefresh: - Type: Schedule - Properties: - Schedule: !Ref ScheduleExpression - Description: Scheduled DBLab refresh trigger - Enabled: !If [ScheduleEnabled, true, false] - - RDSRefreshLogGroup: - Type: AWS::Logs::LogGroup - Properties: - LogGroupName: !Sub '/aws/lambda/${RDSRefreshFunction}' - RetentionInDays: 30 - -Outputs: - RDSRefreshFunctionArn: - Description: ARN of the RDS Refresh Lambda function - Value: !GetAtt RDSRefreshFunction.Arn - Export: - Name: !Sub '${AWS::StackName}-FunctionArn' - - RDSRefreshFunctionName: - Description: Name of the RDS Refresh Lambda function - Value: !Ref RDSRefreshFunction - - InvocationCommand: - Description: AWS CLI command to manually invoke the function - Value: !Sub | - aws lambda invoke --function-name ${RDSRefreshFunction} \ - --cli-binary-format raw-in-base64-out \ - --payload '{"dryRun": false}' \ - response.json && cat response.json - - DryRunCommand: - Description: AWS CLI command to run a dry-run test - Value: !Sub | - aws lambda invoke --function-name ${RDSRefreshFunction} \ - --cli-binary-format raw-in-base64-out \ - --payload '{"dryRun": true}' \ - response.json && cat response.json diff --git a/engine/internal/rdsrefresh/config.go b/engine/internal/rdsrefresh/config.go deleted file mode 100644 index 6bc90643..00000000 --- a/engine/internal/rdsrefresh/config.go +++ /dev/null @@ -1,190 +0,0 @@ -/* -2025 © PostgresAI -*/ - -// Package rdsrefresh provides functionality to automate DBLab full refresh -// using temporary RDS/Aurora clones created from snapshots. -package rdsrefresh - -import ( - "fmt" - "os" - "time" - - "gopkg.in/yaml.v3" -) - -// Config holds the configuration for the RDS refresh component. -type Config struct { - Source SourceConfig `yaml:"source"` - Clone CloneConfig `yaml:"clone"` - DBLab DBLabConfig `yaml:"dblab"` - AWS AWSConfig `yaml:"aws"` -} - -// SourceConfig defines the source RDS/Aurora database to clone from. -type SourceConfig struct { - // Type specifies the source type: "rds" for RDS instance, "aurora-cluster" for Aurora cluster. - Type string `yaml:"type"` - // Identifier is the RDS DB instance identifier or Aurora cluster identifier. - Identifier string `yaml:"identifier"` - // SnapshotIdentifier is the specific snapshot to use. If empty, the latest automated snapshot is used. - SnapshotIdentifier string `yaml:"snapshotIdentifier"` -} - -// CloneConfig defines settings for the temporary clone. -type CloneConfig struct { - // InstanceClass is the DB instance class for the clone (e.g., "db.t3.medium"). - InstanceClass string `yaml:"instanceClass"` - // DBSubnetGroupName is the DB subnet group for the clone. - DBSubnetGroupName string `yaml:"subnetGroup"` - // VPCSecurityGroupIDs are the security group IDs to assign to the clone. - VPCSecurityGroupIDs []string `yaml:"securityGroups"` - // PubliclyAccessible determines if the clone should be publicly accessible. - PubliclyAccessible bool `yaml:"publiclyAccessible"` - // Tags are additional tags to add to the clone. - Tags map[string]string `yaml:"tags"` - // ParameterGroupName is the parameter group to use for the clone. - ParameterGroupName string `yaml:"parameterGroup"` - // OptionGroupName is the option group to use for the clone (RDS only). - OptionGroupName string `yaml:"optionGroup"` - // DBClusterParameterGroupName is the cluster parameter group for Aurora clones. - DBClusterParameterGroupName string `yaml:"clusterParameterGroup"` - // EngineVersion specifies the engine version for the clone. If empty, uses source version. - EngineVersion string `yaml:"engineVersion"` - // Port is the port for the clone. If 0, uses default port. - Port int32 `yaml:"port"` - // EnableIAMAuth enables IAM database authentication. - EnableIAMAuth bool `yaml:"enableIAMAuth"` - // StorageType specifies storage type (gp2, gp3, io1, etc.) for RDS clones. - StorageType string `yaml:"storageType"` - // DeletionProtection enables deletion protection on the clone. - DeletionProtection bool `yaml:"deletionProtection"` -} - -// DBLabConfig defines the DBLab Engine connection settings. -type DBLabConfig struct { - // APIEndpoint is the DBLab Engine API endpoint (e.g., "https://dblab.example.com:2345"). - APIEndpoint string `yaml:"apiEndpoint"` - // Token is the verification token for the DBLab API. - Token string `yaml:"token"` - // Insecure allows connections to DBLab with invalid TLS certificates. - Insecure bool `yaml:"insecure"` - // PollInterval is how often to poll the DBLab status during refresh. - PollInterval Duration `yaml:"pollInterval"` - // Timeout is the maximum time to wait for the refresh to complete. - Timeout Duration `yaml:"timeout"` -} - -// AWSConfig holds AWS-specific settings. -type AWSConfig struct { - // Region is the AWS region where the RDS/Aurora resources are located. - Region string `yaml:"region"` - // Endpoint is a custom AWS endpoint (useful for testing with LocalStack). - Endpoint string `yaml:"endpoint"` -} - -// Duration is a wrapper around time.Duration for YAML parsing. -type Duration time.Duration - -// UnmarshalYAML implements yaml.Unmarshaler for Duration. -func (d *Duration) UnmarshalYAML(value *yaml.Node) error { - var s string - if err := value.Decode(&s); err != nil { - return err - } - - dur, err := time.ParseDuration(s) - if err != nil { - return fmt.Errorf("invalid duration %q: %w", s, err) - } - - *d = Duration(dur) - - return nil -} - -// MarshalYAML implements yaml.Marshaler for Duration. -func (d Duration) MarshalYAML() (interface{}, error) { - return time.Duration(d).String(), nil -} - -// Duration returns the time.Duration value. -func (d Duration) Duration() time.Duration { - return time.Duration(d) -} - -// LoadConfig loads configuration from a YAML file. -func LoadConfig(path string) (*Config, error) { - data, err := os.ReadFile(path) - if err != nil { - return nil, fmt.Errorf("failed to read config file: %w", err) - } - - // Expand environment variables in the config - data = []byte(os.ExpandEnv(string(data))) - - var cfg Config - if err := yaml.Unmarshal(data, &cfg); err != nil { - return nil, fmt.Errorf("failed to parse config file: %w", err) - } - - if err := cfg.Validate(); err != nil { - return nil, fmt.Errorf("invalid configuration: %w", err) - } - - cfg.SetDefaults() - - return &cfg, nil -} - -// Validate checks that the configuration is valid. -func (c *Config) Validate() error { - if c.Source.Type == "" { - return fmt.Errorf("source.type is required (rds or aurora-cluster)") - } - - if c.Source.Type != "rds" && c.Source.Type != "aurora-cluster" { - return fmt.Errorf("source.type must be 'rds' or 'aurora-cluster', got %q", c.Source.Type) - } - - if c.Source.Identifier == "" { - return fmt.Errorf("source.identifier is required") - } - - if c.Clone.InstanceClass == "" { - return fmt.Errorf("clone.instanceClass is required") - } - - if c.DBLab.APIEndpoint == "" { - return fmt.Errorf("dblab.apiEndpoint is required") - } - - if c.DBLab.Token == "" { - return fmt.Errorf("dblab.token is required") - } - - if c.AWS.Region == "" { - return fmt.Errorf("aws.region is required") - } - - return nil -} - -// SetDefaults sets default values for optional configuration fields. -func (c *Config) SetDefaults() { - if c.DBLab.PollInterval == 0 { - c.DBLab.PollInterval = Duration(30 * time.Second) - } - - if c.DBLab.Timeout == 0 { - c.DBLab.Timeout = Duration(4 * time.Hour) - } - - if c.Clone.Tags == nil { - c.Clone.Tags = make(map[string]string) - } - - c.Clone.Tags["ManagedBy"] = "dblab-rds-refresh" - c.Clone.Tags["AutoDelete"] = "true" -} diff --git a/engine/internal/rdsrefresh/dblab.go b/engine/internal/rdsrefresh/dblab.go deleted file mode 100644 index 90b86144..00000000 --- a/engine/internal/rdsrefresh/dblab.go +++ /dev/null @@ -1,205 +0,0 @@ -/* -2025 © PostgresAI -*/ - -package rdsrefresh - -import ( - "bytes" - "context" - "crypto/tls" - "encoding/json" - "fmt" - "io" - "net/http" - "time" - - "gitlab.com/postgres-ai/database-lab/v3/pkg/models" -) - -const ( - verificationHeader = "Verification-Token" - contentTypeJSON = "application/json" -) - -// DBLabClient provides methods to interact with the DBLab Engine API. -type DBLabClient struct { - baseURL string - token string - httpClient *http.Client -} - -// NewDBLabClient creates a new DBLab API client. -func NewDBLabClient(cfg *DBLabConfig) *DBLabClient { - transport := &http.Transport{ - TLSClientConfig: &tls.Config{InsecureSkipVerify: cfg.Insecure}, - } - - return &DBLabClient{ - baseURL: cfg.APIEndpoint, - token: cfg.Token, - httpClient: &http.Client{ - Transport: transport, - Timeout: 60 * time.Second, - }, - } -} - -// GetStatus returns the current DBLab Engine instance status. -func (c *DBLabClient) GetStatus(ctx context.Context) (*models.InstanceStatus, error) { - resp, err := c.doRequest(ctx, http.MethodGet, "/status", nil) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - var status models.InstanceStatus - if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { - return nil, fmt.Errorf("failed to decode status response: %w", err) - } - - return &status, nil -} - -// TriggerFullRefresh triggers a full data refresh on the DBLab Engine. -func (c *DBLabClient) TriggerFullRefresh(ctx context.Context) error { - resp, err := c.doRequest(ctx, http.MethodPost, "/full-refresh", nil) - if err != nil { - return err - } - defer resp.Body.Close() - - var result models.Response - if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { - return fmt.Errorf("failed to decode response: %w", err) - } - - if result.Status != "OK" { - return fmt.Errorf("full refresh failed: %s", result.Message) - } - - return nil -} - -// UpdateConfig updates the DBLab Engine configuration. -func (c *DBLabClient) UpdateConfig(ctx context.Context, configPatch map[string]interface{}) error { - body, err := json.Marshal(configPatch) - if err != nil { - return fmt.Errorf("failed to marshal config: %w", err) - } - - resp, err := c.doRequest(ctx, http.MethodPatch, "/config", bytes.NewReader(body)) - if err != nil { - return err - } - defer resp.Body.Close() - - return nil -} - -// WaitForRefreshComplete polls the DBLab status until refresh is complete or timeout. -func (c *DBLabClient) WaitForRefreshComplete(ctx context.Context, pollInterval, timeout time.Duration) error { - ticker := time.NewTicker(pollInterval) - defer ticker.Stop() - - timeoutTimer := time.NewTimer(timeout) - defer timeoutTimer.Stop() - - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-timeoutTimer.C: - return fmt.Errorf("timeout waiting for refresh to complete after %v", timeout) - case <-ticker.C: - status, err := c.GetStatus(ctx) - if err != nil { - return fmt.Errorf("failed to get status: %w", err) - } - - retrievalStatus := status.Retrieving.Status - - switch retrievalStatus { - case models.Finished: - return nil - case models.Failed: - if len(status.Retrieving.Alerts) > 0 { - for _, alert := range status.Retrieving.Alerts { - return fmt.Errorf("refresh failed: %s", alert.Message) - } - } - - return fmt.Errorf("refresh failed (no details available)") - case models.Refreshing, models.Snapshotting, models.Renewed: - // still in progress - continue - case models.Inactive, models.Pending: - // not started yet or pending - continue - default: - continue - } - } - } -} - -// IsRefreshInProgress checks if a refresh is currently in progress. -func (c *DBLabClient) IsRefreshInProgress(ctx context.Context) (bool, error) { - status, err := c.GetStatus(ctx) - if err != nil { - return false, err - } - - switch status.Retrieving.Status { - case models.Refreshing, models.Snapshotting: - return true, nil - default: - return false, nil - } -} - -// Health checks if the DBLab Engine is healthy. -func (c *DBLabClient) Health(ctx context.Context) error { - resp, err := c.doRequest(ctx, http.MethodGet, "/healthz", nil) - if err != nil { - return err - } - defer resp.Body.Close() - - return nil -} - -func (c *DBLabClient) doRequest(ctx context.Context, method, path string, body io.Reader) (*http.Response, error) { - url := c.baseURL + path - - req, err := http.NewRequestWithContext(ctx, method, url, body) - if err != nil { - return nil, fmt.Errorf("failed to create request: %w", err) - } - - req.Header.Set(verificationHeader, c.token) - - if body != nil { - req.Header.Set("Content-Type", contentTypeJSON) - } - - resp, err := c.httpClient.Do(req) - if err != nil { - return nil, fmt.Errorf("request failed: %w", err) - } - - if resp.StatusCode >= http.StatusBadRequest { - defer resp.Body.Close() - - bodyBytes, _ := io.ReadAll(resp.Body) - - var errModel models.Error - if err := json.Unmarshal(bodyBytes, &errModel); err == nil && errModel.Message != "" { - return nil, fmt.Errorf("API error (status %d): %s", resp.StatusCode, errModel.Message) - } - - return nil, fmt.Errorf("API error (status %d): %s", resp.StatusCode, string(bodyBytes)) - } - - return resp, nil -} diff --git a/engine/internal/rdsrefresh/lambda.go b/engine/internal/rdsrefresh/lambda.go deleted file mode 100644 index bddaa00d..00000000 --- a/engine/internal/rdsrefresh/lambda.go +++ /dev/null @@ -1,174 +0,0 @@ -/* -2025 © PostgresAI -*/ - -package rdsrefresh - -import ( - "context" - "encoding/json" - "fmt" - "os" -) - -// LambdaEvent is the input event for the Lambda function. -type LambdaEvent struct { - // DryRun, if true, only validates configuration without creating resources. - DryRun bool `json:"dryRun"` - // ConfigOverrides allows overriding configuration values. - ConfigOverrides *ConfigOverrides `json:"configOverrides"` -} - -// ConfigOverrides allows partial configuration overrides via the Lambda event. -type ConfigOverrides struct { - SnapshotIdentifier string `json:"snapshotIdentifier"` -} - -// LambdaResponse is the output response from the Lambda function. -type LambdaResponse struct { - Success bool `json:"success"` - Message string `json:"message"` - SnapshotID string `json:"snapshotId,omitempty"` - CloneID string `json:"cloneId,omitempty"` - CloneEndpoint string `json:"cloneEndpoint,omitempty"` - DurationSec int64 `json:"durationSeconds,omitempty"` - Error string `json:"error,omitempty"` -} - -// LambdaLogger implements Logger for Lambda/CloudWatch. -type LambdaLogger struct{} - -// Info logs an info message. -func (l *LambdaLogger) Info(msg string, args ...interface{}) { - fmt.Printf("[INFO] "+msg+"\n", args...) -} - -// Error logs an error message. -func (l *LambdaLogger) Error(msg string, args ...interface{}) { - fmt.Printf("[ERROR] "+msg+"\n", args...) -} - -// Debug logs a debug message. -func (l *LambdaLogger) Debug(msg string, args ...interface{}) { - fmt.Printf("[DEBUG] "+msg+"\n", args...) -} - -// HandleLambda is the Lambda function handler. -func HandleLambda(ctx context.Context, event LambdaEvent) (LambdaResponse, error) { - logger := &LambdaLogger{} - - cfg, err := loadLambdaConfig() - if err != nil { - return LambdaResponse{ - Success: false, - Error: err.Error(), - Message: "failed to load configuration", - }, nil - } - - // Apply overrides - if event.ConfigOverrides != nil && event.ConfigOverrides.SnapshotIdentifier != "" { - cfg.Source.SnapshotIdentifier = event.ConfigOverrides.SnapshotIdentifier - } - - refresher, err := NewRefresher(ctx, cfg, logger) - if err != nil { - return LambdaResponse{ - Success: false, - Error: err.Error(), - Message: "failed to initialize refresher", - }, nil - } - - if event.DryRun { - if err := refresher.DryRun(ctx); err != nil { - return LambdaResponse{ - Success: false, - Error: err.Error(), - Message: "dry run failed", - }, nil - } - - return LambdaResponse{ - Success: true, - Message: "dry run completed successfully", - }, nil - } - - result := refresher.Run(ctx) - - resp := LambdaResponse{ - Success: result.Success, - SnapshotID: result.SnapshotID, - CloneID: result.CloneID, - CloneEndpoint: result.CloneEndpoint, - DurationSec: int64(result.Duration.Seconds()), - } - - if result.Error != nil { - resp.Error = result.Error.Error() - resp.Message = "refresh failed" - } else { - resp.Message = "refresh completed successfully" - } - - return resp, nil -} - -// loadLambdaConfig loads configuration from environment variables. -func loadLambdaConfig() (*Config, error) { - cfg := &Config{} - - // Source configuration - cfg.Source.Type = getEnvOrDefault("RDS_SOURCE_TYPE", "rds") - cfg.Source.Identifier = os.Getenv("RDS_SOURCE_IDENTIFIER") - cfg.Source.SnapshotIdentifier = os.Getenv("RDS_SNAPSHOT_IDENTIFIER") - - // Clone configuration - cfg.Clone.InstanceClass = os.Getenv("RDS_CLONE_INSTANCE_CLASS") - cfg.Clone.DBSubnetGroupName = os.Getenv("RDS_CLONE_SUBNET_GROUP") - - if sgJSON := os.Getenv("RDS_CLONE_SECURITY_GROUPS"); sgJSON != "" { - if err := json.Unmarshal([]byte(sgJSON), &cfg.Clone.VPCSecurityGroupIDs); err != nil { - return nil, fmt.Errorf("invalid RDS_CLONE_SECURITY_GROUPS JSON: %w", err) - } - } - - cfg.Clone.PubliclyAccessible = os.Getenv("RDS_CLONE_PUBLIC") == "true" - cfg.Clone.ParameterGroupName = os.Getenv("RDS_CLONE_PARAMETER_GROUP") - cfg.Clone.OptionGroupName = os.Getenv("RDS_CLONE_OPTION_GROUP") - cfg.Clone.DBClusterParameterGroupName = os.Getenv("RDS_CLONE_CLUSTER_PARAMETER_GROUP") - cfg.Clone.EnableIAMAuth = os.Getenv("RDS_CLONE_ENABLE_IAM_AUTH") == "true" - cfg.Clone.StorageType = os.Getenv("RDS_CLONE_STORAGE_TYPE") - - // Parse tags from JSON - if tagsJSON := os.Getenv("RDS_CLONE_TAGS"); tagsJSON != "" { - if err := json.Unmarshal([]byte(tagsJSON), &cfg.Clone.Tags); err != nil { - return nil, fmt.Errorf("invalid RDS_CLONE_TAGS JSON: %w", err) - } - } - - // DBLab configuration - cfg.DBLab.APIEndpoint = os.Getenv("DBLAB_API_ENDPOINT") - cfg.DBLab.Token = os.Getenv("DBLAB_TOKEN") - cfg.DBLab.Insecure = os.Getenv("DBLAB_INSECURE") == "true" - - // AWS configuration - cfg.AWS.Region = os.Getenv("AWS_REGION") - - if err := cfg.Validate(); err != nil { - return nil, err - } - - cfg.SetDefaults() - - return cfg, nil -} - -func getEnvOrDefault(key, defaultValue string) string { - if v := os.Getenv(key); v != "" { - return v - } - - return defaultValue -} diff --git a/engine/internal/rdsrefresh/rds.go b/engine/internal/rdsrefresh/rds.go deleted file mode 100644 index b105532d..00000000 --- a/engine/internal/rdsrefresh/rds.go +++ /dev/null @@ -1,509 +0,0 @@ -/* -2025 © PostgresAI -*/ - -package rdsrefresh - -import ( - "context" - "fmt" - "sort" - "time" - - "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/config" - "github.com/aws/aws-sdk-go-v2/service/rds" - "github.com/aws/aws-sdk-go-v2/service/rds/types" -) - -const ( - cloneNamePrefix = "dblab-refresh-" - waitPollInterval = 30 * time.Second - maxWaitTime = 2 * time.Hour - defaultPort int32 = 5432 -) - -// RDSClient wraps the AWS RDS client with convenience methods. -type RDSClient struct { - client *rds.Client - cfg *Config -} - -// CloneInfo holds information about a created clone. -type CloneInfo struct { - Identifier string - Endpoint string - Port int32 - IsCluster bool -} - -// NewRDSClient creates a new RDS client. -func NewRDSClient(ctx context.Context, cfg *Config) (*RDSClient, error) { - awsCfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(cfg.AWS.Region)) - if err != nil { - return nil, fmt.Errorf("failed to load AWS config: %w", err) - } - - var opts []func(*rds.Options) - if cfg.AWS.Endpoint != "" { - opts = append(opts, func(o *rds.Options) { - o.BaseEndpoint = aws.String(cfg.AWS.Endpoint) - }) - } - - return &RDSClient{ - client: rds.NewFromConfig(awsCfg, opts...), - cfg: cfg, - }, nil -} - -// FindLatestSnapshot finds the latest available snapshot for the source. -func (r *RDSClient) FindLatestSnapshot(ctx context.Context) (string, error) { - if r.cfg.Source.SnapshotIdentifier != "" { - return r.cfg.Source.SnapshotIdentifier, nil - } - - if r.cfg.Source.Type == "aurora-cluster" { - return r.findLatestClusterSnapshot(ctx) - } - - return r.findLatestDBSnapshot(ctx) -} - -func (r *RDSClient) findLatestDBSnapshot(ctx context.Context) (string, error) { - input := &rds.DescribeDBSnapshotsInput{ - DBInstanceIdentifier: aws.String(r.cfg.Source.Identifier), - SnapshotType: aws.String("automated"), - } - - result, err := r.client.DescribeDBSnapshots(ctx, input) - if err != nil { - return "", fmt.Errorf("failed to describe DB snapshots: %w", err) - } - - if len(result.DBSnapshots) == 0 { - return "", fmt.Errorf("no automated snapshots found for RDS instance %q", r.cfg.Source.Identifier) - } - - // Sort by creation time (newest first) - sort.Slice(result.DBSnapshots, func(i, j int) bool { - ti := result.DBSnapshots[i].SnapshotCreateTime - tj := result.DBSnapshots[j].SnapshotCreateTime - - if ti == nil || tj == nil { - return ti != nil - } - - return ti.After(*tj) - }) - - // Find the first available snapshot - for _, snap := range result.DBSnapshots { - if snap.Status != nil && *snap.Status == "available" { - return *snap.DBSnapshotIdentifier, nil - } - } - - return "", fmt.Errorf("no available snapshots found for RDS instance %q", r.cfg.Source.Identifier) -} - -func (r *RDSClient) findLatestClusterSnapshot(ctx context.Context) (string, error) { - input := &rds.DescribeDBClusterSnapshotsInput{ - DBClusterIdentifier: aws.String(r.cfg.Source.Identifier), - SnapshotType: aws.String("automated"), - } - - result, err := r.client.DescribeDBClusterSnapshots(ctx, input) - if err != nil { - return "", fmt.Errorf("failed to describe DB cluster snapshots: %w", err) - } - - if len(result.DBClusterSnapshots) == 0 { - return "", fmt.Errorf("no automated snapshots found for Aurora cluster %q", r.cfg.Source.Identifier) - } - - // Sort by creation time (newest first) - sort.Slice(result.DBClusterSnapshots, func(i, j int) bool { - ti := result.DBClusterSnapshots[i].SnapshotCreateTime - tj := result.DBClusterSnapshots[j].SnapshotCreateTime - - if ti == nil || tj == nil { - return ti != nil - } - - return ti.After(*tj) - }) - - // Find the first available snapshot - for _, snap := range result.DBClusterSnapshots { - if snap.Status != nil && *snap.Status == "available" { - return *snap.DBClusterSnapshotIdentifier, nil - } - } - - return "", fmt.Errorf("no available snapshots found for Aurora cluster %q", r.cfg.Source.Identifier) -} - -// CreateClone creates a temporary clone from a snapshot. -func (r *RDSClient) CreateClone(ctx context.Context, snapshotID string) (*CloneInfo, error) { - cloneName := fmt.Sprintf("%s%s", cloneNamePrefix, time.Now().UTC().Format("20060102-150405")) - - if r.cfg.Source.Type == "aurora-cluster" { - return r.createAuroraClone(ctx, snapshotID, cloneName) - } - - return r.createRDSClone(ctx, snapshotID, cloneName) -} - -func (r *RDSClient) createRDSClone(ctx context.Context, snapshotID, cloneName string) (*CloneInfo, error) { - tags := r.buildTags() - - input := &rds.RestoreDBInstanceFromDBSnapshotInput{ - DBInstanceIdentifier: aws.String(cloneName), - DBSnapshotIdentifier: aws.String(snapshotID), - DBInstanceClass: aws.String(r.cfg.Clone.InstanceClass), - PubliclyAccessible: aws.Bool(r.cfg.Clone.PubliclyAccessible), - Tags: tags, - DeletionProtection: aws.Bool(r.cfg.Clone.DeletionProtection), - } - - if r.cfg.Clone.DBSubnetGroupName != "" { - input.DBSubnetGroupName = aws.String(r.cfg.Clone.DBSubnetGroupName) - } - - if len(r.cfg.Clone.VPCSecurityGroupIDs) > 0 { - input.VpcSecurityGroupIds = r.cfg.Clone.VPCSecurityGroupIDs - } - - if r.cfg.Clone.ParameterGroupName != "" { - input.DBParameterGroupName = aws.String(r.cfg.Clone.ParameterGroupName) - } - - if r.cfg.Clone.OptionGroupName != "" { - input.OptionGroupName = aws.String(r.cfg.Clone.OptionGroupName) - } - - if r.cfg.Clone.Port > 0 { - input.Port = aws.Int32(r.cfg.Clone.Port) - } - - if r.cfg.Clone.EnableIAMAuth { - input.EnableIAMDatabaseAuthentication = aws.Bool(true) - } - - if r.cfg.Clone.StorageType != "" { - input.StorageType = aws.String(r.cfg.Clone.StorageType) - } - - _, err := r.client.RestoreDBInstanceFromDBSnapshot(ctx, input) - if err != nil { - return nil, fmt.Errorf("failed to restore DB instance from snapshot: %w", err) - } - - return &CloneInfo{ - Identifier: cloneName, - IsCluster: false, - }, nil -} - -func (r *RDSClient) createAuroraClone(ctx context.Context, snapshotID, cloneName string) (*CloneInfo, error) { - tags := r.buildTags() - - // First, restore the Aurora cluster - clusterInput := &rds.RestoreDBClusterFromSnapshotInput{ - DBClusterIdentifier: aws.String(cloneName), - SnapshotIdentifier: aws.String(snapshotID), - Tags: tags, - DeletionProtection: aws.Bool(r.cfg.Clone.DeletionProtection), - } - - if r.cfg.Clone.DBSubnetGroupName != "" { - clusterInput.DBSubnetGroupName = aws.String(r.cfg.Clone.DBSubnetGroupName) - } - - if len(r.cfg.Clone.VPCSecurityGroupIDs) > 0 { - clusterInput.VpcSecurityGroupIds = r.cfg.Clone.VPCSecurityGroupIDs - } - - if r.cfg.Clone.DBClusterParameterGroupName != "" { - clusterInput.DBClusterParameterGroupName = aws.String(r.cfg.Clone.DBClusterParameterGroupName) - } - - if r.cfg.Clone.EngineVersion != "" { - clusterInput.EngineVersion = aws.String(r.cfg.Clone.EngineVersion) - } - - if r.cfg.Clone.Port > 0 { - clusterInput.Port = aws.Int32(r.cfg.Clone.Port) - } - - if r.cfg.Clone.EnableIAMAuth { - clusterInput.EnableIAMDatabaseAuthentication = aws.Bool(true) - } - - // Get the engine from the snapshot - snapshotResp, err := r.client.DescribeDBClusterSnapshots(ctx, &rds.DescribeDBClusterSnapshotsInput{ - DBClusterSnapshotIdentifier: aws.String(snapshotID), - }) - if err != nil { - return nil, fmt.Errorf("failed to describe cluster snapshot: %w", err) - } - - if len(snapshotResp.DBClusterSnapshots) == 0 { - return nil, fmt.Errorf("snapshot %q not found", snapshotID) - } - - snapshot := snapshotResp.DBClusterSnapshots[0] - clusterInput.Engine = snapshot.Engine - - _, err = r.client.RestoreDBClusterFromSnapshot(ctx, clusterInput) - if err != nil { - return nil, fmt.Errorf("failed to restore DB cluster from snapshot: %w", err) - } - - // Wait for cluster to be available before creating instance - if err := r.waitForClusterAvailable(ctx, cloneName); err != nil { - // Try to clean up the cluster - _ = r.deleteAuroraCluster(ctx, cloneName) - return nil, fmt.Errorf("cluster did not become available: %w", err) - } - - // Create a DB instance in the cluster - instanceName := cloneName + "-instance" - instanceInput := &rds.CreateDBInstanceInput{ - DBInstanceIdentifier: aws.String(instanceName), - DBInstanceClass: aws.String(r.cfg.Clone.InstanceClass), - DBClusterIdentifier: aws.String(cloneName), - Engine: snapshot.Engine, - Tags: tags, - } - - if r.cfg.Clone.ParameterGroupName != "" { - instanceInput.DBParameterGroupName = aws.String(r.cfg.Clone.ParameterGroupName) - } - - _, err = r.client.CreateDBInstance(ctx, instanceInput) - if err != nil { - // Try to clean up the cluster - _ = r.deleteAuroraCluster(ctx, cloneName) - return nil, fmt.Errorf("failed to create DB instance in cluster: %w", err) - } - - return &CloneInfo{ - Identifier: cloneName, - IsCluster: true, - }, nil -} - -func (r *RDSClient) buildTags() []types.Tag { - tags := make([]types.Tag, 0, len(r.cfg.Clone.Tags)) - - for k, v := range r.cfg.Clone.Tags { - tags = append(tags, types.Tag{ - Key: aws.String(k), - Value: aws.String(v), - }) - } - - return tags -} - -// WaitForCloneAvailable waits for the clone to become available and returns connection info. -func (r *RDSClient) WaitForCloneAvailable(ctx context.Context, clone *CloneInfo) error { - if clone.IsCluster { - instanceName := clone.Identifier + "-instance" - - if err := r.waitForInstanceAvailable(ctx, instanceName); err != nil { - return err - } - - // Get the cluster endpoint - clusterResp, err := r.client.DescribeDBClusters(ctx, &rds.DescribeDBClustersInput{ - DBClusterIdentifier: aws.String(clone.Identifier), - }) - if err != nil { - return fmt.Errorf("failed to describe cluster: %w", err) - } - - if len(clusterResp.DBClusters) == 0 { - return fmt.Errorf("cluster %q not found", clone.Identifier) - } - - cluster := clusterResp.DBClusters[0] - clone.Endpoint = aws.ToString(cluster.Endpoint) - clone.Port = aws.ToInt32(cluster.Port) - - if clone.Port == 0 { - clone.Port = defaultPort - } - - return nil - } - - if err := r.waitForInstanceAvailable(ctx, clone.Identifier); err != nil { - return err - } - - // Get the instance endpoint - instanceResp, err := r.client.DescribeDBInstances(ctx, &rds.DescribeDBInstancesInput{ - DBInstanceIdentifier: aws.String(clone.Identifier), - }) - if err != nil { - return fmt.Errorf("failed to describe instance: %w", err) - } - - if len(instanceResp.DBInstances) == 0 { - return fmt.Errorf("instance %q not found", clone.Identifier) - } - - instance := instanceResp.DBInstances[0] - - if instance.Endpoint != nil { - clone.Endpoint = aws.ToString(instance.Endpoint.Address) - clone.Port = aws.ToInt32(instance.Endpoint.Port) - } - - if clone.Port == 0 { - clone.Port = defaultPort - } - - return nil -} - -func (r *RDSClient) waitForInstanceAvailable(ctx context.Context, identifier string) error { - waiter := rds.NewDBInstanceAvailableWaiter(r.client) - - return waiter.Wait(ctx, &rds.DescribeDBInstancesInput{ - DBInstanceIdentifier: aws.String(identifier), - }, maxWaitTime) -} - -func (r *RDSClient) waitForClusterAvailable(ctx context.Context, identifier string) error { - waiter := rds.NewDBClusterAvailableWaiter(r.client) - - return waiter.Wait(ctx, &rds.DescribeDBClustersInput{ - DBClusterIdentifier: aws.String(identifier), - }, maxWaitTime) -} - -// DeleteClone deletes the temporary clone. -func (r *RDSClient) DeleteClone(ctx context.Context, clone *CloneInfo) error { - if clone.IsCluster { - return r.deleteAuroraCluster(ctx, clone.Identifier) - } - - return r.deleteRDSInstance(ctx, clone.Identifier) -} - -func (r *RDSClient) deleteRDSInstance(ctx context.Context, identifier string) error { - // First, disable deletion protection if enabled - _, _ = r.client.ModifyDBInstance(ctx, &rds.ModifyDBInstanceInput{ - DBInstanceIdentifier: aws.String(identifier), - DeletionProtection: aws.Bool(false), - ApplyImmediately: aws.Bool(true), - }) - - _, err := r.client.DeleteDBInstance(ctx, &rds.DeleteDBInstanceInput{ - DBInstanceIdentifier: aws.String(identifier), - SkipFinalSnapshot: aws.Bool(true), - DeleteAutomatedBackups: aws.Bool(true), - }) - - if err != nil { - return fmt.Errorf("failed to delete DB instance: %w", err) - } - - return nil -} - -func (r *RDSClient) deleteAuroraCluster(ctx context.Context, clusterIdentifier string) error { - // First, delete all instances in the cluster - instancesResp, err := r.client.DescribeDBInstances(ctx, &rds.DescribeDBInstancesInput{ - Filters: []types.Filter{ - { - Name: aws.String("db-cluster-id"), - Values: []string{clusterIdentifier}, - }, - }, - }) - if err != nil { - return fmt.Errorf("failed to list cluster instances: %w", err) - } - - for _, instance := range instancesResp.DBInstances { - if err := r.deleteRDSInstance(ctx, aws.ToString(instance.DBInstanceIdentifier)); err != nil { - return fmt.Errorf("failed to delete cluster instance: %w", err) - } - } - - // Wait for all instances to be deleted - for _, instance := range instancesResp.DBInstances { - waiter := rds.NewDBInstanceDeletedWaiter(r.client) - - if err := waiter.Wait(ctx, &rds.DescribeDBInstancesInput{ - DBInstanceIdentifier: instance.DBInstanceIdentifier, - }, maxWaitTime); err != nil { - return fmt.Errorf("failed waiting for instance deletion: %w", err) - } - } - - // Disable deletion protection on cluster - _, _ = r.client.ModifyDBCluster(ctx, &rds.ModifyDBClusterInput{ - DBClusterIdentifier: aws.String(clusterIdentifier), - DeletionProtection: aws.Bool(false), - ApplyImmediately: aws.Bool(true), - }) - - // Delete the cluster - _, err = r.client.DeleteDBCluster(ctx, &rds.DeleteDBClusterInput{ - DBClusterIdentifier: aws.String(clusterIdentifier), - SkipFinalSnapshot: aws.Bool(true), - }) - - if err != nil { - return fmt.Errorf("failed to delete DB cluster: %w", err) - } - - return nil -} - -// GetSourceInfo returns information about the source database. -func (r *RDSClient) GetSourceInfo(ctx context.Context) (string, error) { - if r.cfg.Source.Type == "aurora-cluster" { - resp, err := r.client.DescribeDBClusters(ctx, &rds.DescribeDBClustersInput{ - DBClusterIdentifier: aws.String(r.cfg.Source.Identifier), - }) - if err != nil { - return "", fmt.Errorf("failed to describe source cluster: %w", err) - } - - if len(resp.DBClusters) == 0 { - return "", fmt.Errorf("source cluster %q not found", r.cfg.Source.Identifier) - } - - cluster := resp.DBClusters[0] - - return fmt.Sprintf("Aurora cluster %s (engine: %s, version: %s)", - r.cfg.Source.Identifier, - aws.ToString(cluster.Engine), - aws.ToString(cluster.EngineVersion)), nil - } - - resp, err := r.client.DescribeDBInstances(ctx, &rds.DescribeDBInstancesInput{ - DBInstanceIdentifier: aws.String(r.cfg.Source.Identifier), - }) - if err != nil { - return "", fmt.Errorf("failed to describe source instance: %w", err) - } - - if len(resp.DBInstances) == 0 { - return "", fmt.Errorf("source instance %q not found", r.cfg.Source.Identifier) - } - - instance := resp.DBInstances[0] - - return fmt.Sprintf("RDS instance %s (engine: %s, version: %s)", - r.cfg.Source.Identifier, - aws.ToString(instance.Engine), - aws.ToString(instance.EngineVersion)), nil -} diff --git a/engine/internal/rdsrefresh/refresher.go b/engine/internal/rdsrefresh/refresher.go deleted file mode 100644 index a2455ca7..00000000 --- a/engine/internal/rdsrefresh/refresher.go +++ /dev/null @@ -1,243 +0,0 @@ -/* -2025 © PostgresAI -*/ - -package rdsrefresh - -import ( - "context" - "fmt" - "time" -) - -// Logger defines the logging interface. -type Logger interface { - Info(msg string, args ...interface{}) - Error(msg string, args ...interface{}) - Debug(msg string, args ...interface{}) -} - -// DefaultLogger is a simple stdout logger. -type DefaultLogger struct{} - -// Info logs an info message. -func (l *DefaultLogger) Info(msg string, args ...interface{}) { - fmt.Printf("[INFO] "+msg+"\n", args...) -} - -// Error logs an error message. -func (l *DefaultLogger) Error(msg string, args ...interface{}) { - fmt.Printf("[ERROR] "+msg+"\n", args...) -} - -// Debug logs a debug message. -func (l *DefaultLogger) Debug(msg string, args ...interface{}) { - fmt.Printf("[DEBUG] "+msg+"\n", args...) -} - -// Refresher orchestrates the RDS/Aurora clone and DBLab refresh workflow. -type Refresher struct { - cfg *Config - rds *RDSClient - dblab *DBLabClient - logger Logger -} - -// RefreshResult contains the result of a refresh operation. -type RefreshResult struct { - Success bool - SnapshotID string - CloneID string - StartTime time.Time - EndTime time.Time - Duration time.Duration - Error error - CloneEndpoint string -} - -// NewRefresher creates a new Refresher instance. -func NewRefresher(ctx context.Context, cfg *Config, logger Logger) (*Refresher, error) { - if logger == nil { - logger = &DefaultLogger{} - } - - rdsClient, err := NewRDSClient(ctx, cfg) - if err != nil { - return nil, fmt.Errorf("failed to create RDS client: %w", err) - } - - dblabClient := NewDBLabClient(&cfg.DBLab) - - return &Refresher{ - cfg: cfg, - rds: rdsClient, - dblab: dblabClient, - logger: logger, - }, nil -} - -// Run executes the full refresh workflow: -// 1. Verifies DBLab is healthy and not already refreshing -// 2. Finds the latest snapshot -// 3. Creates a temporary clone from the snapshot -// 4. Waits for the clone to be available -// 5. Triggers DBLab full refresh -// 6. Waits for refresh to complete -// 7. Deletes the temporary clone -func (r *Refresher) Run(ctx context.Context) *RefreshResult { - result := &RefreshResult{ - StartTime: time.Now(), - } - - defer func() { - result.EndTime = time.Now() - result.Duration = result.EndTime.Sub(result.StartTime) - }() - - // Step 1: Check DBLab health and status - r.logger.Info("Checking DBLab Engine health...") - - if err := r.dblab.Health(ctx); err != nil { - result.Error = fmt.Errorf("DBLab health check failed: %w", err) - return result - } - - inProgress, err := r.dblab.IsRefreshInProgress(ctx) - if err != nil { - result.Error = fmt.Errorf("failed to check DBLab status: %w", err) - return result - } - - if inProgress { - result.Error = fmt.Errorf("refresh already in progress, skipping") - return result - } - - // Step 2: Get source info - r.logger.Info("Checking source database...") - - sourceInfo, err := r.rds.GetSourceInfo(ctx) - if err != nil { - result.Error = fmt.Errorf("failed to get source info: %w", err) - return result - } - - r.logger.Info("Source: %s", sourceInfo) - - // Step 3: Find latest snapshot - r.logger.Info("Finding latest snapshot...") - - snapshotID, err := r.rds.FindLatestSnapshot(ctx) - if err != nil { - result.Error = fmt.Errorf("failed to find snapshot: %w", err) - return result - } - - result.SnapshotID = snapshotID - r.logger.Info("Using snapshot: %s", snapshotID) - - // Step 4: Create temporary clone - r.logger.Info("Creating temporary RDS clone from snapshot...") - - clone, err := r.rds.CreateClone(ctx, snapshotID) - if err != nil { - result.Error = fmt.Errorf("failed to create clone: %w", err) - return result - } - - result.CloneID = clone.Identifier - r.logger.Info("Created clone: %s", clone.Identifier) - - // Ensure cleanup on any exit - defer func() { - r.logger.Info("Cleaning up temporary clone %s...", clone.Identifier) - - if deleteErr := r.rds.DeleteClone(context.Background(), clone); deleteErr != nil { - r.logger.Error("Failed to delete clone %s: %v (manual cleanup may be required)", clone.Identifier, deleteErr) - } else { - r.logger.Info("Successfully deleted temporary clone %s", clone.Identifier) - } - }() - - // Step 5: Wait for clone to be available - r.logger.Info("Waiting for clone to become available (this may take 10-30 minutes)...") - - if err := r.rds.WaitForCloneAvailable(ctx, clone); err != nil { - result.Error = fmt.Errorf("clone did not become available: %w", err) - return result - } - - result.CloneEndpoint = clone.Endpoint - r.logger.Info("Clone available at: %s:%d", clone.Endpoint, clone.Port) - - // Step 6: Trigger DBLab full refresh - r.logger.Info("Triggering DBLab full refresh...") - - if err := r.dblab.TriggerFullRefresh(ctx); err != nil { - result.Error = fmt.Errorf("failed to trigger refresh: %w", err) - return result - } - - r.logger.Info("Full refresh triggered, waiting for completion...") - - // Step 7: Wait for refresh to complete - pollInterval := r.cfg.DBLab.PollInterval.Duration() - timeout := r.cfg.DBLab.Timeout.Duration() - - if err := r.dblab.WaitForRefreshComplete(ctx, pollInterval, timeout); err != nil { - result.Error = fmt.Errorf("refresh did not complete: %w", err) - return result - } - - r.logger.Info("DBLab refresh completed successfully!") - result.Success = true - - return result -} - -// DryRun performs all validation steps without actually creating resources. -func (r *Refresher) DryRun(ctx context.Context) error { - r.logger.Info("=== DRY RUN MODE ===") - - // Check DBLab - r.logger.Info("Checking DBLab Engine health...") - - if err := r.dblab.Health(ctx); err != nil { - return fmt.Errorf("DBLab health check failed: %w", err) - } - - r.logger.Info("DBLab Engine is healthy") - - // Check current status - status, err := r.dblab.GetStatus(ctx) - if err != nil { - return fmt.Errorf("failed to get DBLab status: %w", err) - } - - r.logger.Info("DBLab retrieval status: %s", status.Retrieving.Status) - - // Check source - r.logger.Info("Checking source database...") - - sourceInfo, err := r.rds.GetSourceInfo(ctx) - if err != nil { - return fmt.Errorf("failed to get source info: %w", err) - } - - r.logger.Info("Source: %s", sourceInfo) - - // Check snapshot - r.logger.Info("Finding latest snapshot...") - - snapshotID, err := r.rds.FindLatestSnapshot(ctx) - if err != nil { - return fmt.Errorf("failed to find snapshot: %w", err) - } - - r.logger.Info("Would use snapshot: %s", snapshotID) - r.logger.Info("Would create clone with instance class: %s", r.cfg.Clone.InstanceClass) - - r.logger.Info("=== DRY RUN COMPLETE - All checks passed ===") - - return nil -} diff --git a/rds-refresh/Makefile b/rds-refresh/Makefile index 158083f2..b41afb23 100644 --- a/rds-refresh/Makefile +++ b/rds-refresh/Makefile @@ -1,27 +1,22 @@ -.PHONY: build build-linux build-lambda clean test fmt vet +.PHONY: build build-linux clean test fmt vet docker-build docker-push VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo "dev") BUILD_TIME ?= $(shell date -u +"%Y-%m-%dT%H:%M:%SZ") LDFLAGS = -ldflags "-s -w -X main.version=$(VERSION) -X main.buildTime=$(BUILD_TIME)" +DOCKER_IMAGE ?= postgresai/rds-refresh # Build for current platform build: go build $(LDFLAGS) -o rds-refresh . -# Build for Linux (for Docker/Lambda) +# Build for Linux (for Docker containers) build-linux: GOOS=linux GOARCH=amd64 go build $(LDFLAGS) -o rds-refresh-linux-amd64 . GOOS=linux GOARCH=arm64 go build $(LDFLAGS) -o rds-refresh-linux-arm64 . -# Build Lambda bootstrap binary -build-lambda: - GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build $(LDFLAGS) -o bootstrap . - zip rds-refresh-lambda.zip bootstrap - rm bootstrap - # Clean build artifacts clean: - rm -f rds-refresh rds-refresh-linux-* bootstrap rds-refresh-lambda.zip + rm -f rds-refresh rds-refresh-linux-* # Run tests test: @@ -47,3 +42,12 @@ run: # Run dry-run locally dry-run: go run . -config config.yaml -dry-run + +# Build Docker image +docker-build: + docker build -t $(DOCKER_IMAGE):$(VERSION) -t $(DOCKER_IMAGE):latest . + +# Push Docker image +docker-push: + docker push $(DOCKER_IMAGE):$(VERSION) + docker push $(DOCKER_IMAGE):latest diff --git a/rds-refresh/README.md b/rds-refresh/README.md index eecd1b68..94e975ef 100644 --- a/rds-refresh/README.md +++ b/rds-refresh/README.md @@ -7,8 +7,9 @@ A standalone tool that automates DBLab Engine full refresh using temporary RDS o This tool provides a hassle-free way to keep your DBLab Engine data synchronized with your production RDS/Aurora database: 1. **Creates a temporary clone** from the latest RDS/Aurora snapshot -2. **Triggers DBLab full refresh** to sync data from the clone -3. **Deletes the temporary clone** after refresh completes +2. **Updates DBLab configuration** with the new clone's endpoint +3. **Triggers DBLab full refresh** to sync data from the clone +4. **Deletes the temporary clone** after refresh completes This approach avoids impacting your production database during the data sync process. @@ -17,9 +18,9 @@ This approach avoids impacting your production database during the data sync pro ### Build ```bash -# Clone this repository -git clone https://github.com/postgres-ai/rds-refresh.git -cd rds-refresh +# Clone the repository +git clone https://github.com/postgres-ai/database-lab-engine.git +cd database-lab-engine/rds-refresh # Build make build @@ -41,7 +42,7 @@ vim config.yaml ### Run ```bash -# Dry run (validates configuration) +# Dry run (validates configuration without creating resources) ./rds-refresh -config config.yaml -dry-run # Full refresh @@ -50,85 +51,88 @@ vim config.yaml ## Deployment Options -### Option 1: AWS Lambda (Recommended) +The refresh process can take 1-4 hours depending on database size, so this tool is designed for long-running execution environments. -Deploy as a serverless function with automatic scheduling via EventBridge. - -#### Prerequisites - -- [AWS SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-sam-cli.html) -- AWS credentials configured -- Go 1.21+ - -#### Deploy - -```bash -# Build and deploy -sam build -sam deploy --guided -``` - -During guided deployment, you'll be prompted for: - -| Parameter | Description | Example | -|-----------|-------------|---------| -| `RDSSourceType` | `rds` or `aurora-cluster` | `rds` | -| `RDSSourceIdentifier` | Source DB identifier | `production-db` | -| `RDSCloneInstanceClass` | Clone instance size | `db.t3.medium` | -| `DBLabAPIEndpoint` | DBLab API URL | `https://dblab.example.com:2345` | -| `DBLabToken` | DBLab verification token | `your-secret-token` | -| `ScheduleExpression` | Refresh schedule | `rate(7 days)` | - -#### Manual Invocation +### Option 1: Docker (Recommended) ```bash -# Dry run -aws lambda invoke --function-name dblab-rds-refresh \ - --cli-binary-format raw-in-base64-out \ - --payload '{"dryRun": true}' \ - response.json && cat response.json +# Build image +make docker-build -# Full refresh -aws lambda invoke --function-name dblab-rds-refresh \ - --cli-binary-format raw-in-base64-out \ - --payload '{"dryRun": false}' \ - response.json && cat response.json +# Run +docker run \ + -v /path/to/config.yaml:/config.yaml \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DB_PASSWORD \ + -e DBLAB_TOKEN \ + postgresai/rds-refresh -config /config.yaml ``` -### Option 2: CLI with Cron - -```bash -# Build -make build - -# Install -sudo mv rds-refresh /usr/local/bin/ +### Option 2: ECS Task -# Create config -sudo mkdir -p /etc/dblab -sudo cp config.example.yaml /etc/dblab/rds-refresh.yaml -sudo vim /etc/dblab/rds-refresh.yaml +Create an ECS Task Definition for scheduled execution: -# Add to crontab (every Sunday at 2 AM) -echo "0 2 * * 0 /usr/local/bin/rds-refresh -config /etc/dblab/rds-refresh.yaml >> /var/log/rds-refresh.log 2>&1" | crontab - +```json +{ + "family": "dblab-rds-refresh", + "networkMode": "awsvpc", + "containerDefinitions": [ + { + "name": "rds-refresh", + "image": "postgresai/rds-refresh:latest", + "command": ["-config", "/config/config.yaml"], + "mountPoints": [ + { + "sourceVolume": "config", + "containerPath": "/config" + } + ], + "secrets": [ + { + "name": "DB_PASSWORD", + "valueFrom": "arn:aws:secretsmanager:us-east-1:123456789:secret:db-password" + }, + { + "name": "DBLAB_TOKEN", + "valueFrom": "arn:aws:secretsmanager:us-east-1:123456789:secret:dblab-token" + } + ], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": "/ecs/dblab-rds-refresh", + "awslogs-region": "us-east-1", + "awslogs-stream-prefix": "ecs" + } + } + } + ], + "taskRoleArn": "arn:aws:iam::123456789:role/dblab-rds-refresh-task", + "executionRoleArn": "arn:aws:iam::123456789:role/ecsTaskExecutionRole", + "volumes": [ + { + "name": "config", + "efsVolumeConfiguration": { + "fileSystemId": "fs-12345678" + } + } + ] +} ``` -### Option 3: Docker - +Schedule with EventBridge: ```bash -# Build -docker build -t rds-refresh . +aws events put-rule \ + --name dblab-rds-refresh-weekly \ + --schedule-expression "rate(7 days)" -# Run -docker run \ - -v /path/to/config.yaml:/config.yaml \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DBLAB_TOKEN \ - rds-refresh -config /config.yaml +aws events put-targets \ + --rule dblab-rds-refresh-weekly \ + --targets "Id"="1","Arn"="arn:aws:ecs:us-east-1:123456789:cluster/my-cluster","RoleArn"="arn:aws:iam::123456789:role/ecsEventsRole","EcsParameters"="{\"taskDefinitionArn\": \"arn:aws:ecs:us-east-1:123456789:task-definition/dblab-rds-refresh:1\",\"taskCount\": 1}" ``` -### Option 4: Kubernetes CronJob +### Option 3: Kubernetes CronJob ```yaml apiVersion: batch/v1 @@ -137,19 +141,26 @@ metadata: name: dblab-rds-refresh spec: schedule: "0 2 * * 0" # Every Sunday at 2 AM + concurrencyPolicy: Forbid jobTemplate: spec: + backoffLimit: 1 template: spec: - serviceAccountName: dblab-rds-refresh # with IRSA + serviceAccountName: dblab-rds-refresh # with IRSA for AWS access containers: - name: rds-refresh - image: your-registry/rds-refresh:latest + image: postgresai/rds-refresh:latest args: ["-config", "/config/config.yaml"] volumeMounts: - name: config mountPath: /config env: + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: dblab-secrets + key: db-password - name: DBLAB_TOKEN valueFrom: secretKeyRef: @@ -159,32 +170,66 @@ spec: - name: config configMap: name: rds-refresh-config - restartPolicy: OnFailure + restartPolicy: Never +``` + +### Option 4: CLI with Cron + +```bash +# Build and install +make build +sudo mv rds-refresh /usr/local/bin/ + +# Create config directory +sudo mkdir -p /etc/dblab +sudo cp config.example.yaml /etc/dblab/rds-refresh.yaml +sudo chmod 600 /etc/dblab/rds-refresh.yaml +sudo vim /etc/dblab/rds-refresh.yaml + +# Add to crontab (every Sunday at 2 AM) +echo "0 2 * * 0 /usr/local/bin/rds-refresh -config /etc/dblab/rds-refresh.yaml >> /var/log/rds-refresh.log 2>&1" | crontab - ``` ## Configuration See [config.example.yaml](config.example.yaml) for a fully documented example. +### Key Configuration Fields + +```yaml +source: + type: rds # "rds" or "aurora-cluster" + identifier: production-db # RDS instance or Aurora cluster ID + dbName: myapp # Database name for DBLab to connect to + username: postgres # Database username + password: ${DB_PASSWORD} # Use env var expansion for secrets + +clone: + instanceClass: db.t3.medium # Can be smaller than production + subnetGroup: my-subnet # Must be accessible from DBLab + securityGroups: + - sg-12345678 # Must allow DBLab inbound access + +dblab: + apiEndpoint: https://dblab.example.com:2345 + token: ${DBLAB_TOKEN} + pollInterval: 30s # Status check frequency + timeout: 4h # Max wait for refresh completion + +aws: + region: us-east-1 +``` + ### Environment Variables -When running as Lambda, configuration is loaded from environment variables: - -| Variable | Required | Description | -|----------|----------|-------------| -| `RDS_SOURCE_IDENTIFIER` | Yes | Source RDS instance or Aurora cluster ID | -| `RDS_CLONE_INSTANCE_CLASS` | Yes | Instance class for clone (e.g., `db.t3.medium`) | -| `DBLAB_API_ENDPOINT` | Yes | DBLab Engine API endpoint | -| `DBLAB_TOKEN` | Yes | DBLab verification token | -| `AWS_REGION` | Yes | AWS region | -| `RDS_SOURCE_TYPE` | No | `rds` or `aurora-cluster` (default: `rds`) | -| `RDS_SNAPSHOT_IDENTIFIER` | No | Specific snapshot ID (default: latest) | -| `RDS_CLONE_SUBNET_GROUP` | No | DB subnet group name | -| `RDS_CLONE_SECURITY_GROUPS` | No | JSON array of security group IDs | -| `RDS_CLONE_PUBLIC` | No | `true` to make clone publicly accessible | -| `RDS_CLONE_ENABLE_IAM_AUTH` | No | `true` to enable IAM authentication | -| `RDS_CLONE_STORAGE_TYPE` | No | Storage type (gp2, gp3, io1, etc.) | -| `DBLAB_INSECURE` | No | `true` to skip TLS verification | +The configuration file supports environment variable expansion using `${VAR_NAME}` syntax. This is useful for secrets: + +```yaml +source: + password: ${DB_PASSWORD} +dblab: + token: ${DBLAB_TOKEN} +``` ## AWS IAM Permissions @@ -244,16 +289,20 @@ The tool requires the following IAM permissions: Replace `ACCOUNT_ID` with your AWS account ID. -## DBLab Engine Configuration +## DBLab Engine Requirements + +The tool dynamically updates the DBLab Engine's source configuration before triggering refresh. Your DBLab Engine must: -Configure DBLab Engine to connect to the temporary clone. The clone will be named `dblab-refresh-YYYYMMDD-HHMMSS`. +1. **Support config updates via API** - The `/admin/config` endpoint must be available +2. **Run in logical mode** - Using pg_dump/pg_restore for data retrieval +3. **Be accessible** - The API endpoint must be reachable from where this tool runs -Example DBLab retrieval configuration: +Example DBLab configuration: ```yaml retrieval: refresh: - timetable: "" # Disable built-in scheduler (managed externally) + timetable: "" # Disable built-in scheduler (managed by this tool) skipStartRefresh: true jobs: @@ -265,51 +314,78 @@ retrieval: logicalDump: options: source: - type: rdsIam + type: local # Will be updated dynamically connection: dbname: mydb - username: dblab_user - rdsIam: - awsRegion: us-east-1 - dbInstanceIdentifier: dblab-refresh-current # Will be the temp clone + username: postgres + # host and port will be updated by rds-refresh ``` +## Workflow + +The tool executes the following steps: + +1. **Health check** - Verifies DBLab Engine is healthy and not already refreshing +2. **Source validation** - Gets source RDS/Aurora database info +3. **Snapshot discovery** - Finds the latest automated snapshot +4. **Clone creation** - Creates a temporary RDS instance/cluster from the snapshot +5. **Wait for clone** - Polls until clone is available (10-30 minutes typical) +6. **Config update** - Updates DBLab's source configuration with the clone endpoint +7. **Trigger refresh** - Initiates DBLab full refresh +8. **Wait for completion** - Polls until refresh completes (1-4 hours typical) +9. **Cleanup** - Deletes the temporary clone + +If any step fails, the clone is automatically deleted (cleanup runs in defer). + ## Troubleshooting ### Common Issues **Clone creation fails with "DBSubnetGroup not found"** - Ensure the subnet group exists and is in the correct VPC +- Verify the subnet group name in your configuration **Clone not accessible from DBLab** -- Verify security groups allow inbound connections from DBLab +- Verify security groups allow inbound connections from DBLab on port 5432 - Check if `publiclyAccessible` setting matches your network topology +- Ensure the clone and DBLab are in the same VPC or have network connectivity + +**DBLab config update fails** +- Verify the DBLab API endpoint is correct +- Check that the verification token is valid +- Ensure DBLab supports the `/admin/config` endpoint **DBLab refresh timeout** -- Increase `dblab.timeout` in configuration -- Check DBLab Engine logs for issues +- Increase `dblab.timeout` in configuration (default is 4 hours) +- Check DBLab Engine logs for issues during refresh +- Consider the database size - larger databases take longer **AWS credentials not found** -- Ensure AWS credentials are configured (env vars, IAM role, or credentials file) +- For ECS/Kubernetes: Use IAM Roles for Service Accounts (IRSA) or ECS Task Roles +- For CLI: Configure AWS credentials via environment variables or credentials file +- Verify IAM permissions are correctly attached ### Debug Mode ```bash -# Enable verbose AWS SDK logging -export AWS_SDK_LOAD_CONFIG=1 +# Enable verbose output ./rds-refresh -config config.yaml 2>&1 | tee refresh.log + +# Check AWS credential chain +aws sts get-caller-identity ``` ## Cost Considerations -- **Clone runtime**: You pay for the clone instance while it exists -- **Storage**: Clones don't duplicate storage (snapshot-based) -- **Lambda**: Minimal cost (typically < $0.10/month for weekly refreshes) +- **Clone runtime**: You pay for the clone instance while it exists (typically 2-5 hours) +- **Storage**: Clones don't duplicate storage initially (snapshot-based, copy-on-write) +- **Data transfer**: Minimal if DBLab is in the same region **Cost optimization tips**: -- Use a smaller instance class than production +- Use a smaller instance class than production (e.g., `db.t3.medium`) - Use `gp3` storage type for better price/performance - Schedule refreshes during off-peak hours +- The tool automatically deletes clones after completion ## License diff --git a/rds-refresh/config.example.yaml b/rds-refresh/config.example.yaml index cec82385..93b59f2a 100644 --- a/rds-refresh/config.example.yaml +++ b/rds-refresh/config.example.yaml @@ -12,6 +12,14 @@ source: # RDS DB instance identifier or Aurora cluster identifier identifier: production-db + # Database name to connect to (used when configuring DBLab) + dbName: myapp + + # Database credentials (used when configuring DBLab to connect to clone) + # Use environment variable expansion for security + username: postgres + password: ${DB_PASSWORD} + # Optional: Specific snapshot identifier to use # If empty, the latest automated snapshot will be used # snapshotIdentifier: rds:production-db-2024-01-15-02-00 diff --git a/rds-refresh/config.go b/rds-refresh/config.go index 3e16ec30..99bc0209 100644 --- a/rds-refresh/config.go +++ b/rds-refresh/config.go @@ -28,6 +28,12 @@ type SourceConfig struct { Identifier string `yaml:"identifier"` // SnapshotIdentifier is the specific snapshot to use. If empty, the latest automated snapshot is used. SnapshotIdentifier string `yaml:"snapshotIdentifier"` + // DBName is the database name to connect to (used when updating DBLab config). + DBName string `yaml:"dbName"` + // Username is the database username (used when updating DBLab config). + Username string `yaml:"username"` + // Password is the database password (used when updating DBLab config). + Password string `yaml:"password"` } // CloneConfig defines settings for the temporary clone. @@ -148,6 +154,18 @@ func (c *Config) Validate() error { return fmt.Errorf("source.identifier is required") } + if c.Source.DBName == "" { + return fmt.Errorf("source.dbName is required") + } + + if c.Source.Username == "" { + return fmt.Errorf("source.username is required") + } + + if c.Source.Password == "" { + return fmt.Errorf("source.password is required") + } + if c.Clone.InstanceClass == "" { return fmt.Errorf("clone.instanceClass is required") } diff --git a/rds-refresh/dblab.go b/rds-refresh/dblab.go index 25306874..af78df96 100644 --- a/rds-refresh/dblab.go +++ b/rds-refresh/dblab.go @@ -5,6 +5,7 @@ package main import ( + "bytes" "context" "crypto/tls" "encoding/json" @@ -46,10 +47,10 @@ type Status struct { // Retrieving represents state of retrieval subsystem. type Retrieving struct { - Mode string `json:"mode"` - Status RetrievalStatus `json:"status"` - LastRefresh string `json:"lastRefresh"` - NextRefresh string `json:"nextRefresh"` + Mode string `json:"mode"` + Status RetrievalStatus `json:"status"` + LastRefresh string `json:"lastRefresh"` + NextRefresh string `json:"nextRefresh"` Alerts map[string]Alert `json:"alerts"` } @@ -71,6 +72,25 @@ type APIError struct { Message string `json:"message"` } +// ConfigUpdateRequest represents a request to update DBLab config. +type ConfigUpdateRequest struct { + Retrieval *RetrievalConfigUpdate `json:"retrieval,omitempty"` +} + +// RetrievalConfigUpdate represents retrieval config fields to update. +type RetrievalConfigUpdate struct { + DBSource *DBSourceConfig `json:"dbSource,omitempty"` +} + +// DBSourceConfig represents the source database connection config. +type DBSourceConfig struct { + Host string `json:"host,omitempty"` + Port int `json:"port,omitempty"` + DBName string `json:"dbname,omitempty"` + Username string `json:"username,omitempty"` + Password string `json:"password,omitempty"` +} + // DBLabClient provides methods to interact with the DBLab Engine API. type DBLabClient struct { baseURL string @@ -79,7 +99,11 @@ type DBLabClient struct { } // NewDBLabClient creates a new DBLab API client. -func NewDBLabClient(cfg *DBLabConfig) *DBLabClient { +func NewDBLabClient(cfg *DBLabConfig, logger Logger) *DBLabClient { + if cfg.Insecure && logger != nil { + logger.Error("WARNING: TLS certificate verification is disabled. This is insecure for production use.") + } + transport := &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: cfg.Insecure}, } @@ -202,6 +226,43 @@ func (c *DBLabClient) Health(ctx context.Context) error { return nil } +// UpdateSourceConfig updates the source database connection in DBLab config. +func (c *DBLabClient) UpdateSourceConfig(ctx context.Context, host string, port int, dbname, username, password string) error { + updateReq := ConfigUpdateRequest{ + Retrieval: &RetrievalConfigUpdate{ + DBSource: &DBSourceConfig{ + Host: host, + Port: port, + DBName: dbname, + Username: username, + Password: password, + }, + }, + } + + bodyBytes, err := json.Marshal(updateReq) + if err != nil { + return fmt.Errorf("failed to marshal config update: %w", err) + } + + resp, err := c.doRequest(ctx, http.MethodPatch, "/admin/config", bytes.NewReader(bodyBytes)) + if err != nil { + return fmt.Errorf("failed to update DBLab config: %w", err) + } + defer resp.Body.Close() + + var result APIResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return fmt.Errorf("failed to decode config update response: %w", err) + } + + if result.Status != "OK" { + return fmt.Errorf("config update failed: %s", result.Message) + } + + return nil +} + func (c *DBLabClient) doRequest(ctx context.Context, method, path string, body io.Reader) (*http.Response, error) { url := c.baseURL + path diff --git a/rds-refresh/go.mod b/rds-refresh/go.mod index 51b1c9d3..a11819b1 100644 --- a/rds-refresh/go.mod +++ b/rds-refresh/go.mod @@ -5,7 +5,6 @@ go 1.23 toolchain go1.24.7 require ( - github.com/aws/aws-lambda-go v1.51.0 github.com/aws/aws-sdk-go-v2 v1.41.0 github.com/aws/aws-sdk-go-v2/config v1.32.5 github.com/aws/aws-sdk-go-v2/service/rds v1.113.1 diff --git a/rds-refresh/main.go b/rds-refresh/main.go index 452c1a99..995df956 100644 --- a/rds-refresh/main.go +++ b/rds-refresh/main.go @@ -3,21 +3,19 @@ rds-refresh - Automate DBLab full refresh using RDS/Aurora snapshots -This tool creates a temporary RDS/Aurora clone from a snapshot, triggers -a DBLab Engine full refresh, and then cleans up the temporary clone. +This tool creates a temporary RDS/Aurora clone from a snapshot, updates +DBLab Engine config with the clone endpoint, triggers a full refresh, +and then cleans up the temporary clone. */ package main import ( "context" - "encoding/json" "flag" "fmt" "os" "os/signal" "syscall" - - "github.com/aws/aws-lambda-go/lambda" ) var ( @@ -26,13 +24,6 @@ var ( ) func main() { - // Check if running in Lambda - if os.Getenv("AWS_LAMBDA_FUNCTION_NAME") != "" { - lambda.Start(HandleLambda) - return - } - - // CLI mode configPath := flag.String("config", "", "Path to configuration file") dryRun := flag.Bool("dry-run", false, "Validate configuration without creating resources") showVersion := flag.Bool("version", false, "Show version information") @@ -72,7 +63,7 @@ func run(configPath string, dryRun bool) error { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - // Handle interrupt signals + // handle interrupt signals for graceful shutdown sigCh := make(chan os.Signal, 1) signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) @@ -109,155 +100,12 @@ func run(configPath string, dryRun bool) error { return nil } -// LambdaEvent is the input event for the Lambda function. -type LambdaEvent struct { - // DryRun, if true, only validates configuration without creating resources. - DryRun bool `json:"dryRun"` - // ConfigOverrides allows overriding configuration values. - ConfigOverrides *ConfigOverrides `json:"configOverrides"` -} - -// ConfigOverrides allows partial configuration overrides via the Lambda event. -type ConfigOverrides struct { - SnapshotIdentifier string `json:"snapshotIdentifier"` -} - -// LambdaResponse is the output response from the Lambda function. -type LambdaResponse struct { - Success bool `json:"success"` - Message string `json:"message"` - SnapshotID string `json:"snapshotId,omitempty"` - CloneID string `json:"cloneId,omitempty"` - CloneEndpoint string `json:"cloneEndpoint,omitempty"` - DurationSec int64 `json:"durationSeconds,omitempty"` - Error string `json:"error,omitempty"` -} - -// HandleLambda is the Lambda function handler. -func HandleLambda(ctx context.Context, event LambdaEvent) (LambdaResponse, error) { - logger := &DefaultLogger{} - - cfg, err := loadLambdaConfig() - if err != nil { - return LambdaResponse{ - Success: false, - Error: err.Error(), - Message: "failed to load configuration", - }, nil - } - - // Apply overrides - if event.ConfigOverrides != nil && event.ConfigOverrides.SnapshotIdentifier != "" { - cfg.Source.SnapshotIdentifier = event.ConfigOverrides.SnapshotIdentifier - } - - refresher, err := NewRefresher(ctx, cfg, logger) - if err != nil { - return LambdaResponse{ - Success: false, - Error: err.Error(), - Message: "failed to initialize refresher", - }, nil - } - - if event.DryRun { - if err := refresher.DryRun(ctx); err != nil { - return LambdaResponse{ - Success: false, - Error: err.Error(), - Message: "dry run failed", - }, nil - } - - return LambdaResponse{ - Success: true, - Message: "dry run completed successfully", - }, nil - } - - result := refresher.Run(ctx) - - resp := LambdaResponse{ - Success: result.Success, - SnapshotID: result.SnapshotID, - CloneID: result.CloneID, - CloneEndpoint: result.CloneEndpoint, - DurationSec: int64(result.Duration.Seconds()), - } - - if result.Error != nil { - resp.Error = result.Error.Error() - resp.Message = "refresh failed" - } else { - resp.Message = "refresh completed successfully" - } - - return resp, nil -} - -// loadLambdaConfig loads configuration from environment variables. -func loadLambdaConfig() (*Config, error) { - cfg := &Config{} - - // Source configuration - cfg.Source.Type = getEnvOrDefault("RDS_SOURCE_TYPE", "rds") - cfg.Source.Identifier = os.Getenv("RDS_SOURCE_IDENTIFIER") - cfg.Source.SnapshotIdentifier = os.Getenv("RDS_SNAPSHOT_IDENTIFIER") - - // Clone configuration - cfg.Clone.InstanceClass = os.Getenv("RDS_CLONE_INSTANCE_CLASS") - cfg.Clone.DBSubnetGroupName = os.Getenv("RDS_CLONE_SUBNET_GROUP") - - if sgJSON := os.Getenv("RDS_CLONE_SECURITY_GROUPS"); sgJSON != "" { - if err := json.Unmarshal([]byte(sgJSON), &cfg.Clone.VPCSecurityGroupIDs); err != nil { - return nil, fmt.Errorf("invalid RDS_CLONE_SECURITY_GROUPS JSON: %w", err) - } - } - - cfg.Clone.PubliclyAccessible = os.Getenv("RDS_CLONE_PUBLIC") == "true" - cfg.Clone.ParameterGroupName = os.Getenv("RDS_CLONE_PARAMETER_GROUP") - cfg.Clone.OptionGroupName = os.Getenv("RDS_CLONE_OPTION_GROUP") - cfg.Clone.DBClusterParameterGroupName = os.Getenv("RDS_CLONE_CLUSTER_PARAMETER_GROUP") - cfg.Clone.EnableIAMAuth = os.Getenv("RDS_CLONE_ENABLE_IAM_AUTH") == "true" - cfg.Clone.StorageType = os.Getenv("RDS_CLONE_STORAGE_TYPE") - - // Parse tags from JSON - if tagsJSON := os.Getenv("RDS_CLONE_TAGS"); tagsJSON != "" { - if err := json.Unmarshal([]byte(tagsJSON), &cfg.Clone.Tags); err != nil { - return nil, fmt.Errorf("invalid RDS_CLONE_TAGS JSON: %w", err) - } - } - - // DBLab configuration - cfg.DBLab.APIEndpoint = os.Getenv("DBLAB_API_ENDPOINT") - cfg.DBLab.Token = os.Getenv("DBLAB_TOKEN") - cfg.DBLab.Insecure = os.Getenv("DBLAB_INSECURE") == "true" - - // AWS configuration - cfg.AWS.Region = os.Getenv("AWS_REGION") - - if err := cfg.Validate(); err != nil { - return nil, err - } - - cfg.SetDefaults() - - return cfg, nil -} - -func getEnvOrDefault(key, defaultValue string) string { - if v := os.Getenv(key); v != "" { - return v - } - - return defaultValue -} - func printUsage() { fmt.Fprintf(os.Stderr, `rds-refresh - Automate DBLab full refresh using RDS/Aurora snapshots -This tool creates a temporary RDS/Aurora clone from a snapshot, triggers -a DBLab Engine full refresh, and then cleans up the temporary clone. +This tool creates a temporary RDS/Aurora clone from a snapshot, updates +DBLab Engine config with the clone endpoint, triggers a full refresh, +and then cleans up the temporary clone. USAGE: rds-refresh -config [options] @@ -268,34 +116,29 @@ OPTIONS: -version Show version information -help Show this help message -LAMBDA MODE: - When running as an AWS Lambda function (detected via AWS_LAMBDA_FUNCTION_NAME - environment variable), configuration is loaded from environment variables: - - Required: - RDS_SOURCE_IDENTIFIER Source RDS instance or Aurora cluster ID - RDS_CLONE_INSTANCE_CLASS Instance class for the clone (e.g., db.t3.medium) - DBLAB_API_ENDPOINT DBLab Engine API endpoint - DBLAB_TOKEN DBLab verification token - AWS_REGION AWS region - - Optional: - RDS_SOURCE_TYPE "rds" or "aurora-cluster" (default: rds) - RDS_SNAPSHOT_IDENTIFIER Specific snapshot ID (default: latest) - RDS_CLONE_SUBNET_GROUP DB subnet group name - RDS_CLONE_SECURITY_GROUPS JSON array of security group IDs - RDS_CLONE_PUBLIC "true" to make clone publicly accessible - RDS_CLONE_PARAMETER_GROUP DB parameter group name - RDS_CLONE_ENABLE_IAM_AUTH "true" to enable IAM authentication - RDS_CLONE_STORAGE_TYPE Storage type (gp2, gp3, io1, etc.) - RDS_CLONE_TAGS JSON object of additional tags - DBLAB_INSECURE "true" to skip TLS verification +DEPLOYMENT: + This tool is designed to run as a container (Docker, ECS Task, Kubernetes Job) + or directly from the command line. The refresh process can take 1-4 hours + depending on database size, so long-running execution environments are required. + + Docker: + docker run -v /path/to/config.yaml:/config.yaml \ + postgres-ai/rds-refresh -config /config.yaml + + ECS Task / Kubernetes Job: + Schedule as a periodic task (e.g., daily) using your orchestration platform. + + Cron: + 0 2 * * * /usr/local/bin/rds-refresh -config /etc/rds-refresh/config.yaml EXAMPLE CONFIGURATION: source: - type: rds + type: rds # or "aurora-cluster" identifier: production-db + dbName: myapp + username: postgres + password: ${DB_PASSWORD} # supports environment variable expansion clone: instanceClass: db.t3.medium @@ -303,7 +146,6 @@ EXAMPLE CONFIGURATION: securityGroups: - sg-12345678 publiclyAccessible: false - enableIAMAuth: true dblab: apiEndpoint: https://dblab.example.com:2345 @@ -314,6 +156,17 @@ EXAMPLE CONFIGURATION: aws: region: us-east-1 +WORKFLOW: + 1. Verifies DBLab is healthy and not already refreshing + 2. Gets source database info from RDS/Aurora + 3. Finds the latest automated snapshot + 4. Creates a temporary RDS clone from the snapshot + 5. Waits for the clone to be available (10-30 minutes) + 6. Updates DBLab config with the clone endpoint + 7. Triggers DBLab full refresh + 8. Waits for refresh to complete (1-4 hours) + 9. Deletes the temporary clone + For more information, see: https://postgres.ai/docs/database-lab-engine diff --git a/rds-refresh/refresher.go b/rds-refresh/refresher.go index 74741f78..803a6812 100644 --- a/rds-refresh/refresher.go +++ b/rds-refresh/refresher.go @@ -66,7 +66,7 @@ func NewRefresher(ctx context.Context, cfg *Config, logger Logger) (*Refresher, return nil, fmt.Errorf("failed to create RDS client: %w", err) } - dblabClient := NewDBLabClient(&cfg.DBLab) + dblabClient := NewDBLabClient(&cfg.DBLab, logger) return &Refresher{ cfg: cfg, @@ -78,12 +78,14 @@ func NewRefresher(ctx context.Context, cfg *Config, logger Logger) (*Refresher, // Run executes the full refresh workflow: // 1. Verifies DBLab is healthy and not already refreshing -// 2. Finds the latest snapshot -// 3. Creates a temporary clone from the snapshot -// 4. Waits for the clone to be available -// 5. Triggers DBLab full refresh -// 6. Waits for refresh to complete -// 7. Deletes the temporary clone +// 2. Gets source database info +// 3. Finds the latest snapshot +// 4. Creates a temporary clone from the snapshot +// 5. Waits for the clone to be available +// 6. Updates DBLab config with the clone endpoint +// 7. Triggers DBLab full refresh +// 8. Waits for refresh to complete +// 9. Deletes the temporary clone func (r *Refresher) Run(ctx context.Context) *RefreshResult { result := &RefreshResult{ StartTime: time.Now(), @@ -170,7 +172,24 @@ func (r *Refresher) Run(ctx context.Context) *RefreshResult { result.CloneEndpoint = clone.Endpoint r.logger.Info("Clone available at: %s:%d", clone.Endpoint, clone.Port) - // Step 6: Trigger DBLab full refresh + // Step 6: Update DBLab config with clone endpoint + r.logger.Info("Updating DBLab source config with clone endpoint...") + + if err := r.dblab.UpdateSourceConfig( + ctx, + clone.Endpoint, + int(clone.Port), + r.cfg.Source.DBName, + r.cfg.Source.Username, + r.cfg.Source.Password, + ); err != nil { + result.Error = fmt.Errorf("failed to update DBLab config: %w", err) + return result + } + + r.logger.Info("DBLab config updated successfully") + + // Step 7: Trigger DBLab full refresh r.logger.Info("Triggering DBLab full refresh...") if err := r.dblab.TriggerFullRefresh(ctx); err != nil { diff --git a/rds-refresh/template.yaml b/rds-refresh/template.yaml deleted file mode 100644 index c14dfd9a..00000000 --- a/rds-refresh/template.yaml +++ /dev/null @@ -1,241 +0,0 @@ -AWSTemplateFormatVersion: '2010-09-09' -Transform: AWS::Serverless-2016-10-31 -Description: > - DBLab RDS/Aurora Refresh Lambda - - Automates DBLab full refresh using temporary RDS/Aurora clones created from snapshots. - -Metadata: - AWS::ServerlessRepo::Application: - Name: dblab-rds-refresh - Description: Automates DBLab full refresh using temporary RDS/Aurora clones - Author: Postgres.ai - SpdxLicenseId: Apache-2.0 - Labels: ['dblab', 'rds', 'aurora', 'postgresql', 'database'] - HomePageUrl: https://postgres.ai - SourceCodeUrl: https://github.com/postgres-ai/rds-refresh - -Parameters: - # Source Configuration - RDSSourceType: - Type: String - Default: rds - AllowedValues: - - rds - - aurora-cluster - Description: Type of source database (rds for RDS instance, aurora-cluster for Aurora) - - RDSSourceIdentifier: - Type: String - Description: RDS DB instance identifier or Aurora cluster identifier - - RDSSnapshotIdentifier: - Type: String - Default: '' - Description: Specific snapshot ID to use (leave empty for latest automated snapshot) - - # Clone Configuration - RDSCloneInstanceClass: - Type: String - Default: db.t3.medium - Description: Instance class for the temporary clone - - RDSCloneSubnetGroup: - Type: String - Default: '' - Description: DB subnet group name for the clone - - RDSCloneSecurityGroups: - Type: String - Default: '' - Description: JSON array of VPC security group IDs (e.g., '["sg-123", "sg-456"]') - - RDSClonePubliclyAccessible: - Type: String - Default: 'false' - AllowedValues: - - 'true' - - 'false' - Description: Whether the clone should be publicly accessible - - RDSCloneEnableIAMAuth: - Type: String - Default: 'true' - AllowedValues: - - 'true' - - 'false' - Description: Enable IAM database authentication on the clone - - RDSCloneParameterGroup: - Type: String - Default: '' - Description: DB parameter group name for the clone - - RDSCloneStorageType: - Type: String - Default: '' - Description: Storage type for the clone (gp2, gp3, io1, etc.) - - # DBLab Configuration - DBLabAPIEndpoint: - Type: String - Description: DBLab Engine API endpoint (e.g., https://dblab.example.com:2345) - - DBLabToken: - Type: String - NoEcho: true - Description: DBLab verification token - - DBLabInsecure: - Type: String - Default: 'false' - AllowedValues: - - 'true' - - 'false' - Description: Skip TLS certificate verification for DBLab API - - # Schedule Configuration - ScheduleExpression: - Type: String - Default: 'rate(7 days)' - Description: Schedule expression for automatic refresh (e.g., 'rate(7 days)' or 'cron(0 2 ? * SUN *)') - - EnableSchedule: - Type: String - Default: 'true' - AllowedValues: - - 'true' - - 'false' - Description: Enable scheduled automatic refresh - - # Lambda Configuration - LambdaTimeout: - Type: Number - Default: 900 - MinValue: 60 - MaxValue: 900 - Description: Lambda function timeout in seconds (max 15 minutes) - - LambdaMemorySize: - Type: Number - Default: 256 - MinValue: 128 - MaxValue: 1024 - Description: Lambda function memory size in MB - -Conditions: - ScheduleEnabled: !Equals [!Ref EnableSchedule, 'true'] - HasSubnetGroup: !Not [!Equals [!Ref RDSCloneSubnetGroup, '']] - HasSecurityGroups: !Not [!Equals [!Ref RDSCloneSecurityGroups, '']] - HasParameterGroup: !Not [!Equals [!Ref RDSCloneParameterGroup, '']] - HasStorageType: !Not [!Equals [!Ref RDSCloneStorageType, '']] - HasSnapshotId: !Not [!Equals [!Ref RDSSnapshotIdentifier, '']] - -Globals: - Function: - Timeout: !Ref LambdaTimeout - MemorySize: !Ref LambdaMemorySize - Runtime: provided.al2023 - Architectures: - - arm64 - -Resources: - RDSRefreshFunction: - Type: AWS::Serverless::Function - Metadata: - BuildMethod: go1.x - Properties: - CodeUri: . - Handler: bootstrap - Description: Automates DBLab full refresh using temporary RDS/Aurora clones - Environment: - Variables: - RDS_SOURCE_TYPE: !Ref RDSSourceType - RDS_SOURCE_IDENTIFIER: !Ref RDSSourceIdentifier - RDS_SNAPSHOT_IDENTIFIER: !If [HasSnapshotId, !Ref RDSSnapshotIdentifier, ''] - RDS_CLONE_INSTANCE_CLASS: !Ref RDSCloneInstanceClass - RDS_CLONE_SUBNET_GROUP: !If [HasSubnetGroup, !Ref RDSCloneSubnetGroup, ''] - RDS_CLONE_SECURITY_GROUPS: !If [HasSecurityGroups, !Ref RDSCloneSecurityGroups, ''] - RDS_CLONE_PUBLIC: !Ref RDSClonePubliclyAccessible - RDS_CLONE_ENABLE_IAM_AUTH: !Ref RDSCloneEnableIAMAuth - RDS_CLONE_PARAMETER_GROUP: !If [HasParameterGroup, !Ref RDSCloneParameterGroup, ''] - RDS_CLONE_STORAGE_TYPE: !If [HasStorageType, !Ref RDSCloneStorageType, ''] - DBLAB_API_ENDPOINT: !Ref DBLabAPIEndpoint - DBLAB_TOKEN: !Ref DBLabToken - DBLAB_INSECURE: !Ref DBLabInsecure - Policies: - - Version: '2012-10-17' - Statement: - - Sid: RDSReadSnapshots - Effect: Allow - Action: - - rds:DescribeDBSnapshots - - rds:DescribeDBClusterSnapshots - - rds:DescribeDBInstances - - rds:DescribeDBClusters - Resource: '*' - - Sid: RDSCreateClone - Effect: Allow - Action: - - rds:RestoreDBInstanceFromDBSnapshot - - rds:RestoreDBClusterFromSnapshot - - rds:CreateDBInstance - - rds:AddTagsToResource - - rds:ModifyDBInstance - - rds:ModifyDBCluster - Resource: - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:db:dblab-refresh-*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster:dblab-refresh-*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:snapshot:*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster-snapshot:*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:subgrp:*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:pg:*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:og:*' - - Sid: RDSDeleteClone - Effect: Allow - Action: - - rds:DeleteDBInstance - - rds:DeleteDBCluster - Resource: - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:db:dblab-refresh-*' - - !Sub 'arn:aws:rds:${AWS::Region}:${AWS::AccountId}:cluster:dblab-refresh-*' - Events: - ScheduledRefresh: - Type: Schedule - Properties: - Schedule: !Ref ScheduleExpression - Description: Scheduled DBLab refresh trigger - Enabled: !If [ScheduleEnabled, true, false] - - RDSRefreshLogGroup: - Type: AWS::Logs::LogGroup - Properties: - LogGroupName: !Sub '/aws/lambda/${RDSRefreshFunction}' - RetentionInDays: 30 - -Outputs: - RDSRefreshFunctionArn: - Description: ARN of the RDS Refresh Lambda function - Value: !GetAtt RDSRefreshFunction.Arn - Export: - Name: !Sub '${AWS::StackName}-FunctionArn' - - RDSRefreshFunctionName: - Description: Name of the RDS Refresh Lambda function - Value: !Ref RDSRefreshFunction - - InvocationCommand: - Description: AWS CLI command to manually invoke the function - Value: !Sub | - aws lambda invoke --function-name ${RDSRefreshFunction} \ - --cli-binary-format raw-in-base64-out \ - --payload '{"dryRun": false}' \ - response.json && cat response.json - - DryRunCommand: - Description: AWS CLI command to run a dry-run test - Value: !Sub | - aws lambda invoke --function-name ${RDSRefreshFunction} \ - --cli-binary-format raw-in-base64-out \ - --payload '{"dryRun": true}' \ - response.json && cat response.json From cb8223a7ea93a19dbe8bcd849dae7d43b1eba81a Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 11 Dec 2025 05:37:28 +0000 Subject: [PATCH 5/6] fix(rds-refresh): use correct DBLab config API format DBLab expects a flat JSON structure matching ConfigProjection fields, not a nested structure. The API automatically reloads the config after update (calls reloadFn internally), so no SIGHUP is needed. Changed from nested: {"retrieval": {"dbSource": {"host": "..."}}} To flat projection format: {"host": "...", "port": 5432, "dbname": "...", ...} --- rds-refresh/dblab.go | 48 +++++++++++++------------------------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/rds-refresh/dblab.go b/rds-refresh/dblab.go index af78df96..1eb09440 100644 --- a/rds-refresh/dblab.go +++ b/rds-refresh/dblab.go @@ -73,22 +73,13 @@ type APIError struct { } // ConfigUpdateRequest represents a request to update DBLab config. +// Uses flat structure matching DBLab's ConfigProjection fields. type ConfigUpdateRequest struct { - Retrieval *RetrievalConfigUpdate `json:"retrieval,omitempty"` -} - -// RetrievalConfigUpdate represents retrieval config fields to update. -type RetrievalConfigUpdate struct { - DBSource *DBSourceConfig `json:"dbSource,omitempty"` -} - -// DBSourceConfig represents the source database connection config. -type DBSourceConfig struct { - Host string `json:"host,omitempty"` - Port int `json:"port,omitempty"` - DBName string `json:"dbname,omitempty"` - Username string `json:"username,omitempty"` - Password string `json:"password,omitempty"` + Host *string `json:"host,omitempty"` + Port *int64 `json:"port,omitempty"` + DBName *string `json:"dbname,omitempty"` + Username *string `json:"username,omitempty"` + Password *string `json:"password,omitempty"` } // DBLabClient provides methods to interact with the DBLab Engine API. @@ -227,17 +218,15 @@ func (c *DBLabClient) Health(ctx context.Context) error { } // UpdateSourceConfig updates the source database connection in DBLab config. +// DBLab automatically reloads the configuration after the update. func (c *DBLabClient) UpdateSourceConfig(ctx context.Context, host string, port int, dbname, username, password string) error { + port64 := int64(port) updateReq := ConfigUpdateRequest{ - Retrieval: &RetrievalConfigUpdate{ - DBSource: &DBSourceConfig{ - Host: host, - Port: port, - DBName: dbname, - Username: username, - Password: password, - }, - }, + Host: &host, + Port: &port64, + DBName: &dbname, + Username: &username, + Password: &password, } bodyBytes, err := json.Marshal(updateReq) @@ -245,21 +234,12 @@ func (c *DBLabClient) UpdateSourceConfig(ctx context.Context, host string, port return fmt.Errorf("failed to marshal config update: %w", err) } - resp, err := c.doRequest(ctx, http.MethodPatch, "/admin/config", bytes.NewReader(bodyBytes)) + resp, err := c.doRequest(ctx, http.MethodPut, "/admin/config", bytes.NewReader(bodyBytes)) if err != nil { return fmt.Errorf("failed to update DBLab config: %w", err) } defer resp.Body.Close() - var result APIResponse - if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { - return fmt.Errorf("failed to decode config update response: %w", err) - } - - if result.Status != "OK" { - return fmt.Errorf("config update failed: %s", result.Message) - } - return nil } From ec74bc61bbca01f1d29d1bfabe485c197d4d1271 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 12 Dec 2025 05:48:24 +0000 Subject: [PATCH 6/6] fix(rds-refresh): prevent race condition in WaitForRefreshComplete The function could return success prematurely if DBLab's status still showed StatusFinished from a previous refresh. Since /full-refresh triggers the operation in a goroutine, polling immediately after could see stale status. Now tracks whether refresh has actually started before accepting StatusFinished as success. This prevents premature clone deletion. --- rds-refresh/dblab.go | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/rds-refresh/dblab.go b/rds-refresh/dblab.go index 1eb09440..ac773270 100644 --- a/rds-refresh/dblab.go +++ b/rds-refresh/dblab.go @@ -146,6 +146,9 @@ func (c *DBLabClient) TriggerFullRefresh(ctx context.Context) error { } // WaitForRefreshComplete polls the DBLab status until refresh is complete or timeout. +// It first waits for the refresh to start (status changes from finished/inactive), +// then waits for it to complete. This prevents race conditions where stale status +// from a previous refresh could cause premature return. func (c *DBLabClient) WaitForRefreshComplete(ctx context.Context, pollInterval, timeout time.Duration) error { ticker := time.NewTicker(pollInterval) defer ticker.Stop() @@ -153,11 +156,16 @@ func (c *DBLabClient) WaitForRefreshComplete(ctx context.Context, pollInterval, timeoutTimer := time.NewTimer(timeout) defer timeoutTimer.Stop() + refreshStarted := false + for { select { case <-ctx.Done(): return ctx.Err() case <-timeoutTimer.C: + if !refreshStarted { + return fmt.Errorf("timeout waiting for refresh to start after %v", timeout) + } return fmt.Errorf("timeout waiting for refresh to complete after %v", timeout) case <-ticker.C: status, err := c.GetStatus(ctx) @@ -168,21 +176,34 @@ func (c *DBLabClient) WaitForRefreshComplete(ctx context.Context, pollInterval, retrievalStatus := status.Retrieving.Status switch retrievalStatus { + case StatusRefreshing, StatusSnapshotting, StatusRenewed, StatusPending: + // refresh is in progress - mark as started + refreshStarted = true + continue case StatusFinished: + if !refreshStarted { + // still showing old status, refresh hasn't started yet + continue + } + // refresh started and now finished return nil case StatusFailed: + if !refreshStarted { + // old failure status, refresh hasn't started yet + continue + } if len(status.Retrieving.Alerts) > 0 { for _, alert := range status.Retrieving.Alerts { return fmt.Errorf("refresh failed: %s", alert.Message) } } - return fmt.Errorf("refresh failed (no details available)") - case StatusRefreshing, StatusSnapshotting, StatusRenewed: - // still in progress - continue - case StatusInactive, StatusPending: - // not started yet or pending + case StatusInactive: + if refreshStarted { + // was running but now inactive - unusual, treat as failure + return fmt.Errorf("refresh stopped unexpectedly (status: inactive)") + } + // not started yet continue default: continue