From b2ca59490b32ca112b69ba25006104782cbaf455 Mon Sep 17 00:00:00 2001 From: Simone Scarduzio Date: Sun, 12 Oct 2025 22:41:48 +0200 Subject: [PATCH] feat: Add EC2 region detection and cost optimization features --- CHANGELOG.md | 12 ++ README.md | 6 + docs/EC2_REGION_DETECTION.md | 242 +++++++++++++++++++++++ pyproject.toml | 2 + src/deltaglider/adapters/__init__.py | 14 +- src/deltaglider/adapters/ec2_metadata.py | 126 ++++++++++++ src/deltaglider/app/cli/aws_compat.py | 99 ++++++++++ src/deltaglider/app/cli/main.py | 11 +- 8 files changed, 504 insertions(+), 8 deletions(-) create mode 100644 docs/EC2_REGION_DETECTION.md create mode 100644 src/deltaglider/adapters/ec2_metadata.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cbcdf0..9a9359e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **EC2 Region Detection & Cost Optimization** + - Automatic detection of EC2 instance region using IMDSv2 + - Warns when EC2 region ≠ S3 client region (potential cross-region charges) + - Different warnings for auto-detected vs. explicit `--region` flag mismatches + - Green checkmark when regions are aligned (optimal configuration) + - Can be disabled with `DG_DISABLE_EC2_DETECTION=true` environment variable + - Helps users optimize for cost and performance before migration starts - **New CLI Command**: `deltaglider migrate` for S3-to-S3 bucket migration with compression - Supports resume capability (skips already migrated files) - Real-time progress tracking with file count and statistics @@ -16,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Dry run mode with `--dry-run` flag - Include/exclude pattern filtering - Shows compression statistics after migration + - **EC2-aware region logging**: Detects EC2 instance and warns about cross-region charges - **FIXED**: Now correctly preserves original filenames during migration - **S3-to-S3 Recursive Copy**: `deltaglider cp -r s3://source/ s3://dest/` now supported - Automatically uses migration functionality with prefix preservation @@ -32,6 +40,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Previously created files with temp names like `tmp1b9cpdsn.zip` - Now correctly uses original filenames from source S3 keys - Fixed by adding `override_name` parameter to `DeltaService.put()` +- **CLI Region Support**: `--region` flag now properly passes region to boto3 client + - Previously only set environment variable, relied on boto3 auto-detection + - Now explicitly passes `region_name` to `boto3.client()` via `boto3_kwargs` + - Ensures consistent behavior with `DeltaGliderClient` SDK ### Changed - Recursive S3-to-S3 copy operations now preserve source prefix structure by default diff --git a/README.md b/README.md index 82eb9a1..dd40d8d 100644 --- a/README.md +++ b/README.md @@ -558,6 +558,12 @@ deltaglider migrate --exclude "*.log" s3://old/ s3://new/ - **Resume Support**: Migration automatically skips files that already exist in the destination - **Progress Tracking**: Shows real-time migration progress and statistics - **Safety First**: Interactive confirmation shows file count before starting +- **EC2 Cost Optimization**: Automatically detects EC2 instance region and warns about cross-region charges + - ✅ Green checkmark when regions align (no extra charges) + - ℹ️ INFO when auto-detected mismatch (suggests optimal region) + - ⚠️ WARNING when user explicitly set wrong `--region` (expect data transfer costs) + - Disable with `DG_DISABLE_EC2_DETECTION=true` if needed +- **AWS Region Transparency**: Displays the actual AWS region being used - **Prefix Preservation**: By default, source prefix is preserved in destination (use `--no-preserve-prefix` to disable) - **S3-to-S3 Transfer**: Both regular S3 and DeltaGlider buckets supported diff --git a/docs/EC2_REGION_DETECTION.md b/docs/EC2_REGION_DETECTION.md new file mode 100644 index 0000000..5ffbc2a --- /dev/null +++ b/docs/EC2_REGION_DETECTION.md @@ -0,0 +1,242 @@ +# EC2 Region Detection & Cost Optimization + +DeltaGlider automatically detects when you're running on an EC2 instance and warns you about potential cross-region data transfer charges. + +## Overview + +When running `deltaglider migrate` on an EC2 instance, DeltaGlider: + +1. **Detects EC2 Environment**: Uses IMDSv2 (Instance Metadata Service v2) to determine if running on EC2 +2. **Retrieves Instance Region**: Gets the actual AWS region where your EC2 instance is running +3. **Compares Regions**: Checks if your EC2 region matches the S3 client region +4. **Warns About Costs**: Displays clear warnings when regions don't match + +## Why This Matters + +**AWS Cross-Region Data Transfer Costs**: +- **Same region**: No additional charges for data transfer +- **Cross-region**: $0.02 per GB transferred (can add up quickly for large migrations) +- **NAT Gateway**: Additional charges if going through NAT + +**Example Cost Impact**: +- Migrating 1TB from `us-east-1` EC2 → `us-west-2` S3 = ~$20 in data transfer charges +- Same migration within same region = $0 in data transfer charges + +## Output Examples + +### Scenario 1: Regions Aligned (Optimal) ✅ + +```bash +$ deltaglider migrate s3://old-bucket/ s3://new-bucket/ +EC2 Instance: us-east-1a +S3 Client Region: us-east-1 +✓ Regions aligned - no cross-region charges +Migrating from s3://old-bucket/ + to s3://new-bucket/ +... +``` + +**Result**: No warnings, optimal configuration, no extra charges. + +--- + +### Scenario 2: Auto-Detected Mismatch (INFO) ℹ️ + +```bash +$ deltaglider migrate s3://old-bucket/ s3://new-bucket/ +EC2 Instance: us-west-2a +S3 Client Region: us-east-1 + +ℹ️ INFO: EC2 region (us-west-2) differs from configured S3 region (us-east-1) + Consider using --region us-west-2 to avoid cross-region charges. + +Migrating from s3://old-bucket/ + to s3://new-bucket/ +... +``` + +**Result**: Informational warning, suggests optimal region. User didn't explicitly set wrong region, so it's likely from their AWS config. + +--- + +### Scenario 3: Explicit Region Override Mismatch (WARNING) ⚠️ + +```bash +$ deltaglider migrate --region us-east-1 s3://old-bucket/ s3://new-bucket/ +EC2 Instance: us-west-2a +S3 Client Region: us-east-1 + +⚠️ WARNING: EC2 region=us-west-2 != S3 client region=us-east-1 + Expect cross-region/NAT data charges. Align regions (set client region=us-west-2) + before proceeding. Or drop --region for automatic region resolution. + +Migrating from s3://old-bucket/ + to s3://new-bucket/ +... +``` + +**Result**: Strong warning because user explicitly set the wrong region with `--region` flag. They might not realize the cost implications. + +--- + +### Scenario 4: Not on EC2 + +```bash +$ deltaglider migrate s3://old-bucket/ s3://new-bucket/ +S3 Client Region: us-east-1 +Migrating from s3://old-bucket/ + to s3://new-bucket/ +... +``` + +**Result**: Simple region display, no EC2 warnings (not applicable). + +## Configuration + +### Disable EC2 Detection + +If you want to disable EC2 detection (e.g., for testing or if it causes issues): + +```bash +export DG_DISABLE_EC2_DETECTION=true +deltaglider migrate s3://old/ s3://new/ +``` + +Or in your script: + +```python +import os +os.environ["DG_DISABLE_EC2_DETECTION"] = "true" +``` + +### How It Works + +DeltaGlider uses **IMDSv2** (Instance Metadata Service v2) for security: + +1. **Token Request** (PUT with TTL): + ``` + PUT http://169.254.169.254/latest/api/token + X-aws-ec2-metadata-token-ttl-seconds: 21600 + ``` + +2. **Metadata Request** (GET with token): + ``` + GET http://169.254.169.254/latest/meta-data/placement/region + X-aws-ec2-metadata-token: + ``` + +3. **Fast Timeout**: 1 second timeout for non-EC2 environments (no delay if not on EC2) + +### Security Notes + +- **IMDSv2 Only**: DeltaGlider uses the more secure IMDSv2, not the legacy IMDSv1 +- **No Credentials**: Only reads metadata, never accesses credentials +- **Graceful Fallback**: Silently skips detection if IMDS unavailable +- **No Network Impact**: Uses local-only IP (169.254.169.254), never leaves the instance + +## Best Practices + +### For Cost Optimization + +1. **Same Region**: Always try to keep EC2 instance and S3 bucket in the same region +2. **Check First**: Run with `--dry-run` to verify the setup before actual migration +3. **Use Auto-Detection**: Don't specify `--region` unless you have a specific reason +4. **Monitor Costs**: Use AWS Cost Explorer to track cross-region data transfer + +### For Terraform/IaC + +```hcl +# Good: EC2 and S3 in same region +resource "aws_instance" "app" { + region = "us-west-2" +} + +resource "aws_s3_bucket" "data" { + region = "us-west-2" # Same region +} +``` + +### For Multi-Region Setups + +If you MUST do cross-region transfers: + +1. **Use VPC Endpoints**: Reduce NAT Gateway costs +2. **Schedule Off-Peak**: AWS charges less during off-peak hours in some regions +3. **Consider S3 Transfer Acceleration**: May be cheaper for very large transfers +4. **Batch Operations**: Minimize number of API calls + +## Technical Details + +### EC2MetadataAdapter + +Location: `src/deltaglider/adapters/ec2_metadata.py` + +Key methods: +- `is_running_on_ec2()`: Detects EC2 environment +- `get_region()`: Returns AWS region code (e.g., "us-east-1") +- `get_availability_zone()`: Returns AZ (e.g., "us-east-1a") + +### Region Logging + +Location: `src/deltaglider/app/cli/aws_compat.py` + +Function: `log_aws_region(service, region_override=False)` + +Logic: +- If not EC2: Show S3 region only +- If EC2 + regions match: Green checkmark ✅ +- If EC2 + auto-detected mismatch: Blue INFO ℹ️ +- If EC2 + `--region` mismatch: Yellow WARNING ⚠️ + +## Troubleshooting + +### "Cannot connect to IMDS" + +**Cause**: Network policy blocks access to 169.254.169.254 + +**Solution**: +```bash +# Test IMDS connectivity +TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" \ + -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") +curl -H "X-aws-ec2-metadata-token: $TOKEN" \ + http://169.254.169.254/latest/meta-data/placement/region + +# If it fails, disable detection +export DG_DISABLE_EC2_DETECTION=true +``` + +### "Wrong region detected" + +**Cause**: Cached metadata or race condition + +**Solution**: DeltaGlider caches metadata for performance. Restart the process to refresh. + +### "Warning appears but I want cross-region" + +**Cause**: You intentionally need cross-region transfer + +**Solution**: This is just a warning, not an error. The migration will proceed. The warning helps you confirm you understand the cost implications. + +## FAQ + +**Q: Does this slow down my migrations?** +A: No. EC2 detection happens once before migration starts (< 100ms). It doesn't affect migration performance. + +**Q: What if I'm not on EC2 but the detection is slow?** +A: The timeout is 1 second. If IMDS is unreachable, it fails fast. Disable with `DG_DISABLE_EC2_DETECTION=true`. + +**Q: Does this work on Fargate/ECS/Lambda?** +A: Yes! All AWS compute services support IMDSv2. The detection works the same way. + +**Q: Can I use this with LocalStack/MinIO?** +A: Yes. When using `--endpoint-url`, DeltaGlider skips EC2 detection (not applicable for non-AWS S3). + +**Q: Will this detect VPC endpoints?** +A: No. VPC endpoints don't change the "region" from an EC2 perspective. The warning still applies if regions don't match. + +## Related Documentation + +- [AWS Data Transfer Pricing](https://aws.amazon.com/ec2/pricing/on-demand/#Data_Transfer) +- [AWS IMDSv2 Documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html) +- [S3 Transfer Costs](https://aws.amazon.com/s3/pricing/) diff --git a/pyproject.toml b/pyproject.toml index 643fc91..4a10f2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ dependencies = [ "click>=8.1.0", "cryptography>=42.0.0", "python-dateutil>=2.9.0", + "requests>=2.32.0", ] [project.urls] @@ -109,6 +110,7 @@ dev-dependencies = [ "mypy>=1.13.0", "boto3-stubs[s3]>=1.35.0", "types-python-dateutil>=2.9.0", + "types-requests>=2.32.0", "setuptools-scm>=8.0.0", ] diff --git a/src/deltaglider/adapters/__init__.py b/src/deltaglider/adapters/__init__.py index e187d89..9fe3d28 100644 --- a/src/deltaglider/adapters/__init__.py +++ b/src/deltaglider/adapters/__init__.py @@ -6,20 +6,22 @@ from .cache_fs import FsCacheAdapter from .cache_memory import MemoryCache from .clock_utc import UtcClockAdapter from .diff_xdelta import XdeltaAdapter +from .ec2_metadata import EC2MetadataAdapter from .hash_sha import Sha256Adapter from .logger_std import StdLoggerAdapter from .metrics_noop import NoopMetricsAdapter from .storage_s3 import S3StorageAdapter __all__ = [ - "S3StorageAdapter", - "XdeltaAdapter", - "Sha256Adapter", - "FsCacheAdapter", "ContentAddressedCache", + "EC2MetadataAdapter", "EncryptedCache", + "FsCacheAdapter", "MemoryCache", - "UtcClockAdapter", - "StdLoggerAdapter", "NoopMetricsAdapter", + "S3StorageAdapter", + "Sha256Adapter", + "StdLoggerAdapter", + "UtcClockAdapter", + "XdeltaAdapter", ] diff --git a/src/deltaglider/adapters/ec2_metadata.py b/src/deltaglider/adapters/ec2_metadata.py new file mode 100644 index 0000000..828f28d --- /dev/null +++ b/src/deltaglider/adapters/ec2_metadata.py @@ -0,0 +1,126 @@ +"""EC2 Instance Metadata Service (IMDS) adapter. + +Provides access to EC2 instance metadata using IMDSv2 with token-based authentication. +Falls back gracefully when not running on EC2. +""" + +import os + +import requests + + +class EC2MetadataAdapter: + """Adapter for EC2 Instance Metadata Service (IMDSv2).""" + + IMDS_BASE_URL = "http://169.254.169.254/latest" + TOKEN_URL = f"{IMDS_BASE_URL}/api/token" + TOKEN_TTL_SECONDS = 21600 # 6 hours + TOKEN_HEADER = "X-aws-ec2-metadata-token" + TIMEOUT_SECONDS = 1 # Fast timeout for non-EC2 environments + + def __init__(self) -> None: + """Initialize EC2 metadata adapter.""" + self._token: str | None = None + self._is_ec2: bool | None = None + self._region: str | None = None + + def is_running_on_ec2(self) -> bool: + """Check if running on an EC2 instance. + + Returns: + True if running on EC2, False otherwise + + Note: + Result is cached after first check for performance. + """ + if self._is_ec2 is not None: + return self._is_ec2 + + # Skip check if explicitly disabled + if os.environ.get("DG_DISABLE_EC2_DETECTION", "").lower() in ("true", "1", "yes"): + self._is_ec2 = False + return False + + try: + # Try to get IMDSv2 token + self._token = self._get_token() + self._is_ec2 = self._token is not None + except Exception: + self._is_ec2 = False + + return self._is_ec2 + + def get_region(self) -> str | None: + """Get the EC2 instance's AWS region. + + Returns: + AWS region code (e.g., "us-east-1") or None if not on EC2 + + Note: + Result is cached after first successful fetch. + """ + if not self.is_running_on_ec2(): + return None + + if self._region is not None: + return self._region + + try: + if self._token: + response = requests.get( + f"{self.IMDS_BASE_URL}/meta-data/placement/region", + headers={self.TOKEN_HEADER: self._token}, + timeout=self.TIMEOUT_SECONDS, + ) + if response.status_code == 200: + self._region = response.text.strip() + return self._region + except Exception: + pass + + return None + + def get_availability_zone(self) -> str | None: + """Get the EC2 instance's availability zone. + + Returns: + Availability zone (e.g., "us-east-1a") or None if not on EC2 + """ + if not self.is_running_on_ec2(): + return None + + try: + if self._token: + response = requests.get( + f"{self.IMDS_BASE_URL}/meta-data/placement/availability-zone", + headers={self.TOKEN_HEADER: self._token}, + timeout=self.TIMEOUT_SECONDS, + ) + if response.status_code == 200: + return str(response.text.strip()) + except Exception: + pass + + return None + + def _get_token(self) -> str | None: + """Get IMDSv2 token for authenticated metadata requests. + + Returns: + IMDSv2 token or None if unable to retrieve + + Note: + Uses IMDSv2 for security. IMDSv1 is not supported. + """ + try: + response = requests.put( + self.TOKEN_URL, + headers={"X-aws-ec2-metadata-token-ttl-seconds": str(self.TOKEN_TTL_SECONDS)}, + timeout=self.TIMEOUT_SECONDS, + ) + if response.status_code == 200: + return response.text.strip() + except Exception: + pass + + return None diff --git a/src/deltaglider/app/cli/aws_compat.py b/src/deltaglider/app/cli/aws_compat.py index 611fd7b..0859efb 100644 --- a/src/deltaglider/app/cli/aws_compat.py +++ b/src/deltaglider/app/cli/aws_compat.py @@ -17,9 +17,87 @@ __all__ = [ "copy_s3_to_s3", "migrate_s3_to_s3", "handle_recursive", + "log_aws_region", ] +def log_aws_region(service: DeltaService, region_override: bool = False) -> None: + """Log the AWS region being used and warn about cross-region charges. + + This function: + 1. Detects if running on EC2 + 2. Compares EC2 region with S3 client region + 3. Warns about potential cross-region data transfer charges + 4. Helps users optimize for cost and performance + + Args: + service: DeltaService instance with storage adapter + region_override: True if user explicitly specified --region flag + """ + try: + from ...adapters.ec2_metadata import EC2MetadataAdapter + from ...adapters.storage_s3 import S3StorageAdapter + + if not isinstance(service.storage, S3StorageAdapter): + return # Not using S3 storage, skip + + # Get S3 client region + s3_region = service.storage.client.meta.region_name + if not s3_region: + s3_region = "us-east-1" # boto3 default + + # Check if running on EC2 + ec2_metadata = EC2MetadataAdapter() + if ec2_metadata.is_running_on_ec2(): + ec2_region = ec2_metadata.get_region() + ec2_az = ec2_metadata.get_availability_zone() + + # Log EC2 context + click.echo(f"EC2 Instance: {ec2_az or ec2_region or 'unknown'}") + click.echo(f"S3 Client Region: {s3_region}") + + # Check for region mismatch + if ec2_region and ec2_region != s3_region: + if region_override: + # User explicitly set --region, warn about costs + click.echo("") + click.secho( + f"⚠️ WARNING: EC2 region={ec2_region} != S3 client region={s3_region}", + fg="yellow", + bold=True, + ) + click.secho( + f" Expect cross-region/NAT data charges. Align regions (set client region={ec2_region})", + fg="yellow", + ) + click.secho( + " before proceeding. Or drop --region for automatic region resolution.", + fg="yellow", + ) + click.echo("") + else: + # Auto-detected mismatch, but user can still cancel + click.echo("") + click.secho( + f"ℹ️ INFO: EC2 region ({ec2_region}) differs from configured S3 region ({s3_region})", + fg="cyan", + ) + click.secho( + f" Consider using --region {ec2_region} to avoid cross-region charges.", + fg="cyan", + ) + click.echo("") + elif ec2_region and ec2_region == s3_region: + # Regions match - optimal configuration + click.secho("✓ Regions aligned - no cross-region charges", fg="green") + else: + # Not on EC2, just show S3 region + click.echo(f"S3 Client Region: {s3_region}") + + except Exception: + pass # Silently ignore errors getting region info + + def is_s3_path(path: str) -> bool: """Check if path is an S3 URL.""" return path.startswith("s3://") @@ -239,6 +317,7 @@ def migrate_s3_to_s3( dry_run: bool = False, skip_confirm: bool = False, preserve_prefix: bool = True, + region_override: bool = False, ) -> None: """Migrate objects from one S3 location to another with delta compression. @@ -247,6 +326,21 @@ def migrate_s3_to_s3( - Progress tracking: Shows migration progress - Confirmation prompt: Shows file count before starting - Prefix preservation: Optionally preserves source prefix structure in destination + - EC2 region detection: Warns about cross-region data transfer charges + + Args: + service: DeltaService instance + source_url: Source S3 URL + dest_url: Destination S3 URL + exclude: Pattern to exclude files + include: Pattern to include files + quiet: Suppress output + no_delta: Disable delta compression + max_ratio: Maximum delta/file ratio + dry_run: Show what would be migrated without migrating + skip_confirm: Skip confirmation prompt + preserve_prefix: Preserve source prefix in destination + region_override: True if user explicitly specified --region flag """ import fnmatch @@ -269,6 +363,10 @@ def migrate_s3_to_s3( effective_dest_prefix = (dest_prefix or "") + source_prefix_name + "/" if not quiet: + # Log AWS region being used (helps users verify their configuration) + # Pass region_override to warn about cross-region charges if user explicitly set --region + log_aws_region(service, region_override=region_override) + if preserve_prefix and source_prefix: click.echo(f"Migrating from s3://{source_bucket}/{source_prefix}") click.echo(f" to s3://{dest_bucket}/{effective_dest_prefix}") @@ -530,4 +628,5 @@ def handle_recursive( dry_run=False, skip_confirm=True, # Don't prompt for cp command preserve_prefix=True, # Always preserve prefix for cp -r + region_override=False, # cp command doesn't track region override explicitly ) diff --git a/src/deltaglider/app/cli/main.py b/src/deltaglider/app/cli/main.py index 55f8b01..20ad833 100644 --- a/src/deltaglider/app/cli/main.py +++ b/src/deltaglider/app/cli/main.py @@ -7,6 +7,7 @@ import shutil import sys import tempfile from pathlib import Path +from typing import Any import click @@ -50,7 +51,7 @@ def create_service( # Register cleanup handler to remove cache on exit atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True)) - # Set AWS environment variables if provided + # Set AWS environment variables if provided (for compatibility with other AWS tools) if endpoint_url: os.environ["AWS_ENDPOINT_URL"] = endpoint_url if region: @@ -58,9 +59,14 @@ def create_service( if profile: os.environ["AWS_PROFILE"] = profile + # Build boto3_kwargs for explicit parameter passing (preferred over env vars) + boto3_kwargs: dict[str, Any] = {} + if region: + boto3_kwargs["region_name"] = region + # Create adapters hasher = Sha256Adapter() - storage = S3StorageAdapter(endpoint_url=endpoint_url) + storage = S3StorageAdapter(endpoint_url=endpoint_url, boto3_kwargs=boto3_kwargs) diff = XdeltaAdapter() # SECURITY: Configurable cache with encryption and backend selection @@ -730,6 +736,7 @@ def migrate( dry_run=dry_run, skip_confirm=yes, preserve_prefix=not no_preserve_prefix, + region_override=region is not None, # True if user explicitly specified --region ) except Exception as e: