feat: Add EC2 region detection and cost optimization features

This commit is contained in:
Simone Scarduzio
2025-10-12 22:41:48 +02:00
parent 4f56c4b600
commit b2ca59490b
8 changed files with 504 additions and 8 deletions

View File

@@ -6,20 +6,22 @@ from .cache_fs import FsCacheAdapter
from .cache_memory import MemoryCache
from .clock_utc import UtcClockAdapter
from .diff_xdelta import XdeltaAdapter
from .ec2_metadata import EC2MetadataAdapter
from .hash_sha import Sha256Adapter
from .logger_std import StdLoggerAdapter
from .metrics_noop import NoopMetricsAdapter
from .storage_s3 import S3StorageAdapter
__all__ = [
"S3StorageAdapter",
"XdeltaAdapter",
"Sha256Adapter",
"FsCacheAdapter",
"ContentAddressedCache",
"EC2MetadataAdapter",
"EncryptedCache",
"FsCacheAdapter",
"MemoryCache",
"UtcClockAdapter",
"StdLoggerAdapter",
"NoopMetricsAdapter",
"S3StorageAdapter",
"Sha256Adapter",
"StdLoggerAdapter",
"UtcClockAdapter",
"XdeltaAdapter",
]

View File

@@ -0,0 +1,126 @@
"""EC2 Instance Metadata Service (IMDS) adapter.
Provides access to EC2 instance metadata using IMDSv2 with token-based authentication.
Falls back gracefully when not running on EC2.
"""
import os
import requests
class EC2MetadataAdapter:
"""Adapter for EC2 Instance Metadata Service (IMDSv2)."""
IMDS_BASE_URL = "http://169.254.169.254/latest"
TOKEN_URL = f"{IMDS_BASE_URL}/api/token"
TOKEN_TTL_SECONDS = 21600 # 6 hours
TOKEN_HEADER = "X-aws-ec2-metadata-token"
TIMEOUT_SECONDS = 1 # Fast timeout for non-EC2 environments
def __init__(self) -> None:
"""Initialize EC2 metadata adapter."""
self._token: str | None = None
self._is_ec2: bool | None = None
self._region: str | None = None
def is_running_on_ec2(self) -> bool:
"""Check if running on an EC2 instance.
Returns:
True if running on EC2, False otherwise
Note:
Result is cached after first check for performance.
"""
if self._is_ec2 is not None:
return self._is_ec2
# Skip check if explicitly disabled
if os.environ.get("DG_DISABLE_EC2_DETECTION", "").lower() in ("true", "1", "yes"):
self._is_ec2 = False
return False
try:
# Try to get IMDSv2 token
self._token = self._get_token()
self._is_ec2 = self._token is not None
except Exception:
self._is_ec2 = False
return self._is_ec2
def get_region(self) -> str | None:
"""Get the EC2 instance's AWS region.
Returns:
AWS region code (e.g., "us-east-1") or None if not on EC2
Note:
Result is cached after first successful fetch.
"""
if not self.is_running_on_ec2():
return None
if self._region is not None:
return self._region
try:
if self._token:
response = requests.get(
f"{self.IMDS_BASE_URL}/meta-data/placement/region",
headers={self.TOKEN_HEADER: self._token},
timeout=self.TIMEOUT_SECONDS,
)
if response.status_code == 200:
self._region = response.text.strip()
return self._region
except Exception:
pass
return None
def get_availability_zone(self) -> str | None:
"""Get the EC2 instance's availability zone.
Returns:
Availability zone (e.g., "us-east-1a") or None if not on EC2
"""
if not self.is_running_on_ec2():
return None
try:
if self._token:
response = requests.get(
f"{self.IMDS_BASE_URL}/meta-data/placement/availability-zone",
headers={self.TOKEN_HEADER: self._token},
timeout=self.TIMEOUT_SECONDS,
)
if response.status_code == 200:
return str(response.text.strip())
except Exception:
pass
return None
def _get_token(self) -> str | None:
"""Get IMDSv2 token for authenticated metadata requests.
Returns:
IMDSv2 token or None if unable to retrieve
Note:
Uses IMDSv2 for security. IMDSv1 is not supported.
"""
try:
response = requests.put(
self.TOKEN_URL,
headers={"X-aws-ec2-metadata-token-ttl-seconds": str(self.TOKEN_TTL_SECONDS)},
timeout=self.TIMEOUT_SECONDS,
)
if response.status_code == 200:
return response.text.strip()
except Exception:
pass
return None

View File

@@ -17,9 +17,87 @@ __all__ = [
"copy_s3_to_s3",
"migrate_s3_to_s3",
"handle_recursive",
"log_aws_region",
]
def log_aws_region(service: DeltaService, region_override: bool = False) -> None:
"""Log the AWS region being used and warn about cross-region charges.
This function:
1. Detects if running on EC2
2. Compares EC2 region with S3 client region
3. Warns about potential cross-region data transfer charges
4. Helps users optimize for cost and performance
Args:
service: DeltaService instance with storage adapter
region_override: True if user explicitly specified --region flag
"""
try:
from ...adapters.ec2_metadata import EC2MetadataAdapter
from ...adapters.storage_s3 import S3StorageAdapter
if not isinstance(service.storage, S3StorageAdapter):
return # Not using S3 storage, skip
# Get S3 client region
s3_region = service.storage.client.meta.region_name
if not s3_region:
s3_region = "us-east-1" # boto3 default
# Check if running on EC2
ec2_metadata = EC2MetadataAdapter()
if ec2_metadata.is_running_on_ec2():
ec2_region = ec2_metadata.get_region()
ec2_az = ec2_metadata.get_availability_zone()
# Log EC2 context
click.echo(f"EC2 Instance: {ec2_az or ec2_region or 'unknown'}")
click.echo(f"S3 Client Region: {s3_region}")
# Check for region mismatch
if ec2_region and ec2_region != s3_region:
if region_override:
# User explicitly set --region, warn about costs
click.echo("")
click.secho(
f"⚠️ WARNING: EC2 region={ec2_region} != S3 client region={s3_region}",
fg="yellow",
bold=True,
)
click.secho(
f" Expect cross-region/NAT data charges. Align regions (set client region={ec2_region})",
fg="yellow",
)
click.secho(
" before proceeding. Or drop --region for automatic region resolution.",
fg="yellow",
)
click.echo("")
else:
# Auto-detected mismatch, but user can still cancel
click.echo("")
click.secho(
f" INFO: EC2 region ({ec2_region}) differs from configured S3 region ({s3_region})",
fg="cyan",
)
click.secho(
f" Consider using --region {ec2_region} to avoid cross-region charges.",
fg="cyan",
)
click.echo("")
elif ec2_region and ec2_region == s3_region:
# Regions match - optimal configuration
click.secho("✓ Regions aligned - no cross-region charges", fg="green")
else:
# Not on EC2, just show S3 region
click.echo(f"S3 Client Region: {s3_region}")
except Exception:
pass # Silently ignore errors getting region info
def is_s3_path(path: str) -> bool:
"""Check if path is an S3 URL."""
return path.startswith("s3://")
@@ -239,6 +317,7 @@ def migrate_s3_to_s3(
dry_run: bool = False,
skip_confirm: bool = False,
preserve_prefix: bool = True,
region_override: bool = False,
) -> None:
"""Migrate objects from one S3 location to another with delta compression.
@@ -247,6 +326,21 @@ def migrate_s3_to_s3(
- Progress tracking: Shows migration progress
- Confirmation prompt: Shows file count before starting
- Prefix preservation: Optionally preserves source prefix structure in destination
- EC2 region detection: Warns about cross-region data transfer charges
Args:
service: DeltaService instance
source_url: Source S3 URL
dest_url: Destination S3 URL
exclude: Pattern to exclude files
include: Pattern to include files
quiet: Suppress output
no_delta: Disable delta compression
max_ratio: Maximum delta/file ratio
dry_run: Show what would be migrated without migrating
skip_confirm: Skip confirmation prompt
preserve_prefix: Preserve source prefix in destination
region_override: True if user explicitly specified --region flag
"""
import fnmatch
@@ -269,6 +363,10 @@ def migrate_s3_to_s3(
effective_dest_prefix = (dest_prefix or "") + source_prefix_name + "/"
if not quiet:
# Log AWS region being used (helps users verify their configuration)
# Pass region_override to warn about cross-region charges if user explicitly set --region
log_aws_region(service, region_override=region_override)
if preserve_prefix and source_prefix:
click.echo(f"Migrating from s3://{source_bucket}/{source_prefix}")
click.echo(f" to s3://{dest_bucket}/{effective_dest_prefix}")
@@ -530,4 +628,5 @@ def handle_recursive(
dry_run=False,
skip_confirm=True, # Don't prompt for cp command
preserve_prefix=True, # Always preserve prefix for cp -r
region_override=False, # cp command doesn't track region override explicitly
)

View File

@@ -7,6 +7,7 @@ import shutil
import sys
import tempfile
from pathlib import Path
from typing import Any
import click
@@ -50,7 +51,7 @@ def create_service(
# Register cleanup handler to remove cache on exit
atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True))
# Set AWS environment variables if provided
# Set AWS environment variables if provided (for compatibility with other AWS tools)
if endpoint_url:
os.environ["AWS_ENDPOINT_URL"] = endpoint_url
if region:
@@ -58,9 +59,14 @@ def create_service(
if profile:
os.environ["AWS_PROFILE"] = profile
# Build boto3_kwargs for explicit parameter passing (preferred over env vars)
boto3_kwargs: dict[str, Any] = {}
if region:
boto3_kwargs["region_name"] = region
# Create adapters
hasher = Sha256Adapter()
storage = S3StorageAdapter(endpoint_url=endpoint_url)
storage = S3StorageAdapter(endpoint_url=endpoint_url, boto3_kwargs=boto3_kwargs)
diff = XdeltaAdapter()
# SECURITY: Configurable cache with encryption and backend selection
@@ -730,6 +736,7 @@ def migrate(
dry_run=dry_run,
skip_confirm=yes,
preserve_prefix=not no_preserve_prefix,
region_override=region is not None, # True if user explicitly specified --region
)
except Exception as e: