mirror of
https://github.com/beshu-tech/deltaglider.git
synced 2026-04-30 04:04:33 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e1259b7ea8 | ||
|
|
ff05e77c24 | ||
|
|
c3d385bf18 | ||
|
|
aea5cb5d9a | ||
|
|
b2ca59490b | ||
|
|
4f56c4b600 | ||
|
|
14c6af0f35 | ||
|
|
67792b2031 |
47
CHANGELOG.md
47
CHANGELOG.md
@@ -7,6 +7,53 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- **EC2 Region Detection & Cost Optimization**
|
||||
- Automatic detection of EC2 instance region using IMDSv2
|
||||
- Warns when EC2 region ≠ S3 client region (potential cross-region charges)
|
||||
- Different warnings for auto-detected vs. explicit `--region` flag mismatches
|
||||
- Green checkmark when regions are aligned (optimal configuration)
|
||||
- Can be disabled with `DG_DISABLE_EC2_DETECTION=true` environment variable
|
||||
- Helps users optimize for cost and performance before migration starts
|
||||
- **New CLI Command**: `deltaglider migrate` for S3-to-S3 bucket migration with compression
|
||||
- Supports resume capability (skips already migrated files)
|
||||
- Real-time progress tracking with file count and statistics
|
||||
- Interactive confirmation prompt (use `--yes` to skip)
|
||||
- Prefix preservation by default (use `--no-preserve-prefix` to disable)
|
||||
- Dry run mode with `--dry-run` flag
|
||||
- Include/exclude pattern filtering
|
||||
- Shows compression statistics after migration
|
||||
- **EC2-aware region logging**: Detects EC2 instance and warns about cross-region charges
|
||||
- **FIXED**: Now correctly preserves original filenames during migration
|
||||
- **S3-to-S3 Recursive Copy**: `deltaglider cp -r s3://source/ s3://dest/` now supported
|
||||
- Automatically uses migration functionality with prefix preservation
|
||||
- Applies delta compression during transfer
|
||||
- Preserves original filenames correctly
|
||||
- **Version Command**: Added `--version` flag to show deltaglider version
|
||||
- Usage: `deltaglider --version`
|
||||
- **DeltaService API Enhancement**: Added `override_name` parameter to `put()` method
|
||||
- Allows specifying destination filename independently of source filesystem path
|
||||
- Enables proper S3-to-S3 transfers without filesystem renaming tricks
|
||||
|
||||
### Fixed
|
||||
- **Critical**: S3-to-S3 migration now preserves original filenames
|
||||
- Previously created files with temp names like `tmp1b9cpdsn.zip`
|
||||
- Now correctly uses original filenames from source S3 keys
|
||||
- Fixed by adding `override_name` parameter to `DeltaService.put()`
|
||||
- **CLI Region Support**: `--region` flag now properly passes region to boto3 client
|
||||
- Previously only set environment variable, relied on boto3 auto-detection
|
||||
- Now explicitly passes `region_name` to `boto3.client()` via `boto3_kwargs`
|
||||
- Ensures consistent behavior with `DeltaGliderClient` SDK
|
||||
|
||||
### Changed
|
||||
- Recursive S3-to-S3 copy operations now preserve source prefix structure by default
|
||||
- Migration operations show formatted output with source and destination paths
|
||||
|
||||
### Documentation
|
||||
- Added comprehensive migration guide in README.md
|
||||
- Updated CLI reference with migrate command examples
|
||||
- Added prefix preservation behavior documentation
|
||||
|
||||
## [5.1.1] - 2025-01-10
|
||||
|
||||
### Fixed
|
||||
|
||||
66
README.md
66
README.md
@@ -89,6 +89,7 @@ docker run -v /shared-cache:/tmp/.deltaglider \
|
||||
- `DG_CACHE_BACKEND`: Cache backend (default: `filesystem`, options: `filesystem`, `memory`)
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size in MB (default: `100`)
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded encryption key for cross-process cache sharing
|
||||
- `DG_DISABLE_EC2_DETECTION`: Disable EC2 instance detection (default: `false`, set to `true` to disable)
|
||||
- `AWS_ENDPOINT_URL`: S3 endpoint URL (default: AWS S3)
|
||||
- `AWS_ACCESS_KEY_ID`: AWS access key
|
||||
- `AWS_SECRET_ACCESS_KEY`: AWS secret key
|
||||
@@ -116,6 +117,9 @@ deltaglider ls s3://releases/
|
||||
|
||||
# Sync directories
|
||||
deltaglider sync ./dist/ s3://releases/v1.0.0/
|
||||
|
||||
# Migrate existing S3 bucket to DeltaGlider-compressed storage
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
```
|
||||
|
||||
**That's it!** DeltaGlider automatically detects similar files and applies 99%+ compression. For more commands and options, see [CLI Reference](#cli-reference).
|
||||
@@ -189,13 +193,22 @@ deltaglider sync s3://releases/ ./local-backup/ # Sync from S3
|
||||
deltaglider sync --delete ./src/ s3://backup/ # Mirror exactly
|
||||
deltaglider sync --exclude "*.log" ./src/ s3://backup/ # Exclude patterns
|
||||
|
||||
# Get bucket statistics (compression metrics)
|
||||
deltaglider stats my-bucket # Quick stats overview
|
||||
# Get bucket statistics with intelligent S3-based caching
|
||||
deltaglider stats my-bucket # Quick stats (~100ms with cache)
|
||||
deltaglider stats s3://my-bucket # Also accepts s3:// format
|
||||
deltaglider stats s3://my-bucket/ # With or without trailing slash
|
||||
deltaglider stats my-bucket --detailed # Detailed compression metrics (slower)
|
||||
deltaglider stats my-bucket --sampled # Balanced (one sample per deltaspace)
|
||||
deltaglider stats my-bucket --detailed # Most accurate (slower, all metadata)
|
||||
deltaglider stats my-bucket --refresh # Force cache refresh
|
||||
deltaglider stats my-bucket --no-cache # Skip caching entirely
|
||||
deltaglider stats my-bucket --json # JSON output for automation
|
||||
|
||||
# Migrate existing S3 buckets to DeltaGlider compression
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/ # Interactive migration
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/ --yes # Skip confirmation
|
||||
deltaglider migrate --dry-run s3://old-bucket/ s3://new/ # Preview migration
|
||||
deltaglider migrate s3://bucket/v1/ s3://bucket/v2/ # Migrate prefixes
|
||||
|
||||
# Works with MinIO, R2, and S3-compatible storage
|
||||
deltaglider cp file.zip s3://bucket/ --endpoint-url http://localhost:9000
|
||||
```
|
||||
@@ -519,10 +532,57 @@ Migrating from `aws s3` to `deltaglider` is as simple as changing the command na
|
||||
| `aws s3 rm s3://bucket/file` | `deltaglider rm s3://bucket/file` | - |
|
||||
| `aws s3 sync dir/ s3://bucket/` | `deltaglider sync dir/ s3://bucket/` | ✅ 99% incremental |
|
||||
|
||||
### Migrating Existing S3 Buckets
|
||||
|
||||
DeltaGlider provides a dedicated `migrate` command to compress your existing S3 data:
|
||||
|
||||
```bash
|
||||
# Migrate an entire bucket
|
||||
deltaglider migrate s3://old-bucket/ s3://compressed-bucket/
|
||||
|
||||
# Migrate a prefix (preserves prefix structure by default)
|
||||
deltaglider migrate s3://bucket/releases/ s3://bucket/archive/
|
||||
# Result: s3://bucket/archive/releases/ contains the files
|
||||
|
||||
# Migrate without preserving source prefix
|
||||
deltaglider migrate --no-preserve-prefix s3://bucket/v1/ s3://bucket/archive/
|
||||
# Result: Files go directly into s3://bucket/archive/
|
||||
|
||||
# Preview migration (dry run)
|
||||
deltaglider migrate --dry-run s3://old/ s3://new/
|
||||
|
||||
# Skip confirmation prompt
|
||||
deltaglider migrate --yes s3://old/ s3://new/
|
||||
|
||||
# Exclude certain file patterns
|
||||
deltaglider migrate --exclude "*.log" s3://old/ s3://new/
|
||||
```
|
||||
|
||||
**Key Features:**
|
||||
- **Resume Support**: Migration automatically skips files that already exist in the destination
|
||||
- **Progress Tracking**: Shows real-time migration progress and statistics
|
||||
- **Safety First**: Interactive confirmation shows file count before starting
|
||||
- **EC2 Cost Optimization**: Automatically detects EC2 instance region and warns about cross-region charges
|
||||
- ✅ Green checkmark when regions align (no extra charges)
|
||||
- ℹ️ INFO when auto-detected mismatch (suggests optimal region)
|
||||
- ⚠️ WARNING when user explicitly set wrong `--region` (expect data transfer costs)
|
||||
- Disable with `DG_DISABLE_EC2_DETECTION=true` if needed
|
||||
- **AWS Region Transparency**: Displays the actual AWS region being used
|
||||
- **Prefix Preservation**: By default, source prefix is preserved in destination (use `--no-preserve-prefix` to disable)
|
||||
- **S3-to-S3 Transfer**: Both regular S3 and DeltaGlider buckets supported
|
||||
|
||||
**Prefix Preservation Examples:**
|
||||
- `s3://src/data/` → `s3://dest/` creates `s3://dest/data/`
|
||||
- `s3://src/a/b/c/` → `s3://dest/x/` creates `s3://dest/x/c/`
|
||||
- Use `--no-preserve-prefix` to place files directly in destination without the source prefix
|
||||
|
||||
The migration preserves all file names and structure while applying DeltaGlider's compression transparently.
|
||||
|
||||
## Production Ready
|
||||
|
||||
- ✅ **Battle tested**: 200K+ files in production
|
||||
- ✅ **Data integrity**: SHA256 verification on every operation
|
||||
- ✅ **Cost optimization**: Automatic EC2 region detection warns about cross-region charges - [📖 EC2 Detection Guide](docs/EC2_REGION_DETECTION.md)
|
||||
- ✅ **S3 compatible**: Works with AWS, MinIO, Cloudflare R2, etc.
|
||||
- ✅ **Atomic operations**: No partial states
|
||||
- ✅ **Concurrent safe**: Multiple clients supported
|
||||
|
||||
242
docs/EC2_REGION_DETECTION.md
Normal file
242
docs/EC2_REGION_DETECTION.md
Normal file
@@ -0,0 +1,242 @@
|
||||
# EC2 Region Detection & Cost Optimization
|
||||
|
||||
DeltaGlider automatically detects when you're running on an EC2 instance and warns you about potential cross-region data transfer charges.
|
||||
|
||||
## Overview
|
||||
|
||||
When running `deltaglider migrate` on an EC2 instance, DeltaGlider:
|
||||
|
||||
1. **Detects EC2 Environment**: Uses IMDSv2 (Instance Metadata Service v2) to determine if running on EC2
|
||||
2. **Retrieves Instance Region**: Gets the actual AWS region where your EC2 instance is running
|
||||
3. **Compares Regions**: Checks if your EC2 region matches the S3 client region
|
||||
4. **Warns About Costs**: Displays clear warnings when regions don't match
|
||||
|
||||
## Why This Matters
|
||||
|
||||
**AWS Cross-Region Data Transfer Costs**:
|
||||
- **Same region**: No additional charges for data transfer
|
||||
- **Cross-region**: $0.02 per GB transferred (can add up quickly for large migrations)
|
||||
- **NAT Gateway**: Additional charges if going through NAT
|
||||
|
||||
**Example Cost Impact**:
|
||||
- Migrating 1TB from `us-east-1` EC2 → `us-west-2` S3 = ~$20 in data transfer charges
|
||||
- Same migration within same region = $0 in data transfer charges
|
||||
|
||||
## Output Examples
|
||||
|
||||
### Scenario 1: Regions Aligned (Optimal) ✅
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
EC2 Instance: us-east-1a
|
||||
S3 Client Region: us-east-1
|
||||
✓ Regions aligned - no cross-region charges
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: No warnings, optimal configuration, no extra charges.
|
||||
|
||||
---
|
||||
|
||||
### Scenario 2: Auto-Detected Mismatch (INFO) ℹ️
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
EC2 Instance: us-west-2a
|
||||
S3 Client Region: us-east-1
|
||||
|
||||
ℹ️ INFO: EC2 region (us-west-2) differs from configured S3 region (us-east-1)
|
||||
Consider using --region us-west-2 to avoid cross-region charges.
|
||||
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: Informational warning, suggests optimal region. User didn't explicitly set wrong region, so it's likely from their AWS config.
|
||||
|
||||
---
|
||||
|
||||
### Scenario 3: Explicit Region Override Mismatch (WARNING) ⚠️
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate --region us-east-1 s3://old-bucket/ s3://new-bucket/
|
||||
EC2 Instance: us-west-2a
|
||||
S3 Client Region: us-east-1
|
||||
|
||||
⚠️ WARNING: EC2 region=us-west-2 != S3 client region=us-east-1
|
||||
Expect cross-region/NAT data charges. Align regions (set client region=us-west-2)
|
||||
before proceeding. Or drop --region for automatic region resolution.
|
||||
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: Strong warning because user explicitly set the wrong region with `--region` flag. They might not realize the cost implications.
|
||||
|
||||
---
|
||||
|
||||
### Scenario 4: Not on EC2
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
S3 Client Region: us-east-1
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: Simple region display, no EC2 warnings (not applicable).
|
||||
|
||||
## Configuration
|
||||
|
||||
### Disable EC2 Detection
|
||||
|
||||
If you want to disable EC2 detection (e.g., for testing or if it causes issues):
|
||||
|
||||
```bash
|
||||
export DG_DISABLE_EC2_DETECTION=true
|
||||
deltaglider migrate s3://old/ s3://new/
|
||||
```
|
||||
|
||||
Or in your script:
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ["DG_DISABLE_EC2_DETECTION"] = "true"
|
||||
```
|
||||
|
||||
### How It Works
|
||||
|
||||
DeltaGlider uses **IMDSv2** (Instance Metadata Service v2) for security:
|
||||
|
||||
1. **Token Request** (PUT with TTL):
|
||||
```
|
||||
PUT http://169.254.169.254/latest/api/token
|
||||
X-aws-ec2-metadata-token-ttl-seconds: 21600
|
||||
```
|
||||
|
||||
2. **Metadata Request** (GET with token):
|
||||
```
|
||||
GET http://169.254.169.254/latest/meta-data/placement/region
|
||||
X-aws-ec2-metadata-token: <token>
|
||||
```
|
||||
|
||||
3. **Fast Timeout**: 1 second timeout for non-EC2 environments (no delay if not on EC2)
|
||||
|
||||
### Security Notes
|
||||
|
||||
- **IMDSv2 Only**: DeltaGlider uses the more secure IMDSv2, not the legacy IMDSv1
|
||||
- **No Credentials**: Only reads metadata, never accesses credentials
|
||||
- **Graceful Fallback**: Silently skips detection if IMDS unavailable
|
||||
- **No Network Impact**: Uses local-only IP (169.254.169.254), never leaves the instance
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Cost Optimization
|
||||
|
||||
1. **Same Region**: Always try to keep EC2 instance and S3 bucket in the same region
|
||||
2. **Check First**: Run with `--dry-run` to verify the setup before actual migration
|
||||
3. **Use Auto-Detection**: Don't specify `--region` unless you have a specific reason
|
||||
4. **Monitor Costs**: Use AWS Cost Explorer to track cross-region data transfer
|
||||
|
||||
### For Terraform/IaC
|
||||
|
||||
```hcl
|
||||
# Good: EC2 and S3 in same region
|
||||
resource "aws_instance" "app" {
|
||||
region = "us-west-2"
|
||||
}
|
||||
|
||||
resource "aws_s3_bucket" "data" {
|
||||
region = "us-west-2" # Same region
|
||||
}
|
||||
```
|
||||
|
||||
### For Multi-Region Setups
|
||||
|
||||
If you MUST do cross-region transfers:
|
||||
|
||||
1. **Use VPC Endpoints**: Reduce NAT Gateway costs
|
||||
2. **Schedule Off-Peak**: AWS charges less during off-peak hours in some regions
|
||||
3. **Consider S3 Transfer Acceleration**: May be cheaper for very large transfers
|
||||
4. **Batch Operations**: Minimize number of API calls
|
||||
|
||||
## Technical Details
|
||||
|
||||
### EC2MetadataAdapter
|
||||
|
||||
Location: `src/deltaglider/adapters/ec2_metadata.py`
|
||||
|
||||
Key methods:
|
||||
- `is_running_on_ec2()`: Detects EC2 environment
|
||||
- `get_region()`: Returns AWS region code (e.g., "us-east-1")
|
||||
- `get_availability_zone()`: Returns AZ (e.g., "us-east-1a")
|
||||
|
||||
### Region Logging
|
||||
|
||||
Location: `src/deltaglider/app/cli/aws_compat.py`
|
||||
|
||||
Function: `log_aws_region(service, region_override=False)`
|
||||
|
||||
Logic:
|
||||
- If not EC2: Show S3 region only
|
||||
- If EC2 + regions match: Green checkmark ✅
|
||||
- If EC2 + auto-detected mismatch: Blue INFO ℹ️
|
||||
- If EC2 + `--region` mismatch: Yellow WARNING ⚠️
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Cannot connect to IMDS"
|
||||
|
||||
**Cause**: Network policy blocks access to 169.254.169.254
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Test IMDS connectivity
|
||||
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" \
|
||||
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
|
||||
curl -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/placement/region
|
||||
|
||||
# If it fails, disable detection
|
||||
export DG_DISABLE_EC2_DETECTION=true
|
||||
```
|
||||
|
||||
### "Wrong region detected"
|
||||
|
||||
**Cause**: Cached metadata or race condition
|
||||
|
||||
**Solution**: DeltaGlider caches metadata for performance. Restart the process to refresh.
|
||||
|
||||
### "Warning appears but I want cross-region"
|
||||
|
||||
**Cause**: You intentionally need cross-region transfer
|
||||
|
||||
**Solution**: This is just a warning, not an error. The migration will proceed. The warning helps you confirm you understand the cost implications.
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: Does this slow down my migrations?**
|
||||
A: No. EC2 detection happens once before migration starts (< 100ms). It doesn't affect migration performance.
|
||||
|
||||
**Q: What if I'm not on EC2 but the detection is slow?**
|
||||
A: The timeout is 1 second. If IMDS is unreachable, it fails fast. Disable with `DG_DISABLE_EC2_DETECTION=true`.
|
||||
|
||||
**Q: Does this work on Fargate/ECS/Lambda?**
|
||||
A: Yes! All AWS compute services support IMDSv2. The detection works the same way.
|
||||
|
||||
**Q: Can I use this with LocalStack/MinIO?**
|
||||
A: Yes. When using `--endpoint-url`, DeltaGlider skips EC2 detection (not applicable for non-AWS S3).
|
||||
|
||||
**Q: Will this detect VPC endpoints?**
|
||||
A: No. VPC endpoints don't change the "region" from an EC2 perspective. The warning still applies if regions don't match.
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [AWS Data Transfer Pricing](https://aws.amazon.com/ec2/pricing/on-demand/#Data_Transfer)
|
||||
- [AWS IMDSv2 Documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html)
|
||||
- [S3 Transfer Costs](https://aws.amazon.com/s3/pricing/)
|
||||
237
docs/METADATA_ISSUE_DIAGNOSIS.md
Normal file
237
docs/METADATA_ISSUE_DIAGNOSIS.md
Normal file
@@ -0,0 +1,237 @@
|
||||
# Metadata Issue Diagnosis and Resolution
|
||||
|
||||
## Issue Summary
|
||||
|
||||
**Date**: 2025-10-14
|
||||
**Severity**: Medium (affects stats accuracy, not functionality)
|
||||
**Status**: Diagnosed, enhanced logging added
|
||||
|
||||
## The Problem
|
||||
|
||||
When running `deltaglider stats`, you saw warnings like:
|
||||
|
||||
```
|
||||
Delta build/1.66.1/universal/readonlyrest_kbn_universal-1.66.1_es9.1.3.zip.delta:
|
||||
no original_size metadata (original_size=342104, size=342104).
|
||||
Using compressed size as fallback. This may undercount space savings.
|
||||
```
|
||||
|
||||
This indicates that delta files are missing the `file_size` metadata key, which causes stats to undercount compression savings.
|
||||
|
||||
## Root Cause
|
||||
|
||||
The delta files in your bucket **do not have S3 object metadata** attached to them. Specifically, they're missing the `file_size` key that DeltaGlider uses to calculate the original file size before compression.
|
||||
|
||||
### Why Metadata is Missing
|
||||
|
||||
Possible causes (in order of likelihood):
|
||||
|
||||
1. **Uploaded with older DeltaGlider version**: Files uploaded before `file_size` metadata was added
|
||||
2. **Direct S3 upload**: Files copied directly via AWS CLI, s3cmd, or other tools (bypassing DeltaGlider)
|
||||
3. **Upload failure**: Metadata write failed during upload but file upload succeeded
|
||||
4. **S3 storage issue**: Metadata was lost due to S3 provider issue (rare)
|
||||
|
||||
### What DeltaGlider Expects
|
||||
|
||||
When DeltaGlider uploads a delta file, it stores these metadata keys:
|
||||
|
||||
```python
|
||||
{
|
||||
"tool": "deltaglider/5.x.x",
|
||||
"original_name": "file.zip",
|
||||
"file_sha256": "abc123...",
|
||||
"file_size": "1048576", # ← MISSING in your files
|
||||
"created_at": "2025-01-01T00:00:00Z",
|
||||
"ref_key": "prefix/reference.bin",
|
||||
"ref_sha256": "def456...",
|
||||
"delta_size": "524288",
|
||||
"delta_cmd": "xdelta3 -e -9 -s reference.bin file.zip file.zip.delta"
|
||||
}
|
||||
```
|
||||
|
||||
Without `file_size`, DeltaGlider can't calculate the space savings accurately.
|
||||
|
||||
## Impact
|
||||
|
||||
### What Works
|
||||
- ✅ File upload/download - completely unaffected
|
||||
- ✅ Delta compression - works normally
|
||||
- ✅ Verification - integrity checks work fine
|
||||
- ✅ All other operations - sync, ls, cp, etc.
|
||||
|
||||
### What's Affected
|
||||
- ❌ **Stats accuracy**: Compression metrics are undercounted
|
||||
- Files without metadata: counted as if they saved 0 bytes
|
||||
- Actual compression ratio: underestimated
|
||||
- Space saved: underestimated
|
||||
|
||||
### Example Impact
|
||||
|
||||
If you have 100 delta files:
|
||||
- 90 files with metadata: accurate stats
|
||||
- 10 files without metadata: counted at compressed size (no savings shown)
|
||||
- **Result**: Stats show ~90% of actual compression savings
|
||||
|
||||
## The Fix (Already Applied)
|
||||
|
||||
### Enhanced Logging
|
||||
|
||||
We've improved the logging in `src/deltaglider/client_operations/stats.py` to help diagnose the issue:
|
||||
|
||||
**1. During metadata fetch (lines 317-333)**:
|
||||
```python
|
||||
if "file_size" in metadata:
|
||||
original_size = int(metadata["file_size"])
|
||||
logger.debug(f"Delta {key}: using original_size={original_size} from metadata")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Delta {key}: metadata missing 'file_size' key. "
|
||||
f"Available keys: {list(metadata.keys())}. "
|
||||
f"Using compressed size={size} as fallback"
|
||||
)
|
||||
```
|
||||
|
||||
This will show you exactly which metadata keys ARE present on the object.
|
||||
|
||||
**2. During stats calculation (lines 395-405)**:
|
||||
```python
|
||||
logger.warning(
|
||||
f"Delta {obj.key}: no original_size metadata "
|
||||
f"(original_size={obj.original_size}, size={obj.size}). "
|
||||
f"Using compressed size as fallback. "
|
||||
f"This may undercount space savings."
|
||||
)
|
||||
```
|
||||
|
||||
This shows both values so you can see if they're equal (metadata missing) or different (metadata present).
|
||||
|
||||
### CLI Help Improvement
|
||||
|
||||
We've also improved the `stats` command help (line 750):
|
||||
```python
|
||||
@cli.command(short_help="Get bucket statistics and compression metrics")
|
||||
```
|
||||
|
||||
And enhanced the option descriptions to be more informative.
|
||||
|
||||
## Verification
|
||||
|
||||
To check which files are missing metadata, you can use the diagnostic script:
|
||||
|
||||
```bash
|
||||
# Create and run the metadata checker
|
||||
python scripts/check_metadata.py <your-bucket-name>
|
||||
```
|
||||
|
||||
This will show:
|
||||
- Total delta files
|
||||
- Files with complete metadata
|
||||
- Files missing metadata
|
||||
- Specific missing fields for each file
|
||||
|
||||
## Resolution Options
|
||||
|
||||
### Option 1: Re-upload Files (Recommended)
|
||||
|
||||
Re-uploading files will attach proper metadata:
|
||||
|
||||
```bash
|
||||
# Re-upload a single file
|
||||
deltaglider cp local-file.zip s3://bucket/path/file.zip
|
||||
|
||||
# Re-upload a directory
|
||||
deltaglider sync local-dir/ s3://bucket/path/
|
||||
```
|
||||
|
||||
**Pros**:
|
||||
- Accurate stats for all files
|
||||
- Proper metadata for future operations
|
||||
- One-time fix
|
||||
|
||||
**Cons**:
|
||||
- Takes time to re-upload
|
||||
- Uses bandwidth
|
||||
|
||||
### Option 2: Accept Inaccurate Stats
|
||||
|
||||
Keep files as-is and accept that stats are undercounted:
|
||||
|
||||
**Pros**:
|
||||
- No work required
|
||||
- Files still work perfectly for download/verification
|
||||
|
||||
**Cons**:
|
||||
- Stats show less compression than actually achieved
|
||||
- Missing metadata for future features
|
||||
|
||||
### Option 3: Metadata Repair Tool (Future)
|
||||
|
||||
We could create a tool that:
|
||||
1. Downloads each delta file
|
||||
2. Reconstructs it to get original size
|
||||
3. Updates metadata in-place
|
||||
|
||||
**Status**: Not implemented yet, but feasible if needed.
|
||||
|
||||
## Prevention
|
||||
|
||||
For future uploads, DeltaGlider **will always** attach complete metadata (assuming current version is used).
|
||||
|
||||
The code in `src/deltaglider/core/service.py` (lines 445-467) ensures metadata is set:
|
||||
|
||||
```python
|
||||
delta_meta = DeltaMeta(
|
||||
tool=self.tool_version,
|
||||
original_name=original_name,
|
||||
file_sha256=file_sha256,
|
||||
file_size=file_size, # ← Always set
|
||||
created_at=self.clock.now(),
|
||||
ref_key=ref_key,
|
||||
ref_sha256=ref_sha256,
|
||||
delta_size=delta_size,
|
||||
delta_cmd=f"xdelta3 -e -9 -s reference.bin {original_name} {original_name}.delta",
|
||||
)
|
||||
|
||||
self.storage.put(
|
||||
full_delta_key,
|
||||
delta_path,
|
||||
delta_meta.to_dict(), # ← Includes file_size
|
||||
)
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
After reinstalling from source, run stats with enhanced logging:
|
||||
|
||||
```bash
|
||||
# Install from source
|
||||
pip install -e .
|
||||
|
||||
# Run stats with INFO logging to see detailed messages
|
||||
DG_LOG_LEVEL=INFO deltaglider stats mybucket --detailed
|
||||
|
||||
# Look for warnings like:
|
||||
# "Delta X: metadata missing 'file_size' key. Available keys: [...]"
|
||||
```
|
||||
|
||||
The warning will now show which metadata keys ARE present, helping you understand if:
|
||||
- Metadata is completely empty: `Available keys: []`
|
||||
- Metadata exists but incomplete: `Available keys: ['tool', 'ref_key', ...]`
|
||||
|
||||
## Summary
|
||||
|
||||
| Aspect | Status |
|
||||
|--------|--------|
|
||||
| File operations | ✅ Unaffected |
|
||||
| Stats accuracy | ⚠️ Undercounted for files missing metadata |
|
||||
| Logging | ✅ Enhanced to show missing keys |
|
||||
| Future uploads | ✅ Will have complete metadata |
|
||||
| Resolution | 📋 Re-upload or accept inaccuracy |
|
||||
|
||||
## Related Files
|
||||
|
||||
- `src/deltaglider/client_operations/stats.py` - Enhanced logging
|
||||
- `src/deltaglider/core/service.py` - Metadata creation
|
||||
- `src/deltaglider/core/models.py` - DeltaMeta definition
|
||||
- `scripts/check_metadata.py` - Diagnostic tool (NEW)
|
||||
- `docs/PAGINATION_BUG_FIX.md` - Related performance fix
|
||||
258
docs/PAGINATION_BUG_FIX.md
Normal file
258
docs/PAGINATION_BUG_FIX.md
Normal file
@@ -0,0 +1,258 @@
|
||||
# Pagination Bug Fix - Critical Issue Resolution
|
||||
|
||||
## Summary
|
||||
|
||||
**Date**: 2025-10-14
|
||||
**Severity**: Critical (infinite loop causing operations to never complete)
|
||||
**Status**: Fixed
|
||||
|
||||
Fixed a critical pagination bug that caused S3 LIST operations to loop infinitely, returning the same objects repeatedly instead of advancing through the bucket.
|
||||
|
||||
## The Bug
|
||||
|
||||
### Symptoms
|
||||
- LIST operations would take minutes or never complete
|
||||
- Pagination logs showed linear growth: page 10 = 9,000 objects, page 20 = 19,000 objects, etc.
|
||||
- Buckets with ~hundreds of objects showed 169,000+ objects after 170+ pages
|
||||
- System meters showed continuous 3MB/s download during listing
|
||||
- Operation would eventually hit max_iterations limit (10,000 pages) and return partial results
|
||||
|
||||
### Root Cause
|
||||
|
||||
The code was using **StartAfter** with **NextContinuationToken**, which is incorrect according to AWS S3 API:
|
||||
|
||||
**Incorrect behavior (before fix)**:
|
||||
```python
|
||||
# In list_objects_page() call
|
||||
response = storage.list_objects(
|
||||
bucket=bucket,
|
||||
start_after=page.next_continuation_token, # ❌ WRONG!
|
||||
)
|
||||
|
||||
# In storage_s3.py
|
||||
if start_after:
|
||||
params["StartAfter"] = start_after # ❌ Expects object key, not token!
|
||||
```
|
||||
|
||||
**Problem**:
|
||||
- `NextContinuationToken` is an opaque token from S3's `list_objects_v2` response
|
||||
- `StartAfter` expects an **actual object key** (string), not a continuation token
|
||||
- When boto3 receives an invalid StartAfter value (a token instead of a key), it ignores it and restarts from the beginning
|
||||
- This caused pagination to restart on every page, returning the same objects repeatedly
|
||||
|
||||
### Why It Happened
|
||||
|
||||
The S3 LIST pagination API has two different mechanisms:
|
||||
|
||||
1. **StartAfter** (S3 v1 style): Resume listing after a specific object key
|
||||
- Used for the **first page** when you want to start from a specific key
|
||||
- Example: `StartAfter="my-object-123.txt"`
|
||||
|
||||
2. **ContinuationToken** (S3 v2 style): Resume from an opaque token
|
||||
- Used for **subsequent pages** in paginated results
|
||||
- Example: `ContinuationToken="1vD6KR5W...encrypted_token..."`
|
||||
- This is what `NextContinuationToken` from the response should be used with
|
||||
|
||||
Our code mixed these two mechanisms, using StartAfter for pagination when it should use ContinuationToken.
|
||||
|
||||
## The Fix
|
||||
|
||||
### Changed Files
|
||||
|
||||
1. **src/deltaglider/adapters/storage_s3.py**
|
||||
- Added `continuation_token` parameter to `list_objects()`
|
||||
- Changed boto3 call to use `ContinuationToken` instead of `StartAfter` for pagination
|
||||
- Kept `StartAfter` support for initial page positioning
|
||||
|
||||
2. **src/deltaglider/core/object_listing.py**
|
||||
- Added `continuation_token` parameter to `list_objects_page()`
|
||||
- Changed `list_all_objects()` to use `continuation_token` variable instead of `start_after`
|
||||
- Updated pagination loop to pass continuation tokens correctly
|
||||
- Added debug logging showing continuation token in use
|
||||
|
||||
### Code Changes
|
||||
|
||||
**storage_s3.py - Before**:
|
||||
```python
|
||||
def list_objects(
|
||||
self,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
params: dict[str, Any] = {"Bucket": bucket, "MaxKeys": max_keys}
|
||||
|
||||
if start_after:
|
||||
params["StartAfter"] = start_after # ❌ Used for pagination
|
||||
|
||||
response = self.client.list_objects_v2(**params)
|
||||
```
|
||||
|
||||
**storage_s3.py - After**:
|
||||
```python
|
||||
def list_objects(
|
||||
self,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
continuation_token: str | None = None, # ✅ NEW
|
||||
) -> dict[str, Any]:
|
||||
params: dict[str, Any] = {"Bucket": bucket, "MaxKeys": max_keys}
|
||||
|
||||
# ✅ Use ContinuationToken for pagination, StartAfter only for first page
|
||||
if continuation_token:
|
||||
params["ContinuationToken"] = continuation_token
|
||||
elif start_after:
|
||||
params["StartAfter"] = start_after
|
||||
|
||||
response = self.client.list_objects_v2(**params)
|
||||
```
|
||||
|
||||
**object_listing.py - Before**:
|
||||
```python
|
||||
def list_all_objects(...) -> ObjectListing:
|
||||
aggregated = ObjectListing()
|
||||
start_after: str | None = None # ❌ Wrong variable name
|
||||
|
||||
while True:
|
||||
page = list_objects_page(
|
||||
storage,
|
||||
bucket=bucket,
|
||||
start_after=start_after, # ❌ Passing token as start_after
|
||||
)
|
||||
|
||||
aggregated.objects.extend(page.objects)
|
||||
|
||||
if not page.is_truncated:
|
||||
break
|
||||
|
||||
start_after = page.next_continuation_token # ❌ Token → start_after
|
||||
```
|
||||
|
||||
**object_listing.py - After**:
|
||||
```python
|
||||
def list_all_objects(...) -> ObjectListing:
|
||||
aggregated = ObjectListing()
|
||||
continuation_token: str | None = None # ✅ Correct variable
|
||||
|
||||
while True:
|
||||
page = list_objects_page(
|
||||
storage,
|
||||
bucket=bucket,
|
||||
continuation_token=continuation_token, # ✅ Token → token
|
||||
)
|
||||
|
||||
aggregated.objects.extend(page.objects)
|
||||
|
||||
if not page.is_truncated:
|
||||
break
|
||||
|
||||
continuation_token = page.next_continuation_token # ✅ Token → token
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit Tests
|
||||
Created comprehensive unit tests in `tests/unit/test_object_listing.py`:
|
||||
|
||||
1. **test_list_objects_page_passes_continuation_token**: Verifies token is passed correctly
|
||||
2. **test_list_all_objects_uses_continuation_token_for_pagination**: Verifies 3-page pagination works
|
||||
3. **test_list_all_objects_prevents_infinite_loop**: Verifies max_iterations protection
|
||||
|
||||
### Manual Verification
|
||||
Created verification script that checks for:
|
||||
- `continuation_token` parameter in both files
|
||||
- `ContinuationToken` usage in boto3 call
|
||||
- Token priority logic (`if continuation_token:` before `elif start_after:`)
|
||||
- Correct variable names throughout pagination loop
|
||||
|
||||
All checks passed ✅
|
||||
|
||||
## Expected Behavior After Fix
|
||||
|
||||
### Before (Broken)
|
||||
```
|
||||
[21:26:16.663] LIST pagination: page 1, 0 objects so far
|
||||
[21:26:18.884] LIST pagination: page 10, 9000 objects so far
|
||||
[21:26:20.930] LIST pagination: page 20, 19000 objects so far
|
||||
[21:26:52.290] LIST pagination: page 170, 169000 objects so far
|
||||
... continues indefinitely ...
|
||||
```
|
||||
|
||||
### After (Fixed)
|
||||
```
|
||||
[21:26:16.663] LIST pagination: page 1, 0 objects so far
|
||||
[21:26:17.012] LIST pagination: page 2, 1000 objects so far, token=AbCd1234EfGh5678...
|
||||
[21:26:17.089] LIST complete: 2 pages, 1234 objects total in 0.43s
|
||||
```
|
||||
|
||||
## Performance Impact
|
||||
|
||||
For a bucket with ~1,000 objects:
|
||||
|
||||
**Before**:
|
||||
- 170+ pages × ~200ms per page = 34+ seconds
|
||||
- Would eventually timeout or hit max_iterations
|
||||
|
||||
**After**:
|
||||
- 2 pages × ~200ms per page = <1 second
|
||||
- ~34x improvement for this case
|
||||
- Actual speedup scales with bucket size (more objects = bigger speedup)
|
||||
|
||||
For a bucket with 200,000 objects (typical production case):
|
||||
- **Before**: Would never complete (would hit 10,000 page limit)
|
||||
- **After**: ~200 pages × ~200ms = ~40 seconds (200x fewer pages!)
|
||||
|
||||
## AWS S3 Pagination Documentation Reference
|
||||
|
||||
From AWS S3 API documentation:
|
||||
|
||||
> **ContinuationToken** (string) - Indicates that the list is being continued on this bucket with a token. ContinuationToken is obfuscated and is not a real key.
|
||||
>
|
||||
> **StartAfter** (string) - Starts after this specified key. StartAfter can be any key in the bucket.
|
||||
>
|
||||
> **NextContinuationToken** (string) - NextContinuationToken is sent when isTruncated is true, which means there are more keys in the bucket that can be listed. The next list requests to Amazon S3 can be continued with this NextContinuationToken.
|
||||
|
||||
Source: [AWS S3 ListObjectsV2 API Documentation](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html)
|
||||
|
||||
## Related Issues
|
||||
|
||||
This bug also affected:
|
||||
- `get_bucket_stats()` - Would take 20+ minutes due to infinite pagination
|
||||
- Any operation using `list_all_objects()` - sync, ls, etc.
|
||||
|
||||
All these operations are now fixed by this pagination fix.
|
||||
|
||||
## Prevention
|
||||
|
||||
To prevent similar issues in the future:
|
||||
|
||||
1. ✅ **Unit tests added**: Verify pagination token handling
|
||||
2. ✅ **Debug logging added**: Shows continuation token in use
|
||||
3. ✅ **Type checking**: mypy catches parameter mismatches
|
||||
4. ✅ **Max iterations limit**: Prevents truly infinite loops (fails safely)
|
||||
5. ✅ **Documentation**: This document explains the fix
|
||||
|
||||
## Verification Checklist
|
||||
|
||||
- [x] Code changes implemented
|
||||
- [x] Unit tests added
|
||||
- [x] Type checking passes (mypy)
|
||||
- [x] Linting passes (ruff)
|
||||
- [x] Manual verification script passes
|
||||
- [x] Documentation created
|
||||
- [x] Performance characteristics documented
|
||||
- [x] AWS API documentation referenced
|
||||
|
||||
## Author Notes
|
||||
|
||||
This was a classic case of mixing two similar but different API mechanisms. The bug was subtle because:
|
||||
1. boto3 didn't throw an error - it silently ignored the invalid StartAfter value
|
||||
2. The pagination appeared to work (returned objects), just the wrong objects
|
||||
3. The linear growth pattern (9K, 19K, 29K) made it look like a counting bug, not a pagination bug
|
||||
|
||||
The fix is simple but critical: use the right parameter (`ContinuationToken`) with the right value (`NextContinuationToken`).
|
||||
342
docs/STATS_CACHING.md
Normal file
342
docs/STATS_CACHING.md
Normal file
@@ -0,0 +1,342 @@
|
||||
# Bucket Statistics Caching
|
||||
|
||||
**TL;DR**: Bucket stats are now cached in S3 with automatic validation. What took 20 minutes now takes ~100ms when the bucket hasn't changed.
|
||||
|
||||
## Overview
|
||||
|
||||
DeltaGlider's `get_bucket_stats()` operation now includes intelligent S3-based caching that dramatically improves performance for read-heavy workloads while maintaining accuracy through automatic validation.
|
||||
|
||||
## The Problem
|
||||
|
||||
Computing bucket statistics requires:
|
||||
1. **LIST operation**: Get all objects (~50-100ms per 1000 objects)
|
||||
2. **HEAD operations**: Fetch metadata for delta files (expensive!)
|
||||
- For a bucket with 10,000 delta files: 10,000 HEAD calls
|
||||
- Even with 10 parallel workers: ~1,000 sequential batches
|
||||
- At ~100ms per batch: **100+ seconds minimum**
|
||||
- With network issues or throttling: **20+ minutes** 😱
|
||||
|
||||
This made monitoring dashboards and repeated stats checks impractical.
|
||||
|
||||
## The Solution
|
||||
|
||||
### S3-Based Cache with Automatic Validation
|
||||
|
||||
Statistics are cached in S3 at `.deltaglider/stats_{mode}.json` (one per mode). On every call:
|
||||
|
||||
1. **Quick LIST operation** (~50-100ms) - always performed for validation
|
||||
2. **Compare** current object_count + compressed_size with cache
|
||||
3. **If unchanged** → Return cached stats instantly ✅ (**~100ms total**)
|
||||
4. **If changed** → Recompute and update cache automatically
|
||||
|
||||
### Three Stats Modes
|
||||
|
||||
```bash
|
||||
# Quick mode (default): Fast listing-only, approximate compression metrics
|
||||
deltaglider stats my-bucket
|
||||
|
||||
# Sampled mode: One HEAD per deltaspace, balanced accuracy/speed
|
||||
deltaglider stats my-bucket --sampled
|
||||
|
||||
# Detailed mode: All HEAD calls, most accurate (slowest)
|
||||
deltaglider stats my-bucket --detailed
|
||||
```
|
||||
|
||||
Each mode has its own independent cache file.
|
||||
|
||||
## Performance
|
||||
|
||||
| Scenario | Before | After | Speedup |
|
||||
|----------|--------|-------|---------|
|
||||
| **First run** (cold cache) | 20 min | 20 min | 1x (must compute) |
|
||||
| **Bucket unchanged** (warm cache) | 20 min | **100ms** | **200x** ✨ |
|
||||
| **Bucket changed** (stale cache) | 20 min | 20 min | 1x (auto-recompute) |
|
||||
| **Dashboard monitoring** | 20 min/check | **100ms/check** | **200x** ✨ |
|
||||
|
||||
## CLI Usage
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
# Use cache (default behavior)
|
||||
deltaglider stats my-bucket
|
||||
|
||||
# Force recomputation even if cache valid
|
||||
deltaglider stats my-bucket --refresh
|
||||
|
||||
# Skip cache entirely (both read and write)
|
||||
deltaglider stats my-bucket --no-cache
|
||||
|
||||
# Different modes with caching
|
||||
deltaglider stats my-bucket --sampled
|
||||
deltaglider stats my-bucket --detailed
|
||||
```
|
||||
|
||||
### Cache Control Flags
|
||||
|
||||
| Flag | Description | Use Case |
|
||||
|------|-------------|----------|
|
||||
| *(none)* | Use cache if valid | **Default** - Fast monitoring |
|
||||
| `--refresh` | Force recomputation | Updated data needed now |
|
||||
| `--no-cache` | Skip caching entirely | Testing, one-off analysis |
|
||||
| `--sampled` | Balanced mode | Good accuracy, faster than detailed |
|
||||
| `--detailed` | Most accurate mode | Analytics, reports |
|
||||
|
||||
## Python SDK Usage
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Use cache (fast, ~100ms with cache hit)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
|
||||
# Force refresh (slow, recomputes everything)
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Skip cache entirely
|
||||
stats = client.get_bucket_stats('releases', use_cache=False)
|
||||
|
||||
# Different modes with caching
|
||||
stats = client.get_bucket_stats('releases', mode='quick') # Fast
|
||||
stats = client.get_bucket_stats('releases', mode='sampled') # Balanced
|
||||
stats = client.get_bucket_stats('releases', mode='detailed') # Accurate
|
||||
```
|
||||
|
||||
## Cache Structure
|
||||
|
||||
Cache files are stored at `.deltaglider/stats_{mode}.json` in your bucket:
|
||||
|
||||
```json
|
||||
{
|
||||
"version": "1.0",
|
||||
"mode": "quick",
|
||||
"computed_at": "2025-10-14T10:30:00Z",
|
||||
"validation": {
|
||||
"object_count": 1523,
|
||||
"compressed_size": 1234567890
|
||||
},
|
||||
"stats": {
|
||||
"bucket": "releases",
|
||||
"object_count": 1523,
|
||||
"total_size": 50000000000,
|
||||
"compressed_size": 1234567890,
|
||||
"space_saved": 48765432110,
|
||||
"average_compression_ratio": 0.9753,
|
||||
"delta_objects": 1500,
|
||||
"direct_objects": 23
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## How Validation Works
|
||||
|
||||
**Smart Staleness Detection**:
|
||||
1. Always perform quick LIST operation (required anyway, ~50-100ms)
|
||||
2. Calculate current `object_count` and `compressed_size` from LIST
|
||||
3. Compare with cached values
|
||||
4. If **both match** → Cache valid, return instantly
|
||||
5. If **either differs** → Bucket changed, recompute automatically
|
||||
|
||||
This catches:
|
||||
- ✅ Objects added (count increases)
|
||||
- ✅ Objects removed (count decreases)
|
||||
- ✅ Objects replaced (size changes)
|
||||
- ✅ Content modified (size changes)
|
||||
|
||||
**Edge Case**: If only metadata changes (tags, headers) but not content/count/size, cache remains valid. This is acceptable since metadata changes are rare and don't affect core statistics.
|
||||
|
||||
## Use Cases
|
||||
|
||||
### ✅ Perfect For
|
||||
|
||||
1. **Monitoring Dashboards**
|
||||
- Check stats every minute
|
||||
- Bucket rarely changes
|
||||
- **20 min → 100ms per check** ✨
|
||||
|
||||
2. **CI/CD Status Checks**
|
||||
- Verify upload success
|
||||
- Check compression effectiveness
|
||||
- Near-instant feedback
|
||||
|
||||
3. **Repeated Analysis**
|
||||
- Multiple stats queries during investigation
|
||||
- Cache persists across sessions
|
||||
- Huge time savings
|
||||
|
||||
### ⚠️ Less Beneficial For
|
||||
|
||||
1. **Write-Heavy Buckets**
|
||||
- Bucket changes on every check
|
||||
- Cache always stale
|
||||
- **No benefit, but no harm either** (graceful degradation)
|
||||
|
||||
2. **One-Off Queries**
|
||||
- Single stats check
|
||||
- Cache doesn't help (cold cache)
|
||||
- Still works normally
|
||||
|
||||
## Cache Management
|
||||
|
||||
### Automatic Management
|
||||
|
||||
- **Creation**: Automatic on first `get_bucket_stats()` call
|
||||
- **Validation**: Automatic on every call (always current)
|
||||
- **Updates**: Automatic when bucket changes
|
||||
- **Cleanup**: Not needed (cache files are tiny ~1-10KB)
|
||||
|
||||
### Manual Management
|
||||
|
||||
```bash
|
||||
# View cache files
|
||||
deltaglider ls s3://my-bucket/.deltaglider/
|
||||
|
||||
# Delete cache manually (will be recreated automatically)
|
||||
deltaglider rm s3://my-bucket/.deltaglider/stats_quick.json
|
||||
deltaglider rm s3://my-bucket/.deltaglider/stats_sampled.json
|
||||
deltaglider rm s3://my-bucket/.deltaglider/stats_detailed.json
|
||||
|
||||
# Or delete entire .deltaglider prefix
|
||||
deltaglider rm -r s3://my-bucket/.deltaglider/
|
||||
```
|
||||
|
||||
## Technical Details
|
||||
|
||||
### Cache Files
|
||||
|
||||
- **Location**: `.deltaglider/` prefix in each bucket
|
||||
- **Naming**: `stats_{mode}.json` (quick, sampled, detailed)
|
||||
- **Size**: ~1-10KB per file
|
||||
- **Format**: JSON with version, mode, validation data, and stats
|
||||
|
||||
### Validation Logic
|
||||
|
||||
```python
|
||||
def is_cache_valid(cached, current):
|
||||
"""Cache is valid if object count and size unchanged."""
|
||||
return (
|
||||
cached['object_count'] == current['object_count'] and
|
||||
cached['compressed_size'] == current['compressed_size']
|
||||
)
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
Cache operations are **non-fatal**:
|
||||
- ✅ Cache read fails → Compute normally, log warning
|
||||
- ✅ Cache write fails → Return computed stats, log warning
|
||||
- ✅ Corrupted cache → Ignore, recompute, overwrite
|
||||
- ✅ Version mismatch → Ignore, recompute with new version
|
||||
- ✅ Permission denied → Log warning, continue without caching
|
||||
|
||||
**The stats operation never fails due to cache issues.**
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Potential improvements for the future:
|
||||
|
||||
1. **TTL-Based Expiration**: Auto-refresh after N hours even if unchanged
|
||||
2. **Cache Cleanup Command**: `deltaglider cache clear` for manual invalidation
|
||||
3. **Cache Statistics**: Show hit/miss rates, staleness info
|
||||
4. **Async Cache Updates**: Background refresh for very large buckets
|
||||
5. **Cross-Bucket Cache**: Share reference data across related buckets
|
||||
|
||||
## Comparison with Old Implementation
|
||||
|
||||
| Aspect | Old (In-Memory) | New (S3-Based) |
|
||||
|--------|----------------|----------------|
|
||||
| **Storage** | Process memory | S3 bucket |
|
||||
| **Persistence** | Lost on restart | Survives restarts |
|
||||
| **Sharing** | Per-process | Shared across all clients |
|
||||
| **Validation** | None | Automatic on every call |
|
||||
| **Staleness** | Always fresh | Automatically detected |
|
||||
| **Use Case** | Single session | Monitoring, dashboards |
|
||||
|
||||
## Examples
|
||||
|
||||
### Example 1: Monitoring Dashboard
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import time
|
||||
|
||||
client = create_client()
|
||||
|
||||
while True:
|
||||
# Fast stats check (~100ms with cache)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, "
|
||||
f"Compression: {stats.average_compression_ratio:.1%}")
|
||||
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
# First run: 20 min (computes and caches)
|
||||
# All subsequent runs: ~100ms (cache hit)
|
||||
```
|
||||
|
||||
### Example 2: CI/CD Pipeline
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Upload new release
|
||||
client.upload("v2.0.0.zip", "s3://releases/v2.0.0/")
|
||||
|
||||
# Quick verification (fast with cache)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
if stats.average_compression_ratio < 0.90:
|
||||
print("Warning: Lower than expected compression")
|
||||
```
|
||||
|
||||
### Example 3: Force Fresh Stats
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Force recomputation for accurate report
|
||||
stats = client.get_bucket_stats(
|
||||
'releases',
|
||||
mode='detailed',
|
||||
refresh_cache=True
|
||||
)
|
||||
|
||||
print(f"Accurate compression report:")
|
||||
print(f" Original: {stats.total_size / 1e9:.1f} GB")
|
||||
print(f" Stored: {stats.compressed_size / 1e9:.1f} GB")
|
||||
print(f" Saved: {stats.space_saved / 1e9:.1f} GB ({stats.average_compression_ratio:.1%})")
|
||||
```
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: Does caching affect accuracy?**
|
||||
A: No! Cache is automatically validated on every call. If the bucket changed, stats are recomputed automatically.
|
||||
|
||||
**Q: What if I need fresh stats immediately?**
|
||||
A: Use `--refresh` flag (CLI) or `refresh_cache=True` (SDK) to force recomputation.
|
||||
|
||||
**Q: Can I disable caching?**
|
||||
A: Yes, use `--no-cache` flag (CLI) or `use_cache=False` (SDK).
|
||||
|
||||
**Q: How much space do cache files use?**
|
||||
A: ~1-10KB per mode, negligible for any bucket.
|
||||
|
||||
**Q: What happens if cache write fails?**
|
||||
A: The operation continues normally - computed stats are returned and a warning is logged. Caching is optional and non-fatal.
|
||||
|
||||
**Q: Do I need to clean up cache files?**
|
||||
A: No, they're tiny and automatically managed. But you can delete `.deltaglider/` prefix if desired.
|
||||
|
||||
**Q: Does cache work across different modes?**
|
||||
A: Each mode (quick, sampled, detailed) has its own independent cache file.
|
||||
|
||||
---
|
||||
|
||||
**Implementation**: See [PR #XX] for complete implementation details and test coverage.
|
||||
|
||||
**Related**: [SDK Documentation](sdk/README.md) | [CLI Reference](../README.md#cli-reference) | [Architecture](sdk/architecture.md)
|
||||
@@ -9,6 +9,8 @@ DeltaGlider provides AWS S3 CLI compatible commands with automatic delta compres
|
||||
- `deltaglider ls [s3_url]` - List buckets and objects
|
||||
- `deltaglider rm <s3_url>` - Remove objects
|
||||
- `deltaglider sync <source> <destination>` - Synchronize directories
|
||||
- `deltaglider migrate <source> <destination>` - Migrate S3 buckets with compression and EC2 cost warnings
|
||||
- `deltaglider stats <bucket>` - Get bucket statistics and compression metrics
|
||||
- `deltaglider verify <s3_url>` - Verify file integrity
|
||||
|
||||
### Current Usage Examples
|
||||
|
||||
@@ -57,9 +57,10 @@ while response.get('IsTruncated'):
|
||||
# Get detailed compression stats only when needed
|
||||
response = client.list_objects(Bucket='releases', FetchMetadata=True) # Slower but detailed
|
||||
|
||||
# Quick bucket statistics
|
||||
stats = client.get_bucket_stats('releases') # Fast overview
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True) # With compression metrics
|
||||
# Bucket statistics with intelligent S3-based caching (NEW!)
|
||||
stats = client.get_bucket_stats('releases') # Fast (~100ms with cache)
|
||||
stats = client.get_bucket_stats('releases', mode='detailed') # Accurate compression metrics
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True) # Force fresh computation
|
||||
|
||||
client.delete_object(Bucket='releases', Key='old-version.zip')
|
||||
```
|
||||
|
||||
@@ -53,6 +53,7 @@ dependencies = [
|
||||
"click>=8.1.0",
|
||||
"cryptography>=42.0.0",
|
||||
"python-dateutil>=2.9.0",
|
||||
"requests>=2.32.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -109,6 +110,7 @@ dev-dependencies = [
|
||||
"mypy>=1.13.0",
|
||||
"boto3-stubs[s3]>=1.35.0",
|
||||
"types-python-dateutil>=2.9.0",
|
||||
"types-requests>=2.32.0",
|
||||
"setuptools-scm>=8.0.0",
|
||||
]
|
||||
|
||||
|
||||
101
scripts/check_metadata.py
Normal file
101
scripts/check_metadata.py
Normal file
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check which delta files are missing metadata."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from deltaglider import create_client
|
||||
|
||||
|
||||
def check_bucket_metadata(bucket: str) -> None:
|
||||
"""Check all delta files in a bucket for missing metadata.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
"""
|
||||
client = create_client()
|
||||
|
||||
print(f"Checking delta files in bucket: {bucket}\n")
|
||||
print("=" * 80)
|
||||
|
||||
# List all objects
|
||||
response = client.service.storage.list_objects(bucket=bucket, max_keys=10000)
|
||||
|
||||
missing_metadata = []
|
||||
has_metadata = []
|
||||
total_delta_files = 0
|
||||
|
||||
for obj in response["objects"]:
|
||||
key = obj["key"]
|
||||
|
||||
# Only check .delta files
|
||||
if not key.endswith(".delta"):
|
||||
continue
|
||||
|
||||
total_delta_files += 1
|
||||
|
||||
# Get metadata
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
|
||||
if not obj_head:
|
||||
print(f"❌ {key}: Object not found")
|
||||
continue
|
||||
|
||||
metadata = obj_head.metadata
|
||||
|
||||
# Check for required metadata fields
|
||||
required_fields = ["file_size", "file_sha256", "ref_key", "ref_sha256", "delta_size"]
|
||||
missing_fields = [f for f in required_fields if f not in metadata]
|
||||
|
||||
if missing_fields:
|
||||
missing_metadata.append({
|
||||
"key": key,
|
||||
"missing_fields": missing_fields,
|
||||
"has_metadata": bool(metadata),
|
||||
"available_keys": list(metadata.keys()) if metadata else [],
|
||||
})
|
||||
status = "⚠️ MISSING"
|
||||
detail = f"missing: {', '.join(missing_fields)}"
|
||||
else:
|
||||
has_metadata.append(key)
|
||||
status = "✅ OK"
|
||||
detail = f"file_size={metadata.get('file_size')}"
|
||||
|
||||
print(f"{status} {key}")
|
||||
print(f" {detail}")
|
||||
if metadata:
|
||||
print(f" Available keys: {', '.join(metadata.keys())}")
|
||||
print()
|
||||
|
||||
# Summary
|
||||
print("=" * 80)
|
||||
print(f"\nSummary:")
|
||||
print(f" Total delta files: {total_delta_files}")
|
||||
print(f" With complete metadata: {len(has_metadata)} ({len(has_metadata)/total_delta_files*100:.1f}%)")
|
||||
print(f" Missing metadata: {len(missing_metadata)} ({len(missing_metadata)/total_delta_files*100:.1f}%)")
|
||||
|
||||
if missing_metadata:
|
||||
print(f"\n❌ Files with missing metadata:")
|
||||
for item in missing_metadata:
|
||||
print(f" - {item['key']}")
|
||||
print(f" Missing: {', '.join(item['missing_fields'])}")
|
||||
if item['available_keys']:
|
||||
print(f" Has: {', '.join(item['available_keys'])}")
|
||||
|
||||
print(f"\n💡 Recommendation:")
|
||||
print(f" These files should be re-uploaded to get proper metadata and accurate stats.")
|
||||
print(f" You can re-upload with: deltaglider cp <local-file> s3://{bucket}/<path>")
|
||||
else:
|
||||
print(f"\n✅ All delta files have complete metadata!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python check_metadata.py <bucket-name>")
|
||||
sys.exit(1)
|
||||
|
||||
bucket_name = sys.argv[1]
|
||||
check_bucket_metadata(bucket_name)
|
||||
@@ -6,20 +6,22 @@ from .cache_fs import FsCacheAdapter
|
||||
from .cache_memory import MemoryCache
|
||||
from .clock_utc import UtcClockAdapter
|
||||
from .diff_xdelta import XdeltaAdapter
|
||||
from .ec2_metadata import EC2MetadataAdapter
|
||||
from .hash_sha import Sha256Adapter
|
||||
from .logger_std import StdLoggerAdapter
|
||||
from .metrics_noop import NoopMetricsAdapter
|
||||
from .storage_s3 import S3StorageAdapter
|
||||
|
||||
__all__ = [
|
||||
"S3StorageAdapter",
|
||||
"XdeltaAdapter",
|
||||
"Sha256Adapter",
|
||||
"FsCacheAdapter",
|
||||
"ContentAddressedCache",
|
||||
"EC2MetadataAdapter",
|
||||
"EncryptedCache",
|
||||
"FsCacheAdapter",
|
||||
"MemoryCache",
|
||||
"UtcClockAdapter",
|
||||
"StdLoggerAdapter",
|
||||
"NoopMetricsAdapter",
|
||||
"S3StorageAdapter",
|
||||
"Sha256Adapter",
|
||||
"StdLoggerAdapter",
|
||||
"UtcClockAdapter",
|
||||
"XdeltaAdapter",
|
||||
]
|
||||
|
||||
126
src/deltaglider/adapters/ec2_metadata.py
Normal file
126
src/deltaglider/adapters/ec2_metadata.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""EC2 Instance Metadata Service (IMDS) adapter.
|
||||
|
||||
Provides access to EC2 instance metadata using IMDSv2 with token-based authentication.
|
||||
Falls back gracefully when not running on EC2.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class EC2MetadataAdapter:
|
||||
"""Adapter for EC2 Instance Metadata Service (IMDSv2)."""
|
||||
|
||||
IMDS_BASE_URL = "http://169.254.169.254/latest"
|
||||
TOKEN_URL = f"{IMDS_BASE_URL}/api/token"
|
||||
TOKEN_TTL_SECONDS = 21600 # 6 hours
|
||||
TOKEN_HEADER = "X-aws-ec2-metadata-token"
|
||||
TIMEOUT_SECONDS = 1 # Fast timeout for non-EC2 environments
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize EC2 metadata adapter."""
|
||||
self._token: str | None = None
|
||||
self._is_ec2: bool | None = None
|
||||
self._region: str | None = None
|
||||
|
||||
def is_running_on_ec2(self) -> bool:
|
||||
"""Check if running on an EC2 instance.
|
||||
|
||||
Returns:
|
||||
True if running on EC2, False otherwise
|
||||
|
||||
Note:
|
||||
Result is cached after first check for performance.
|
||||
"""
|
||||
if self._is_ec2 is not None:
|
||||
return self._is_ec2
|
||||
|
||||
# Skip check if explicitly disabled
|
||||
if os.environ.get("DG_DISABLE_EC2_DETECTION", "").lower() in ("true", "1", "yes"):
|
||||
self._is_ec2 = False
|
||||
return False
|
||||
|
||||
try:
|
||||
# Try to get IMDSv2 token
|
||||
self._token = self._get_token()
|
||||
self._is_ec2 = self._token is not None
|
||||
except Exception:
|
||||
self._is_ec2 = False
|
||||
|
||||
return self._is_ec2
|
||||
|
||||
def get_region(self) -> str | None:
|
||||
"""Get the EC2 instance's AWS region.
|
||||
|
||||
Returns:
|
||||
AWS region code (e.g., "us-east-1") or None if not on EC2
|
||||
|
||||
Note:
|
||||
Result is cached after first successful fetch.
|
||||
"""
|
||||
if not self.is_running_on_ec2():
|
||||
return None
|
||||
|
||||
if self._region is not None:
|
||||
return self._region
|
||||
|
||||
try:
|
||||
if self._token:
|
||||
response = requests.get(
|
||||
f"{self.IMDS_BASE_URL}/meta-data/placement/region",
|
||||
headers={self.TOKEN_HEADER: self._token},
|
||||
timeout=self.TIMEOUT_SECONDS,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
self._region = response.text.strip()
|
||||
return self._region
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def get_availability_zone(self) -> str | None:
|
||||
"""Get the EC2 instance's availability zone.
|
||||
|
||||
Returns:
|
||||
Availability zone (e.g., "us-east-1a") or None if not on EC2
|
||||
"""
|
||||
if not self.is_running_on_ec2():
|
||||
return None
|
||||
|
||||
try:
|
||||
if self._token:
|
||||
response = requests.get(
|
||||
f"{self.IMDS_BASE_URL}/meta-data/placement/availability-zone",
|
||||
headers={self.TOKEN_HEADER: self._token},
|
||||
timeout=self.TIMEOUT_SECONDS,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return str(response.text.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _get_token(self) -> str | None:
|
||||
"""Get IMDSv2 token for authenticated metadata requests.
|
||||
|
||||
Returns:
|
||||
IMDSv2 token or None if unable to retrieve
|
||||
|
||||
Note:
|
||||
Uses IMDSv2 for security. IMDSv1 is not supported.
|
||||
"""
|
||||
try:
|
||||
response = requests.put(
|
||||
self.TOKEN_URL,
|
||||
headers={"X-aws-ec2-metadata-token-ttl-seconds": str(self.TOKEN_TTL_SECONDS)},
|
||||
timeout=self.TIMEOUT_SECONDS,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.text.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
@@ -97,6 +97,7 @@ class S3StorageAdapter(StoragePort):
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
continuation_token: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""List objects with S3-compatible response.
|
||||
|
||||
@@ -105,7 +106,8 @@ class S3StorageAdapter(StoragePort):
|
||||
prefix: Filter results to keys beginning with prefix
|
||||
delimiter: Delimiter for grouping keys (e.g., '/' for folders)
|
||||
max_keys: Maximum number of keys to return
|
||||
start_after: Start listing after this key
|
||||
start_after: Start listing after this key (for first page only)
|
||||
continuation_token: Token from previous response for pagination
|
||||
|
||||
Returns:
|
||||
Dict with objects, common_prefixes, and pagination info
|
||||
@@ -119,7 +121,11 @@ class S3StorageAdapter(StoragePort):
|
||||
params["Prefix"] = prefix
|
||||
if delimiter:
|
||||
params["Delimiter"] = delimiter
|
||||
if start_after:
|
||||
|
||||
# Use ContinuationToken for pagination if available, otherwise StartAfter
|
||||
if continuation_token:
|
||||
params["ContinuationToken"] = continuation_token
|
||||
elif start_after:
|
||||
params["StartAfter"] = start_after
|
||||
|
||||
try:
|
||||
|
||||
@@ -1,28 +1,120 @@
|
||||
"""AWS S3 CLI compatible commands."""
|
||||
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
|
||||
from ...core import DeltaService, DeltaSpace, ObjectKey
|
||||
from ...core import (
|
||||
DeltaService,
|
||||
DeltaSpace,
|
||||
ObjectKey,
|
||||
build_s3_url,
|
||||
is_s3_url,
|
||||
)
|
||||
from ...core import parse_s3_url as core_parse_s3_url
|
||||
from .sync import fetch_s3_object_heads
|
||||
|
||||
__all__ = [
|
||||
"is_s3_path",
|
||||
"parse_s3_url",
|
||||
"determine_operation",
|
||||
"upload_file",
|
||||
"download_file",
|
||||
"copy_s3_to_s3",
|
||||
"migrate_s3_to_s3",
|
||||
"handle_recursive",
|
||||
"log_aws_region",
|
||||
]
|
||||
|
||||
|
||||
def log_aws_region(service: DeltaService, region_override: bool = False) -> None:
|
||||
"""Log the AWS region being used and warn about cross-region charges.
|
||||
|
||||
This function:
|
||||
1. Detects if running on EC2
|
||||
2. Compares EC2 region with S3 client region
|
||||
3. Warns about potential cross-region data transfer charges
|
||||
4. Helps users optimize for cost and performance
|
||||
|
||||
Args:
|
||||
service: DeltaService instance with storage adapter
|
||||
region_override: True if user explicitly specified --region flag
|
||||
"""
|
||||
try:
|
||||
from ...adapters.ec2_metadata import EC2MetadataAdapter
|
||||
from ...adapters.storage_s3 import S3StorageAdapter
|
||||
|
||||
if not isinstance(service.storage, S3StorageAdapter):
|
||||
return # Not using S3 storage, skip
|
||||
|
||||
# Get S3 client region
|
||||
s3_region = service.storage.client.meta.region_name
|
||||
if not s3_region:
|
||||
s3_region = "us-east-1" # boto3 default
|
||||
|
||||
# Check if running on EC2
|
||||
ec2_metadata = EC2MetadataAdapter()
|
||||
if ec2_metadata.is_running_on_ec2():
|
||||
ec2_region = ec2_metadata.get_region()
|
||||
ec2_az = ec2_metadata.get_availability_zone()
|
||||
|
||||
# Log EC2 context
|
||||
click.echo(f"EC2 Instance: {ec2_az or ec2_region or 'unknown'}")
|
||||
click.echo(f"S3 Client Region: {s3_region}")
|
||||
|
||||
# Check for region mismatch
|
||||
if ec2_region and ec2_region != s3_region:
|
||||
if region_override:
|
||||
# User explicitly set --region, warn about costs
|
||||
click.echo("")
|
||||
click.secho(
|
||||
f"⚠️ WARNING: EC2 region={ec2_region} != S3 client region={s3_region}",
|
||||
fg="yellow",
|
||||
bold=True,
|
||||
)
|
||||
click.secho(
|
||||
f" Expect cross-region/NAT data charges. Align regions (set client region={ec2_region})",
|
||||
fg="yellow",
|
||||
)
|
||||
click.secho(
|
||||
" before proceeding. Or drop --region for automatic region resolution.",
|
||||
fg="yellow",
|
||||
)
|
||||
click.echo("")
|
||||
else:
|
||||
# Auto-detected mismatch, but user can still cancel
|
||||
click.echo("")
|
||||
click.secho(
|
||||
f"ℹ️ INFO: EC2 region ({ec2_region}) differs from configured S3 region ({s3_region})",
|
||||
fg="cyan",
|
||||
)
|
||||
click.secho(
|
||||
f" Consider using --region {ec2_region} to avoid cross-region charges.",
|
||||
fg="cyan",
|
||||
)
|
||||
click.echo("")
|
||||
elif ec2_region and ec2_region == s3_region:
|
||||
# Regions match - optimal configuration
|
||||
click.secho("✓ Regions aligned - no cross-region charges", fg="green")
|
||||
else:
|
||||
# Not on EC2, just show S3 region
|
||||
click.echo(f"S3 Client Region: {s3_region}")
|
||||
|
||||
except Exception:
|
||||
pass # Silently ignore errors getting region info
|
||||
|
||||
|
||||
def is_s3_path(path: str) -> bool:
|
||||
"""Check if path is an S3 URL."""
|
||||
return path.startswith("s3://")
|
||||
return is_s3_url(path)
|
||||
|
||||
|
||||
def parse_s3_url(url: str) -> tuple[str, str]:
|
||||
"""Parse S3 URL into bucket and key."""
|
||||
if not url.startswith("s3://"):
|
||||
raise ValueError(f"Invalid S3 URL: {url}")
|
||||
|
||||
s3_path = url[5:].rstrip("/")
|
||||
parts = s3_path.split("/", 1)
|
||||
bucket = parts[0]
|
||||
key = parts[1] if len(parts) > 1 else ""
|
||||
return bucket, key
|
||||
parsed = core_parse_s3_url(url, strip_trailing_slash=True)
|
||||
return parsed.bucket, parsed.key
|
||||
|
||||
|
||||
def determine_operation(source: str, dest: str) -> str:
|
||||
@@ -57,6 +149,8 @@ def upload_file(
|
||||
|
||||
delta_space = DeltaSpace(bucket=bucket, prefix="/".join(key.split("/")[:-1]))
|
||||
|
||||
dest_url = build_s3_url(bucket, key)
|
||||
|
||||
try:
|
||||
# Check if delta should be disabled
|
||||
if no_delta:
|
||||
@@ -66,7 +160,7 @@ def upload_file(
|
||||
|
||||
if not quiet:
|
||||
file_size = local_path.stat().st_size
|
||||
click.echo(f"upload: '{local_path}' to 's3://{bucket}/{key}' ({file_size} bytes)")
|
||||
click.echo(f"upload: '{local_path}' to '{dest_url}' ({file_size} bytes)")
|
||||
else:
|
||||
# Use delta compression
|
||||
summary = service.put(local_path, delta_space, max_ratio)
|
||||
@@ -75,12 +169,12 @@ def upload_file(
|
||||
if summary.delta_size:
|
||||
ratio = round((summary.delta_size / summary.file_size) * 100, 1)
|
||||
click.echo(
|
||||
f"upload: '{local_path}' to 's3://{bucket}/{summary.key}' "
|
||||
f"upload: '{local_path}' to '{build_s3_url(bucket, summary.key)}' "
|
||||
f"(delta: {ratio}% of original)"
|
||||
)
|
||||
else:
|
||||
click.echo(
|
||||
f"upload: '{local_path}' to 's3://{bucket}/{summary.key}' "
|
||||
f"upload: '{local_path}' to '{build_s3_url(bucket, summary.key)}' "
|
||||
f"(reference: {summary.file_size} bytes)"
|
||||
)
|
||||
|
||||
@@ -112,7 +206,7 @@ def download_file(
|
||||
actual_key = delta_key
|
||||
obj_key = ObjectKey(bucket=bucket, key=delta_key)
|
||||
if not quiet:
|
||||
click.echo(f"Auto-detected delta: s3://{bucket}/{delta_key}")
|
||||
click.echo(f"Auto-detected delta: {build_s3_url(bucket, delta_key)}")
|
||||
|
||||
# Determine output path
|
||||
if local_path is None:
|
||||
@@ -136,7 +230,7 @@ def download_file(
|
||||
if not quiet:
|
||||
file_size = local_path.stat().st_size
|
||||
click.echo(
|
||||
f"download: 's3://{bucket}/{actual_key}' to '{local_path}' ({file_size} bytes)"
|
||||
f"download: '{build_s3_url(bucket, actual_key)}' to '{local_path}' ({file_size} bytes)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -149,31 +243,310 @@ def copy_s3_to_s3(
|
||||
source_url: str,
|
||||
dest_url: str,
|
||||
quiet: bool = False,
|
||||
max_ratio: float | None = None,
|
||||
no_delta: bool = False,
|
||||
) -> None:
|
||||
"""Copy object between S3 locations."""
|
||||
# For now, implement as download + upload
|
||||
# TODO: Optimize with server-side copy when possible
|
||||
"""Copy object between S3 locations with optional delta compression.
|
||||
|
||||
This performs a direct S3-to-S3 transfer using streaming to preserve
|
||||
the original file content and apply delta compression at the destination.
|
||||
"""
|
||||
source_bucket, source_key = parse_s3_url(source_url)
|
||||
dest_bucket, dest_key = parse_s3_url(dest_url)
|
||||
|
||||
if not quiet:
|
||||
click.echo(f"copy: 's3://{source_bucket}/{source_key}' to 's3://{dest_bucket}/{dest_key}'")
|
||||
click.echo(
|
||||
f"copy: '{build_s3_url(source_bucket, source_key)}' "
|
||||
f"to '{build_s3_url(dest_bucket, dest_key)}'"
|
||||
)
|
||||
|
||||
# Use temporary file
|
||||
import tempfile
|
||||
try:
|
||||
# Get the source object as a stream
|
||||
source_stream = service.storage.get(f"{source_bucket}/{source_key}")
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=Path(source_key).suffix) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
# Determine the destination deltaspace
|
||||
dest_key_parts = dest_key.split("/")
|
||||
if len(dest_key_parts) > 1:
|
||||
dest_prefix = "/".join(dest_key_parts[:-1])
|
||||
else:
|
||||
dest_prefix = ""
|
||||
|
||||
# Download from source
|
||||
download_file(service, source_url, tmp_path, quiet=True)
|
||||
dest_deltaspace = DeltaSpace(bucket=dest_bucket, prefix=dest_prefix)
|
||||
|
||||
# Upload to destination
|
||||
upload_file(service, tmp_path, dest_url, quiet=True)
|
||||
# If delta is disabled or max_ratio specified, use direct put
|
||||
if no_delta:
|
||||
# Direct storage put without delta compression
|
||||
service.storage.put(f"{dest_bucket}/{dest_key}", source_stream, {})
|
||||
if not quiet:
|
||||
click.echo("Copy completed (no delta compression)")
|
||||
else:
|
||||
# Write to a temporary file and use override_name to preserve original filename
|
||||
import tempfile
|
||||
|
||||
# Extract original filename from source
|
||||
original_filename = Path(source_key).name
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(source_key).suffix) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
|
||||
# Write stream to temp file
|
||||
with open(tmp_path, "wb") as f:
|
||||
shutil.copyfileobj(source_stream, f)
|
||||
|
||||
try:
|
||||
# Use DeltaService.put() with override_name to preserve original filename
|
||||
summary = service.put(
|
||||
tmp_path, dest_deltaspace, max_ratio, override_name=original_filename
|
||||
)
|
||||
|
||||
if not quiet:
|
||||
if summary.delta_size:
|
||||
ratio = round((summary.delta_size / summary.file_size) * 100, 1)
|
||||
click.echo(f"Copy completed with delta compression ({ratio}% of original)")
|
||||
else:
|
||||
click.echo("Copy completed (stored as reference)")
|
||||
finally:
|
||||
# Clean up temp file
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"S3-to-S3 copy failed: {e}", err=True)
|
||||
raise
|
||||
|
||||
|
||||
def migrate_s3_to_s3(
|
||||
service: DeltaService,
|
||||
source_url: str,
|
||||
dest_url: str,
|
||||
exclude: str | None = None,
|
||||
include: str | None = None,
|
||||
quiet: bool = False,
|
||||
no_delta: bool = False,
|
||||
max_ratio: float | None = None,
|
||||
dry_run: bool = False,
|
||||
skip_confirm: bool = False,
|
||||
preserve_prefix: bool = False,
|
||||
region_override: bool = False,
|
||||
) -> None:
|
||||
"""Migrate objects from one S3 location to another with delta compression.
|
||||
|
||||
Features:
|
||||
- Resume support: Only copies files that don't exist in destination
|
||||
- Progress tracking: Shows migration progress
|
||||
- Confirmation prompt: Shows file count before starting
|
||||
- Prefix preservation: Optionally preserves source prefix structure in destination
|
||||
- EC2 region detection: Warns about cross-region data transfer charges
|
||||
|
||||
Args:
|
||||
service: DeltaService instance
|
||||
source_url: Source S3 URL
|
||||
dest_url: Destination S3 URL
|
||||
exclude: Pattern to exclude files
|
||||
include: Pattern to include files
|
||||
quiet: Suppress output
|
||||
no_delta: Disable delta compression
|
||||
max_ratio: Maximum delta/file ratio
|
||||
dry_run: Show what would be migrated without migrating
|
||||
skip_confirm: Skip confirmation prompt
|
||||
preserve_prefix: Preserve source prefix in destination
|
||||
region_override: True if user explicitly specified --region flag
|
||||
"""
|
||||
import fnmatch
|
||||
|
||||
source_bucket, source_prefix = parse_s3_url(source_url)
|
||||
dest_bucket, dest_prefix = parse_s3_url(dest_url)
|
||||
|
||||
# Ensure prefixes end with / if they exist
|
||||
if source_prefix and not source_prefix.endswith("/"):
|
||||
source_prefix += "/"
|
||||
if dest_prefix and not dest_prefix.endswith("/"):
|
||||
dest_prefix += "/"
|
||||
|
||||
# Determine the effective destination prefix based on preserve_prefix setting
|
||||
effective_dest_prefix = dest_prefix
|
||||
if preserve_prefix and source_prefix:
|
||||
# Extract the last component of the source prefix (e.g., "prefix1/" from "path/to/prefix1/")
|
||||
source_prefix_name = source_prefix.rstrip("/").split("/")[-1]
|
||||
if source_prefix_name:
|
||||
# Append source prefix name to destination
|
||||
effective_dest_prefix = (dest_prefix or "") + source_prefix_name + "/"
|
||||
|
||||
if not quiet:
|
||||
# Log AWS region being used (helps users verify their configuration)
|
||||
# Pass region_override to warn about cross-region charges if user explicitly set --region
|
||||
log_aws_region(service, region_override=region_override)
|
||||
|
||||
source_display = build_s3_url(source_bucket, source_prefix)
|
||||
dest_display = build_s3_url(dest_bucket, dest_prefix)
|
||||
effective_dest_display = build_s3_url(dest_bucket, effective_dest_prefix)
|
||||
|
||||
if preserve_prefix and source_prefix:
|
||||
click.echo(f"Migrating from {source_display}")
|
||||
click.echo(f" to {effective_dest_display}")
|
||||
else:
|
||||
click.echo(f"Migrating from {source_display} to {dest_display}")
|
||||
click.echo("Scanning source and destination buckets...")
|
||||
|
||||
# List source objects
|
||||
source_list_prefix = f"{source_bucket}/{source_prefix}" if source_prefix else source_bucket
|
||||
source_objects = []
|
||||
|
||||
for obj in service.storage.list(source_list_prefix):
|
||||
# Skip reference.bin files (internal delta reference)
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
continue
|
||||
# Skip .delta files in source (we'll handle the original files)
|
||||
if obj.key.endswith(".delta"):
|
||||
continue
|
||||
|
||||
# Apply include/exclude filters
|
||||
rel_key = obj.key.removeprefix(source_prefix) if source_prefix else obj.key
|
||||
if exclude and fnmatch.fnmatch(rel_key, exclude):
|
||||
continue
|
||||
if include and not fnmatch.fnmatch(rel_key, include):
|
||||
continue
|
||||
|
||||
source_objects.append(obj)
|
||||
|
||||
# List destination objects to detect what needs copying
|
||||
dest_list_prefix = (
|
||||
f"{dest_bucket}/{effective_dest_prefix}" if effective_dest_prefix else dest_bucket
|
||||
)
|
||||
dest_keys = set()
|
||||
|
||||
for obj in service.storage.list(dest_list_prefix):
|
||||
# Get the relative key in destination
|
||||
rel_key = obj.key.removeprefix(effective_dest_prefix) if effective_dest_prefix else obj.key
|
||||
# Remove .delta suffix for comparison
|
||||
if rel_key.endswith(".delta"):
|
||||
rel_key = rel_key[:-6]
|
||||
# Skip reference.bin
|
||||
if not rel_key.endswith("/reference.bin"):
|
||||
dest_keys.add(rel_key)
|
||||
|
||||
# Determine files to migrate (not in destination)
|
||||
files_to_migrate = []
|
||||
total_size = 0
|
||||
|
||||
for source_obj in source_objects:
|
||||
# Get relative path from source prefix
|
||||
rel_key = source_obj.key.removeprefix(source_prefix) if source_prefix else source_obj.key
|
||||
|
||||
# Check if already exists in destination
|
||||
if rel_key not in dest_keys:
|
||||
files_to_migrate.append((source_obj, rel_key))
|
||||
total_size += source_obj.size
|
||||
|
||||
# Show summary and ask for confirmation
|
||||
if not files_to_migrate:
|
||||
if not quiet:
|
||||
click.echo("Copy completed")
|
||||
click.echo("All files are already migrated. Nothing to do.")
|
||||
return
|
||||
|
||||
if not quiet:
|
||||
|
||||
def format_bytes(size: int) -> str:
|
||||
size_float = float(size)
|
||||
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
||||
if size_float < 1024.0:
|
||||
return f"{size_float:.2f} {unit}"
|
||||
size_float /= 1024.0
|
||||
return f"{size_float:.2f} PB"
|
||||
|
||||
click.echo("")
|
||||
click.echo(f"Files to migrate: {len(files_to_migrate)}")
|
||||
click.echo(f"Total size: {format_bytes(total_size)}")
|
||||
if len(dest_keys) > 0:
|
||||
click.echo(f"Already migrated: {len(dest_keys)} files (will be skipped)")
|
||||
|
||||
# Handle dry run mode early (before confirmation prompt)
|
||||
if dry_run:
|
||||
if not quiet:
|
||||
click.echo("\n--- DRY RUN MODE ---")
|
||||
for _obj, rel_key in files_to_migrate[:10]: # Show first 10 files
|
||||
click.echo(f" Would migrate: {rel_key}")
|
||||
if len(files_to_migrate) > 10:
|
||||
click.echo(f" ... and {len(files_to_migrate) - 10} more files")
|
||||
return
|
||||
|
||||
# Ask for confirmation before proceeding with actual migration
|
||||
if not quiet and not skip_confirm:
|
||||
click.echo("")
|
||||
if not click.confirm("Do you want to proceed with the migration?"):
|
||||
click.echo("Migration cancelled.")
|
||||
return
|
||||
|
||||
# Perform migration
|
||||
if not quiet:
|
||||
click.echo(f"\nStarting migration of {len(files_to_migrate)} files...")
|
||||
|
||||
successful = 0
|
||||
failed = 0
|
||||
failed_files = []
|
||||
|
||||
for i, (source_obj, rel_key) in enumerate(files_to_migrate, 1):
|
||||
source_s3_url = build_s3_url(source_bucket, source_obj.key)
|
||||
|
||||
# Construct destination URL using effective prefix
|
||||
if effective_dest_prefix:
|
||||
dest_key = effective_dest_prefix + rel_key
|
||||
else:
|
||||
dest_key = rel_key
|
||||
dest_s3_url = build_s3_url(dest_bucket, dest_key)
|
||||
|
||||
try:
|
||||
if not quiet:
|
||||
progress = f"[{i}/{len(files_to_migrate)}]"
|
||||
click.echo(f"{progress} Migrating {rel_key}...", nl=False)
|
||||
|
||||
# Copy with delta compression
|
||||
copy_s3_to_s3(
|
||||
service,
|
||||
source_s3_url,
|
||||
dest_s3_url,
|
||||
quiet=True,
|
||||
max_ratio=max_ratio,
|
||||
no_delta=no_delta,
|
||||
)
|
||||
|
||||
successful += 1
|
||||
if not quiet:
|
||||
click.echo(" ✓")
|
||||
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
failed_files.append((rel_key, str(e)))
|
||||
if not quiet:
|
||||
click.echo(f" ✗ ({e})")
|
||||
|
||||
# Show final summary
|
||||
if not quiet:
|
||||
click.echo("")
|
||||
click.echo("Migration Summary:")
|
||||
click.echo(f" Successfully migrated: {successful} files")
|
||||
if failed > 0:
|
||||
click.echo(f" Failed: {failed} files")
|
||||
click.echo("\nFailed files:")
|
||||
for file, error in failed_files[:10]: # Show first 10 failures
|
||||
click.echo(f" {file}: {error}")
|
||||
if len(failed_files) > 10:
|
||||
click.echo(f" ... and {len(failed_files) - 10} more failures")
|
||||
|
||||
# Show compression statistics from cache if available (no bucket scan)
|
||||
if successful > 0 and not no_delta:
|
||||
try:
|
||||
from ...client import DeltaGliderClient
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
# Use cached stats only - don't scan bucket (prevents blocking)
|
||||
cached_stats = client._get_cached_bucket_stats(dest_bucket, "quick")
|
||||
if cached_stats and cached_stats.delta_objects > 0:
|
||||
click.echo(
|
||||
f"\nCompression achieved: {cached_stats.average_compression_ratio:.1%}"
|
||||
)
|
||||
click.echo(f"Space saved: {format_bytes(cached_stats.space_saved)}")
|
||||
except Exception:
|
||||
pass # Ignore stats errors
|
||||
|
||||
|
||||
def handle_recursive(
|
||||
@@ -228,10 +601,7 @@ def handle_recursive(
|
||||
dest_path = Path(dest)
|
||||
dest_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# List all objects with prefix
|
||||
# Note: S3StorageAdapter.list() expects "bucket/prefix" format
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
objects = list(service.storage.list(list_prefix))
|
||||
objects = fetch_s3_object_heads(service, bucket, prefix)
|
||||
|
||||
if not quiet:
|
||||
click.echo(f"Downloading {len(objects)} files...")
|
||||
@@ -261,9 +631,22 @@ def handle_recursive(
|
||||
local_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download file
|
||||
s3_url = f"s3://{bucket}/{obj.key}"
|
||||
s3_url = build_s3_url(bucket, obj.key)
|
||||
download_file(service, s3_url, local_path, quiet)
|
||||
|
||||
else:
|
||||
click.echo("S3-to-S3 recursive copy not yet implemented", err=True)
|
||||
sys.exit(1)
|
||||
elif operation == "copy":
|
||||
# S3-to-S3 recursive copy with migration support
|
||||
migrate_s3_to_s3(
|
||||
service,
|
||||
source,
|
||||
dest,
|
||||
exclude=exclude,
|
||||
include=include,
|
||||
quiet=quiet,
|
||||
no_delta=no_delta,
|
||||
max_ratio=max_ratio,
|
||||
dry_run=False,
|
||||
skip_confirm=True, # Don't prompt for cp command
|
||||
preserve_prefix=True, # Always preserve prefix for cp -r
|
||||
region_override=False, # cp command doesn't track region override explicitly
|
||||
)
|
||||
|
||||
@@ -7,9 +7,11 @@ import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import click
|
||||
|
||||
from ... import __version__
|
||||
from ...adapters import (
|
||||
NoopMetricsAdapter,
|
||||
S3StorageAdapter,
|
||||
@@ -49,7 +51,7 @@ def create_service(
|
||||
# Register cleanup handler to remove cache on exit
|
||||
atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True))
|
||||
|
||||
# Set AWS environment variables if provided
|
||||
# Set AWS environment variables if provided (for compatibility with other AWS tools)
|
||||
if endpoint_url:
|
||||
os.environ["AWS_ENDPOINT_URL"] = endpoint_url
|
||||
if region:
|
||||
@@ -57,9 +59,14 @@ def create_service(
|
||||
if profile:
|
||||
os.environ["AWS_PROFILE"] = profile
|
||||
|
||||
# Build boto3_kwargs for explicit parameter passing (preferred over env vars)
|
||||
boto3_kwargs: dict[str, Any] = {}
|
||||
if region:
|
||||
boto3_kwargs["region_name"] = region
|
||||
|
||||
# Create adapters
|
||||
hasher = Sha256Adapter()
|
||||
storage = S3StorageAdapter(endpoint_url=endpoint_url)
|
||||
storage = S3StorageAdapter(endpoint_url=endpoint_url, boto3_kwargs=boto3_kwargs)
|
||||
diff = XdeltaAdapter()
|
||||
|
||||
# SECURITY: Configurable cache with encryption and backend selection
|
||||
@@ -113,8 +120,23 @@ def create_service(
|
||||
)
|
||||
|
||||
|
||||
def _version_callback(ctx: click.Context, param: click.Parameter, value: bool) -> None:
|
||||
"""Callback for --version option."""
|
||||
if value:
|
||||
click.echo(f"deltaglider {__version__}")
|
||||
ctx.exit(0)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.option("--debug", is_flag=True, help="Enable debug logging")
|
||||
@click.option(
|
||||
"--version",
|
||||
is_flag=True,
|
||||
is_eager=True,
|
||||
expose_value=False,
|
||||
callback=_version_callback,
|
||||
help="Show version and exit",
|
||||
)
|
||||
@click.pass_context
|
||||
def cli(ctx: click.Context, debug: bool) -> None:
|
||||
"""DeltaGlider - Delta-aware S3 file storage wrapper."""
|
||||
@@ -172,9 +194,6 @@ def cp(
|
||||
|
||||
# Handle recursive operations for directories
|
||||
if recursive:
|
||||
if operation == "copy":
|
||||
click.echo("S3-to-S3 recursive copy not yet implemented", err=True)
|
||||
sys.exit(1)
|
||||
handle_recursive(
|
||||
service, source, dest, recursive, exclude, include, quiet, no_delta, max_ratio
|
||||
)
|
||||
@@ -196,7 +215,7 @@ def cp(
|
||||
download_file(service, source, local_path, quiet)
|
||||
|
||||
elif operation == "copy":
|
||||
copy_s3_to_s3(service, source, dest, quiet)
|
||||
copy_s3_to_s3(service, source, dest, quiet, max_ratio, no_delta)
|
||||
|
||||
except ValueError as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
@@ -604,20 +623,14 @@ def sync(
|
||||
@click.pass_obj
|
||||
def verify(service: DeltaService, s3_url: str) -> None:
|
||||
"""Verify integrity of delta file."""
|
||||
# Parse S3 URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
try:
|
||||
bucket, key = parse_s3_url(s3_url)
|
||||
if not key:
|
||||
raise ValueError("Missing key")
|
||||
except ValueError:
|
||||
click.echo(f"Error: Invalid S3 URL: {s3_url}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
s3_path = s3_url[5:]
|
||||
parts = s3_path.split("/", 1)
|
||||
if len(parts) != 2:
|
||||
click.echo(f"Error: Invalid S3 URL: {s3_url}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
bucket = parts[0]
|
||||
key = parts[1]
|
||||
|
||||
obj_key = ObjectKey(bucket=bucket, key=key)
|
||||
|
||||
try:
|
||||
@@ -641,37 +654,196 @@ def verify(service: DeltaService, s3_url: str) -> None:
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("source")
|
||||
@click.argument("dest")
|
||||
@click.option("--exclude", help="Exclude files matching pattern")
|
||||
@click.option("--include", help="Include only files matching pattern")
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Suppress output")
|
||||
@click.option("--no-delta", is_flag=True, help="Disable delta compression")
|
||||
@click.option("--max-ratio", type=float, help="Max delta/file ratio (default: 0.5)")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be migrated without migrating")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompt")
|
||||
@click.option(
|
||||
"--no-preserve-prefix", is_flag=True, help="Don't preserve source prefix in destination"
|
||||
)
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def migrate(
|
||||
service: DeltaService,
|
||||
source: str,
|
||||
dest: str,
|
||||
exclude: str | None,
|
||||
include: str | None,
|
||||
quiet: bool,
|
||||
no_delta: bool,
|
||||
max_ratio: float | None,
|
||||
dry_run: bool,
|
||||
yes: bool,
|
||||
no_preserve_prefix: bool,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Migrate S3 bucket/prefix to DeltaGlider-compressed storage.
|
||||
|
||||
This command facilitates the migration of existing S3 objects to another bucket
|
||||
with DeltaGlider compression. It supports:
|
||||
- Resume capability: Only copies files that don't exist in destination
|
||||
- Progress tracking: Shows migration progress
|
||||
- Confirmation prompt: Shows file count before starting (use --yes to skip)
|
||||
- Prefix preservation: By default, source prefix is preserved in destination
|
||||
|
||||
When migrating a prefix, the source prefix name is preserved by default:
|
||||
s3://src/prefix1/ → s3://dest/ creates s3://dest/prefix1/
|
||||
s3://src/a/b/c/ → s3://dest/x/ creates s3://dest/x/c/
|
||||
|
||||
Use --no-preserve-prefix to disable this behavior:
|
||||
s3://src/prefix1/ → s3://dest/ creates s3://dest/ (files at root)
|
||||
|
||||
Examples:
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
deltaglider migrate s3://old-bucket/data/ s3://new-bucket/
|
||||
deltaglider migrate --no-preserve-prefix s3://src/v1/ s3://dest/
|
||||
deltaglider migrate --dry-run s3://old-bucket/ s3://new-bucket/
|
||||
deltaglider migrate --yes --quiet s3://old-bucket/ s3://new-bucket/
|
||||
"""
|
||||
from .aws_compat import is_s3_path, migrate_s3_to_s3
|
||||
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Validate both paths are S3
|
||||
if not is_s3_path(source) or not is_s3_path(dest):
|
||||
click.echo("Error: Both source and destination must be S3 paths", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Perform migration
|
||||
migrate_s3_to_s3(
|
||||
service,
|
||||
source,
|
||||
dest,
|
||||
exclude=exclude,
|
||||
include=include,
|
||||
quiet=quiet,
|
||||
no_delta=no_delta,
|
||||
max_ratio=max_ratio,
|
||||
dry_run=dry_run,
|
||||
skip_confirm=yes,
|
||||
preserve_prefix=not no_preserve_prefix,
|
||||
region_override=region is not None, # True if user explicitly specified --region
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Migration failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command(short_help="Get bucket statistics and compression metrics")
|
||||
@click.argument("bucket")
|
||||
@click.option("--detailed", is_flag=True, help="Fetch detailed compression metrics (slower)")
|
||||
@click.option("--sampled", is_flag=True, help="Balanced mode: one sample per deltaspace (~5-15s)")
|
||||
@click.option(
|
||||
"--detailed", is_flag=True, help="Most accurate: HEAD for all deltas (slowest, ~1min+)"
|
||||
)
|
||||
@click.option("--refresh", is_flag=True, help="Force cache refresh even if valid")
|
||||
@click.option("--no-cache", is_flag=True, help="Skip caching entirely (both read and write)")
|
||||
@click.option("--json", "output_json", is_flag=True, help="Output in JSON format")
|
||||
@click.pass_obj
|
||||
def stats(service: DeltaService, bucket: str, detailed: bool, output_json: bool) -> None:
|
||||
"""Get bucket statistics and compression metrics.
|
||||
def stats(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
sampled: bool,
|
||||
detailed: bool,
|
||||
refresh: bool,
|
||||
no_cache: bool,
|
||||
output_json: bool,
|
||||
) -> None:
|
||||
"""Get bucket statistics and compression metrics with intelligent S3-based caching.
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name/
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Modes (mutually exclusive):
|
||||
- quick (default): Fast listing-only stats (~0.5s), approximate compression metrics
|
||||
- --sampled: Balanced mode - one HEAD per deltaspace (~5-15s for typical buckets)
|
||||
- --detailed: Most accurate - HEAD for every delta file (slowest, ~1min+ for large buckets)
|
||||
|
||||
Caching (NEW - massive performance improvement!):
|
||||
Stats are cached in S3 at .deltaglider/stats_{mode}.json (one per mode).
|
||||
Cache is automatically validated on every call using object count + size.
|
||||
If bucket changed, stats are recomputed automatically.
|
||||
|
||||
Performance with cache:
|
||||
- Cache hit: ~0.1s (200x faster than recomputation!)
|
||||
- Cache miss: Full computation time (creates cache for next time)
|
||||
- Cache invalid: Auto-recomputes when bucket changes
|
||||
|
||||
Options:
|
||||
--refresh: Force cache refresh even if valid (use when you need fresh data now)
|
||||
--no-cache: Skip caching entirely - always recompute (useful for testing/debugging)
|
||||
--json: Output in JSON format for automation/scripting
|
||||
|
||||
Examples:
|
||||
deltaglider stats mybucket # Fast (~0.1s with cache, ~0.5s without)
|
||||
deltaglider stats mybucket --sampled # Balanced accuracy/speed (~5-15s first run)
|
||||
deltaglider stats mybucket --detailed # Most accurate (~1-10min first run, ~0.1s cached)
|
||||
deltaglider stats mybucket --refresh # Force recomputation even if cached
|
||||
deltaglider stats mybucket --no-cache # Always compute fresh (skip cache)
|
||||
deltaglider stats mybucket --json # JSON output for scripts
|
||||
deltaglider stats s3://mybucket/ # Also accepts s3:// URLs
|
||||
|
||||
Timing Logs:
|
||||
Set DG_LOG_LEVEL=INFO to see detailed phase timing with timestamps:
|
||||
[HH:MM:SS.mmm] Phase 1: LIST completed in 0.52s - Found 1523 objects
|
||||
[HH:MM:SS.mmm] Phase 2: Cache HIT in 0.06s - Using cached stats
|
||||
[HH:MM:SS.mmm] COMPLETE: Total time 0.58s
|
||||
|
||||
See docs/STATS_CACHING.md for complete documentation.
|
||||
"""
|
||||
from ...client import DeltaGliderClient
|
||||
from ...client_operations.stats import StatsMode
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if bucket.startswith("s3://"):
|
||||
# Remove s3:// prefix and any trailing slashes
|
||||
bucket = bucket[5:].rstrip("/")
|
||||
# Extract just the bucket name (first path component)
|
||||
bucket = bucket.split("/")[0] if "/" in bucket else bucket
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if sampled and detailed:
|
||||
click.echo("Error: --sampled and --detailed cannot be used together", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if refresh and no_cache:
|
||||
click.echo("Error: --refresh and --no-cache cannot be used together", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
mode: StatsMode = "quick"
|
||||
if sampled:
|
||||
mode = "sampled"
|
||||
if detailed:
|
||||
mode = "detailed"
|
||||
|
||||
# Create client from service
|
||||
client = DeltaGliderClient(service=service)
|
||||
|
||||
# Get bucket stats
|
||||
bucket_stats = client.get_bucket_stats(bucket, detailed_stats=detailed)
|
||||
# Get bucket stats with caching control
|
||||
use_cache = not no_cache
|
||||
bucket_stats = client.get_bucket_stats(
|
||||
bucket, mode=mode, use_cache=use_cache, refresh_cache=refresh
|
||||
)
|
||||
|
||||
if output_json:
|
||||
# JSON output
|
||||
|
||||
@@ -5,9 +5,27 @@ from pathlib import Path
|
||||
import click
|
||||
|
||||
from ...core import DeltaService
|
||||
from ...core.object_listing import list_all_objects, object_dict_to_head
|
||||
from ...ports import ObjectHead
|
||||
|
||||
|
||||
def fetch_s3_object_heads(service: DeltaService, bucket: str, prefix: str) -> list[ObjectHead]:
|
||||
"""Retrieve all objects for a prefix, falling back to iterator when needed."""
|
||||
try:
|
||||
listing = list_all_objects(
|
||||
service.storage,
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
max_keys=1000,
|
||||
logger=getattr(service, "logger", None),
|
||||
)
|
||||
except (RuntimeError, NotImplementedError):
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
return list(service.storage.list(list_prefix))
|
||||
|
||||
return [object_dict_to_head(obj) for obj in listing.objects]
|
||||
|
||||
|
||||
def get_local_files(
|
||||
local_dir: Path, exclude: str | None = None, include: str | None = None
|
||||
) -> dict[str, tuple[Path, int]]:
|
||||
@@ -42,8 +60,7 @@ def get_s3_files(
|
||||
import fnmatch
|
||||
|
||||
files = {}
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
objects = service.storage.list(list_prefix)
|
||||
objects = fetch_s3_object_heads(service, bucket, prefix)
|
||||
|
||||
for obj in objects:
|
||||
# Skip reference.bin files (internal)
|
||||
|
||||
@@ -33,10 +33,14 @@ from .client_operations import (
|
||||
upload_batch as _upload_batch,
|
||||
upload_chunked as _upload_chunked,
|
||||
)
|
||||
|
||||
# fmt: on
|
||||
from .client_operations.stats import StatsMode
|
||||
|
||||
from .core import DeltaService, DeltaSpace, ObjectKey
|
||||
from .core.errors import NotFoundError
|
||||
from .core.object_listing import ObjectListing, list_objects_page
|
||||
from .core.s3_uri import parse_s3_url
|
||||
from .response_builders import (
|
||||
build_delete_response,
|
||||
build_get_response,
|
||||
@@ -64,7 +68,7 @@ class DeltaGliderClient:
|
||||
self.endpoint_url = endpoint_url
|
||||
self._multipart_uploads: dict[str, Any] = {} # Track multipart uploads
|
||||
# Session-scoped bucket statistics cache (cleared with the client lifecycle)
|
||||
self._bucket_stats_cache: dict[str, dict[bool, BucketStats]] = {}
|
||||
self._bucket_stats_cache: dict[str, dict[str, BucketStats]] = {}
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
@@ -80,35 +84,45 @@ class DeltaGliderClient:
|
||||
def _store_bucket_stats_cache(
|
||||
self,
|
||||
bucket: str,
|
||||
detailed_stats: bool,
|
||||
mode: StatsMode,
|
||||
stats: BucketStats,
|
||||
) -> None:
|
||||
"""Store bucket statistics in the session cache."""
|
||||
bucket_cache = self._bucket_stats_cache.setdefault(bucket, {})
|
||||
bucket_cache[detailed_stats] = stats
|
||||
# Detailed stats are a superset of quick stats; reuse them for quick calls.
|
||||
if detailed_stats:
|
||||
bucket_cache[False] = stats
|
||||
bucket_cache[mode] = stats
|
||||
if mode == "detailed":
|
||||
bucket_cache["sampled"] = stats
|
||||
bucket_cache["quick"] = stats
|
||||
elif mode == "sampled":
|
||||
bucket_cache.setdefault("quick", stats)
|
||||
|
||||
def _get_cached_bucket_stats(self, bucket: str, detailed_stats: bool) -> BucketStats | None:
|
||||
"""Retrieve cached stats for a bucket, preferring detailed metrics when available."""
|
||||
def _get_cached_bucket_stats(self, bucket: str, mode: StatsMode) -> BucketStats | None:
|
||||
"""Retrieve cached stats for a bucket, preferring more detailed metrics when available."""
|
||||
bucket_cache = self._bucket_stats_cache.get(bucket)
|
||||
if not bucket_cache:
|
||||
return None
|
||||
if detailed_stats:
|
||||
return bucket_cache.get(True)
|
||||
return bucket_cache.get(False) or bucket_cache.get(True)
|
||||
if mode == "detailed":
|
||||
return bucket_cache.get("detailed")
|
||||
if mode == "sampled":
|
||||
return bucket_cache.get("sampled") or bucket_cache.get("detailed")
|
||||
return (
|
||||
bucket_cache.get("quick") or bucket_cache.get("sampled") or bucket_cache.get("detailed")
|
||||
)
|
||||
|
||||
def _get_cached_bucket_stats_for_listing(self, bucket: str) -> tuple[BucketStats | None, bool]:
|
||||
def _get_cached_bucket_stats_for_listing(
|
||||
self, bucket: str
|
||||
) -> tuple[BucketStats | None, StatsMode | None]:
|
||||
"""Return best cached stats for bucket listings."""
|
||||
bucket_cache = self._bucket_stats_cache.get(bucket)
|
||||
if not bucket_cache:
|
||||
return (None, False)
|
||||
if True in bucket_cache:
|
||||
return (bucket_cache[True], True)
|
||||
if False in bucket_cache:
|
||||
return (bucket_cache[False], False)
|
||||
return (None, False)
|
||||
return (None, None)
|
||||
if "detailed" in bucket_cache:
|
||||
return (bucket_cache["detailed"], "detailed")
|
||||
if "sampled" in bucket_cache:
|
||||
return (bucket_cache["sampled"], "sampled")
|
||||
if "quick" in bucket_cache:
|
||||
return (bucket_cache["quick"], "quick")
|
||||
return (None, None)
|
||||
|
||||
# ============================================================================
|
||||
# Boto3-compatible APIs (matches S3 client interface)
|
||||
@@ -328,34 +342,32 @@ class DeltaGliderClient:
|
||||
FetchMetadata=True # Only fetches for delta files
|
||||
)
|
||||
"""
|
||||
# Use storage adapter's list_objects method
|
||||
if hasattr(self.service.storage, "list_objects"):
|
||||
result = self.service.storage.list_objects(
|
||||
start_after = StartAfter or ContinuationToken
|
||||
try:
|
||||
listing = list_objects_page(
|
||||
self.service.storage,
|
||||
bucket=Bucket,
|
||||
prefix=Prefix,
|
||||
delimiter=Delimiter,
|
||||
max_keys=MaxKeys,
|
||||
start_after=StartAfter or ContinuationToken, # Support both pagination methods
|
||||
start_after=start_after,
|
||||
)
|
||||
elif isinstance(self.service.storage, S3StorageAdapter):
|
||||
result = self.service.storage.list_objects(
|
||||
bucket=Bucket,
|
||||
prefix=Prefix,
|
||||
delimiter=Delimiter,
|
||||
max_keys=MaxKeys,
|
||||
start_after=StartAfter or ContinuationToken,
|
||||
)
|
||||
else:
|
||||
# Fallback
|
||||
result = {
|
||||
"objects": [],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
}
|
||||
except NotImplementedError:
|
||||
if isinstance(self.service.storage, S3StorageAdapter):
|
||||
listing = list_objects_page(
|
||||
self.service.storage,
|
||||
bucket=Bucket,
|
||||
prefix=Prefix,
|
||||
delimiter=Delimiter,
|
||||
max_keys=MaxKeys,
|
||||
start_after=start_after,
|
||||
)
|
||||
else:
|
||||
listing = ObjectListing()
|
||||
|
||||
# Convert to boto3-compatible S3Object TypedDicts (type-safe!)
|
||||
contents: list[S3Object] = []
|
||||
for obj in result.get("objects", []):
|
||||
for obj in listing.objects:
|
||||
# Skip reference.bin files (internal files, never exposed to users)
|
||||
if obj["key"].endswith("/reference.bin") or obj["key"] == "reference.bin":
|
||||
continue
|
||||
@@ -403,14 +415,14 @@ class DeltaGliderClient:
|
||||
"Key": display_key, # Use cleaned key without .delta
|
||||
"Size": obj["size"],
|
||||
"LastModified": obj.get("last_modified", ""),
|
||||
"ETag": obj.get("etag"),
|
||||
"ETag": str(obj.get("etag", "")),
|
||||
"StorageClass": obj.get("storage_class", "STANDARD"),
|
||||
"Metadata": deltaglider_metadata,
|
||||
}
|
||||
contents.append(s3_obj)
|
||||
|
||||
# Build type-safe boto3-compatible CommonPrefix TypedDicts
|
||||
common_prefixes = result.get("common_prefixes", [])
|
||||
common_prefixes = listing.common_prefixes
|
||||
common_prefix_dicts: list[CommonPrefix] | None = (
|
||||
[CommonPrefix(Prefix=p) for p in common_prefixes] if common_prefixes else None
|
||||
)
|
||||
@@ -425,8 +437,8 @@ class DeltaGliderClient:
|
||||
max_keys=MaxKeys,
|
||||
contents=contents,
|
||||
common_prefixes=common_prefix_dicts,
|
||||
is_truncated=result.get("is_truncated", False),
|
||||
next_continuation_token=result.get("next_continuation_token"),
|
||||
is_truncated=listing.is_truncated,
|
||||
next_continuation_token=listing.next_continuation_token,
|
||||
continuation_token=ContinuationToken,
|
||||
),
|
||||
)
|
||||
@@ -736,14 +748,9 @@ class DeltaGliderClient:
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
|
||||
# Parse S3 URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
raise ValueError(f"Invalid S3 URL: {s3_url}")
|
||||
|
||||
s3_path = s3_url[5:].rstrip("/")
|
||||
parts = s3_path.split("/", 1)
|
||||
bucket = parts[0]
|
||||
prefix = parts[1] if len(parts) > 1 else ""
|
||||
address = parse_s3_url(s3_url, strip_trailing_slash=True)
|
||||
bucket = address.bucket
|
||||
prefix = address.key
|
||||
|
||||
# Create delta space and upload
|
||||
delta_space = DeltaSpace(bucket=bucket, prefix=prefix)
|
||||
@@ -776,17 +783,9 @@ class DeltaGliderClient:
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
|
||||
# Parse S3 URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
raise ValueError(f"Invalid S3 URL: {s3_url}")
|
||||
|
||||
s3_path = s3_url[5:]
|
||||
parts = s3_path.split("/", 1)
|
||||
if len(parts) < 2:
|
||||
raise ValueError(f"S3 URL must include key: {s3_url}")
|
||||
|
||||
bucket = parts[0]
|
||||
key = parts[1]
|
||||
address = parse_s3_url(s3_url, allow_empty_key=False)
|
||||
bucket = address.bucket
|
||||
key = address.key
|
||||
|
||||
# Auto-append .delta if the file doesn't exist without it
|
||||
# This allows users to specify the original name and we'll find the delta
|
||||
@@ -812,17 +811,9 @@ class DeltaGliderClient:
|
||||
Returns:
|
||||
True if verification passed, False otherwise
|
||||
"""
|
||||
# Parse S3 URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
raise ValueError(f"Invalid S3 URL: {s3_url}")
|
||||
|
||||
s3_path = s3_url[5:]
|
||||
parts = s3_path.split("/", 1)
|
||||
if len(parts) < 2:
|
||||
raise ValueError(f"S3 URL must include key: {s3_url}")
|
||||
|
||||
bucket = parts[0]
|
||||
key = parts[1]
|
||||
address = parse_s3_url(s3_url, allow_empty_key=False)
|
||||
bucket = address.bucket
|
||||
key = address.key
|
||||
|
||||
obj_key = ObjectKey(bucket=bucket, key=key)
|
||||
result = self.service.verify(obj_key)
|
||||
@@ -965,39 +956,62 @@ class DeltaGliderClient:
|
||||
result: ObjectInfo = _get_object_info(self, s3_url)
|
||||
return result
|
||||
|
||||
def get_bucket_stats(self, bucket: str, detailed_stats: bool = False) -> BucketStats:
|
||||
"""Get statistics for a bucket with optional detailed compression metrics.
|
||||
def get_bucket_stats(
|
||||
self,
|
||||
bucket: str,
|
||||
mode: StatsMode = "quick",
|
||||
use_cache: bool = True,
|
||||
refresh_cache: bool = False,
|
||||
) -> BucketStats:
|
||||
"""Get statistics for a bucket with selectable accuracy modes and S3-based caching.
|
||||
|
||||
This method provides two modes:
|
||||
- Quick stats (default): Fast overview using LIST only (~50ms)
|
||||
- Detailed stats: Accurate compression metrics with HEAD requests (slower)
|
||||
Modes:
|
||||
- ``quick``: Fast listing-only stats (delta compression approximated).
|
||||
- ``sampled``: Fetch one delta HEAD per delta-space and reuse the ratio.
|
||||
- ``detailed``: Fetch metadata for every delta object (slowest, most accurate).
|
||||
|
||||
Caching:
|
||||
- Stats are cached in S3 at ``.deltaglider/stats_{mode}.json``
|
||||
- Cache is automatically validated on every call (uses LIST operation)
|
||||
- If bucket changed, cache is recomputed automatically
|
||||
- Use ``refresh_cache=True`` to force recomputation
|
||||
- Use ``use_cache=False`` to skip caching entirely
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
detailed_stats: If True, fetch accurate compression ratios for delta files (default: False)
|
||||
mode: Stats mode ("quick", "sampled", or "detailed")
|
||||
use_cache: If True, use S3-cached stats when available (default: True)
|
||||
refresh_cache: If True, force cache recomputation even if valid (default: False)
|
||||
|
||||
Returns:
|
||||
BucketStats with compression and space savings info
|
||||
|
||||
Performance:
|
||||
- With detailed_stats=False: ~50ms for any bucket size (1 LIST call per 1000 objects)
|
||||
- With detailed_stats=True: ~2-3s per 1000 objects (adds HEAD calls for delta files only)
|
||||
- With cache hit: ~50-100ms (LIST + cache read + validation)
|
||||
- quick (no cache): ~50ms per 1000 objects (LIST only)
|
||||
- sampled (no cache): ~60 HEAD calls per 60 delta-spaces plus LIST
|
||||
- detailed (no cache): ~2-3s per 1000 delta objects (LIST + HEAD per delta)
|
||||
|
||||
Example:
|
||||
# Quick stats for dashboard display
|
||||
# Quick stats with caching (fast, ~100ms)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, Size: {stats.total_size}")
|
||||
|
||||
# Detailed stats for analytics (slower but accurate)
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True)
|
||||
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
|
||||
# Force refresh (slow, recomputes everything)
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Skip cache entirely
|
||||
stats = client.get_bucket_stats('releases', use_cache=False)
|
||||
|
||||
# Detailed stats with caching
|
||||
stats = client.get_bucket_stats('releases', mode='detailed')
|
||||
"""
|
||||
cached = self._get_cached_bucket_stats(bucket, detailed_stats)
|
||||
if cached:
|
||||
return cached
|
||||
if mode not in {"quick", "sampled", "detailed"}:
|
||||
raise ValueError(f"Unknown stats mode: {mode}")
|
||||
|
||||
result: BucketStats = _get_bucket_stats(self, bucket, detailed_stats)
|
||||
self._store_bucket_stats_cache(bucket, detailed_stats, result)
|
||||
# Use S3-based caching from stats.py (replaces old in-memory cache)
|
||||
result: BucketStats = _get_bucket_stats(
|
||||
self, bucket, mode=mode, use_cache=use_cache, refresh_cache=refresh_cache
|
||||
)
|
||||
return result
|
||||
|
||||
def generate_presigned_url(
|
||||
|
||||
@@ -145,11 +145,12 @@ def list_buckets(
|
||||
bucket_data = dict(bucket_entry)
|
||||
name = bucket_data.get("Name")
|
||||
if isinstance(name, str) and name:
|
||||
cached_stats, detailed = client._get_cached_bucket_stats_for_listing(name)
|
||||
if cached_stats is not None:
|
||||
cached_stats, cached_mode = client._get_cached_bucket_stats_for_listing(name)
|
||||
if cached_stats is not None and cached_mode is not None:
|
||||
bucket_data["DeltaGliderStats"] = {
|
||||
"Cached": True,
|
||||
"Detailed": detailed,
|
||||
"Mode": cached_mode,
|
||||
"Detailed": cached_mode == "detailed",
|
||||
"ObjectCount": cached_stats.object_count,
|
||||
"TotalSize": cached_stats.total_size,
|
||||
"CompressedSize": cached_stats.compressed_size,
|
||||
|
||||
@@ -7,11 +7,473 @@ This module contains DeltaGlider-specific statistics operations:
|
||||
- find_similar_files
|
||||
"""
|
||||
|
||||
import concurrent.futures
|
||||
import json
|
||||
import re
|
||||
from dataclasses import asdict
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Any, Literal
|
||||
|
||||
from ..client_models import BucketStats, CompressionEstimate, ObjectInfo
|
||||
from ..core.delta_extensions import is_delta_candidate
|
||||
from ..core.object_listing import list_all_objects
|
||||
from ..core.s3_uri import parse_s3_url
|
||||
|
||||
StatsMode = Literal["quick", "sampled", "detailed"]
|
||||
|
||||
# Cache configuration
|
||||
CACHE_VERSION = "1.0"
|
||||
CACHE_PREFIX = ".deltaglider"
|
||||
|
||||
# ============================================================================
|
||||
# Internal Helper Functions
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _fetch_delta_metadata(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
delta_keys: list[str],
|
||||
max_timeout: int = 600,
|
||||
) -> dict[str, dict[str, Any]]:
|
||||
"""Fetch metadata for delta files in parallel with timeout.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
delta_keys: List of delta file keys
|
||||
max_timeout: Maximum total timeout in seconds (default: 600 = 10 min)
|
||||
|
||||
Returns:
|
||||
Dict mapping delta key -> metadata dict
|
||||
"""
|
||||
metadata_map: dict[str, dict[str, Any]] = {}
|
||||
|
||||
if not delta_keys:
|
||||
return metadata_map
|
||||
|
||||
client.service.logger.info(
|
||||
f"Fetching metadata for {len(delta_keys)} delta files in parallel..."
|
||||
)
|
||||
|
||||
def fetch_single_metadata(key: str) -> tuple[str, dict[str, Any] | None]:
|
||||
try:
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
if obj_head and obj_head.metadata:
|
||||
return key, obj_head.metadata
|
||||
except Exception as e:
|
||||
client.service.logger.debug(f"Failed to fetch metadata for {key}: {e}")
|
||||
return key, None
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(delta_keys))) as executor:
|
||||
futures = [executor.submit(fetch_single_metadata, key) for key in delta_keys]
|
||||
|
||||
# Calculate timeout: 60s per file, capped at max_timeout
|
||||
timeout_per_file = 60
|
||||
total_timeout = min(len(delta_keys) * timeout_per_file, max_timeout)
|
||||
|
||||
try:
|
||||
for future in concurrent.futures.as_completed(futures, timeout=total_timeout):
|
||||
try:
|
||||
key, metadata = future.result(timeout=5) # 5s per result
|
||||
if metadata:
|
||||
metadata_map[key] = metadata
|
||||
except concurrent.futures.TimeoutError:
|
||||
client.service.logger.warning("Timeout fetching metadata for a delta file")
|
||||
continue
|
||||
except concurrent.futures.TimeoutError:
|
||||
client.service.logger.warning(
|
||||
f"_fetch_delta_metadata: Timeout after {total_timeout}s. "
|
||||
f"Fetched {len(metadata_map)}/{len(delta_keys)} metadata entries. "
|
||||
f"Continuing with partial metadata..."
|
||||
)
|
||||
# Cancel remaining futures
|
||||
for future in futures:
|
||||
future.cancel()
|
||||
|
||||
return metadata_map
|
||||
|
||||
|
||||
def _extract_deltaspace(key: str) -> str:
|
||||
"""Return the delta space (prefix) for a given object key."""
|
||||
if "/" in key:
|
||||
return key.rsplit("/", 1)[0]
|
||||
return ""
|
||||
|
||||
|
||||
def _get_cache_key(mode: StatsMode) -> str:
|
||||
"""Get the S3 key for a cache file based on mode.
|
||||
|
||||
Args:
|
||||
mode: Stats mode (quick, sampled, or detailed)
|
||||
|
||||
Returns:
|
||||
S3 key like ".deltaglider/stats_quick.json"
|
||||
"""
|
||||
return f"{CACHE_PREFIX}/stats_{mode}.json"
|
||||
|
||||
|
||||
def _read_stats_cache(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
mode: StatsMode,
|
||||
) -> tuple[BucketStats | None, dict[str, Any] | None]:
|
||||
"""Read cached stats from S3 if available.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
mode: Stats mode to read cache for
|
||||
|
||||
Returns:
|
||||
Tuple of (BucketStats | None, validation_data | None)
|
||||
Returns (None, None) if cache doesn't exist or is invalid
|
||||
"""
|
||||
cache_key = _get_cache_key(mode)
|
||||
|
||||
try:
|
||||
# Try to read cache file from S3
|
||||
obj = client.service.storage.get(f"{bucket}/{cache_key}")
|
||||
if not obj or not obj.data:
|
||||
return None, None
|
||||
|
||||
# Parse JSON
|
||||
cache_data = json.loads(obj.data.decode("utf-8"))
|
||||
|
||||
# Validate version
|
||||
if cache_data.get("version") != CACHE_VERSION:
|
||||
client.service.logger.warning(
|
||||
f"Cache version mismatch: expected {CACHE_VERSION}, got {cache_data.get('version')}"
|
||||
)
|
||||
return None, None
|
||||
|
||||
# Validate mode
|
||||
if cache_data.get("mode") != mode:
|
||||
client.service.logger.warning(
|
||||
f"Cache mode mismatch: expected {mode}, got {cache_data.get('mode')}"
|
||||
)
|
||||
return None, None
|
||||
|
||||
# Extract stats and validation data
|
||||
stats_dict = cache_data.get("stats")
|
||||
validation_data = cache_data.get("validation")
|
||||
|
||||
if not stats_dict or not validation_data:
|
||||
client.service.logger.warning("Cache missing stats or validation data")
|
||||
return None, None
|
||||
|
||||
# Reconstruct BucketStats from dict
|
||||
stats = BucketStats(**stats_dict)
|
||||
|
||||
client.service.logger.debug(
|
||||
f"Successfully read cache for {bucket} (mode={mode}, "
|
||||
f"computed_at={cache_data.get('computed_at')})"
|
||||
)
|
||||
|
||||
return stats, validation_data
|
||||
|
||||
except FileNotFoundError:
|
||||
# Cache doesn't exist yet - this is normal
|
||||
client.service.logger.debug(f"No cache found for {bucket} (mode={mode})")
|
||||
return None, None
|
||||
except json.JSONDecodeError as e:
|
||||
client.service.logger.warning(f"Invalid JSON in cache file: {e}")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
client.service.logger.warning(f"Error reading cache: {e}")
|
||||
return None, None
|
||||
|
||||
|
||||
def _write_stats_cache(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
mode: StatsMode,
|
||||
stats: BucketStats,
|
||||
object_count: int,
|
||||
compressed_size: int,
|
||||
) -> None:
|
||||
"""Write computed stats to S3 cache.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
mode: Stats mode being cached
|
||||
stats: Computed BucketStats to cache
|
||||
object_count: Current object count (for validation)
|
||||
compressed_size: Current compressed size (for validation)
|
||||
"""
|
||||
cache_key = _get_cache_key(mode)
|
||||
|
||||
try:
|
||||
# Build cache structure
|
||||
cache_data = {
|
||||
"version": CACHE_VERSION,
|
||||
"mode": mode,
|
||||
"computed_at": datetime.now(UTC).isoformat(),
|
||||
"validation": {
|
||||
"object_count": object_count,
|
||||
"compressed_size": compressed_size,
|
||||
},
|
||||
"stats": asdict(stats),
|
||||
}
|
||||
|
||||
# Serialize to JSON
|
||||
cache_json = json.dumps(cache_data, indent=2)
|
||||
|
||||
# Write to S3
|
||||
client.service.storage.put(
|
||||
address=f"{bucket}/{cache_key}",
|
||||
data=cache_json.encode("utf-8"),
|
||||
metadata={
|
||||
"content-type": "application/json",
|
||||
"x-deltaglider-cache": "true",
|
||||
},
|
||||
)
|
||||
|
||||
client.service.logger.info(
|
||||
f"Wrote cache for {bucket} (mode={mode}, {len(cache_json)} bytes)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# Log warning but don't fail - caching is optional
|
||||
client.service.logger.warning(f"Failed to write cache (non-fatal): {e}")
|
||||
|
||||
|
||||
def _is_cache_valid(
|
||||
cached_validation: dict[str, Any],
|
||||
current_object_count: int,
|
||||
current_compressed_size: int,
|
||||
) -> bool:
|
||||
"""Check if cached stats are still valid based on bucket state.
|
||||
|
||||
Validation strategy: Compare object count and total compressed size.
|
||||
If either changed, the cache is stale.
|
||||
|
||||
Args:
|
||||
cached_validation: Validation data from cache
|
||||
current_object_count: Current object count from LIST
|
||||
current_compressed_size: Current compressed size from LIST
|
||||
|
||||
Returns:
|
||||
True if cache is still valid, False if stale
|
||||
"""
|
||||
cached_count = cached_validation.get("object_count")
|
||||
cached_size = cached_validation.get("compressed_size")
|
||||
|
||||
if cached_count != current_object_count:
|
||||
return False
|
||||
|
||||
if cached_size != current_compressed_size:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _build_object_info_list(
|
||||
raw_objects: list[dict[str, Any]],
|
||||
metadata_map: dict[str, dict[str, Any]],
|
||||
logger: Any,
|
||||
sampled_space_metadata: dict[str, dict[str, Any]] | None = None,
|
||||
) -> list[ObjectInfo]:
|
||||
"""Build ObjectInfo list from raw objects and metadata.
|
||||
|
||||
Args:
|
||||
raw_objects: List of raw object dicts from S3 LIST
|
||||
metadata_map: Dict of key -> metadata for delta files
|
||||
logger: Logger instance
|
||||
|
||||
Returns:
|
||||
List of ObjectInfo objects
|
||||
"""
|
||||
all_objects = []
|
||||
|
||||
for obj_dict in raw_objects:
|
||||
key = obj_dict["key"]
|
||||
size = obj_dict["size"]
|
||||
is_delta = key.endswith(".delta")
|
||||
|
||||
deltaspace = _extract_deltaspace(key)
|
||||
|
||||
# Get metadata from map (empty dict if not present)
|
||||
metadata = metadata_map.get(key)
|
||||
if metadata is None and sampled_space_metadata and deltaspace in sampled_space_metadata:
|
||||
metadata = sampled_space_metadata[deltaspace]
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
|
||||
# Parse compression ratio and original size
|
||||
compression_ratio = 0.0
|
||||
original_size = size
|
||||
|
||||
if is_delta and metadata:
|
||||
try:
|
||||
ratio_str = metadata.get("compression_ratio", "0.0")
|
||||
compression_ratio = float(ratio_str) if ratio_str != "unknown" else 0.0
|
||||
except (ValueError, TypeError):
|
||||
compression_ratio = 0.0
|
||||
|
||||
try:
|
||||
if "file_size" in metadata:
|
||||
original_size = int(metadata["file_size"])
|
||||
logger.debug(f"Delta {key}: using original_size={original_size} from metadata")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Delta {key}: metadata missing 'file_size' key. "
|
||||
f"Available keys: {list(metadata.keys())}. "
|
||||
f"Using compressed size={size} as fallback"
|
||||
)
|
||||
original_size = size
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning(
|
||||
f"Delta {key}: failed to parse file_size from metadata: {e}. "
|
||||
f"Using compressed size={size} as fallback"
|
||||
)
|
||||
original_size = size
|
||||
|
||||
all_objects.append(
|
||||
ObjectInfo(
|
||||
key=key,
|
||||
size=size,
|
||||
last_modified=obj_dict.get("last_modified", ""),
|
||||
etag=obj_dict.get("etag"),
|
||||
storage_class=obj_dict.get("storage_class", "STANDARD"),
|
||||
original_size=original_size,
|
||||
compressed_size=size,
|
||||
is_delta=is_delta,
|
||||
compression_ratio=compression_ratio,
|
||||
reference_key=metadata.get("ref_key") if metadata else None,
|
||||
)
|
||||
)
|
||||
|
||||
return all_objects
|
||||
|
||||
|
||||
def _calculate_bucket_statistics(
|
||||
all_objects: list[ObjectInfo],
|
||||
bucket: str,
|
||||
logger: Any,
|
||||
) -> BucketStats:
|
||||
"""Calculate statistics from ObjectInfo list.
|
||||
|
||||
Args:
|
||||
all_objects: List of ObjectInfo objects
|
||||
bucket: Bucket name for stats
|
||||
logger: Logger instance
|
||||
|
||||
Returns:
|
||||
BucketStats object
|
||||
"""
|
||||
total_original_size = 0
|
||||
total_compressed_size = 0
|
||||
delta_count = 0
|
||||
direct_count = 0
|
||||
reference_files = {} # deltaspace -> size
|
||||
|
||||
# First pass: identify object types and reference files
|
||||
for obj in all_objects:
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
deltaspace = obj.key.rsplit("/reference.bin", 1)[0] if "/" in obj.key else ""
|
||||
reference_files[deltaspace] = obj.size
|
||||
elif obj.is_delta:
|
||||
delta_count += 1
|
||||
else:
|
||||
direct_count += 1
|
||||
|
||||
# Second pass: calculate sizes
|
||||
for obj in all_objects:
|
||||
# Skip reference.bin (handled separately)
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
continue
|
||||
|
||||
if obj.is_delta:
|
||||
# Delta: use original_size if available, otherwise compressed size
|
||||
if obj.original_size and obj.original_size != obj.size:
|
||||
logger.debug(f"Delta {obj.key}: using original_size={obj.original_size}")
|
||||
total_original_size += obj.original_size
|
||||
else:
|
||||
# This warning should only appear if metadata is missing or incomplete
|
||||
# If you see this, the delta file may have been uploaded with an older
|
||||
# version of DeltaGlider or the upload was incomplete
|
||||
logger.warning(
|
||||
f"Delta {obj.key}: no original_size metadata "
|
||||
f"(original_size={obj.original_size}, size={obj.size}). "
|
||||
f"Using compressed size as fallback. "
|
||||
f"This may undercount space savings."
|
||||
)
|
||||
total_original_size += obj.size
|
||||
total_compressed_size += obj.size
|
||||
else:
|
||||
# Direct files: original = compressed
|
||||
total_original_size += obj.size
|
||||
total_compressed_size += obj.size
|
||||
|
||||
# Handle reference.bin files
|
||||
total_reference_size = sum(reference_files.values())
|
||||
|
||||
if delta_count > 0 and total_reference_size > 0:
|
||||
total_compressed_size += total_reference_size
|
||||
logger.info(
|
||||
f"Including {len(reference_files)} reference.bin file(s) "
|
||||
f"({total_reference_size:,} bytes) in compressed size"
|
||||
)
|
||||
elif delta_count == 0 and total_reference_size > 0:
|
||||
_log_orphaned_references(bucket, reference_files, total_reference_size, logger)
|
||||
|
||||
# Calculate final metrics
|
||||
space_saved = total_original_size - total_compressed_size
|
||||
avg_ratio = (space_saved / total_original_size) if total_original_size > 0 else 0.0
|
||||
|
||||
return BucketStats(
|
||||
bucket=bucket,
|
||||
object_count=delta_count + direct_count,
|
||||
total_size=total_original_size,
|
||||
compressed_size=total_compressed_size,
|
||||
space_saved=space_saved,
|
||||
average_compression_ratio=avg_ratio,
|
||||
delta_objects=delta_count,
|
||||
direct_objects=direct_count,
|
||||
)
|
||||
|
||||
|
||||
def _log_orphaned_references(
|
||||
bucket: str,
|
||||
reference_files: dict[str, int],
|
||||
total_reference_size: int,
|
||||
logger: Any,
|
||||
) -> None:
|
||||
"""Log warning about orphaned reference.bin files.
|
||||
|
||||
Args:
|
||||
bucket: Bucket name
|
||||
reference_files: Dict of deltaspace -> size
|
||||
total_reference_size: Total size of all reference files
|
||||
logger: Logger instance
|
||||
"""
|
||||
waste_mb = total_reference_size / 1024 / 1024
|
||||
logger.warning(
|
||||
f"\n{'=' * 60}\n"
|
||||
f"WARNING: ORPHANED REFERENCE FILE(S) DETECTED!\n"
|
||||
f"{'=' * 60}\n"
|
||||
f"Found {len(reference_files)} reference.bin file(s) totaling "
|
||||
f"{total_reference_size:,} bytes ({waste_mb:.2f} MB)\n"
|
||||
f"but NO delta files are using them.\n"
|
||||
f"\n"
|
||||
f"This wastes {waste_mb:.2f} MB of storage!\n"
|
||||
f"\n"
|
||||
f"Orphaned reference files:\n"
|
||||
)
|
||||
|
||||
for deltaspace, size in reference_files.items():
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
logger.warning(f" - s3://{bucket}/{path} ({size:,} bytes)")
|
||||
|
||||
logger.warning("\nConsider removing these orphaned files:\n")
|
||||
for deltaspace in reference_files:
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
logger.warning(f" aws s3 rm s3://{bucket}/{path}")
|
||||
|
||||
logger.warning(f"{'=' * 60}")
|
||||
|
||||
|
||||
def get_object_info(
|
||||
@@ -27,14 +489,9 @@ def get_object_info(
|
||||
Returns:
|
||||
ObjectInfo with detailed metadata
|
||||
"""
|
||||
# Parse URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
raise ValueError(f"Invalid S3 URL: {s3_url}")
|
||||
|
||||
s3_path = s3_url[5:]
|
||||
parts = s3_path.split("/", 1)
|
||||
bucket = parts[0]
|
||||
key = parts[1] if len(parts) > 1 else ""
|
||||
address = parse_s3_url(s3_url, allow_empty_key=False)
|
||||
bucket = address.bucket
|
||||
key = address.key
|
||||
|
||||
# Get object metadata
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
@@ -60,222 +517,281 @@ def get_object_info(
|
||||
def get_bucket_stats(
|
||||
client: Any, # DeltaGliderClient
|
||||
bucket: str,
|
||||
detailed_stats: bool = False,
|
||||
mode: StatsMode = "quick",
|
||||
use_cache: bool = True,
|
||||
refresh_cache: bool = False,
|
||||
) -> BucketStats:
|
||||
"""Get statistics for a bucket with optional detailed compression metrics.
|
||||
"""Get statistics for a bucket with configurable metadata strategies and caching.
|
||||
|
||||
This method provides two modes:
|
||||
- Quick stats (default): Fast overview using LIST only (~50ms)
|
||||
- Detailed stats: Accurate compression metrics with HEAD requests (slower)
|
||||
Modes:
|
||||
- ``quick`` (default): Stream LIST results only. Compression metrics for delta files are
|
||||
approximate (falls back to delta size when metadata is unavailable).
|
||||
- ``sampled``: Fetch HEAD metadata for a single delta per delta-space and reuse the ratios for
|
||||
other deltas in the same space. Balances accuracy and speed.
|
||||
- ``detailed``: Fetch HEAD metadata for every delta object for the most accurate statistics.
|
||||
|
||||
Caching:
|
||||
- Stats are cached per mode in ``.deltaglider/stats_{mode}.json``
|
||||
- Cache is validated using object count and compressed size from LIST
|
||||
- If bucket changed, cache is recomputed automatically
|
||||
- Use ``refresh_cache=True`` to force recomputation
|
||||
- Use ``use_cache=False`` to skip caching entirely
|
||||
|
||||
**Robustness**: This function is designed to always return valid stats:
|
||||
- Returns partial stats if timeouts or pagination issues occur
|
||||
- Returns empty stats (zeros) if bucket listing completely fails
|
||||
- Never hangs indefinitely (max 10 min timeout, 10M object limit)
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
detailed_stats: If True, fetch accurate compression ratios for delta files (default: False)
|
||||
mode: Stats mode ("quick", "sampled", or "detailed")
|
||||
use_cache: If True, use cached stats when available (default: True)
|
||||
refresh_cache: If True, force cache recomputation even if valid (default: False)
|
||||
|
||||
Returns:
|
||||
BucketStats with compression and space savings info
|
||||
BucketStats with compression and space savings info. Always returns a valid BucketStats
|
||||
object, even if errors occur (will return empty/partial stats with warnings logged).
|
||||
|
||||
Raises:
|
||||
RuntimeError: Only if bucket listing fails immediately with no objects collected.
|
||||
All other errors result in partial/empty stats being returned.
|
||||
|
||||
Performance:
|
||||
- With detailed_stats=False: ~50ms for any bucket size (1 LIST call per 1000 objects)
|
||||
- With detailed_stats=True: ~2-3s per 1000 objects (adds HEAD calls for delta files only)
|
||||
- With cache hit: ~50-100ms (LIST + cache read + validation)
|
||||
- quick (no cache): ~50ms for any bucket size (LIST calls only)
|
||||
- sampled (no cache): LIST + one HEAD per delta-space
|
||||
- detailed (no cache): LIST + HEAD for every delta (slowest but accurate)
|
||||
- Max timeout: 10 minutes (prevents indefinite hangs)
|
||||
- Max objects: 10M (prevents infinite loops)
|
||||
|
||||
Example:
|
||||
# Quick stats for dashboard display
|
||||
# Use cached stats (fast, ~100ms)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, Size: {stats.total_size}")
|
||||
|
||||
# Detailed stats for analytics (slower but accurate)
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True)
|
||||
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
|
||||
# Force refresh (slow, recomputes everything)
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Skip cache entirely
|
||||
stats = client.get_bucket_stats('releases', use_cache=False)
|
||||
|
||||
# Different modes with caching
|
||||
stats_sampled = client.get_bucket_stats('releases', mode='sampled')
|
||||
stats_detailed = client.get_bucket_stats('releases', mode='detailed')
|
||||
"""
|
||||
# List all objects DIRECTLY from storage adapter to see reference.bin files
|
||||
# (client.list_objects filters them out for user-facing operations)
|
||||
all_objects = []
|
||||
start_after = None
|
||||
try:
|
||||
if mode not in {"quick", "sampled", "detailed"}:
|
||||
raise ValueError(f"Unknown stats mode: {mode}")
|
||||
|
||||
import concurrent.futures
|
||||
# Phase 1: Always do a quick LIST to get current state (needed for validation)
|
||||
import time
|
||||
|
||||
# Phase 1: Collect all objects and identify delta files
|
||||
raw_objects = []
|
||||
delta_keys = []
|
||||
phase1_start = time.time()
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: Starting LIST operation for bucket '{bucket}'"
|
||||
)
|
||||
|
||||
while True:
|
||||
# Call storage adapter directly to see ALL files including reference.bin
|
||||
response = client.service.storage.list_objects(
|
||||
listing = list_all_objects(
|
||||
client.service.storage,
|
||||
bucket=bucket,
|
||||
prefix="",
|
||||
max_keys=1000,
|
||||
start_after=start_after,
|
||||
logger=client.service.logger,
|
||||
)
|
||||
raw_objects = listing.objects
|
||||
|
||||
# Collect objects and identify delta files
|
||||
for obj_dict in response.get("objects", []):
|
||||
raw_objects.append(obj_dict)
|
||||
if obj_dict["key"].endswith(".delta"):
|
||||
delta_keys.append(obj_dict["key"])
|
||||
# Calculate validation metrics from LIST
|
||||
current_object_count = len(raw_objects)
|
||||
current_compressed_size = sum(obj["size"] for obj in raw_objects)
|
||||
|
||||
if not response.get("is_truncated"):
|
||||
break
|
||||
|
||||
start_after = response.get("next_continuation_token")
|
||||
|
||||
# Phase 2: Fetch metadata for delta files in parallel (10x faster)
|
||||
metadata_map = {}
|
||||
if delta_keys:
|
||||
phase1_duration = time.time() - phase1_start
|
||||
client.service.logger.info(
|
||||
f"Fetching metadata for {len(delta_keys)} delta files in parallel..."
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: LIST completed in {phase1_duration:.2f}s - "
|
||||
f"Found {current_object_count} objects, {current_compressed_size:,} bytes total"
|
||||
)
|
||||
|
||||
def fetch_metadata(key: str) -> tuple[str, dict[str, Any] | None]:
|
||||
try:
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
if obj_head and obj_head.metadata:
|
||||
return key, obj_head.metadata
|
||||
except Exception as e:
|
||||
client.service.logger.debug(f"Failed to fetch metadata for {key}: {e}")
|
||||
return key, None
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=min(10, len(delta_keys))
|
||||
) as executor:
|
||||
futures = [executor.submit(fetch_metadata, key) for key in delta_keys]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
key, metadata = future.result()
|
||||
if metadata:
|
||||
metadata_map[key] = metadata
|
||||
|
||||
# Phase 3: Build ObjectInfo list with metadata
|
||||
for obj_dict in raw_objects:
|
||||
key = obj_dict["key"]
|
||||
size = obj_dict["size"]
|
||||
is_delta = key.endswith(".delta")
|
||||
|
||||
# Get metadata from our parallel fetch
|
||||
metadata = metadata_map.get(key, {})
|
||||
|
||||
# Parse compression ratio and original size
|
||||
compression_ratio = 0.0
|
||||
original_size = size
|
||||
if is_delta and metadata:
|
||||
try:
|
||||
ratio_str = metadata.get("compression_ratio", "0.0")
|
||||
compression_ratio = float(ratio_str) if ratio_str != "unknown" else 0.0
|
||||
except (ValueError, TypeError):
|
||||
compression_ratio = 0.0
|
||||
try:
|
||||
original_size = int(metadata.get("file_size", size))
|
||||
client.service.logger.debug(f"Delta {key}: using original_size={original_size}")
|
||||
except (ValueError, TypeError):
|
||||
original_size = size
|
||||
|
||||
all_objects.append(
|
||||
ObjectInfo(
|
||||
key=key,
|
||||
size=size,
|
||||
last_modified=obj_dict.get("last_modified", ""),
|
||||
etag=obj_dict.get("etag"),
|
||||
storage_class=obj_dict.get("storage_class", "STANDARD"),
|
||||
original_size=original_size,
|
||||
compressed_size=size,
|
||||
is_delta=is_delta,
|
||||
compression_ratio=compression_ratio,
|
||||
reference_key=metadata.get("ref_key") if metadata else None,
|
||||
# Phase 2: Try to use cache if enabled and not forcing refresh
|
||||
phase2_start = time.time()
|
||||
if use_cache and not refresh_cache:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Checking cache for mode '{mode}'"
|
||||
)
|
||||
)
|
||||
cached_stats, cached_validation = _read_stats_cache(client, bucket, mode)
|
||||
|
||||
# Calculate statistics - COUNT ALL FILES
|
||||
total_original_size = 0
|
||||
total_compressed_size = 0
|
||||
delta_count = 0
|
||||
direct_count = 0
|
||||
reference_files = {} # Track all reference.bin files and their deltaspaces
|
||||
|
||||
# First pass: identify what we have
|
||||
for obj in all_objects:
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
# Extract deltaspace prefix
|
||||
if "/" in obj.key:
|
||||
deltaspace = obj.key.rsplit("/reference.bin", 1)[0]
|
||||
if cached_stats and cached_validation:
|
||||
# Validate cache against current bucket state
|
||||
if _is_cache_valid(
|
||||
cached_validation, current_object_count, current_compressed_size
|
||||
):
|
||||
phase2_duration = time.time() - phase2_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache HIT in {phase2_duration:.2f}s - "
|
||||
f"Using cached stats for {bucket} (mode={mode}, bucket unchanged)"
|
||||
)
|
||||
return cached_stats
|
||||
else:
|
||||
phase2_duration = time.time() - phase2_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache INVALID in {phase2_duration:.2f}s - "
|
||||
f"Bucket changed: count {cached_validation.get('object_count')} → {current_object_count}, "
|
||||
f"size {cached_validation.get('compressed_size')} → {current_compressed_size}"
|
||||
)
|
||||
else:
|
||||
deltaspace = "" # Root level reference.bin
|
||||
reference_files[deltaspace] = obj.size
|
||||
elif obj.is_delta:
|
||||
delta_count += 1
|
||||
else:
|
||||
direct_count += 1
|
||||
|
||||
# Second pass: calculate sizes
|
||||
for obj in all_objects:
|
||||
# Skip reference.bin in this pass (we'll handle it separately)
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
continue
|
||||
|
||||
if obj.is_delta:
|
||||
# Delta file: original from metadata, compressed = delta size
|
||||
if obj.original_size and obj.original_size != obj.size:
|
||||
client.service.logger.debug(
|
||||
f"Delta {obj.key}: using original_size={obj.original_size}"
|
||||
phase2_duration = time.time() - phase2_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache MISS in {phase2_duration:.2f}s - "
|
||||
f"No valid cache found"
|
||||
)
|
||||
total_original_size += obj.original_size
|
||||
else:
|
||||
client.service.logger.warning(
|
||||
f"Delta {obj.key}: no original_size, using compressed size={obj.size}"
|
||||
)
|
||||
total_original_size += obj.size
|
||||
total_compressed_size += obj.size
|
||||
else:
|
||||
# Direct files: original = compressed = actual size
|
||||
total_original_size += obj.size
|
||||
total_compressed_size += obj.size
|
||||
if refresh_cache:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache SKIPPED (refresh requested)"
|
||||
)
|
||||
elif not use_cache:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache DISABLED"
|
||||
)
|
||||
|
||||
# Handle reference.bin files
|
||||
total_reference_size = sum(reference_files.values())
|
||||
|
||||
if delta_count > 0 and total_reference_size > 0:
|
||||
# Add all reference.bin files to compressed size
|
||||
total_compressed_size += total_reference_size
|
||||
# Phase 3: Cache miss or invalid - compute stats from scratch
|
||||
client.service.logger.info(
|
||||
f"Including {len(reference_files)} reference.bin file(s) ({total_reference_size:,} bytes) in compressed size"
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 3: Computing stats (mode={mode})"
|
||||
)
|
||||
elif delta_count == 0 and total_reference_size > 0:
|
||||
# ORPHANED REFERENCE WARNING
|
||||
waste_mb = total_reference_size / 1024 / 1024
|
||||
client.service.logger.warning(
|
||||
f"\n{'=' * 60}\n"
|
||||
f"WARNING: ORPHANED REFERENCE FILE(S) DETECTED!\n"
|
||||
f"{'=' * 60}\n"
|
||||
f"Found {len(reference_files)} reference.bin file(s) totaling {total_reference_size:,} bytes ({waste_mb:.2f} MB)\n"
|
||||
f"but NO delta files are using them.\n"
|
||||
f"\n"
|
||||
f"This wastes {waste_mb:.2f} MB of storage!\n"
|
||||
f"\n"
|
||||
f"Orphaned reference files:\n"
|
||||
|
||||
# Phase 4: Extract delta keys for metadata fetching
|
||||
phase4_start = time.time()
|
||||
delta_keys = [obj["key"] for obj in raw_objects if obj["key"].endswith(".delta")]
|
||||
phase4_duration = time.time() - phase4_start
|
||||
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 4: Delta extraction completed in {phase4_duration:.3f}s - "
|
||||
f"Found {len(delta_keys)} delta files"
|
||||
)
|
||||
for deltaspace, size in reference_files.items():
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
client.service.logger.warning(f" - s3://{bucket}/{path} ({size:,} bytes)")
|
||||
|
||||
client.service.logger.warning("\nConsider removing these orphaned files:\n")
|
||||
for deltaspace in reference_files:
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
client.service.logger.warning(f" aws s3 rm s3://{bucket}/{path}")
|
||||
# Phase 5: Fetch metadata for delta files based on mode
|
||||
phase5_start = time.time()
|
||||
metadata_map: dict[str, dict[str, Any]] = {}
|
||||
sampled_space_metadata: dict[str, dict[str, Any]] | None = None
|
||||
|
||||
client.service.logger.warning(f"{'=' * 60}")
|
||||
if delta_keys:
|
||||
if mode == "detailed":
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Fetching metadata for ALL {len(delta_keys)} delta files"
|
||||
)
|
||||
metadata_map = _fetch_delta_metadata(client, bucket, delta_keys)
|
||||
|
||||
space_saved = total_original_size - total_compressed_size
|
||||
avg_ratio = (space_saved / total_original_size) if total_original_size > 0 else 0.0
|
||||
elif mode == "sampled":
|
||||
# Sample one delta per deltaspace
|
||||
seen_spaces: set[str] = set()
|
||||
sampled_keys: list[str] = []
|
||||
for key in delta_keys:
|
||||
space = _extract_deltaspace(key)
|
||||
if space not in seen_spaces:
|
||||
seen_spaces.add(space)
|
||||
sampled_keys.append(key)
|
||||
|
||||
return BucketStats(
|
||||
bucket=bucket,
|
||||
object_count=delta_count + direct_count, # Only count user files, not reference.bin
|
||||
total_size=total_original_size,
|
||||
compressed_size=total_compressed_size,
|
||||
space_saved=space_saved,
|
||||
average_compression_ratio=avg_ratio,
|
||||
delta_objects=delta_count,
|
||||
direct_objects=direct_count,
|
||||
)
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Sampling {len(sampled_keys)} delta files "
|
||||
f"(one per deltaspace) out of {len(delta_keys)} total delta files"
|
||||
)
|
||||
|
||||
# Log which files are being sampled
|
||||
if sampled_keys:
|
||||
for idx, key in enumerate(sampled_keys[:10], 1): # Show first 10
|
||||
space = _extract_deltaspace(key)
|
||||
client.service.logger.info(
|
||||
f" [{idx}] Sampling: {key} (deltaspace: '{space or '(root)'}')"
|
||||
)
|
||||
if len(sampled_keys) > 10:
|
||||
client.service.logger.info(f" ... and {len(sampled_keys) - 10} more")
|
||||
|
||||
if sampled_keys:
|
||||
metadata_map = _fetch_delta_metadata(client, bucket, sampled_keys)
|
||||
sampled_space_metadata = {
|
||||
_extract_deltaspace(k): metadata for k, metadata in metadata_map.items()
|
||||
}
|
||||
|
||||
phase5_duration = time.time() - phase5_start
|
||||
if mode == "quick":
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Skipped metadata fetching (quick mode) in {phase5_duration:.3f}s"
|
||||
)
|
||||
else:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Metadata fetching completed in {phase5_duration:.2f}s - "
|
||||
f"Fetched {len(metadata_map)} metadata records"
|
||||
)
|
||||
|
||||
# Phase 6: Build ObjectInfo list
|
||||
phase6_start = time.time()
|
||||
all_objects = _build_object_info_list(
|
||||
raw_objects,
|
||||
metadata_map,
|
||||
client.service.logger,
|
||||
sampled_space_metadata,
|
||||
)
|
||||
phase6_duration = time.time() - phase6_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 6: ObjectInfo list built in {phase6_duration:.3f}s - "
|
||||
f"{len(all_objects)} objects processed"
|
||||
)
|
||||
|
||||
# Phase 7: Calculate final statistics
|
||||
phase7_start = time.time()
|
||||
stats = _calculate_bucket_statistics(all_objects, bucket, client.service.logger)
|
||||
phase7_duration = time.time() - phase7_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 7: Statistics calculated in {phase7_duration:.3f}s - "
|
||||
f"{stats.delta_objects} delta, {stats.direct_objects} direct objects"
|
||||
)
|
||||
|
||||
# Phase 8: Write cache if enabled
|
||||
phase8_start = time.time()
|
||||
if use_cache:
|
||||
_write_stats_cache(
|
||||
client=client,
|
||||
bucket=bucket,
|
||||
mode=mode,
|
||||
stats=stats,
|
||||
object_count=current_object_count,
|
||||
compressed_size=current_compressed_size,
|
||||
)
|
||||
phase8_duration = time.time() - phase8_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 8: Cache written in {phase8_duration:.3f}s"
|
||||
)
|
||||
else:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 8: Cache write skipped (caching disabled)"
|
||||
)
|
||||
|
||||
# Summary
|
||||
total_duration = time.time() - phase1_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] COMPLETE: Total time {total_duration:.2f}s for bucket '{bucket}' (mode={mode})"
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
# Last resort: return empty stats with error indication
|
||||
client.service.logger.error(
|
||||
f"get_bucket_stats: Failed to build statistics for '{bucket}': {e}. "
|
||||
f"Returning empty stats."
|
||||
)
|
||||
return BucketStats(
|
||||
bucket=bucket,
|
||||
object_count=0,
|
||||
total_size=0,
|
||||
compressed_size=0,
|
||||
space_saved=0,
|
||||
average_compression_ratio=0.0,
|
||||
delta_objects=0,
|
||||
direct_objects=0,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Public API Functions
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def estimate_compression(
|
||||
@@ -300,30 +816,8 @@ def estimate_compression(
|
||||
file_path = Path(file_path)
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
# Check file extension
|
||||
filename = file_path.name
|
||||
ext = file_path.suffix.lower()
|
||||
delta_extensions = {
|
||||
".zip",
|
||||
".tar",
|
||||
".gz",
|
||||
".tar.gz",
|
||||
".tgz",
|
||||
".bz2",
|
||||
".tar.bz2",
|
||||
".xz",
|
||||
".tar.xz",
|
||||
".7z",
|
||||
".rar",
|
||||
".dmg",
|
||||
".iso",
|
||||
".pkg",
|
||||
".deb",
|
||||
".rpm",
|
||||
".apk",
|
||||
".jar",
|
||||
".war",
|
||||
".ear",
|
||||
}
|
||||
|
||||
# Already compressed formats that won't benefit from delta
|
||||
incompressible = {".jpg", ".jpeg", ".png", ".mp4", ".mp3", ".avi", ".mov"}
|
||||
@@ -337,7 +831,7 @@ def estimate_compression(
|
||||
should_use_delta=False,
|
||||
)
|
||||
|
||||
if ext not in delta_extensions:
|
||||
if not is_delta_candidate(filename):
|
||||
# Unknown type, conservative estimate
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
"""Core domain for DeltaGlider."""
|
||||
|
||||
from .delta_extensions import (
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
DEFAULT_DELTA_EXTENSIONS,
|
||||
is_delta_candidate,
|
||||
)
|
||||
from .errors import (
|
||||
DeltaGliderError,
|
||||
DiffDecodeError,
|
||||
@@ -19,6 +24,7 @@ from .models import (
|
||||
Sha256,
|
||||
VerifyResult,
|
||||
)
|
||||
from .s3_uri import S3Url, build_s3_url, is_s3_url, parse_s3_url
|
||||
from .service import DeltaService
|
||||
|
||||
__all__ = [
|
||||
@@ -38,4 +44,11 @@ __all__ = [
|
||||
"PutSummary",
|
||||
"VerifyResult",
|
||||
"DeltaService",
|
||||
"DEFAULT_DELTA_EXTENSIONS",
|
||||
"DEFAULT_COMPOUND_DELTA_EXTENSIONS",
|
||||
"is_delta_candidate",
|
||||
"S3Url",
|
||||
"build_s3_url",
|
||||
"is_s3_url",
|
||||
"parse_s3_url",
|
||||
]
|
||||
|
||||
56
src/deltaglider/core/delta_extensions.py
Normal file
56
src/deltaglider/core/delta_extensions.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""Shared delta compression extension policy."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Collection, Iterable
|
||||
|
||||
# Compound extensions must be checked before simple suffix matching so that
|
||||
# multi-part archives like ".tar.gz" are handled correctly.
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS: tuple[str, ...] = (".tar.gz", ".tar.bz2", ".tar.xz")
|
||||
|
||||
# Simple extensions that benefit from delta compression. Keep this structure
|
||||
# immutable so it can be safely reused across modules.
|
||||
DEFAULT_DELTA_EXTENSIONS: frozenset[str] = frozenset(
|
||||
{
|
||||
".zip",
|
||||
".tar",
|
||||
".gz",
|
||||
".tgz",
|
||||
".bz2",
|
||||
".xz",
|
||||
".7z",
|
||||
".rar",
|
||||
".dmg",
|
||||
".iso",
|
||||
".pkg",
|
||||
".deb",
|
||||
".rpm",
|
||||
".apk",
|
||||
".jar",
|
||||
".war",
|
||||
".ear",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def is_delta_candidate(
|
||||
filename: str,
|
||||
*,
|
||||
simple_extensions: Collection[str] = DEFAULT_DELTA_EXTENSIONS,
|
||||
compound_extensions: Iterable[str] = DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
) -> bool:
|
||||
"""Check if a filename should use delta compression based on extension."""
|
||||
name_lower = filename.lower()
|
||||
|
||||
for ext in compound_extensions:
|
||||
if name_lower.endswith(ext):
|
||||
return True
|
||||
|
||||
return any(name_lower.endswith(ext) for ext in simple_extensions)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"DEFAULT_COMPOUND_DELTA_EXTENSIONS",
|
||||
"DEFAULT_DELTA_EXTENSIONS",
|
||||
"is_delta_candidate",
|
||||
]
|
||||
206
src/deltaglider/core/object_listing.py
Normal file
206
src/deltaglider/core/object_listing.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""Shared helpers for listing bucket objects with pagination support."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from ..ports.storage import ObjectHead
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ObjectListing:
|
||||
"""All objects and prefixes returned from a bucket listing."""
|
||||
|
||||
objects: list[dict[str, Any]] = field(default_factory=list)
|
||||
common_prefixes: list[str] = field(default_factory=list)
|
||||
key_count: int = 0
|
||||
is_truncated: bool = False
|
||||
next_continuation_token: str | None = None
|
||||
|
||||
|
||||
def list_objects_page(
|
||||
storage: Any,
|
||||
*,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
continuation_token: str | None = None,
|
||||
) -> ObjectListing:
|
||||
"""Perform a single list_objects call using the storage adapter."""
|
||||
if not hasattr(storage, "list_objects"):
|
||||
raise NotImplementedError("Storage adapter does not support list_objects")
|
||||
|
||||
response = storage.list_objects(
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
delimiter=delimiter,
|
||||
max_keys=max_keys,
|
||||
start_after=start_after,
|
||||
continuation_token=continuation_token,
|
||||
)
|
||||
|
||||
return ObjectListing(
|
||||
objects=list(response.get("objects", [])),
|
||||
common_prefixes=list(response.get("common_prefixes", [])),
|
||||
key_count=response.get("key_count", len(response.get("objects", []))),
|
||||
is_truncated=bool(response.get("is_truncated", False)),
|
||||
next_continuation_token=response.get("next_continuation_token"),
|
||||
)
|
||||
|
||||
|
||||
def list_all_objects(
|
||||
storage: Any,
|
||||
*,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
logger: Any | None = None,
|
||||
max_iterations: int = 10_000,
|
||||
) -> ObjectListing:
|
||||
"""Fetch all objects under the given bucket/prefix with pagination safety."""
|
||||
import time
|
||||
from datetime import UTC, datetime
|
||||
|
||||
aggregated = ObjectListing()
|
||||
continuation_token: str | None = None
|
||||
iteration_count = 0
|
||||
list_start_time = time.time()
|
||||
|
||||
while True:
|
||||
iteration_count += 1
|
||||
if iteration_count > max_iterations:
|
||||
if logger:
|
||||
logger.warning(
|
||||
"list_all_objects: reached max iterations (%s). Returning partial results.",
|
||||
max_iterations,
|
||||
)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = continuation_token
|
||||
break
|
||||
|
||||
# Log progress every 10 pages or on first page
|
||||
if logger and (iteration_count == 1 or iteration_count % 10 == 0):
|
||||
elapsed = time.time() - list_start_time
|
||||
objects_per_sec = len(aggregated.objects) / elapsed if elapsed > 0 else 0
|
||||
token_info = f", token={continuation_token[:20]}..." if continuation_token else ""
|
||||
logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] LIST pagination: "
|
||||
f"page {iteration_count}, {len(aggregated.objects)} objects so far "
|
||||
f"({objects_per_sec:.0f} obj/s, {elapsed:.1f}s elapsed{token_info})"
|
||||
)
|
||||
|
||||
# Warn if taking very long (>60s)
|
||||
if elapsed > 60 and iteration_count % 50 == 0:
|
||||
estimated_total = (len(aggregated.objects) / iteration_count) * max_iterations
|
||||
logger.warning(
|
||||
f"LIST operation is slow ({elapsed:.0f}s elapsed). "
|
||||
f"This bucket has MANY objects ({len(aggregated.objects)} so far). "
|
||||
f"Consider using a smaller prefix or enabling caching. "
|
||||
f"Estimated remaining: {estimated_total - len(aggregated.objects):.0f} objects"
|
||||
)
|
||||
|
||||
try:
|
||||
page = list_objects_page(
|
||||
storage,
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
delimiter=delimiter,
|
||||
max_keys=max_keys,
|
||||
continuation_token=continuation_token,
|
||||
)
|
||||
except Exception as exc:
|
||||
if not aggregated.objects:
|
||||
raise RuntimeError(f"Failed to list objects for bucket '{bucket}': {exc}") from exc
|
||||
if logger:
|
||||
logger.warning(
|
||||
"list_all_objects: pagination error after %s objects: %s. Returning partial results.",
|
||||
len(aggregated.objects),
|
||||
exc,
|
||||
)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = continuation_token
|
||||
break
|
||||
|
||||
aggregated.objects.extend(page.objects)
|
||||
aggregated.common_prefixes.extend(page.common_prefixes)
|
||||
aggregated.key_count += page.key_count
|
||||
|
||||
if not page.is_truncated:
|
||||
aggregated.is_truncated = False
|
||||
aggregated.next_continuation_token = None
|
||||
if logger:
|
||||
elapsed = time.time() - list_start_time
|
||||
logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] LIST complete: "
|
||||
f"{iteration_count} pages, {len(aggregated.objects)} objects total in {elapsed:.2f}s"
|
||||
)
|
||||
break
|
||||
|
||||
continuation_token = page.next_continuation_token
|
||||
if not continuation_token:
|
||||
if logger:
|
||||
logger.warning(
|
||||
"list_all_objects: truncated response without continuation token after %s objects.",
|
||||
len(aggregated.objects),
|
||||
)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = None
|
||||
break
|
||||
|
||||
if aggregated.common_prefixes:
|
||||
seen: set[str] = set()
|
||||
unique_prefixes: list[str] = []
|
||||
for prefix in aggregated.common_prefixes:
|
||||
if prefix not in seen:
|
||||
seen.add(prefix)
|
||||
unique_prefixes.append(prefix)
|
||||
aggregated.common_prefixes = unique_prefixes
|
||||
aggregated.key_count = len(aggregated.objects)
|
||||
return aggregated
|
||||
|
||||
|
||||
def _parse_last_modified(value: Any) -> datetime:
|
||||
if isinstance(value, datetime):
|
||||
dt = value
|
||||
elif value:
|
||||
text = str(value)
|
||||
if text.endswith("Z"):
|
||||
text = text[:-1] + "+00:00"
|
||||
try:
|
||||
dt = datetime.fromisoformat(text)
|
||||
except ValueError:
|
||||
dt = datetime.fromtimestamp(0, tz=timezone.utc) # noqa: UP017
|
||||
else:
|
||||
dt = datetime.fromtimestamp(0, tz=timezone.utc) # noqa: UP017
|
||||
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc) # noqa: UP017
|
||||
return dt
|
||||
|
||||
|
||||
def object_dict_to_head(obj: dict[str, Any]) -> ObjectHead:
|
||||
"""Convert a list_objects entry into ObjectHead for compatibility uses."""
|
||||
metadata = obj.get("metadata")
|
||||
if metadata is None or not isinstance(metadata, dict):
|
||||
metadata = {}
|
||||
|
||||
return ObjectHead(
|
||||
key=obj["key"],
|
||||
size=int(obj.get("size", 0)),
|
||||
etag=str(obj.get("etag", "")),
|
||||
last_modified=_parse_last_modified(obj.get("last_modified")),
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ObjectListing",
|
||||
"list_objects_page",
|
||||
"list_all_objects",
|
||||
"object_dict_to_head",
|
||||
]
|
||||
85
src/deltaglider/core/s3_uri.py
Normal file
85
src/deltaglider/core/s3_uri.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Utilities for working with S3-style URLs and keys."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
S3_SCHEME = "s3://"
|
||||
|
||||
|
||||
class S3Url(NamedTuple):
|
||||
"""Normalized representation of an S3 URL."""
|
||||
|
||||
bucket: str
|
||||
key: str = ""
|
||||
|
||||
def to_url(self) -> str:
|
||||
"""Return the canonical string form."""
|
||||
if self.key:
|
||||
return f"{S3_SCHEME}{self.bucket}/{self.key}"
|
||||
return f"{S3_SCHEME}{self.bucket}"
|
||||
|
||||
def with_key(self, key: str) -> S3Url:
|
||||
"""Return a new S3Url with a different key."""
|
||||
return S3Url(self.bucket, key.lstrip("/"))
|
||||
|
||||
def join_key(self, suffix: str) -> S3Url:
|
||||
"""Append a suffix to the key using '/' semantics."""
|
||||
suffix = suffix.lstrip("/")
|
||||
if not self.key:
|
||||
return self.with_key(suffix)
|
||||
if not suffix:
|
||||
return self
|
||||
return self.with_key(f"{self.key.rstrip('/')}/{suffix}")
|
||||
|
||||
|
||||
def is_s3_url(value: str) -> bool:
|
||||
"""Check if a string is an S3 URL."""
|
||||
return value.startswith(S3_SCHEME)
|
||||
|
||||
|
||||
def parse_s3_url(
|
||||
url: str,
|
||||
*,
|
||||
allow_empty_key: bool = True,
|
||||
strip_trailing_slash: bool = False,
|
||||
) -> S3Url:
|
||||
"""Parse an S3 URL into bucket and key components."""
|
||||
if not is_s3_url(url):
|
||||
raise ValueError(f"Invalid S3 URL: {url}")
|
||||
|
||||
path = url[len(S3_SCHEME) :]
|
||||
if strip_trailing_slash:
|
||||
path = path.rstrip("/")
|
||||
|
||||
bucket, sep, key = path.partition("/")
|
||||
if not bucket:
|
||||
raise ValueError(f"S3 URL missing bucket: {url}")
|
||||
|
||||
if not sep:
|
||||
key = ""
|
||||
|
||||
key = key.lstrip("/")
|
||||
if not key and not allow_empty_key:
|
||||
raise ValueError(f"S3 URL must include a key: {url}")
|
||||
|
||||
return S3Url(bucket=bucket, key=key)
|
||||
|
||||
|
||||
def build_s3_url(bucket: str, key: str | None = None) -> str:
|
||||
"""Build an S3 URL from components."""
|
||||
if not bucket:
|
||||
raise ValueError("Bucket name cannot be empty")
|
||||
|
||||
if key:
|
||||
key = key.lstrip("/")
|
||||
return f"{S3_SCHEME}{bucket}/{key}"
|
||||
return f"{S3_SCHEME}{bucket}"
|
||||
|
||||
|
||||
__all__ = [
|
||||
"S3Url",
|
||||
"build_s3_url",
|
||||
"is_s3_url",
|
||||
"parse_s3_url",
|
||||
]
|
||||
@@ -15,6 +15,11 @@ from ..ports import (
|
||||
StoragePort,
|
||||
)
|
||||
from ..ports.storage import ObjectHead
|
||||
from .delta_extensions import (
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
DEFAULT_DELTA_EXTENSIONS,
|
||||
is_delta_candidate,
|
||||
)
|
||||
from .errors import (
|
||||
DiffDecodeError,
|
||||
DiffEncodeError,
|
||||
@@ -58,51 +63,41 @@ class DeltaService:
|
||||
self.tool_version = tool_version
|
||||
self.max_ratio = max_ratio
|
||||
|
||||
# File extensions that should use delta compression
|
||||
self.delta_extensions = {
|
||||
".zip",
|
||||
".tar",
|
||||
".gz",
|
||||
".tar.gz",
|
||||
".tgz",
|
||||
".bz2",
|
||||
".tar.bz2",
|
||||
".xz",
|
||||
".tar.xz",
|
||||
".7z",
|
||||
".rar",
|
||||
".dmg",
|
||||
".iso",
|
||||
".pkg",
|
||||
".deb",
|
||||
".rpm",
|
||||
".apk",
|
||||
".jar",
|
||||
".war",
|
||||
".ear",
|
||||
}
|
||||
# File extensions that should use delta compression. Keep mutable copies
|
||||
# so advanced callers can customize the policy if needed.
|
||||
self.delta_extensions = set(DEFAULT_DELTA_EXTENSIONS)
|
||||
self.compound_delta_extensions = DEFAULT_COMPOUND_DELTA_EXTENSIONS
|
||||
|
||||
def should_use_delta(self, filename: str) -> bool:
|
||||
"""Check if file should use delta compression based on extension."""
|
||||
name_lower = filename.lower()
|
||||
# Check compound extensions first
|
||||
for ext in [".tar.gz", ".tar.bz2", ".tar.xz"]:
|
||||
if name_lower.endswith(ext):
|
||||
return True
|
||||
# Check simple extensions
|
||||
return any(name_lower.endswith(ext) for ext in self.delta_extensions)
|
||||
return is_delta_candidate(
|
||||
filename,
|
||||
simple_extensions=self.delta_extensions,
|
||||
compound_extensions=self.compound_delta_extensions,
|
||||
)
|
||||
|
||||
def put(
|
||||
self, local_file: Path, delta_space: DeltaSpace, max_ratio: float | None = None
|
||||
self,
|
||||
local_file: Path,
|
||||
delta_space: DeltaSpace,
|
||||
max_ratio: float | None = None,
|
||||
override_name: str | None = None,
|
||||
) -> PutSummary:
|
||||
"""Upload file as reference or delta (for archive files) or directly (for other files)."""
|
||||
"""Upload file as reference or delta (for archive files) or directly (for other files).
|
||||
|
||||
Args:
|
||||
local_file: Path to the local file to upload
|
||||
delta_space: DeltaSpace (bucket + prefix) for the upload
|
||||
max_ratio: Maximum acceptable delta/file ratio (default: service max_ratio)
|
||||
override_name: Optional name to use instead of local_file.name (useful for S3-to-S3 copies)
|
||||
"""
|
||||
if max_ratio is None:
|
||||
max_ratio = self.max_ratio
|
||||
|
||||
start_time = self.clock.now()
|
||||
file_size = local_file.stat().st_size
|
||||
file_sha256 = self.hasher.sha256(local_file)
|
||||
original_name = local_file.name
|
||||
original_name = override_name if override_name else local_file.name
|
||||
|
||||
self.logger.info(
|
||||
"Starting put operation",
|
||||
|
||||
@@ -153,13 +153,14 @@ class TestBucketManagement:
|
||||
delta_objects=6,
|
||||
direct_objects=4,
|
||||
)
|
||||
client._store_bucket_stats_cache("bucket1", detailed_stats=True, stats=cached_stats)
|
||||
client._store_bucket_stats_cache("bucket1", mode="detailed", stats=cached_stats)
|
||||
|
||||
response = client.list_buckets()
|
||||
|
||||
bucket1 = next(bucket for bucket in response["Buckets"] if bucket["Name"] == "bucket1")
|
||||
assert bucket1["DeltaGliderStats"]["Cached"] is True
|
||||
assert bucket1["DeltaGliderStats"]["Detailed"] is True
|
||||
assert bucket1["DeltaGliderStats"]["Mode"] == "detailed"
|
||||
assert bucket1["DeltaGliderStats"]["ObjectCount"] == cached_stats.object_count
|
||||
assert bucket1["DeltaGliderStats"]["TotalSize"] == cached_stats.total_size
|
||||
|
||||
@@ -254,10 +255,14 @@ class TestBucketManagement:
|
||||
|
||||
call_count = {"value": 0}
|
||||
|
||||
def fake_get_bucket_stats(_: Any, bucket: str, detailed_stats_flag: bool) -> BucketStats:
|
||||
def fake_get_bucket_stats(_: Any, bucket: str, mode: str) -> BucketStats:
|
||||
call_count["value"] += 1
|
||||
assert bucket == "bucket1"
|
||||
return detailed_stats if detailed_stats_flag else quick_stats
|
||||
if mode == "detailed":
|
||||
return detailed_stats
|
||||
if mode == "sampled":
|
||||
return detailed_stats # sampled treated as detailed for cache propagation
|
||||
return quick_stats
|
||||
|
||||
monkeypatch.setattr("deltaglider.client._get_bucket_stats", fake_get_bucket_stats)
|
||||
|
||||
@@ -271,7 +276,7 @@ class TestBucketManagement:
|
||||
assert call_count["value"] == 1
|
||||
|
||||
# Detailed call triggers new computation
|
||||
result_detailed = client.get_bucket_stats("bucket1", detailed_stats=True)
|
||||
result_detailed = client.get_bucket_stats("bucket1", mode="detailed")
|
||||
assert result_detailed is detailed_stats
|
||||
assert call_count["value"] == 2
|
||||
|
||||
|
||||
@@ -434,7 +434,7 @@ class TestDeltaGliderFeatures:
|
||||
|
||||
def test_get_bucket_stats(self, client):
|
||||
"""Test getting bucket statistics."""
|
||||
# Test quick stats (default: detailed_stats=False)
|
||||
# Test quick stats (LIST only)
|
||||
stats = client.get_bucket_stats("test-bucket")
|
||||
|
||||
assert isinstance(stats, BucketStats)
|
||||
@@ -442,8 +442,8 @@ class TestDeltaGliderFeatures:
|
||||
assert stats.total_size > 0
|
||||
assert stats.delta_objects >= 1 # We have archive.zip.delta
|
||||
|
||||
# Test with detailed_stats=True
|
||||
detailed_stats = client.get_bucket_stats("test-bucket", detailed_stats=True)
|
||||
# Test with detailed mode
|
||||
detailed_stats = client.get_bucket_stats("test-bucket", mode="detailed")
|
||||
assert isinstance(detailed_stats, BucketStats)
|
||||
assert detailed_stats.object_count == stats.object_count
|
||||
|
||||
|
||||
271
tests/integration/test_s3_migration.py
Normal file
271
tests/integration/test_s3_migration.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""Test S3-to-S3 migration functionality."""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.aws_compat import migrate_s3_to_s3
|
||||
from deltaglider.core import DeltaService
|
||||
from deltaglider.ports import ObjectHead
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_service():
|
||||
"""Create a mock DeltaService."""
|
||||
service = MagicMock(spec=DeltaService)
|
||||
service.storage = MagicMock()
|
||||
return service
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_with_resume(mock_service):
|
||||
"""Test migration with resume support (skips existing files)."""
|
||||
# Setup mock storage with source files
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
etag="abc123",
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="file2.zip",
|
||||
size=2048,
|
||||
etag="def456",
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="subdir/file3.zip",
|
||||
size=512,
|
||||
etag="ghi789",
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
# Destination already has file1.zip (as .delta)
|
||||
dest_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip.delta",
|
||||
size=100,
|
||||
last_modified="2024-01-02T00:00:00Z",
|
||||
etag="delta123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
# Configure mock to return appropriate objects
|
||||
def list_side_effect(prefix):
|
||||
if "source-bucket" in prefix:
|
||||
return iter(source_objects)
|
||||
elif "dest-bucket" in prefix:
|
||||
return iter(dest_objects)
|
||||
return iter([])
|
||||
|
||||
mock_service.storage.list.side_effect = list_side_effect
|
||||
|
||||
# Mock the copy operation and click functions
|
||||
# Use quiet=True to skip EC2 detection logging
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm", return_value=True):
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection and logging
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Should copy only file2.zip and subdir/file3.zip (file1 already exists)
|
||||
assert mock_copy.call_count == 2
|
||||
|
||||
# Verify the files being migrated
|
||||
call_args = [call[0] for call in mock_copy.call_args_list]
|
||||
migrated_files = [(args[1], args[2]) for args in call_args]
|
||||
|
||||
assert ("s3://source-bucket/file2.zip", "s3://dest-bucket/file2.zip") in migrated_files
|
||||
assert (
|
||||
"s3://source-bucket/subdir/file3.zip",
|
||||
"s3://dest-bucket/subdir/file3.zip",
|
||||
) in migrated_files
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_dry_run(mock_service):
|
||||
"""Test dry run mode shows what would be migrated without actually migrating."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_service.storage.list.return_value = iter(source_objects)
|
||||
|
||||
# Mock the copy operation and EC2 detection
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("deltaglider.app.cli.aws_compat.click.echo") as mock_echo:
|
||||
with patch("deltaglider.app.cli.aws_compat.log_aws_region"):
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=False, # Allow output to test dry run messages
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=True,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Should not actually copy anything in dry run mode
|
||||
mock_copy.assert_not_called()
|
||||
|
||||
# Should show dry run message
|
||||
echo_calls = [str(call[0][0]) for call in mock_echo.call_args_list if call[0]]
|
||||
assert any("DRY RUN MODE" in msg for msg in echo_calls)
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_with_filters(mock_service):
|
||||
"""Test migration with include/exclude filters."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="file2.log",
|
||||
size=256,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="def456",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="file3.tar",
|
||||
size=512,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="ghi789",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_service.storage.list.return_value = iter(source_objects)
|
||||
|
||||
# Mock the copy operation
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("click.echo"):
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm", return_value=True):
|
||||
# Exclude .log files
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude="*.log",
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Should copy file1.zip and file3.tar, but not file2.log
|
||||
assert mock_copy.call_count == 2
|
||||
|
||||
call_args = [call[0] for call in mock_copy.call_args_list]
|
||||
migrated_sources = [args[1] for args in call_args]
|
||||
|
||||
assert "s3://source-bucket/file1.zip" in migrated_sources
|
||||
assert "s3://source-bucket/file3.tar" in migrated_sources
|
||||
assert "s3://source-bucket/file2.log" not in migrated_sources
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_skip_confirm(mock_service):
|
||||
"""Test skipping confirmation prompt with skip_confirm=True."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_service.storage.list.return_value = iter(source_objects)
|
||||
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("click.echo"):
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm") as mock_confirm:
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=True, # Skip confirmation
|
||||
)
|
||||
|
||||
# Should not ask for confirmation
|
||||
mock_confirm.assert_not_called()
|
||||
|
||||
# Should still perform the copy
|
||||
mock_copy.assert_called_once()
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_with_prefix(mock_service):
|
||||
"""Test migration with source and destination prefixes."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="data/file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
def list_side_effect(prefix):
|
||||
if "source-bucket/data" in prefix:
|
||||
return iter(source_objects)
|
||||
return iter([])
|
||||
|
||||
mock_service.storage.list.side_effect = list_side_effect
|
||||
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("click.echo"):
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm", return_value=True):
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/data/",
|
||||
"s3://dest-bucket/archive/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Verify the correct destination path is used
|
||||
mock_copy.assert_called_once()
|
||||
call_args = mock_copy.call_args[0]
|
||||
assert call_args[1] == "s3://source-bucket/data/file1.zip"
|
||||
assert call_args[2] == "s3://dest-bucket/archive/file1.zip"
|
||||
@@ -49,9 +49,7 @@ class TestStatsCommand:
|
||||
assert output["direct_objects"] == 3
|
||||
|
||||
# Verify client was called correctly
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", detailed_stats=False
|
||||
)
|
||||
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="quick")
|
||||
|
||||
def test_stats_json_output_detailed(self):
|
||||
"""Test stats command with detailed JSON output."""
|
||||
@@ -79,7 +77,44 @@ class TestStatsCommand:
|
||||
assert output["average_compression_ratio"] == 0.95
|
||||
|
||||
# Verify detailed flag was passed
|
||||
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", detailed_stats=True)
|
||||
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="detailed")
|
||||
|
||||
def test_stats_json_output_sampled(self):
|
||||
"""Test stats command with sampled JSON output."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=2000000,
|
||||
compressed_size=100000,
|
||||
space_saved=1900000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=5,
|
||||
direct_objects=0,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--sampled", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="sampled")
|
||||
|
||||
def test_stats_sampled_and_detailed_conflict(self):
|
||||
"""--sampled and --detailed flags must be mutually exclusive."""
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--sampled", "--detailed"])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "cannot be used together" in result.output
|
||||
|
||||
def test_stats_human_readable_output(self):
|
||||
"""Test stats command with human-readable output."""
|
||||
@@ -155,9 +190,7 @@ class TestStatsCommand:
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Verify bucket name was parsed correctly from S3 URL
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", detailed_stats=False
|
||||
)
|
||||
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="quick")
|
||||
|
||||
def test_stats_with_s3_url_trailing_slash(self):
|
||||
"""Test stats command with s3:// URL format with trailing slash."""
|
||||
@@ -182,9 +215,7 @@ class TestStatsCommand:
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Verify bucket name was parsed correctly from S3 URL with trailing slash
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", detailed_stats=False
|
||||
)
|
||||
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="quick")
|
||||
|
||||
def test_stats_with_s3_url_with_prefix(self):
|
||||
"""Test stats command with s3:// URL format with prefix (should ignore prefix)."""
|
||||
@@ -209,6 +240,4 @@ class TestStatsCommand:
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Verify only bucket name was extracted, prefix ignored
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", detailed_stats=False
|
||||
)
|
||||
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="quick")
|
||||
|
||||
25
tests/unit/test_delta_extensions.py
Normal file
25
tests/unit/test_delta_extensions.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Tests for shared delta extension policy."""
|
||||
|
||||
from deltaglider.core.delta_extensions import (
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
DEFAULT_DELTA_EXTENSIONS,
|
||||
is_delta_candidate,
|
||||
)
|
||||
|
||||
|
||||
def test_is_delta_candidate_matches_default_extensions():
|
||||
"""All default extensions should be detected as delta candidates."""
|
||||
for ext in DEFAULT_DELTA_EXTENSIONS:
|
||||
assert is_delta_candidate(f"file{ext}")
|
||||
|
||||
|
||||
def test_is_delta_candidate_matches_compound_extensions():
|
||||
"""Compound extensions should be handled even with multiple suffixes."""
|
||||
for ext in DEFAULT_COMPOUND_DELTA_EXTENSIONS:
|
||||
assert is_delta_candidate(f"file{ext}")
|
||||
|
||||
|
||||
def test_is_delta_candidate_rejects_other_extensions():
|
||||
"""Non delta-friendly extensions should return False."""
|
||||
assert not is_delta_candidate("document.txt")
|
||||
assert not is_delta_candidate("image.jpeg")
|
||||
112
tests/unit/test_object_listing.py
Normal file
112
tests/unit/test_object_listing.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Unit tests for object_listing pagination."""
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
from deltaglider.core.object_listing import list_all_objects, list_objects_page
|
||||
|
||||
|
||||
def test_list_objects_page_passes_continuation_token():
|
||||
"""Test that list_objects_page passes continuation_token to storage."""
|
||||
storage = Mock()
|
||||
storage.list_objects.return_value = {
|
||||
"objects": [],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
"key_count": 0,
|
||||
}
|
||||
|
||||
list_objects_page(
|
||||
storage,
|
||||
bucket="test-bucket",
|
||||
continuation_token="test-token",
|
||||
)
|
||||
|
||||
# Verify continuation_token was passed
|
||||
storage.list_objects.assert_called_once()
|
||||
call_kwargs = storage.list_objects.call_args.kwargs
|
||||
assert call_kwargs["continuation_token"] == "test-token"
|
||||
|
||||
|
||||
def test_list_all_objects_uses_continuation_token_for_pagination():
|
||||
"""Test that list_all_objects uses continuation_token (not start_after) for pagination."""
|
||||
storage = Mock()
|
||||
|
||||
# Mock 3 pages of results
|
||||
responses = [
|
||||
{
|
||||
"objects": [{"key": f"obj{i}"} for i in range(1000)],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token1",
|
||||
"key_count": 1000,
|
||||
},
|
||||
{
|
||||
"objects": [{"key": f"obj{i}"} for i in range(1000, 2000)],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token2",
|
||||
"key_count": 1000,
|
||||
},
|
||||
{
|
||||
"objects": [{"key": f"obj{i}"} for i in range(2000, 2500)],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
"key_count": 500,
|
||||
},
|
||||
]
|
||||
|
||||
storage.list_objects.side_effect = responses
|
||||
|
||||
result = list_all_objects(
|
||||
storage,
|
||||
bucket="test-bucket",
|
||||
prefix="",
|
||||
)
|
||||
|
||||
# Should have made 3 calls
|
||||
assert storage.list_objects.call_count == 3
|
||||
|
||||
# Should have collected all objects
|
||||
assert len(result.objects) == 2500
|
||||
|
||||
# Should not be truncated
|
||||
assert not result.is_truncated
|
||||
|
||||
# Verify the calls used continuation_token correctly
|
||||
calls = storage.list_objects.call_args_list
|
||||
assert len(calls) == 3
|
||||
|
||||
# First call should have no continuation_token
|
||||
assert calls[0].kwargs.get("continuation_token") is None
|
||||
|
||||
# Second call should use token1
|
||||
assert calls[1].kwargs.get("continuation_token") == "token1"
|
||||
|
||||
# Third call should use token2
|
||||
assert calls[2].kwargs.get("continuation_token") == "token2"
|
||||
|
||||
|
||||
def test_list_all_objects_prevents_infinite_loop():
|
||||
"""Test that list_all_objects has max_iterations protection."""
|
||||
storage = Mock()
|
||||
|
||||
# Mock infinite pagination (always returns more)
|
||||
storage.list_objects.return_value = {
|
||||
"objects": [{"key": "obj"}],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token",
|
||||
"key_count": 1,
|
||||
}
|
||||
|
||||
result = list_all_objects(
|
||||
storage,
|
||||
bucket="test-bucket",
|
||||
max_iterations=10, # Low limit for testing
|
||||
)
|
||||
|
||||
# Should stop at max_iterations
|
||||
assert storage.list_objects.call_count == 10
|
||||
assert result.is_truncated
|
||||
44
tests/unit/test_s3_uri.py
Normal file
44
tests/unit/test_s3_uri.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Tests for S3 URI helpers."""
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.core.s3_uri import build_s3_url, is_s3_url, parse_s3_url
|
||||
|
||||
|
||||
def test_is_s3_url_detects_scheme() -> None:
|
||||
"""is_s3_url should only match the S3 scheme."""
|
||||
assert is_s3_url("s3://bucket/path")
|
||||
assert not is_s3_url("https://example.com/object")
|
||||
|
||||
|
||||
def test_parse_s3_url_returns_bucket_and_key() -> None:
|
||||
"""Parsing should split bucket and key correctly."""
|
||||
parsed = parse_s3_url("s3://my-bucket/path/to/object.txt")
|
||||
assert parsed.bucket == "my-bucket"
|
||||
assert parsed.key == "path/to/object.txt"
|
||||
|
||||
|
||||
def test_parse_strips_trailing_slash_when_requested() -> None:
|
||||
"""strip_trailing_slash should normalise directory-style URLs."""
|
||||
parsed = parse_s3_url("s3://my-bucket/path/to/", strip_trailing_slash=True)
|
||||
assert parsed.bucket == "my-bucket"
|
||||
assert parsed.key == "path/to"
|
||||
|
||||
|
||||
def test_parse_requires_key_when_configured() -> None:
|
||||
"""allow_empty_key=False should reject bucket-only URLs."""
|
||||
with pytest.raises(ValueError):
|
||||
parse_s3_url("s3://bucket-only", allow_empty_key=False)
|
||||
|
||||
|
||||
def test_build_s3_url_round_trip() -> None:
|
||||
"""build_s3_url should round-trip with parse_s3_url."""
|
||||
url = build_s3_url("bucket", "dir/file.tar")
|
||||
parsed = parse_s3_url(url)
|
||||
assert parsed.bucket == "bucket"
|
||||
assert parsed.key == "dir/file.tar"
|
||||
|
||||
|
||||
def test_build_s3_url_for_bucket_root() -> None:
|
||||
"""When key is missing, build_s3_url should omit the trailing slash."""
|
||||
assert build_s3_url("root-bucket") == "s3://root-bucket"
|
||||
@@ -92,7 +92,7 @@ class TestBucketStatsAlgorithm:
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "compressed-bucket")
|
||||
stats = get_bucket_stats(mock_client, "compressed-bucket", mode="detailed")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 2 # Only delta files counted (not reference.bin)
|
||||
@@ -164,7 +164,7 @@ class TestBucketStatsAlgorithm:
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "mixed-bucket")
|
||||
stats = get_bucket_stats(mock_client, "mixed-bucket", mode="detailed")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 4 # 2 delta + 2 direct files
|
||||
@@ -229,7 +229,7 @@ class TestBucketStatsAlgorithm:
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "multi-deltaspace-bucket")
|
||||
stats = get_bucket_stats(mock_client, "multi-deltaspace-bucket", mode="detailed")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 2 # Only delta files
|
||||
@@ -347,40 +347,57 @@ class TestBucketStatsAlgorithm:
|
||||
)
|
||||
|
||||
with patch_as_completed:
|
||||
_ = get_bucket_stats(mock_client, "parallel-bucket")
|
||||
_ = get_bucket_stats(mock_client, "parallel-bucket", mode="detailed")
|
||||
|
||||
# Verify ThreadPoolExecutor was used with correct max_workers
|
||||
mock_executor.assert_called_once_with(max_workers=10) # min(10, 50) = 10
|
||||
|
||||
def test_detailed_stats_flag(self, mock_client):
|
||||
"""Test that detailed_stats flag controls metadata fetching."""
|
||||
# Setup
|
||||
def test_stats_modes_control_metadata_fetch(self, mock_client):
|
||||
"""Metadata fetching should depend on the selected stats mode."""
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "alpha/reference.bin", "size": 100, "last_modified": "2024-01-01"},
|
||||
{"key": "alpha/file1.zip.delta", "size": 10, "last_modified": "2024-01-02"},
|
||||
{"key": "alpha/file2.zip.delta", "size": 12, "last_modified": "2024-01-03"},
|
||||
{"key": "beta/reference.bin", "size": 200, "last_modified": "2024-01-04"},
|
||||
{"key": "beta/file1.zip.delta", "size": 20, "last_modified": "2024-01-05"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Test with detailed_stats=False (default)
|
||||
# NOTE: Currently, the implementation always fetches metadata regardless of the flag
|
||||
# This test documents the current behavior
|
||||
_ = get_bucket_stats(mock_client, "test-bucket", detailed_stats=False)
|
||||
metadata_by_key = {
|
||||
"alpha/file1.zip.delta": {"file_size": "100", "compression_ratio": "0.9"},
|
||||
"alpha/file2.zip.delta": {"file_size": "120", "compression_ratio": "0.88"},
|
||||
"beta/file1.zip.delta": {"file_size": "210", "compression_ratio": "0.9"},
|
||||
}
|
||||
|
||||
# Currently metadata is always fetched for delta files
|
||||
assert mock_client.service.storage.head.called
|
||||
def mock_head(path: str):
|
||||
for key, metadata in metadata_by_key.items():
|
||||
if key in path:
|
||||
head = Mock()
|
||||
head.metadata = metadata
|
||||
return head
|
||||
return None
|
||||
|
||||
# Reset mock
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Quick mode: no metadata fetch
|
||||
_ = get_bucket_stats(mock_client, "mode-test", mode="quick")
|
||||
assert mock_client.service.storage.head.call_count == 0
|
||||
|
||||
# Sampled mode: one HEAD per delta-space (alpha, beta)
|
||||
mock_client.service.storage.head.reset_mock()
|
||||
stats_sampled = get_bucket_stats(mock_client, "mode-test", mode="sampled")
|
||||
assert mock_client.service.storage.head.call_count == 2
|
||||
|
||||
# Test with detailed_stats=True
|
||||
mock_client.service.storage.head.return_value = Mock(metadata={"file_size": "19500000"})
|
||||
# Detailed mode: HEAD for every delta (3 total)
|
||||
mock_client.service.storage.head.reset_mock()
|
||||
stats_detailed = get_bucket_stats(mock_client, "mode-test", mode="detailed")
|
||||
assert mock_client.service.storage.head.call_count == 3
|
||||
|
||||
_ = get_bucket_stats(mock_client, "test-bucket", detailed_stats=True)
|
||||
|
||||
# Should fetch metadata
|
||||
assert mock_client.service.storage.head.called
|
||||
# Sampled totals should be close to detailed but not identical
|
||||
assert stats_detailed.total_size == 100 + 120 + 210
|
||||
assert stats_sampled.total_size == 100 + 100 + 210
|
||||
|
||||
def test_error_handling_in_metadata_fetch(self, mock_client):
|
||||
"""Test graceful handling of errors during metadata fetch."""
|
||||
@@ -407,7 +424,7 @@ class TestBucketStatsAlgorithm:
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute - should handle error gracefully
|
||||
stats = get_bucket_stats(mock_client, "error-bucket", detailed_stats=True)
|
||||
stats = get_bucket_stats(mock_client, "error-bucket", mode="detailed")
|
||||
|
||||
# Verify - file1 uses fallback, file2 uses metadata
|
||||
assert stats.object_count == 2
|
||||
|
||||
284
tests/unit/test_stats_caching.py
Normal file
284
tests/unit/test_stats_caching.py
Normal file
@@ -0,0 +1,284 @@
|
||||
"""Unit tests for bucket stats caching functionality."""
|
||||
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from deltaglider.client_models import BucketStats
|
||||
from deltaglider.client_operations.stats import (
|
||||
_get_cache_key,
|
||||
_is_cache_valid,
|
||||
_read_stats_cache,
|
||||
_write_stats_cache,
|
||||
)
|
||||
|
||||
|
||||
def test_get_cache_key():
|
||||
"""Test cache key generation for different modes."""
|
||||
assert _get_cache_key("quick") == ".deltaglider/stats_quick.json"
|
||||
assert _get_cache_key("sampled") == ".deltaglider/stats_sampled.json"
|
||||
assert _get_cache_key("detailed") == ".deltaglider/stats_detailed.json"
|
||||
|
||||
|
||||
def test_is_cache_valid_when_unchanged():
|
||||
"""Test cache validation when bucket hasn't changed."""
|
||||
cached_validation = {
|
||||
"object_count": 100,
|
||||
"compressed_size": 50000,
|
||||
}
|
||||
|
||||
assert _is_cache_valid(cached_validation, 100, 50000) is True
|
||||
|
||||
|
||||
def test_is_cache_valid_when_count_changed():
|
||||
"""Test cache validation when object count changed."""
|
||||
cached_validation = {
|
||||
"object_count": 100,
|
||||
"compressed_size": 50000,
|
||||
}
|
||||
|
||||
# Object count changed
|
||||
assert _is_cache_valid(cached_validation, 101, 50000) is False
|
||||
|
||||
|
||||
def test_is_cache_valid_when_size_changed():
|
||||
"""Test cache validation when compressed size changed."""
|
||||
cached_validation = {
|
||||
"object_count": 100,
|
||||
"compressed_size": 50000,
|
||||
}
|
||||
|
||||
# Compressed size changed
|
||||
assert _is_cache_valid(cached_validation, 100, 60000) is False
|
||||
|
||||
|
||||
def test_write_and_read_cache_roundtrip():
|
||||
"""Test writing and reading cache with valid data."""
|
||||
# Create mock client and storage
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Create test stats
|
||||
test_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=150,
|
||||
total_size=1000000,
|
||||
compressed_size=50000,
|
||||
space_saved=950000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=140,
|
||||
direct_objects=10,
|
||||
)
|
||||
|
||||
# Capture what was written to storage
|
||||
written_data = None
|
||||
|
||||
def capture_put(address, data, metadata):
|
||||
nonlocal written_data
|
||||
written_data = data
|
||||
|
||||
mock_storage.put = capture_put
|
||||
|
||||
# Write cache
|
||||
_write_stats_cache(
|
||||
client=mock_client,
|
||||
bucket="test-bucket",
|
||||
mode="quick",
|
||||
stats=test_stats,
|
||||
object_count=150,
|
||||
compressed_size=50000,
|
||||
)
|
||||
|
||||
# Verify something was written
|
||||
assert written_data is not None
|
||||
|
||||
# Parse written data
|
||||
cache_data = json.loads(written_data.decode("utf-8"))
|
||||
|
||||
# Verify structure
|
||||
assert cache_data["version"] == "1.0"
|
||||
assert cache_data["mode"] == "quick"
|
||||
assert "computed_at" in cache_data
|
||||
assert cache_data["validation"]["object_count"] == 150
|
||||
assert cache_data["validation"]["compressed_size"] == 50000
|
||||
assert cache_data["stats"]["bucket"] == "test-bucket"
|
||||
assert cache_data["stats"]["object_count"] == 150
|
||||
assert cache_data["stats"]["delta_objects"] == 140
|
||||
|
||||
# Now test reading it back
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = written_data
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
# Verify read stats match original
|
||||
assert stats is not None
|
||||
assert validation is not None
|
||||
assert stats.bucket == "test-bucket"
|
||||
assert stats.object_count == 150
|
||||
assert stats.delta_objects == 140
|
||||
assert stats.average_compression_ratio == 0.95
|
||||
assert validation["object_count"] == 150
|
||||
assert validation["compressed_size"] == 50000
|
||||
|
||||
|
||||
def test_read_cache_missing_file():
|
||||
"""Test reading cache when file doesn't exist."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Simulate FileNotFoundError
|
||||
mock_storage.get.side_effect = FileNotFoundError("No such key")
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
|
||||
|
||||
def test_read_cache_invalid_json():
|
||||
"""Test reading cache with corrupted JSON."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Return invalid JSON
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = b"not valid json {]["
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
mock_logger.warning.assert_called_once()
|
||||
|
||||
|
||||
def test_read_cache_version_mismatch():
|
||||
"""Test reading cache with wrong version."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Cache with wrong version
|
||||
cache_data = {
|
||||
"version": "2.0", # Wrong version
|
||||
"mode": "quick",
|
||||
"validation": {"object_count": 100, "compressed_size": 50000},
|
||||
"stats": {
|
||||
"bucket": "test",
|
||||
"object_count": 100,
|
||||
"total_size": 1000,
|
||||
"compressed_size": 500,
|
||||
"space_saved": 500,
|
||||
"average_compression_ratio": 0.5,
|
||||
"delta_objects": 90,
|
||||
"direct_objects": 10,
|
||||
},
|
||||
}
|
||||
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = json.dumps(cache_data).encode("utf-8")
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
mock_logger.warning.assert_called_once()
|
||||
|
||||
|
||||
def test_read_cache_mode_mismatch():
|
||||
"""Test reading cache with wrong mode."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Cache with mismatched mode
|
||||
cache_data = {
|
||||
"version": "1.0",
|
||||
"mode": "detailed", # Wrong mode
|
||||
"validation": {"object_count": 100, "compressed_size": 50000},
|
||||
"stats": {
|
||||
"bucket": "test",
|
||||
"object_count": 100,
|
||||
"total_size": 1000,
|
||||
"compressed_size": 500,
|
||||
"space_saved": 500,
|
||||
"average_compression_ratio": 0.5,
|
||||
"delta_objects": 90,
|
||||
"direct_objects": 10,
|
||||
},
|
||||
}
|
||||
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = json.dumps(cache_data).encode("utf-8")
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
# Request "quick" mode but cache has "detailed"
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
mock_logger.warning.assert_called_once()
|
||||
|
||||
|
||||
def test_write_cache_handles_errors_gracefully():
|
||||
"""Test that cache write failures don't crash the program."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Simulate S3 permission error
|
||||
mock_storage.put.side_effect = PermissionError("Access denied")
|
||||
|
||||
test_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=150,
|
||||
total_size=1000000,
|
||||
compressed_size=50000,
|
||||
space_saved=950000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=140,
|
||||
direct_objects=10,
|
||||
)
|
||||
|
||||
# Should not raise exception
|
||||
_write_stats_cache(
|
||||
client=mock_client,
|
||||
bucket="test-bucket",
|
||||
mode="quick",
|
||||
stats=test_stats,
|
||||
object_count=150,
|
||||
compressed_size=50000,
|
||||
)
|
||||
|
||||
# Should log warning
|
||||
mock_logger.warning.assert_called_once()
|
||||
assert "Failed to write cache" in str(mock_logger.warning.call_args)
|
||||
Reference in New Issue
Block a user