mirror of
https://github.com/beshu-tech/deltaglider.git
synced 2026-04-30 12:14:32 +02:00
Compare commits
80 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a98fc7c178 | ||
|
|
82e00623de | ||
|
|
e8c76f1dc7 | ||
|
|
c492a5087b | ||
|
|
85af5a95c8 | ||
|
|
60b70309fa | ||
|
|
b0699f952a | ||
|
|
9bfe121f44 | ||
|
|
6cab3de9a0 | ||
|
|
482f45fc02 | ||
|
|
6b3245266e | ||
|
|
20053acb5f | ||
|
|
87f425734f | ||
|
|
012662c377 | ||
|
|
284f030fae | ||
|
|
7a4d30a007 | ||
|
|
0d46283ff0 | ||
|
|
805e2967bc | ||
|
|
2ef1741d51 | ||
|
|
2c1d756e7b | ||
|
|
c6cee7ae26 | ||
|
|
cee9a9fd2d | ||
|
|
0507e6ebcd | ||
|
|
fa9c4fa42d | ||
|
|
934d83975c | ||
|
|
c32d5265d9 | ||
|
|
1cf7e3ad21 | ||
|
|
9b36087438 | ||
|
|
60877966f2 | ||
|
|
fbd44ea3c3 | ||
|
|
3f689fc601 | ||
|
|
3753212f96 | ||
|
|
db7d14f8a8 | ||
|
|
e1259b7ea8 | ||
|
|
ff05e77c24 | ||
|
|
c3d385bf18 | ||
|
|
aea5cb5d9a | ||
|
|
b2ca59490b | ||
|
|
4f56c4b600 | ||
|
|
14c6af0f35 | ||
|
|
67792b2031 | ||
|
|
a9a1396e6e | ||
|
|
52eb5bba21 | ||
|
|
f75db142e8 | ||
|
|
35d34d4862 | ||
|
|
9230cbd762 | ||
|
|
2eba6e8d38 | ||
|
|
656726b57b | ||
|
|
85dd315424 | ||
|
|
dbd2632cae | ||
|
|
3d04a407c0 | ||
|
|
47f022fffe | ||
|
|
7a2ed16ee7 | ||
|
|
5e333254ba | ||
|
|
04cc984d4a | ||
|
|
ac7d4e067f | ||
|
|
e8fb926fd6 | ||
|
|
626e28eaf6 | ||
|
|
90a342dc33 | ||
|
|
f9f2b036e3 | ||
|
|
778d7f0148 | ||
|
|
37ea2f138c | ||
|
|
5e3b76791e | ||
|
|
fb2877bfd3 | ||
|
|
88fd1f51cd | ||
|
|
0857e02edd | ||
|
|
689cf00d02 | ||
|
|
743d52e783 | ||
|
|
8bc0a0eaf3 | ||
|
|
4cf25e4681 | ||
|
|
69ed9056d2 | ||
|
|
38134f28f5 | ||
|
|
fa1f8b85a9 | ||
|
|
a06cc2939c | ||
|
|
5b8477ed61 | ||
|
|
e706ddebdd | ||
|
|
50db9bbb27 | ||
|
|
c25568e315 | ||
|
|
ca1186a3f6 | ||
|
|
4217535e8c |
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@@ -98,7 +98,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
|
||||
92
.github/workflows/docker-publish.yml
vendored
Normal file
92
.github/workflows/docker-publish.yml
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
name: Build and Publish Docker Images
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- develop
|
||||
tags:
|
||||
- 'v*'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: docker.io
|
||||
IMAGE_NAME: beshultd/deltaglider
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0 # Full history for proper git describe
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Extract version from git
|
||||
id: version
|
||||
run: |
|
||||
# Get version from git tags
|
||||
VERSION=$(git describe --tags --always --abbrev=0 2>/dev/null || echo "dev")
|
||||
# Remove 'v' prefix if present
|
||||
VERSION=${VERSION#v}
|
||||
echo "version=${VERSION}" >> $GITHUB_OUTPUT
|
||||
echo "Version: ${VERSION}"
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.IMAGE_NAME }}
|
||||
tags: |
|
||||
# For main branch: tag as 'latest'
|
||||
type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
|
||||
# For develop branch: tag as 'develop'
|
||||
type=raw,value=develop,enable=${{ github.ref == 'refs/heads/develop' }}
|
||||
# For version tags: use semver patterns
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=semver,pattern={{major}}
|
||||
# For PRs: tag as pr-<number>
|
||||
type=ref,event=pr
|
||||
# Include git sha for traceability (only on branch pushes, not tags)
|
||||
type=sha,prefix={{branch}}-,enable=${{ startsWith(github.ref, 'refs/heads/') }}
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-args: |
|
||||
VERSION=${{ steps.version.outputs.version }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Docker Hub Description
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
uses: peter-evans/dockerhub-description@v4
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
repository: ${{ env.IMAGE_NAME }}
|
||||
short-description: "Store 4TB in 5GB: S3-compatible storage with 99.9% compression"
|
||||
readme-filepath: ./README.md
|
||||
2
.github/workflows/release-manual.yml
vendored
2
.github/workflows/release-manual.yml
vendored
@@ -146,7 +146,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -150,7 +150,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the most commonly used operations. This is **not** a 100% drop-in replacement, but covers the core functionality needed for most use cases.
|
||||
|
||||
## ✅ Implemented Methods (21 core methods)
|
||||
## ✅ Implemented Methods (23 core methods)
|
||||
|
||||
### Object Operations
|
||||
- ✅ `put_object()` - Upload objects (with automatic delta compression)
|
||||
@@ -17,6 +17,8 @@ DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the mo
|
||||
- ✅ `create_bucket()` - Create buckets
|
||||
- ✅ `delete_bucket()` - Delete empty buckets
|
||||
- ✅ `list_buckets()` - List all buckets
|
||||
- ✅ `put_bucket_acl()` - Set bucket ACL (passthrough to S3)
|
||||
- ✅ `get_bucket_acl()` - Get bucket ACL (passthrough to S3)
|
||||
|
||||
### Presigned URLs
|
||||
- ✅ `generate_presigned_url()` - Generate presigned URLs
|
||||
@@ -46,8 +48,6 @@ DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the mo
|
||||
- ❌ `list_parts()`
|
||||
|
||||
### Access Control (ACL)
|
||||
- ❌ `get_bucket_acl()`
|
||||
- ❌ `put_bucket_acl()`
|
||||
- ❌ `get_object_acl()`
|
||||
- ❌ `put_object_acl()`
|
||||
- ❌ `get_public_access_block()`
|
||||
@@ -135,9 +135,9 @@ DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the mo
|
||||
|
||||
## Coverage Analysis
|
||||
|
||||
**Implemented:** ~21 methods
|
||||
**Implemented:** ~23 methods
|
||||
**Total boto3 S3 methods:** ~100+ methods
|
||||
**Coverage:** ~20%
|
||||
**Coverage:** ~23%
|
||||
|
||||
## What's Covered
|
||||
|
||||
|
||||
303
CHANGELOG.md
Normal file
303
CHANGELOG.md
Normal file
@@ -0,0 +1,303 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [6.1.1] - 2026-03-23
|
||||
|
||||
### Fixed
|
||||
- **S3-Compatible Endpoint Support**: Disabled boto3 automatic request checksums (CRC32/CRC64) that were added in boto3 1.36+. S3-compatible stores like Hetzner Object Storage reject these headers with `BadRequest`, breaking direct (non-delta) file uploads. Sets `request_checksum_calculation="when_required"` to restore compatibility while still working with AWS S3.
|
||||
- **CI: LocalStack pinned to 4.4** — `localstack/localstack:latest` now requires a paid license; pinned to last free version across all workflows and docker-compose files.
|
||||
|
||||
### Changed
|
||||
- **Dependency Pinning**: All runtime dependencies now use major-version upper bounds (`boto3>=1.35.0,<2.0.0`, etc.) to prevent surprise breaking changes in Docker builds.
|
||||
|
||||
### Added
|
||||
- **S3 Compatibility Tests**: New `test_s3_compat.py` unit tests verifying the boto3 client disables automatic checksums and `put_object` doesn't pass checksum kwargs — regression protection for non-AWS S3 endpoints.
|
||||
- **Dependency Management Guide**: Added quarterly dependency refresh checklist and known compatibility constraints to CLAUDE.md.
|
||||
|
||||
## [6.1.0] - 2025-02-07
|
||||
|
||||
### Added
|
||||
- **Bucket ACL Management**: New `put_bucket_acl()` and `get_bucket_acl()` methods
|
||||
- boto3-compatible passthrough to native S3 ACL operations
|
||||
- Supports canned ACLs (`private`, `public-read`, `public-read-write`, `authenticated-read`)
|
||||
- Supports grant-based ACLs (`GrantRead`, `GrantWrite`, `GrantFullControl`, etc.)
|
||||
- Supports full `AccessControlPolicy` dict for fine-grained control
|
||||
- SDK method count increased from 21 to 23
|
||||
- **New CLI Commands**: `deltaglider put-bucket-acl` and `deltaglider get-bucket-acl`
|
||||
- Mirrors `aws s3api put-bucket-acl` / `get-bucket-acl` syntax
|
||||
- Accepts bucket name or `s3://bucket` URL format
|
||||
- JSON output for `get-bucket-acl` (compatible with AWS CLI)
|
||||
- Supports `--endpoint-url`, `--region`, `--profile` flags
|
||||
- **Docker Publishing**: Added GitHub Actions workflow for multi-arch Docker image builds (amd64/arm64)
|
||||
|
||||
### Changed
|
||||
- **Refactor**: Extracted `DeltaGliderConfig` dataclass for centralized configuration management
|
||||
- **Refactor**: Introduced typed `DeleteResult` and `RecursiveDeleteResult` dataclasses replacing raw dicts
|
||||
- **Refactor**: Centralized S3 metadata key aliases into `core/models.py` constants
|
||||
- **Refactor**: Extracted helper methods in `DeltaService` for improved readability
|
||||
|
||||
### Fixed
|
||||
- Removed unused imports flagged by ruff in test files
|
||||
|
||||
### Documentation
|
||||
- Updated BOTO3_COMPATIBILITY.md (coverage 20% → 23%)
|
||||
- Updated AWS S3 CLI compatibility docs with ACL command examples
|
||||
- Refreshed README with dark mode logo and streamlined content
|
||||
- Cleaned up SDK documentation and examples
|
||||
|
||||
## [6.0.0] - 2025-10-17
|
||||
|
||||
### Added
|
||||
- **EC2 Region Detection & Cost Optimization**
|
||||
- Automatic detection of EC2 instance region using IMDSv2
|
||||
- Warns when EC2 region ≠ S3 client region (potential cross-region charges)
|
||||
- Different warnings for auto-detected vs. explicit `--region` flag mismatches
|
||||
- Green checkmark when regions are aligned (optimal configuration)
|
||||
- Can be disabled with `DG_DISABLE_EC2_DETECTION=true` environment variable
|
||||
- Helps users optimize for cost and performance before migration starts
|
||||
- **New CLI Command**: `deltaglider migrate` for S3-to-S3 bucket migration with compression
|
||||
- Supports resume capability (skips already migrated files)
|
||||
- Real-time progress tracking with file count and statistics
|
||||
- Interactive confirmation prompt (use `--yes` to skip)
|
||||
- Prefix preservation by default (use `--no-preserve-prefix` to disable)
|
||||
- Dry run mode with `--dry-run` flag
|
||||
- Include/exclude pattern filtering
|
||||
- Shows compression statistics after migration
|
||||
- **EC2-aware region logging**: Detects EC2 instance and warns about cross-region charges
|
||||
- **FIXED**: Now correctly preserves original filenames during migration
|
||||
- **S3-to-S3 Recursive Copy**: `deltaglider cp -r s3://source/ s3://dest/` now supported
|
||||
- Automatically uses migration functionality with prefix preservation
|
||||
- Applies delta compression during transfer
|
||||
- Preserves original filenames correctly
|
||||
- **Version Command**: Added `--version` flag to show deltaglider version
|
||||
- Usage: `deltaglider --version`
|
||||
- **DeltaService API Enhancement**: Added `override_name` parameter to `put()` method
|
||||
- Allows specifying destination filename independently of source filesystem path
|
||||
- Enables proper S3-to-S3 transfers without filesystem renaming tricks
|
||||
- **Rehydration & Purge**: Automatic rehydration of delta-compressed files for presigned URL access
|
||||
- New `deltaglider purge` CLI command to clean expired temporary files
|
||||
- **Metadata Namespace**: Centralized `dg-` prefixed metadata keys for all DeltaGlider metadata
|
||||
- **S3-Based Stats Caching**: Bucket statistics cached in S3 with automatic invalidation
|
||||
|
||||
### Fixed
|
||||
- **Critical**: S3-to-S3 migration now preserves original filenames
|
||||
- Previously created files with temp names like `tmp1b9cpdsn.zip`
|
||||
- Now correctly uses original filenames from source S3 keys
|
||||
- Fixed by adding `override_name` parameter to `DeltaService.put()`
|
||||
- **CLI Region Support**: `--region` flag now properly passes region to boto3 client
|
||||
- Previously only set environment variable, relied on boto3 auto-detection
|
||||
- Now explicitly passes `region_name` to `boto3.client()` via `boto3_kwargs`
|
||||
- Ensures consistent behavior with `DeltaGliderClient` SDK
|
||||
|
||||
### Changed
|
||||
- Recursive S3-to-S3 copy operations now preserve source prefix structure by default
|
||||
- Migration operations show formatted output with source and destination paths
|
||||
|
||||
### Documentation
|
||||
- Added comprehensive migration guide in README.md
|
||||
- Updated CLI reference with migrate command examples
|
||||
- Added prefix preservation behavior documentation
|
||||
|
||||
## [5.1.1] - 2025-01-10
|
||||
|
||||
### Fixed
|
||||
- **Stats Command**: Fixed incorrect compression ratio calculations
|
||||
- Now correctly counts ALL files including reference.bin in compressed size
|
||||
- Fixed handling of orphaned reference.bin files (reference files with no delta files)
|
||||
- Added prominent warnings for orphaned reference files with cleanup commands
|
||||
- Fixed stats for buckets with no compression (now shows 0% instead of negative)
|
||||
- SHA1 checksum files are now properly included in calculations
|
||||
|
||||
### Improved
|
||||
- **Stats Performance**: Optimized metadata fetching with parallel requests
|
||||
- 5-10x faster for buckets with many delta files
|
||||
- Uses ThreadPoolExecutor for concurrent HEAD requests
|
||||
- Single-pass calculation algorithm for better efficiency
|
||||
|
||||
## [5.1.0] - 2025-10-10
|
||||
|
||||
### Added
|
||||
- **New CLI Command**: `deltaglider stats <bucket>` for bucket statistics and compression metrics
|
||||
- Supports `--detailed` flag for comprehensive analysis
|
||||
- Supports `--json` flag for machine-readable output
|
||||
- Accepts multiple formats: `s3://bucket/`, `s3://bucket`, `bucket`
|
||||
- **Session-Level Statistics Caching**: Bucket stats now cached per client instance
|
||||
- Automatic cache invalidation on mutations (put, delete, bucket operations)
|
||||
- Intelligent cache reuse (detailed stats serve quick stat requests)
|
||||
- Enhanced `list_buckets()` includes cached stats when available
|
||||
- **Programmatic Cache Management**: Added cache management APIs for long-running applications
|
||||
- `clear_cache()`: Clear all cached references
|
||||
- `evict_cache()`: Remove specific cached reference
|
||||
- Session-scoped cache lifecycle management
|
||||
|
||||
### Changed
|
||||
- Bucket statistics are now cached within client session for performance
|
||||
- `list_buckets()` response includes `DeltaGliderStats` metadata when cached
|
||||
|
||||
### Documentation
|
||||
- Added comprehensive DG_MAX_RATIO tuning guide in docs/
|
||||
- Updated CLI command reference in CLAUDE.md and README.md
|
||||
- Added detailed cache management documentation
|
||||
|
||||
## [5.0.3] - 2025-10-10
|
||||
|
||||
### Security
|
||||
- **BREAKING**: Removed all legacy shared cache code for security
|
||||
- **BREAKING**: Encryption is now ALWAYS ON (cannot be disabled)
|
||||
- Ephemeral process-isolated cache is now the ONLY mode (no opt-out)
|
||||
- **Content-Addressed Storage (CAS)**: Implemented SHA256-based cache storage
|
||||
- Zero collision risk (SHA256 namespace guarantees uniqueness)
|
||||
- Automatic deduplication (same content = same filename)
|
||||
- Tampering protection (changing content changes SHA, breaks lookup)
|
||||
- Two-level directory structure for filesystem optimization
|
||||
- **Encrypted Cache**: All cache data encrypted at rest using Fernet (AES-128-CBC + HMAC)
|
||||
- Ephemeral encryption keys per process (forward secrecy)
|
||||
- Optional persistent keys via `DG_CACHE_ENCRYPTION_KEY` for shared filesystems
|
||||
- Automatic cleanup of corrupted cache files on decryption failures
|
||||
- Fixed TOCTOU vulnerabilities with atomic SHA validation at use-time
|
||||
- Added `get_validated_ref()` method to prevent cache poisoning
|
||||
- Eliminated multi-user data exposure through mandatory cache isolation
|
||||
|
||||
### Removed
|
||||
- **BREAKING**: Removed `DG_UNSAFE_SHARED_CACHE` environment variable
|
||||
- **BREAKING**: Removed `DG_CACHE_DIR` environment variable
|
||||
- **BREAKING**: Removed `DG_CACHE_ENCRYPTION` environment variable (encryption always on)
|
||||
- **BREAKING**: Removed `cache_dir` parameter from `create_client()`
|
||||
|
||||
### Changed
|
||||
- Cache is now auto-created in `/tmp/deltaglider-*` and cleaned on exit
|
||||
- All cache operations use file locking (Unix) and SHA validation
|
||||
- Added `CacheMissError` and `CacheCorruptionError` exceptions
|
||||
|
||||
### Added
|
||||
- New `ContentAddressedCache` adapter in `adapters/cache_cas.py`
|
||||
- New `EncryptedCache` wrapper in `adapters/cache_encrypted.py`
|
||||
- New `MemoryCache` adapter in `adapters/cache_memory.py` with LRU eviction
|
||||
- Self-describing cache structure with SHA256-based filenames
|
||||
- Configurable cache backends via `DG_CACHE_BACKEND` (filesystem or memory)
|
||||
- Memory cache size limit via `DG_CACHE_MEMORY_SIZE_MB` (default: 100MB)
|
||||
|
||||
### Internal
|
||||
- Updated all tests to use Content-Addressed Storage and encryption
|
||||
- All 119 tests passing with zero errors (99 original + 20 new cache tests)
|
||||
- Type checking: 0 errors (mypy)
|
||||
- Linting: All checks passed (ruff)
|
||||
- Completed Phase 1, 2, and 7 of SECURITY_FIX_ROADMAP.md
|
||||
- Added comprehensive test suites for encryption (13 tests) and memory cache (10 tests)
|
||||
|
||||
## [5.0.1] - 2025-01-10
|
||||
|
||||
### Changed
|
||||
- **Code Organization**: Refactored client.py from 1560 to 1154 lines (26% reduction)
|
||||
- Extracted client operations into modular `client_operations/` package:
|
||||
- `bucket.py` - S3 bucket management operations
|
||||
- `presigned.py` - Presigned URL generation
|
||||
- `batch.py` - Batch upload/download operations
|
||||
- `stats.py` - Analytics and statistics operations
|
||||
- Improved code maintainability with logical separation of concerns
|
||||
- Better developer experience with cleaner module structure
|
||||
|
||||
### Internal
|
||||
- Full type safety maintained with mypy (0 errors)
|
||||
- All 99 tests passing
|
||||
- Code quality checks passing (ruff)
|
||||
- No breaking changes - all public APIs remain unchanged
|
||||
|
||||
## [5.0.0] - 2025-01-10
|
||||
|
||||
### Added
|
||||
- boto3-compatible TypedDict types for S3 responses (no boto3 import needed)
|
||||
- Complete boto3 compatibility vision document
|
||||
- Type-safe response builders using TypedDict patterns
|
||||
|
||||
### Changed
|
||||
- **BREAKING**: `list_objects()` now returns boto3-compatible dict instead of custom dataclass
|
||||
- Use `response['Contents']` instead of `response.contents`
|
||||
- Use `response.get('IsTruncated')` instead of `response.is_truncated`
|
||||
- Use `response.get('NextContinuationToken')` instead of `response.next_continuation_token`
|
||||
- DeltaGlider metadata now in `Metadata` field of each object
|
||||
- Internal response building now uses TypedDict for compile-time type safety
|
||||
- All S3 responses are dicts at runtime (TypedDict is a dict!)
|
||||
|
||||
### Fixed
|
||||
- Updated all documentation examples to use dict-based responses
|
||||
- Fixed pagination examples in README and API docs
|
||||
- Corrected SDK documentation with accurate method signatures
|
||||
|
||||
## [4.2.4] - 2025-01-10
|
||||
|
||||
### Fixed
|
||||
- Show only filename in `ls` output instead of full path for cleaner display
|
||||
- Correct `ls` command path handling and prefix display logic
|
||||
|
||||
## [4.2.3] - 2025-01-07
|
||||
|
||||
### Added
|
||||
- Comprehensive test coverage for `delete_objects_recursive()` method with 19 thorough tests
|
||||
- Tests cover delta suffix handling, error/warning aggregation, statistics tracking, and edge cases
|
||||
- Better code organization with separate `client_models.py` and `client_delete_helpers.py` modules
|
||||
|
||||
### Fixed
|
||||
- Fixed all mypy type errors using proper `cast()` for type safety
|
||||
- Improved type hints for dictionary operations in client code
|
||||
|
||||
### Changed
|
||||
- Refactored client code into logical modules for better maintainability
|
||||
- Enhanced code quality with comprehensive linting and type checking
|
||||
- All 99 integration/unit tests passing with zero type errors
|
||||
|
||||
### Internal
|
||||
- Better separation of concerns in client module
|
||||
- Improved developer experience with clearer code structure
|
||||
|
||||
## [4.2.2] - 2024-10-06
|
||||
|
||||
### Fixed
|
||||
- Add .delta suffix fallback for `delete_object()` method
|
||||
- Handle regular S3 objects without DeltaGlider metadata
|
||||
- Update mypy type ignore comment for compatibility
|
||||
|
||||
## [4.2.1] - 2024-10-06
|
||||
|
||||
### Fixed
|
||||
- Make GitHub release creation non-blocking in workflows
|
||||
|
||||
## [4.2.0] - 2024-10-03
|
||||
|
||||
### Added
|
||||
- AWS credential parameters to `create_client()` function
|
||||
- Support for custom endpoint URLs
|
||||
- Enhanced boto3 compatibility
|
||||
|
||||
## [4.1.0] - 2024-09-29
|
||||
|
||||
### Added
|
||||
- boto3-compatible client API
|
||||
- Bucket management methods
|
||||
- Comprehensive SDK documentation
|
||||
|
||||
## [4.0.0] - 2024-09-21
|
||||
|
||||
### Added
|
||||
- Initial public release
|
||||
- CLI with AWS S3 compatibility
|
||||
- Delta compression for versioned artifacts
|
||||
- 99%+ compression for similar files
|
||||
|
||||
[6.1.0]: https://github.com/beshu-tech/deltaglider/compare/v6.0.2...v6.1.0
|
||||
[6.0.0]: https://github.com/beshu-tech/deltaglider/compare/v5.1.1...v6.0.0
|
||||
[5.1.0]: https://github.com/beshu-tech/deltaglider/compare/v5.0.3...v5.1.0
|
||||
[5.0.3]: https://github.com/beshu-tech/deltaglider/compare/v5.0.1...v5.0.3
|
||||
[5.0.1]: https://github.com/beshu-tech/deltaglider/compare/v5.0.0...v5.0.1
|
||||
[5.0.0]: https://github.com/beshu-tech/deltaglider/compare/v4.2.4...v5.0.0
|
||||
[4.2.4]: https://github.com/beshu-tech/deltaglider/compare/v4.2.3...v4.2.4
|
||||
[4.2.3]: https://github.com/beshu-tech/deltaglider/compare/v4.2.2...v4.2.3
|
||||
[4.2.2]: https://github.com/beshu-tech/deltaglider/compare/v4.2.1...v4.2.2
|
||||
[4.2.1]: https://github.com/beshu-tech/deltaglider/compare/v4.2.0...v4.2.1
|
||||
[4.2.0]: https://github.com/beshu-tech/deltaglider/compare/v4.1.0...v4.2.0
|
||||
[4.1.0]: https://github.com/beshu-tech/deltaglider/compare/v4.0.0...v4.1.0
|
||||
[4.0.0]: https://github.com/beshu-tech/deltaglider/releases/tag/v4.0.0
|
||||
88
CLAUDE.md
88
CLAUDE.md
@@ -74,6 +74,19 @@ export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
|
||||
# Now you can use deltaglider commands
|
||||
deltaglider cp test.zip s3://test-bucket/
|
||||
deltaglider stats test-bucket # Get bucket statistics
|
||||
```
|
||||
|
||||
### Available CLI Commands
|
||||
```bash
|
||||
cp # Copy files to/from S3 (AWS S3 compatible)
|
||||
ls # List S3 buckets or objects (AWS S3 compatible)
|
||||
rm # Remove S3 objects (AWS S3 compatible)
|
||||
sync # Synchronize directories with S3 (AWS S3 compatible)
|
||||
stats # Get bucket statistics and compression metrics
|
||||
verify # Verify integrity of delta file
|
||||
put-bucket-acl # Set bucket ACL (s3api compatible passthrough)
|
||||
get-bucket-acl # Get bucket ACL (s3api compatible passthrough)
|
||||
```
|
||||
|
||||
## Architecture
|
||||
@@ -97,13 +110,15 @@ src/deltaglider/
|
||||
│ ├── logger.py # LoggerPort protocol for logging
|
||||
│ └── metrics.py # MetricsPort protocol for observability
|
||||
├── adapters/ # Concrete implementations
|
||||
│ ├── storage_s3.py # S3StorageAdapter using boto3
|
||||
│ ├── diff_xdelta.py # XdeltaAdapter using xdelta3 binary
|
||||
│ ├── hash_sha256.py # Sha256Adapter for checksums
|
||||
│ ├── cache_fs.py # FsCacheAdapter for file system cache
|
||||
│ ├── clock_utc.py # UtcClockAdapter for UTC timestamps
|
||||
│ ├── logger_std.py # StdLoggerAdapter for console output
|
||||
│ └── metrics_noop.py # NoopMetricsAdapter (placeholder)
|
||||
│ ├── storage_s3.py # S3StorageAdapter using boto3
|
||||
│ ├── diff_xdelta.py # XdeltaAdapter using xdelta3 binary
|
||||
│ ├── hash_sha256.py # Sha256Adapter for checksums
|
||||
│ ├── cache_cas.py # ContentAddressedCache (SHA256-based storage)
|
||||
│ ├── cache_encrypted.py # EncryptedCache (Fernet encryption wrapper)
|
||||
│ ├── cache_memory.py # MemoryCache (LRU in-memory cache)
|
||||
│ ├── clock_utc.py # UtcClockAdapter for UTC timestamps
|
||||
│ ├── logger_std.py # StdLoggerAdapter for console output
|
||||
│ └── metrics_noop.py # NoopMetricsAdapter (placeholder)
|
||||
└── app/
|
||||
└── cli/ # Click-based CLI application
|
||||
├── main.py # Main CLI entry point with AWS S3 commands
|
||||
@@ -140,7 +155,13 @@ src/deltaglider/
|
||||
2. **Reference Management** (`core/service.py`):
|
||||
- Reference stored at `{deltaspace.prefix}/reference.bin`
|
||||
- SHA256 verification on every read/write
|
||||
- Local cache in `/tmp/.deltaglider/reference_cache` for performance
|
||||
- **Content-Addressed Storage (CAS)** cache in `/tmp/deltaglider-*` (ephemeral)
|
||||
- Cache uses SHA256 as filename with two-level directory structure (ab/cd/abcdef...)
|
||||
- Automatic deduplication: same content = same SHA = same cache file
|
||||
- Zero collision risk: SHA256 namespace guarantees uniqueness
|
||||
- **Encryption**: Optional Fernet (AES-128-CBC + HMAC) encryption at rest (enabled by default)
|
||||
- Ephemeral encryption keys per process for forward secrecy
|
||||
- **Cache Backends**: Configurable filesystem or in-memory cache with LRU eviction
|
||||
|
||||
3. **Sync Algorithm** (`app/cli/sync.py`):
|
||||
- Compares local vs S3 using size and modification time
|
||||
@@ -181,13 +202,26 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
## Environment Variables
|
||||
|
||||
- `DG_LOG_LEVEL`: Logging level (default: "INFO")
|
||||
- `DG_CACHE_DIR`: Local reference cache directory (default: "/tmp/.deltaglider/reference_cache")
|
||||
- `DG_MAX_RATIO`: Maximum acceptable delta/file ratio (default: "0.5")
|
||||
- `DG_MAX_RATIO`: Maximum acceptable delta/file ratio (default: "0.5", range: "0.0-1.0")
|
||||
- **See [docs/DG_MAX_RATIO.md](docs/DG_MAX_RATIO.md) for complete tuning guide**
|
||||
- Controls when to use delta vs. direct storage
|
||||
- Lower (0.2-0.3) = conservative, only high-quality compression
|
||||
- Higher (0.6-0.7) = permissive, accept modest savings
|
||||
- `DG_CACHE_BACKEND`: Cache backend type - "filesystem" (default) or "memory"
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size limit in MB (default: "100")
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded Fernet key for persistent encryption (ephemeral by default)
|
||||
- `AWS_ENDPOINT_URL`: Override S3 endpoint for MinIO/LocalStack
|
||||
- `AWS_ACCESS_KEY_ID`: AWS credentials
|
||||
- `AWS_SECRET_ACCESS_KEY`: AWS credentials
|
||||
- `AWS_DEFAULT_REGION`: AWS region
|
||||
|
||||
**Security Notes**:
|
||||
- **Encryption Always On**: Cache data is ALWAYS encrypted (cannot be disabled)
|
||||
- **Ephemeral Keys**: Encryption keys auto-generated per process for maximum security
|
||||
- **Auto-Cleanup**: Corrupted cache files automatically deleted on decryption failures
|
||||
- **Process Isolation**: Each process gets isolated cache in `/tmp/deltaglider-*`, cleaned up on exit
|
||||
- **Persistent Keys**: Set `DG_CACHE_ENCRYPTION_KEY` only if you need cross-process cache sharing (e.g., shared filesystems)
|
||||
|
||||
## Important Implementation Details
|
||||
|
||||
1. **xdelta3 Binary Dependency**: The system requires xdelta3 binary installed on the system. The `XdeltaAdapter` uses subprocess to call it.
|
||||
@@ -202,7 +236,11 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- Local reference caching dramatically improves performance for repeated operations
|
||||
- **Content-Addressed Storage**: SHA256-based deduplication eliminates redundant storage
|
||||
- **Cache Backends**:
|
||||
- Filesystem cache (default): persistent across processes, good for shared workflows
|
||||
- Memory cache: faster, zero I/O, perfect for ephemeral CI/CD pipelines
|
||||
- **Encryption Overhead**: ~10-15% performance impact, provides security at rest
|
||||
- Delta compression is CPU-intensive; consider parallelization for bulk uploads
|
||||
- The default max_ratio of 0.5 prevents storing inefficient deltas
|
||||
- For files <1MB, delta overhead may exceed benefits
|
||||
@@ -212,4 +250,30 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
- Never store AWS credentials in code
|
||||
- Use IAM roles when possible
|
||||
- All S3 operations respect bucket policies and encryption settings
|
||||
- SHA256 checksums prevent tampering and corruption
|
||||
- SHA256 checksums prevent tampering and corruption
|
||||
- **Encryption Always On**: Cache data is ALWAYS encrypted using Fernet (AES-128-CBC + HMAC) - cannot be disabled
|
||||
- **Ephemeral Keys**: Encryption keys auto-generated per process for forward secrecy and process isolation
|
||||
- **Auto-Cleanup**: Corrupted or tampered cache files automatically deleted on decryption failures
|
||||
- **Persistent Keys**: Set `DG_CACHE_ENCRYPTION_KEY` only for cross-process cache sharing (use secrets management)
|
||||
- **Content-Addressed Storage**: SHA256-based filenames prevent collision attacks
|
||||
- **Zero-Trust Cache**: All cache operations include cryptographic validation
|
||||
|
||||
## Dependency Management
|
||||
|
||||
### Pinning Strategy
|
||||
Runtime dependencies in `pyproject.toml` use **compatible range pins** (`>=x.y.z,<NEXT_MAJOR`). This prevents surprise breaking changes from major versions while allowing patch/minor updates.
|
||||
|
||||
**Critical dependency: `boto3`** — This is the most breakage-prone dependency. AWS periodically changes default behaviors in minor releases (e.g., boto3 1.36+ added automatic request checksums that break S3-compatible stores like Hetzner Object Storage). The S3 adapter (`adapters/storage_s3.py`) explicitly sets `request_checksum_calculation="when_required"` to maintain compatibility with non-AWS S3 endpoints.
|
||||
|
||||
### Quarterly Dependency Refresh (do every ~3 months)
|
||||
1. **Check for updates**: `uv pip compile pyproject.toml --upgrade --dry-run`
|
||||
2. **Update in a branch**: bump version floors in `pyproject.toml` to current stable releases
|
||||
3. **Run full test suite**: `uv run pytest` (unit + integration)
|
||||
4. **Test against S3-compatible stores**: test a small file upload against Hetzner (or whichever non-AWS endpoint is in use) — boto3 updates are the most likely to break this
|
||||
5. **Rebuild Docker image** and test the same upload from the container
|
||||
6. **Check changelogs** for boto3, cryptography, and click for any deprecation notices or behavior changes
|
||||
|
||||
### Known Compatibility Constraints
|
||||
- **boto3**: Must use `request_checksum_calculation="when_required"` for Hetzner/MinIO compatibility. If upgrading past a new major behavior change, test direct uploads (non-delta path) of small files to non-AWS endpoints.
|
||||
- **cryptography**: Fernet API has been stable, but major versions may drop old OpenSSL support. Verify cache encryption still works after upgrades.
|
||||
- **click**: CLI argument parsing. Major versions may change decorator behavior. Run integration tests (`test_aws_cli_commands_v2.py`) after upgrades.
|
||||
53
Dockerfile
53
Dockerfile
@@ -1,6 +1,7 @@
|
||||
# Multi-stage build for deltaglider
|
||||
ARG PYTHON_VERSION=3.12-slim
|
||||
ARG UV_VERSION=0.5.13
|
||||
ARG VERSION=6.0.2
|
||||
|
||||
# Builder stage - install UV and dependencies
|
||||
FROM ghcr.io/astral-sh/uv:$UV_VERSION AS uv
|
||||
@@ -16,21 +17,29 @@ WORKDIR /build
|
||||
COPY pyproject.toml ./
|
||||
COPY README.md ./
|
||||
|
||||
# Install dependencies with UV caching
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --compile-bytecode .
|
||||
|
||||
# Copy source code
|
||||
# Copy source code - needed for setuptools-scm to write version file
|
||||
COPY src ./src
|
||||
|
||||
# Install the package (force reinstall to ensure it's properly installed)
|
||||
# Install dependencies and package with UV caching
|
||||
# Set SETUPTOOLS_SCM_PRETEND_VERSION to avoid needing .git directory
|
||||
ARG VERSION
|
||||
ENV SETUPTOOLS_SCM_PRETEND_VERSION_FOR_DELTAGLIDER=${VERSION}
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --compile-bytecode --no-deps --force-reinstall .
|
||||
uv pip install --compile-bytecode .
|
||||
|
||||
# Runtime stage - minimal image
|
||||
FROM python:${PYTHON_VERSION}
|
||||
|
||||
# Install xdelta3
|
||||
# Skip man pages and docs to speed up builds
|
||||
RUN mkdir -p /etc/dpkg/dpkg.cfg.d && \
|
||||
echo 'path-exclude /usr/share/doc/*' > /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/man/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/groff/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/info/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/lintian/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/linda/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc
|
||||
|
||||
# Install xdelta3 (now much faster without man pages)
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends xdelta3 && \
|
||||
apt-get clean && \
|
||||
@@ -57,10 +66,34 @@ USER deltaglider
|
||||
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
||||
CMD deltaglider --help || exit 1
|
||||
|
||||
# Environment variables (all optional, can be overridden at runtime)
|
||||
# Logging
|
||||
ENV DG_LOG_LEVEL=INFO
|
||||
|
||||
# Performance & Compression
|
||||
# DG_MAX_RATIO: Maximum delta/file ratio (0.0-1.0)
|
||||
# Default 0.5 means: only use delta if delta_size ≤ 50% of original_size
|
||||
# Lower (0.2-0.3) = more conservative, only high-quality compression
|
||||
# Higher (0.6-0.7) = more permissive, accept modest savings
|
||||
# See docs/DG_MAX_RATIO.md for complete tuning guide
|
||||
ENV DG_MAX_RATIO=0.5
|
||||
|
||||
# Cache Configuration
|
||||
ENV DG_CACHE_BACKEND=filesystem
|
||||
ENV DG_CACHE_MEMORY_SIZE_MB=100
|
||||
# ENV DG_CACHE_ENCRYPTION_KEY=<base64-key> # Optional: Set for cross-process cache sharing
|
||||
|
||||
# AWS Configuration (override at runtime)
|
||||
# ENV AWS_ENDPOINT_URL=https://s3.amazonaws.com
|
||||
# ENV AWS_ACCESS_KEY_ID=<your-key>
|
||||
# ENV AWS_SECRET_ACCESS_KEY=<your-secret>
|
||||
# ENV AWS_DEFAULT_REGION=us-east-1
|
||||
|
||||
# Labels
|
||||
ARG VERSION
|
||||
LABEL org.opencontainers.image.title="DeltaGlider" \
|
||||
org.opencontainers.image.description="Delta-aware S3 file storage wrapper" \
|
||||
org.opencontainers.image.version="0.1.0" \
|
||||
org.opencontainers.image.description="Delta-aware S3 file storage wrapper with encryption" \
|
||||
org.opencontainers.image.version="${VERSION}" \
|
||||
org.opencontainers.image.authors="Beshu Limited" \
|
||||
org.opencontainers.image.source="https://github.com/beshu-tech/deltaglider"
|
||||
|
||||
|
||||
215
README.md
215
README.md
@@ -6,14 +6,13 @@
|
||||
[](https://www.python.org/downloads/)
|
||||
[](https://github.com/jmacd/xdelta)
|
||||
|
||||
<div align="center">
|
||||
<img src="https://github.com/beshu-tech/deltaglider/raw/main/docs/deltaglider.png" alt="DeltaGlider Logo" width="500"/>
|
||||
</div>
|
||||
|
||||
**Store 4TB of similar files in 5GB. No, that's not a typo.**
|
||||
|
||||
DeltaGlider is a drop-in S3 replacement that may achieve 99.9% size reduction for versioned compressed artifacts, backups, and release archives through intelligent binary delta compression (via xdelta3).
|
||||
|
||||
> 🌟 Star if you like this! Or Leave a message in [Issues](https://github.com/beshu-tech/deltaglider/issues) - we are listening!
|
||||
|
||||
## The Problem We Solved
|
||||
|
||||
You're storing hundreds of versions of your software releases. Each 100MB build differs by <1% from the previous version. You're paying to store 100GB of what's essentially 100MB of unique data.
|
||||
@@ -26,12 +25,20 @@ From our [ReadOnlyREST case study](docs/case-study-readonlyrest.md):
|
||||
- **Before**: 201,840 files, 3.96TB storage, $1,120/year
|
||||
- **After**: Same files, 4.9GB storage, $1.32/year
|
||||
- **Compression**: 99.9% (not a typo)
|
||||
- **Integration time**: 5 minutes
|
||||
- **Integration time**: 5 minutes
|
||||
- **Data migration** `deltaglider migrate s3://origin-bucket s3://dest-bucket`
|
||||
|
||||
Deltaglider is great for compressed archives of similar content. Like multiple releases of the same software, DB backups, etc.
|
||||
We don't expect significant benefit for multimedia content like videos, but we never tried.
|
||||
|
||||
## Quick Start
|
||||
|
||||
The quickest way to start is using the GUI
|
||||
* https://github.com/sscarduzio/dg_commander/
|
||||
Deltaglider comes as SDK, CLI, but we also have a GUI:
|
||||
* https://github.com/beshu-tech/deltaglider_commander/
|
||||
|
||||
<div align="center">
|
||||
<img src="https://github.com/beshu-tech/deltaglider/raw/main/docs/deltaglider.png" alt="DeltaGlider Logo"/>
|
||||
</div>
|
||||
|
||||
### CLI Installation
|
||||
|
||||
@@ -46,6 +53,63 @@ uv pip install deltaglider
|
||||
docker run -v ~/.aws:/root/.aws deltaglider/deltaglider --help
|
||||
```
|
||||
|
||||
### Docker Usage
|
||||
|
||||
DeltaGlider provides a secure, production-ready Docker image with encryption always enabled:
|
||||
|
||||
```bash
|
||||
# Basic usage with AWS credentials from environment
|
||||
docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider ls s3://my-bucket/
|
||||
|
||||
# Mount AWS credentials
|
||||
docker run -v ~/.aws:/root/.aws:ro \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
|
||||
# Use memory cache for ephemeral CI/CD pipelines (faster)
|
||||
docker run -e DG_CACHE_BACKEND=memory \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider sync ./dist/ s3://releases/v1.0.0/
|
||||
|
||||
# Configure memory cache size (default: 100MB)
|
||||
docker run -e DG_CACHE_BACKEND=memory \
|
||||
-e DG_CACHE_MEMORY_SIZE_MB=500 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider cp large-file.zip s3://releases/
|
||||
|
||||
# Use MinIO or custom S3 endpoint
|
||||
docker run -e AWS_ENDPOINT_URL=http://minio:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
deltaglider/deltaglider ls s3://test-bucket/
|
||||
|
||||
# Persistent encryption key for cross-container cache sharing
|
||||
# (Only needed if sharing cache across containers via volume mount)
|
||||
docker run -v /shared-cache:/tmp/.deltaglider \
|
||||
-e DG_CACHE_ENCRYPTION_KEY=$(openssl rand -base64 32) \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
```
|
||||
|
||||
**Environment Variables**:
|
||||
- `DG_LOG_LEVEL`: Logging level (default: `INFO`, options: `DEBUG`, `INFO`, `WARNING`, `ERROR`)
|
||||
- `DG_MAX_RATIO`: Maximum delta/file ratio (default: `0.5`, range: `0.0-1.0`) - [📖 Complete Guide](docs/DG_MAX_RATIO.md)
|
||||
- `DG_CACHE_BACKEND`: Cache backend (default: `filesystem`, options: `filesystem`, `memory`)
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size in MB (default: `100`)
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded encryption key for cross-process cache sharing
|
||||
- `DG_DISABLE_EC2_DETECTION`: Disable EC2 instance detection (default: `false`, set to `true` to disable)
|
||||
- `AWS_ENDPOINT_URL`: S3 endpoint URL (default: AWS S3)
|
||||
- `AWS_ACCESS_KEY_ID`: AWS access key
|
||||
- `AWS_SECRET_ACCESS_KEY`: AWS secret key
|
||||
- `AWS_DEFAULT_REGION`: AWS region (default: `us-east-1`)
|
||||
|
||||
> **💡 Tip**: `DG_MAX_RATIO` is a powerful tuning parameter. See the [DG_MAX_RATIO guide](docs/DG_MAX_RATIO.md) to learn how to optimize compression for your use case.
|
||||
|
||||
**Security Notes**:
|
||||
- Encryption is **always enabled** (cannot be disabled)
|
||||
- Each container gets ephemeral encryption keys for maximum security
|
||||
- Corrupted cache files are automatically deleted
|
||||
- Use `DG_CACHE_ENCRYPTION_KEY` only for persistent cache sharing (store securely)
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
@@ -60,6 +124,9 @@ deltaglider ls s3://releases/
|
||||
|
||||
# Sync directories
|
||||
deltaglider sync ./dist/ s3://releases/v1.0.0/
|
||||
|
||||
# Migrate existing S3 bucket to DeltaGlider-compressed storage
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
```
|
||||
|
||||
**That's it!** DeltaGlider automatically detects similar files and applies 99%+ compression. For more commands and options, see [CLI Reference](#cli-reference).
|
||||
@@ -76,11 +143,11 @@ Traditional S3:
|
||||
|
||||
With DeltaGlider:
|
||||
v1.0.0.zip (100MB) → S3: 100MB reference + 0KB delta
|
||||
v1.0.1.zip (100MB) → S3: 98KB delta (100.1MB total)
|
||||
v1.0.2.zip (100MB) → S3: 97KB delta (100.3MB total)
|
||||
v1.0.1.zip (100MB) → S3: 98KB delta (from 100.1MB total)
|
||||
v1.0.2.zip (100MB) → S3: 97KB delta (from 100.3MB total)
|
||||
```
|
||||
|
||||
DeltaGlider stores the first file as a reference and subsequent similar files as tiny deltas (differences). When you download, it reconstructs the original file perfectly using the reference + delta.
|
||||
DeltaGlider stores the first file in a directory (deltaspace) as a reference and subsequent similar files as tiny deltas (differences). When you download, it reconstructs the original file perfectly using the reference + delta.
|
||||
|
||||
### Intelligent File Type Detection
|
||||
|
||||
@@ -100,7 +167,7 @@ DeltaGlider automatically detects file types and applies the optimal strategy:
|
||||
- **AWS CLI Replacement**: Same commands as `aws s3` with automatic compression
|
||||
- **boto3-Compatible SDK**: Works with existing boto3 code with minimal changes
|
||||
- **Zero Configuration**: No databases, no manifest files, no complex setup
|
||||
- **Data Integrity**: SHA256 verification on every operation
|
||||
- **Data Integrity**: original file's SHA256 checksum saved within S3 metadata, verification on every reconstruction
|
||||
- **S3 Compatible**: Works with AWS S3, MinIO, Cloudflare R2, and any S3-compatible storage
|
||||
|
||||
## CLI Reference
|
||||
@@ -133,6 +200,28 @@ deltaglider sync s3://releases/ ./local-backup/ # Sync from S3
|
||||
deltaglider sync --delete ./src/ s3://backup/ # Mirror exactly
|
||||
deltaglider sync --exclude "*.log" ./src/ s3://backup/ # Exclude patterns
|
||||
|
||||
# Get bucket statistics with intelligent S3-based caching
|
||||
deltaglider stats my-bucket # Quick stats (~100ms with cache)
|
||||
deltaglider stats s3://my-bucket # Also accepts s3:// format
|
||||
deltaglider stats s3://my-bucket/ # With or without trailing slash
|
||||
deltaglider stats my-bucket --sampled # Balanced (one sample per deltaspace)
|
||||
deltaglider stats my-bucket --detailed # Most accurate (slower, all metadata)
|
||||
deltaglider stats my-bucket --refresh # Force cache refresh
|
||||
deltaglider stats my-bucket --no-cache # Skip caching entirely
|
||||
deltaglider stats my-bucket --json # JSON output for automation
|
||||
|
||||
# Integrity verification & maintenance
|
||||
deltaglider verify s3://releases/file.zip # Validate stored SHA256
|
||||
deltaglider purge my-bucket # Clean expired .deltaglider/tmp files
|
||||
deltaglider purge my-bucket --dry-run # Preview purge results
|
||||
deltaglider purge my-bucket --json # Machine-readable purge stats
|
||||
|
||||
# Migrate existing S3 buckets to DeltaGlider compression
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/ # Interactive migration
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/ --yes # Skip confirmation
|
||||
deltaglider migrate --dry-run s3://old-bucket/ s3://new/ # Preview migration
|
||||
deltaglider migrate s3://bucket/v1/ s3://bucket/v2/ # Migrate prefixes
|
||||
|
||||
# Works with MinIO, R2, and S3-compatible storage
|
||||
deltaglider cp file.zip s3://bucket/ --endpoint-url http://localhost:9000
|
||||
```
|
||||
@@ -207,14 +296,18 @@ with open('downloaded.zip', 'wb') as f:
|
||||
|
||||
# Smart list_objects with optimized performance
|
||||
response = client.list_objects(Bucket='releases', Prefix='v2.0.0/')
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Paginated listing for large buckets
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100)
|
||||
while response.is_truncated:
|
||||
while response.get('IsTruncated'):
|
||||
for obj in response['Contents']:
|
||||
print(obj['Key'])
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
MaxKeys=100,
|
||||
ContinuationToken=response.next_continuation_token
|
||||
ContinuationToken=response.get('NextContinuationToken')
|
||||
)
|
||||
|
||||
# Delete and inspect objects
|
||||
@@ -403,18 +496,18 @@ This is why DeltaGlider achieves 99%+ compression on versioned archives - xdelta
|
||||
|
||||
### System Architecture
|
||||
|
||||
DeltaGlider uses a clean hexagonal architecture:
|
||||
DeltaGlider intelligently stores files within **DeltaSpaces** - S3 prefixes where related files share a common reference file for delta compression:
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
|
||||
│ Your App │────▶│ DeltaGlider │────▶│ S3/MinIO │
|
||||
│ (CLI/SDK) │ │ Core │ │ Storage │
|
||||
└─────────────┘ └──────────────┘ └─────────────┘
|
||||
│
|
||||
┌──────▼───────┐
|
||||
│ Local Cache │
|
||||
│ (References) │
|
||||
└──────────────┘
|
||||
┌─────────────┐ ┌──────────────┐ ┌─────────────────┐
|
||||
│ Your App │────▶│ DeltaGlider │────▶│ DeltaSpace │
|
||||
│ (CLI/SDK) │ │ Core │ │ (S3 prefix) │
|
||||
└─────────────┘ └──────────────┘ ├─────────────────┤
|
||||
│ │ reference.bin │
|
||||
┌──────▼───────┐ │ file1.delta │
|
||||
│ Local Cache │ │ file2.delta │
|
||||
│ (References) │ │ file3.delta │
|
||||
└──────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
**Key Components:**
|
||||
@@ -423,6 +516,9 @@ DeltaGlider uses a clean hexagonal architecture:
|
||||
- **Integrity verification**: SHA256 on every operation
|
||||
- **Local caching**: Fast repeated operations
|
||||
- **Zero dependencies**: No database, no manifest files
|
||||
- **Modular storage**: The storage layer is pluggable - you could easily replace S3 with a filesystem driver (using extended attributes for metadata) or any other backend
|
||||
|
||||
The codebase follows a ports-and-adapters pattern where core business logic is decoupled from infrastructure, with storage operations abstracted through well-defined interfaces in the `ports/` directory and concrete implementations in `adapters/`.
|
||||
|
||||
### When to Use DeltaGlider
|
||||
|
||||
@@ -452,14 +548,63 @@ Migrating from `aws s3` to `deltaglider` is as simple as changing the command na
|
||||
| `aws s3 rm s3://bucket/file` | `deltaglider rm s3://bucket/file` | - |
|
||||
| `aws s3 sync dir/ s3://bucket/` | `deltaglider sync dir/ s3://bucket/` | ✅ 99% incremental |
|
||||
|
||||
### Migrating Existing S3 Buckets
|
||||
|
||||
DeltaGlider provides a dedicated `migrate` command to compress your existing S3 data:
|
||||
|
||||
```bash
|
||||
# Migrate an entire bucket
|
||||
deltaglider migrate s3://old-bucket/ s3://compressed-bucket/
|
||||
|
||||
# Migrate a prefix (preserves prefix structure by default)
|
||||
deltaglider migrate s3://bucket/releases/ s3://bucket/archive/
|
||||
# Result: s3://bucket/archive/releases/ contains the files
|
||||
|
||||
# Migrate without preserving source prefix
|
||||
deltaglider migrate --no-preserve-prefix s3://bucket/v1/ s3://bucket/archive/
|
||||
# Result: Files go directly into s3://bucket/archive/
|
||||
|
||||
# Preview migration (dry run)
|
||||
deltaglider migrate --dry-run s3://old/ s3://new/
|
||||
|
||||
# Skip confirmation prompt
|
||||
deltaglider migrate --yes s3://old/ s3://new/
|
||||
|
||||
# Exclude certain file patterns
|
||||
deltaglider migrate --exclude "*.log" s3://old/ s3://new/
|
||||
```
|
||||
|
||||
**Key Features:**
|
||||
- **Resume Support**: Migration automatically skips files that already exist in the destination
|
||||
- **Progress Tracking**: Shows real-time migration progress and statistics
|
||||
- **Safety First**: Interactive confirmation shows file count before starting
|
||||
- **EC2 Cost Optimization**: Automatically detects EC2 instance region and warns about cross-region charges
|
||||
- ✅ Green checkmark when regions align (no extra charges)
|
||||
- ℹ️ INFO when auto-detected mismatch (suggests optimal region)
|
||||
- ⚠️ WARNING when user explicitly set wrong `--region` (expect data transfer costs)
|
||||
- Disable with `DG_DISABLE_EC2_DETECTION=true` if needed
|
||||
- **AWS Region Transparency**: Displays the actual AWS region being used
|
||||
- **Prefix Preservation**: By default, source prefix is preserved in destination (use `--no-preserve-prefix` to disable)
|
||||
- **S3-to-S3 Transfer**: Both regular S3 and DeltaGlider buckets supported
|
||||
|
||||
**Prefix Preservation Examples:**
|
||||
- `s3://src/data/` → `s3://dest/` creates `s3://dest/data/`
|
||||
- `s3://src/a/b/c/` → `s3://dest/x/` creates `s3://dest/x/c/`
|
||||
- Use `--no-preserve-prefix` to place files directly in destination without the source prefix
|
||||
|
||||
The migration preserves all file names and structure while applying DeltaGlider's compression transparently.
|
||||
|
||||
## Production Ready
|
||||
|
||||
- ✅ **Battle tested**: 200K+ files in production
|
||||
- ✅ **Data integrity**: SHA256 verification on every operation
|
||||
- ✅ **Cost optimization**: Automatic EC2 region detection warns about cross-region charges - [📖 EC2 Detection Guide](docs/EC2_REGION_DETECTION.md)
|
||||
- ✅ **S3 compatible**: Works with AWS, MinIO, Cloudflare R2, etc.
|
||||
- ✅ **Atomic operations**: No partial states
|
||||
- ✅ **Concurrent safe**: Multiple clients supported
|
||||
- ✅ **Well tested**: 95%+ code coverage
|
||||
- ✅ **Thoroughly tested**: 99 integration/unit tests, comprehensive test coverage
|
||||
- ✅ **Type safe**: Full mypy type checking, zero type errors
|
||||
- ✅ **Code quality**: Automated linting with ruff, clean codebase
|
||||
|
||||
## Development
|
||||
|
||||
@@ -471,9 +616,13 @@ cd deltaglider
|
||||
# Install with dev dependencies
|
||||
uv pip install -e ".[dev]"
|
||||
|
||||
# Run tests
|
||||
# Run tests (99 integration/unit tests)
|
||||
uv run pytest
|
||||
|
||||
# Run quality checks
|
||||
uv run ruff check src/ # Linting
|
||||
uv run mypy src/ # Type checking
|
||||
|
||||
# Run with local MinIO
|
||||
docker-compose up -d
|
||||
export AWS_ENDPOINT_URL=http://localhost:9000
|
||||
@@ -514,14 +663,8 @@ MIT - Use it freely in your projects.
|
||||
|
||||
## Success Stories
|
||||
|
||||
> "We reduced our artifact storage from 4TB to 5GB. This isn't hyperbole—it's math."
|
||||
> — [ReadOnlyREST Case Study](docs/case-study-readonlyrest.md)
|
||||
|
||||
> "Our CI/CD pipeline now uploads 100x faster. Deploys that took minutes now take seconds."
|
||||
> — Platform Engineer at [redacted]
|
||||
|
||||
> "We were about to buy expensive deduplication storage. DeltaGlider saved us $50K/year."
|
||||
> — CTO at [stealth startup]
|
||||
> "We reduced our artifact storage from 4TB to 5GB. CI is also much faster, due to smaller uploads."
|
||||
> — [ReadonlyREST Case Study](docs/case-study-readonlyrest.md)
|
||||
|
||||
---
|
||||
|
||||
@@ -533,4 +676,10 @@ deltaglider analyze s3://your-bucket/
|
||||
# Output: "Potential savings: 95.2% (4.8TB → 237GB)"
|
||||
```
|
||||
|
||||
Built with ❤️ by engineers who were tired of paying to store the same bytes over and over.
|
||||
## Who built this?
|
||||
|
||||
Built with ❤️ by [ReadonlyREST](https://readonlyrest.com) engineers who were tired of paying to store the same bytes over and over.
|
||||
|
||||
We also built [Anaphora](https://anaphora.it) for aggregated reports and alerting
|
||||
|
||||
And [Deltaglider Commander](https://github.com/beshu-tech/deltaglider_commander)
|
||||
|
||||
630
SECURITY_FIX_ROADMAP.md
Normal file
630
SECURITY_FIX_ROADMAP.md
Normal file
@@ -0,0 +1,630 @@
|
||||
# 🛡️ DeltaGlider Security Fix Roadmap
|
||||
|
||||
## Executive Summary
|
||||
Critical security vulnerabilities have been identified in DeltaGlider's cache system that enable multi-user attacks, data exposure, and cache poisoning. This document provides a **chronological, actionable roadmap** to eliminate these threats through bold architectural changes.
|
||||
|
||||
**Key Innovation**: Instead of patching individual issues, we propose a **"Zero-Trust Cache Architecture"** that eliminates entire classes of vulnerabilities.
|
||||
|
||||
---
|
||||
|
||||
## 🚀 The Bold Solution: Ephemeral Signed Cache
|
||||
|
||||
### Core Concept
|
||||
Replace filesystem cache with **ephemeral, cryptographically-signed, user-isolated cache** that eliminates:
|
||||
- TOCTOU vulnerabilities (no shared filesystem)
|
||||
- Multi-user interference (process isolation)
|
||||
- Cache poisoning (cryptographic signatures)
|
||||
- Information disclosure (encrypted metadata)
|
||||
- Cross-endpoint collision (content-addressed storage)
|
||||
|
||||
**Note**: DeltaGlider is designed as a standalone CLI/SDK application. All solutions maintain this architecture without requiring external services.
|
||||
|
||||
---
|
||||
|
||||
## 📋 Implementation Roadmap
|
||||
|
||||
### **DAY 1-2: Emergency Hotfix** (v5.0.3) ✅ COMPLETED
|
||||
*Stop the bleeding - minimal changes for immediate deployment*
|
||||
|
||||
#### 1. **Ephemeral Process-Isolated Cache** (2 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/app/cli/main.py
|
||||
import tempfile
|
||||
import atexit
|
||||
|
||||
# SECURITY: Always use ephemeral process-isolated cache
|
||||
cache_dir = Path(tempfile.mkdtemp(prefix="deltaglider-", dir="/tmp"))
|
||||
atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True))
|
||||
```
|
||||
|
||||
**Impact**: Each process gets isolated cache, auto-cleaned on exit. Eliminates multi-user attacks.
|
||||
**Implementation**: All legacy shared cache code removed. Ephemeral cache is now the ONLY mode.
|
||||
|
||||
#### 2. **Add SHA Validation at Use-Time** (2 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/ports/cache.py
|
||||
class CachePort(Protocol):
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get reference with atomic SHA validation - MUST use this for all operations."""
|
||||
...
|
||||
|
||||
# src/deltaglider/adapters/cache_fs.py
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
path = self.ref_path(bucket, prefix)
|
||||
if not path.exists():
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Lock file for atomic read (Unix only)
|
||||
with open(path, 'rb') as f:
|
||||
if sys.platform != "win32":
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
||||
content = f.read()
|
||||
actual_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
if actual_sha != expected_sha:
|
||||
path.unlink() # Remove corrupted cache
|
||||
raise CacheCorruptionError(f"SHA mismatch: cache corrupted")
|
||||
|
||||
return path
|
||||
```
|
||||
|
||||
#### 3. **Update All Usage Points** (1 hour) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/core/service.py
|
||||
# Replaced ALL instances in two locations:
|
||||
# - Line 234 (get method for decoding)
|
||||
# - Line 415 (_create_delta method for encoding)
|
||||
|
||||
ref_path = self.cache.get_validated_ref(
|
||||
delta_space.bucket,
|
||||
delta_space.prefix,
|
||||
ref_sha256 # Pass expected SHA
|
||||
)
|
||||
```
|
||||
|
||||
**Test & Deploy**: ✅ All 99 tests passing + ready for release
|
||||
|
||||
---
|
||||
|
||||
### **DAY 3-5: Quick Wins** (v5.0.3) ✅ COMPLETED
|
||||
*Low-risk improvements with high security impact*
|
||||
|
||||
#### 4. **Implement Content-Addressed Storage** (4 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/adapters/cache_cas.py
|
||||
class ContentAddressedCache(CachePort):
|
||||
"""Cache using SHA as filename - eliminates collisions"""
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str, sha256: str) -> Path:
|
||||
# Use SHA as filename - guaranteed unique
|
||||
return self.base_dir / sha256[:2] / sha256[2:4] / sha256
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path, sha256: str) -> Path:
|
||||
path = self.ref_path(bucket, prefix, sha256)
|
||||
|
||||
# If file with this SHA exists, we're done (deduplication!)
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
# Atomic write
|
||||
path.parent.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
tmp = path.with_suffix('.tmp')
|
||||
shutil.copy2(src, tmp)
|
||||
os.chmod(tmp, 0o600)
|
||||
|
||||
# Verify content before committing
|
||||
actual_sha = self.hasher.sha256(tmp)
|
||||
if actual_sha != sha256:
|
||||
tmp.unlink()
|
||||
raise ValueError("File corruption during cache write")
|
||||
|
||||
os.replace(tmp, path) # Atomic
|
||||
return path
|
||||
```
|
||||
|
||||
**Benefits**: ✅ ACHIEVED
|
||||
- Same file cached once regardless of bucket/prefix (automatic deduplication)
|
||||
- No collision possible (SHA256 uniqueness guarantees)
|
||||
- Natural cache validation (filename IS the checksum)
|
||||
- Two-level directory structure (ab/cd/abcdef...) for filesystem optimization
|
||||
|
||||
**Implementation**: Complete in `src/deltaglider/adapters/cache_cas.py` with:
|
||||
- `_cas_path()` method for SHA256-based path computation
|
||||
- `get_validated_ref()` with atomic validation and locking
|
||||
- `write_ref()` with atomic temp-file + rename pattern
|
||||
- Ephemeral deltaspace-to-SHA mapping for compatibility
|
||||
|
||||
#### 5. **Add Secure Directory Creation** (2 hours)
|
||||
```python
|
||||
# src/deltaglider/utils/secure_fs.py
|
||||
import os
|
||||
import stat
|
||||
|
||||
def secure_makedirs(path: Path, mode: int = 0o700) -> None:
|
||||
"""Create directory with secure permissions atomically."""
|
||||
try:
|
||||
path.mkdir(parents=True, mode=mode, exist_ok=False)
|
||||
except FileExistsError:
|
||||
# Verify it's ours and has correct permissions
|
||||
st = path.stat()
|
||||
if st.st_uid != os.getuid():
|
||||
raise SecurityError(f"Directory {path} owned by different user")
|
||||
if stat.S_IMODE(st.st_mode) != mode:
|
||||
os.chmod(path, mode) # Fix permissions
|
||||
```
|
||||
|
||||
#### 6. **Unify Cache Configuration** (1 hour)
|
||||
```python
|
||||
# src/deltaglider/config.py
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def get_cache_dir() -> Path:
|
||||
"""Single source of truth for cache directory."""
|
||||
if os.environ.get("DG_NO_CACHE") == "true":
|
||||
return None # Feature flag to disable cache
|
||||
|
||||
if os.environ.get("DG_EPHEMERAL_CACHE") == "true":
|
||||
return Path(tempfile.mkdtemp(prefix="dg-cache-"))
|
||||
|
||||
# User-specific cache by default
|
||||
cache_base = os.environ.get("DG_CACHE_DIR",
|
||||
os.path.expanduser("~/.cache/deltaglider"))
|
||||
return Path(cache_base) / "v2" # Version cache format
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **DAY 6-10: Architecture Redesign** (v5.0.3) ✅ COMPLETED
|
||||
*The bold solution that eliminates entire vulnerability classes*
|
||||
|
||||
#### 7. **Implement Memory Cache with Encryption** (8 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/adapters/cache_memory.py
|
||||
class MemoryCache(CachePort):
|
||||
"""In-memory cache with LRU eviction and configurable size limits."""
|
||||
|
||||
def __init__(self, hasher: HashPort, max_size_mb: int = 100, temp_dir: Path | None = None):
|
||||
self.hasher = hasher
|
||||
self.max_size_bytes = max_size_mb * 1024 * 1024
|
||||
self._current_size = 0
|
||||
self._cache: dict[tuple[str, str], tuple[bytes, str]] = {} # (bucket, prefix) -> (content, SHA)
|
||||
self._access_order: list[tuple[str, str]] = [] # LRU tracking
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Write reference to in-memory cache with LRU eviction."""
|
||||
# Read content and compute SHA
|
||||
content = src.read_bytes()
|
||||
sha256 = self.hasher.sha256_bytes(content)
|
||||
|
||||
# Check if file fits in cache
|
||||
needed_bytes = len(content)
|
||||
if needed_bytes > self.max_size_bytes:
|
||||
raise CacheCorruptionError(f"File too large for cache: {needed_bytes} > {self.max_size_bytes}")
|
||||
|
||||
# Evict LRU if needed
|
||||
self._evict_lru(needed_bytes)
|
||||
|
||||
# Store in memory
|
||||
key = (bucket, prefix)
|
||||
self._cache[key] = (content, sha256)
|
||||
self._current_size += needed_bytes
|
||||
self._access_order.append(key)
|
||||
|
||||
return src # Return original path for compatibility
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with validation."""
|
||||
key = (bucket, prefix)
|
||||
if key not in self._cache:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
content, stored_sha = self._cache[key]
|
||||
|
||||
# Validate SHA matches
|
||||
if stored_sha != expected_sha:
|
||||
raise CacheCorruptionError(f"SHA mismatch for {bucket}/{prefix}")
|
||||
|
||||
# Update LRU order
|
||||
self._access_order.remove(key)
|
||||
self._access_order.append(key)
|
||||
|
||||
# Write to temp file for compatibility
|
||||
temp_path = self.temp_dir / f"{expected_sha}.bin"
|
||||
temp_path.write_bytes(content)
|
||||
return temp_path
|
||||
```
|
||||
|
||||
# src/deltaglider/adapters/cache_encrypted.py
|
||||
class EncryptedCache(CachePort):
|
||||
"""Encrypted cache wrapper using Fernet symmetric encryption."""
|
||||
|
||||
def __init__(self, backend: CachePort, encryption_key: bytes | None = None):
|
||||
self.backend = backend
|
||||
|
||||
# Key management: ephemeral (default) or provided
|
||||
if encryption_key is None:
|
||||
self._key = Fernet.generate_key() # Ephemeral per process
|
||||
self._ephemeral = True
|
||||
else:
|
||||
self._key = encryption_key
|
||||
self._ephemeral = False
|
||||
|
||||
self._cipher = Fernet(self._key)
|
||||
# Track plaintext SHA since encrypted content has different SHA
|
||||
self._plaintext_sha_map: dict[tuple[str, str], str] = {}
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Encrypt and cache reference file."""
|
||||
# Read plaintext and compute SHA
|
||||
plaintext_data = src.read_bytes()
|
||||
plaintext_sha = hashlib.sha256(plaintext_data).hexdigest()
|
||||
|
||||
# Encrypt data
|
||||
encrypted_data = self._cipher.encrypt(plaintext_data)
|
||||
|
||||
# Write encrypted data to temp file
|
||||
temp_encrypted = src.with_suffix(".encrypted.tmp")
|
||||
temp_encrypted.write_bytes(encrypted_data)
|
||||
|
||||
try:
|
||||
# Store encrypted file via backend
|
||||
result_path = self.backend.write_ref(bucket, prefix, temp_encrypted)
|
||||
|
||||
# Store plaintext SHA mapping
|
||||
key = (bucket, prefix)
|
||||
self._plaintext_sha_map[key] = plaintext_sha
|
||||
|
||||
return result_path
|
||||
finally:
|
||||
temp_encrypted.unlink(missing_ok=True)
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with decryption and validation."""
|
||||
# Verify we have the plaintext SHA mapped
|
||||
key = (bucket, prefix)
|
||||
if key not in self._plaintext_sha_map:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
if self._plaintext_sha_map[key] != expected_sha:
|
||||
raise CacheCorruptionError(f"SHA mismatch for {bucket}/{prefix}")
|
||||
|
||||
# Get encrypted file from backend
|
||||
encrypted_path = self.backend.ref_path(bucket, prefix)
|
||||
if not encrypted_path.exists():
|
||||
raise CacheMissError(f"Encrypted cache file not found")
|
||||
|
||||
# Decrypt content
|
||||
encrypted_data = encrypted_path.read_bytes()
|
||||
try:
|
||||
decrypted_data = self._cipher.decrypt(encrypted_data)
|
||||
except Exception as e:
|
||||
raise CacheCorruptionError(f"Decryption failed: {e}") from e
|
||||
|
||||
# Validate plaintext SHA
|
||||
actual_sha = hashlib.sha256(decrypted_data).hexdigest()
|
||||
if actual_sha != expected_sha:
|
||||
raise CacheCorruptionError(f"Decrypted content SHA mismatch")
|
||||
|
||||
# Write decrypted content to temp file
|
||||
decrypted_path = encrypted_path.with_suffix(".decrypted")
|
||||
decrypted_path.write_bytes(decrypted_data)
|
||||
return decrypted_path
|
||||
```
|
||||
|
||||
**Implementation**: ✅ COMPLETED
|
||||
- **MemoryCache**: In-memory cache with LRU eviction, configurable size limits, zero filesystem I/O
|
||||
- **EncryptedCache**: Fernet (AES-128-CBC + HMAC) encryption wrapper, ephemeral keys by default
|
||||
- **Configuration**: `DG_CACHE_BACKEND` (filesystem/memory), `DG_CACHE_ENCRYPTION` (true/false)
|
||||
- **Environment Variables**: `DG_CACHE_MEMORY_SIZE_MB`, `DG_CACHE_ENCRYPTION_KEY`
|
||||
|
||||
**Benefits**: ✅ ACHIEVED
|
||||
- No filesystem access for memory cache = no permission issues
|
||||
- Encrypted at rest = secure cache storage
|
||||
- Per-process ephemeral keys = forward secrecy and process isolation
|
||||
- LRU eviction = prevents memory exhaustion
|
||||
- Zero TOCTOU window = memory operations are atomic
|
||||
- Configurable backends = flexibility for different use cases
|
||||
|
||||
#### 8. **Implement Signed Cache Entries** (6 hours)
|
||||
```python
|
||||
# src/deltaglider/adapters/cache_signed.py
|
||||
import hmac
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
class SignedCache(CachePort):
|
||||
"""Cache with cryptographic signatures and expiry."""
|
||||
|
||||
def __init__(self, base_dir: Path, secret_key: bytes = None):
|
||||
self.base_dir = base_dir
|
||||
# Per-session key if not provided
|
||||
self.secret = secret_key or os.urandom(32)
|
||||
|
||||
def _sign_metadata(self, metadata: dict) -> str:
|
||||
"""Create HMAC signature for metadata."""
|
||||
json_meta = json.dumps(metadata, sort_keys=True)
|
||||
signature = hmac.new(
|
||||
self.secret,
|
||||
json_meta.encode(),
|
||||
hashlib.sha256
|
||||
).hexdigest()
|
||||
return signature
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path, sha256: str) -> Path:
|
||||
# Create signed metadata
|
||||
metadata = {
|
||||
"sha256": sha256,
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"expires": (datetime.utcnow() + timedelta(hours=24)).isoformat(),
|
||||
"pid": os.getpid(),
|
||||
"uid": os.getuid(),
|
||||
}
|
||||
signature = self._sign_metadata(metadata)
|
||||
|
||||
# Store data + metadata
|
||||
cache_dir = self.base_dir / signature[:8] # Use signature prefix as namespace
|
||||
cache_dir.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
|
||||
data_path = cache_dir / f"{sha256}.bin"
|
||||
meta_path = cache_dir / f"{sha256}.meta"
|
||||
|
||||
# Atomic writes
|
||||
shutil.copy2(src, data_path)
|
||||
os.chmod(data_path, 0o600)
|
||||
|
||||
with open(meta_path, 'w') as f:
|
||||
json.dump({"metadata": metadata, "signature": signature}, f)
|
||||
os.chmod(meta_path, 0o600)
|
||||
|
||||
return data_path
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, sha256: str) -> Path:
|
||||
# Find and validate signed entry
|
||||
pattern = self.base_dir / "*" / f"{sha256}.meta"
|
||||
matches = list(Path(self.base_dir).glob(f"*/{sha256}.meta"))
|
||||
|
||||
for meta_path in matches:
|
||||
with open(meta_path) as f:
|
||||
entry = json.load(f)
|
||||
|
||||
# Verify signature
|
||||
expected_sig = self._sign_metadata(entry["metadata"])
|
||||
if not hmac.compare_digest(entry["signature"], expected_sig):
|
||||
meta_path.unlink() # Remove tampered entry
|
||||
continue
|
||||
|
||||
# Check expiry
|
||||
expires = datetime.fromisoformat(entry["metadata"]["expires"])
|
||||
if datetime.utcnow() > expires:
|
||||
meta_path.unlink()
|
||||
continue
|
||||
|
||||
# Validate data integrity
|
||||
data_path = meta_path.with_suffix('.bin')
|
||||
actual_sha = self.hasher.sha256(data_path)
|
||||
if actual_sha != sha256:
|
||||
data_path.unlink()
|
||||
meta_path.unlink()
|
||||
continue
|
||||
|
||||
return data_path
|
||||
|
||||
raise CacheMissError(f"No valid cache entry for {sha256}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **DAY 11-15: Advanced Security** (v6.0.0)
|
||||
*Next-generation features for standalone security*
|
||||
|
||||
#### 9. **Add Integrity Monitoring** (4 hours)
|
||||
```python
|
||||
# src/deltaglider/security/monitor.py
|
||||
import inotify
|
||||
import logging
|
||||
|
||||
class CacheIntegrityMonitor:
|
||||
"""Detect and alert on cache tampering attempts."""
|
||||
|
||||
def __init__(self, cache_dir: Path):
|
||||
self.cache_dir = cache_dir
|
||||
self.notifier = inotify.INotify()
|
||||
self.watch_desc = self.notifier.add_watch(
|
||||
str(cache_dir),
|
||||
inotify.IN_MODIFY | inotify.IN_DELETE | inotify.IN_ATTRIB
|
||||
)
|
||||
self.logger = logging.getLogger("security")
|
||||
|
||||
async def monitor(self):
|
||||
"""Monitor for unauthorized cache modifications."""
|
||||
async for event in self.notifier:
|
||||
if event.mask & inotify.IN_MODIFY:
|
||||
# File modified - verify it was by our process
|
||||
if not self._is_our_modification(event):
|
||||
self.logger.critical(
|
||||
f"SECURITY: Unauthorized cache modification detected: {event.path}"
|
||||
)
|
||||
# Immediately invalidate affected cache
|
||||
Path(event.path).unlink(missing_ok=True)
|
||||
|
||||
elif event.mask & inotify.IN_ATTRIB:
|
||||
# Permission change - always suspicious
|
||||
self.logger.warning(
|
||||
f"SECURITY: Cache permission change: {event.path}"
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **DAY 16-20: Testing & Rollout** (v6.0.0 release)
|
||||
|
||||
#### 10. **Security Test Suite** (8 hours)
|
||||
```python
|
||||
# tests/security/test_cache_attacks.py
|
||||
import pytest
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
|
||||
class TestCacheSecurity:
|
||||
"""Test all known attack vectors."""
|
||||
|
||||
def test_toctou_attack_prevented(self, cache):
|
||||
"""Verify TOCTOU window is eliminated."""
|
||||
sha = "abc123"
|
||||
cache.write_ref("bucket", "prefix", test_file, sha)
|
||||
|
||||
# Attacker thread tries to replace file during read
|
||||
def attacker():
|
||||
time.sleep(0.0001) # Try to hit the TOCTOU window
|
||||
cache_path = cache.ref_path("bucket", "prefix", sha)
|
||||
cache_path.write_bytes(b"malicious")
|
||||
|
||||
thread = threading.Thread(target=attacker)
|
||||
thread.start()
|
||||
|
||||
# Should detect tampering
|
||||
with pytest.raises(CacheCorruptionError):
|
||||
cache.get_validated_ref("bucket", "prefix", sha)
|
||||
|
||||
def test_multi_user_isolation(self, cache):
|
||||
"""Verify users can't access each other's cache."""
|
||||
# Create cache as user A
|
||||
cache_a = SignedCache(Path("/tmp/cache"), secret=b"key_a")
|
||||
cache_a.write_ref("bucket", "prefix", test_file, "sha_a")
|
||||
|
||||
# Try to read as user B with different key
|
||||
cache_b = SignedCache(Path("/tmp/cache"), secret=b"key_b")
|
||||
|
||||
with pytest.raises(CacheMissError):
|
||||
cache_b.get_validated_ref("bucket", "prefix", "sha_a")
|
||||
|
||||
def test_cache_poisoning_prevented(self, cache):
|
||||
"""Verify corrupted cache is detected."""
|
||||
sha = "abc123"
|
||||
cache.write_ref("bucket", "prefix", test_file, sha)
|
||||
|
||||
# Corrupt the cache file
|
||||
cache_path = cache.ref_path("bucket", "prefix", sha)
|
||||
with open(cache_path, 'ab') as f:
|
||||
f.write(b"corrupted")
|
||||
|
||||
# Should detect corruption
|
||||
with pytest.raises(CacheCorruptionError):
|
||||
cache.get_validated_ref("bucket", "prefix", sha)
|
||||
```
|
||||
|
||||
#### 11. **Migration Guide** (4 hours)
|
||||
```python
|
||||
# src/deltaglider/migration/v5_to_v6.py
|
||||
def migrate_cache():
|
||||
"""Migrate from v5 shared cache to v6 secure cache."""
|
||||
old_cache = Path("/tmp/.deltaglider/cache")
|
||||
|
||||
if old_cache.exists():
|
||||
print("WARNING: Old insecure cache detected at", old_cache)
|
||||
print("This cache had security vulnerabilities and will not be migrated.")
|
||||
|
||||
response = input("Delete old cache? [y/N]: ")
|
||||
if response.lower() == 'y':
|
||||
shutil.rmtree(old_cache)
|
||||
print("Old cache deleted. New secure cache will be created on demand.")
|
||||
else:
|
||||
print("Old cache retained at", old_cache)
|
||||
print("Set DG_CACHE_DIR to use a different location.")
|
||||
```
|
||||
|
||||
#### 12. **Performance Benchmarks** (4 hours)
|
||||
```python
|
||||
# benchmarks/cache_performance.py
|
||||
def benchmark_cache_implementations():
|
||||
"""Compare performance of cache implementations."""
|
||||
|
||||
implementations = [
|
||||
("Filesystem (v5)", FsCacheAdapter),
|
||||
("Content-Addressed", ContentAddressedCache),
|
||||
("Memory", MemoryCache),
|
||||
("Signed", SignedCache),
|
||||
]
|
||||
|
||||
for name, cache_class in implementations:
|
||||
cache = cache_class(test_dir)
|
||||
|
||||
# Measure write performance
|
||||
start = time.perf_counter()
|
||||
for i in range(1000):
|
||||
cache.write_ref("bucket", f"prefix{i}", test_file, f"sha{i}")
|
||||
write_time = time.perf_counter() - start
|
||||
|
||||
# Measure read performance
|
||||
start = time.perf_counter()
|
||||
for i in range(1000):
|
||||
cache.get_validated_ref("bucket", f"prefix{i}", f"sha{i}")
|
||||
read_time = time.perf_counter() - start
|
||||
|
||||
print(f"{name}: Write={write_time:.3f}s Read={read_time:.3f}s")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Decision Matrix
|
||||
|
||||
| Solution | Security | Performance | Complexity | Breaking Change |
|
||||
|----------|----------|-------------|------------|-----------------|
|
||||
| Hotfix (Day 1-2) | ⭐⭐⭐ | ⭐⭐ | ⭐ | No |
|
||||
| Content-Addressed | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐ | No |
|
||||
| Memory Cache | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | No |
|
||||
| Signed Cache | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | No |
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Recommended Approach
|
||||
|
||||
### For Immediate Production (Next 48 hours)
|
||||
Deploy **Hotfix v5.0.3** with ephemeral cache + SHA validation
|
||||
|
||||
### For Next Release (1 week)
|
||||
Implement **Content-Addressed Storage** (v5.1.0) - best balance of security and simplicity
|
||||
|
||||
### For Enterprise (1 month)
|
||||
Deploy **Signed Cache** (v6.0.0) for maximum security with built-in TTL and integrity
|
||||
|
||||
---
|
||||
|
||||
## 🚦 Success Metrics
|
||||
|
||||
After implementation, verify:
|
||||
|
||||
1. **Security Tests Pass**: All attack vectors prevented
|
||||
2. **Performance Maintained**: <10% degradation vs v5
|
||||
3. **Zero CVEs**: No security vulnerabilities in cache
|
||||
4. **User Isolation**: Multi-user systems work safely
|
||||
5. **Backward Compatible**: Existing workflows unaffected
|
||||
|
||||
---
|
||||
|
||||
## 📞 Support
|
||||
|
||||
For questions or security concerns:
|
||||
- Security Team: security@deltaglider.io
|
||||
- Lead Developer: @architect
|
||||
- Immediate Issues: Create SECURITY labeled issue
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Disclosure Timeline
|
||||
|
||||
- **Day 0**: Vulnerabilities discovered
|
||||
- **Day 1**: Hotfix released (v5.0.3)
|
||||
- **Day 7**: Improved version released (v5.1.0)
|
||||
- **Day 30**: Full disclosure published
|
||||
- **Day 45**: v6.0.0 with complete redesign
|
||||
|
||||
---
|
||||
|
||||
*Document Version: 1.0*
|
||||
*Classification: SENSITIVE - INTERNAL USE ONLY*
|
||||
*Last Updated: 2024-10-09*
|
||||
@@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- "4566:4566"
|
||||
environment:
|
||||
|
||||
@@ -22,7 +22,7 @@ services:
|
||||
retries: 5
|
||||
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
container_name: deltaglider-localstack
|
||||
ports:
|
||||
- "4566:4566"
|
||||
|
||||
84
docs/BOTO3_COMPATIBILITY_VISION.md
Normal file
84
docs/BOTO3_COMPATIBILITY_VISION.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# boto3 Compatibility Vision
|
||||
|
||||
DeltaGlider is a drop-in replacement for boto3's S3 client. This document spells out what “drop-in”
|
||||
means in practice so new projects can adopt the SDK with confidence.
|
||||
|
||||
## Current State (v5.x and newer)
|
||||
|
||||
- `DeltaGliderClient` methods such as `list_objects`, `put_object`, `get_object`, `delete_object`,
|
||||
`delete_objects`, `head_object`, etc. return **boto3-compatible dicts**.
|
||||
- TypedDict aliases in `deltaglider.types` (e.g. `ListObjectsV2Response`, `PutObjectResponse`) give
|
||||
IDE/type-checking support without importing boto3.
|
||||
- DeltaGlider-specific metadata lives inside standard boto3 fields (typically `Metadata`), so tools
|
||||
that ignore those keys see the exact same structures as they would from boto3.
|
||||
- Tests and documentation exercise and describe the boto3-style responses (`response['Contents']`
|
||||
instead of `response.contents`).
|
||||
|
||||
```python
|
||||
from deltaglider import create_client, ListObjectsV2Response
|
||||
|
||||
client = create_client()
|
||||
response: ListObjectsV2Response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
```
|
||||
|
||||
## Key Design Points
|
||||
|
||||
- **TypedDict everywhere** – `put_object`, `get_object`, `list_objects`, `delete_object`, etc.
|
||||
return the same shapes boto3 does. Use the provided aliases (`ListObjectsV2Response`,
|
||||
`PutObjectResponse`, …) for IDE/completion help.
|
||||
- **Metadata namespace** – DeltaGlider-specific flags such as `deltaglider-is-delta` live under the
|
||||
regular `Metadata` key so every response remains valid boto3 output.
|
||||
- **No shims required** – responses are plain dicts. If you already know boto3, you already know how
|
||||
to consume DeltaGlider outputs.
|
||||
|
||||
## Benefits Summary
|
||||
|
||||
### For Users
|
||||
- **Zero learning curve** – identical data structures to boto3.
|
||||
- **Tooling compatibility** – works with any boto3-aware tool or library.
|
||||
- **Type safety** – TypedDicts provide IDE autocomplete even without boto3 installed.
|
||||
|
||||
### For DeltaGlider
|
||||
- **Cleaner internals** – no custom dataclasses to maintain.
|
||||
- **Simpler docs/tests** – examples mirror boto3 verbatim.
|
||||
- **Marketing accuracy** – "drop-in replacement" is now literal.
|
||||
|
||||
## Technical Details
|
||||
|
||||
### TypedDict refresher
|
||||
```python
|
||||
from typing import TypedDict
|
||||
|
||||
class MyResponse(TypedDict):
|
||||
Key: str
|
||||
Size: int
|
||||
|
||||
resp: MyResponse = {'Key': 'file.zip', 'Size': 1024}
|
||||
print(type(resp)) # <class 'dict'>
|
||||
```
|
||||
At runtime the structure is still a plain `dict`, but static type-checkers understand the shape.
|
||||
|
||||
### DeltaGlider Metadata
|
||||
|
||||
Delta-specific fields live inside the standard `Metadata` map. Example list_objects entry:
|
||||
```python
|
||||
{
|
||||
'Key': 'file.zip',
|
||||
'Size': 1024,
|
||||
'Metadata': {
|
||||
'deltaglider-is-delta': 'true',
|
||||
'deltaglider-compression-ratio': '0.99',
|
||||
'deltaglider-original-size': '50000000',
|
||||
}
|
||||
}
|
||||
```
|
||||
These keys are namespaced (`deltaglider-...`) so they are safe to ignore if not needed.
|
||||
|
||||
## Status Snapshot
|
||||
|
||||
- ✅ TypedDict builders are used everywhere (`build_list_objects_response`, etc.).
|
||||
- ✅ Tests assert boto3-style dict access (`response['Contents']`).
|
||||
- ✅ Documentation (README, SDK docs, examples) shows the boto3 syntax.
|
||||
684
docs/CACHE_MANAGEMENT.md
Normal file
684
docs/CACHE_MANAGEMENT.md
Normal file
@@ -0,0 +1,684 @@
|
||||
# Cache Management for Long-Running Applications
|
||||
|
||||
## Overview
|
||||
|
||||
DeltaGlider uses caching to store reference files locally for efficient delta compression. However, unlike the CLI which automatically cleans up cache on exit, **programmatic SDK usage requires manual cache management** for long-running applications.
|
||||
|
||||
This guide explains how to manage cache in production applications, including:
|
||||
- When and how to clear cache
|
||||
- Encryption key management strategies
|
||||
- Memory vs. filesystem cache trade-offs
|
||||
- Best practices for different application types
|
||||
|
||||
## The Problem
|
||||
|
||||
**CLI (Automatic Cleanup)**:
|
||||
```bash
|
||||
# Cache created in /tmp/deltaglider-xyz123/
|
||||
deltaglider cp file.zip s3://bucket/
|
||||
|
||||
# Process exits → Cache automatically deleted via atexit handler
|
||||
# ✅ No manual cleanup needed
|
||||
```
|
||||
|
||||
**SDK (Manual Cleanup Required)**:
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Long-running application (runs for hours/days)
|
||||
while True:
|
||||
client.put_object(Bucket='releases', Key='file.zip', Body=data)
|
||||
time.sleep(600) # Upload every 10 minutes
|
||||
|
||||
# ❌ Process never exits → Cache never cleaned
|
||||
# ❌ Cache grows indefinitely
|
||||
# ❌ Memory/disk exhaustion after days/weeks
|
||||
```
|
||||
|
||||
## Solution: Manual Cache Management
|
||||
|
||||
### Basic Cache Clearing
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Do some uploads
|
||||
client.put_object(Bucket='releases', Key='file1.zip', Body=data1)
|
||||
client.put_object(Bucket='releases', Key='file2.zip', Body=data2)
|
||||
|
||||
# Clear cache to free resources
|
||||
client.clear_cache()
|
||||
|
||||
# ✅ All cached references removed
|
||||
# ✅ Memory/disk freed
|
||||
# ✅ Next upload will fetch fresh reference from S3
|
||||
```
|
||||
|
||||
### When to Clear Cache
|
||||
|
||||
| Scenario | Frequency | Reason |
|
||||
|----------|-----------|--------|
|
||||
| **Long-running services** | Every 1-4 hours | Prevent memory/disk growth |
|
||||
| **After config changes** | Immediately | Old cache is invalid |
|
||||
| **High memory pressure** | As needed | Free resources |
|
||||
| **Test cleanup** | After each test | Ensure clean state |
|
||||
| **Scheduled jobs** | After job completes | Clean up before next run |
|
||||
| **Key rotation** | After rotation | Old encrypted cache unusable |
|
||||
|
||||
## Cache Strategies by Application Type
|
||||
|
||||
### 1. Long-Running Background Service
|
||||
|
||||
**Scenario**: Continuous upload service running 24/7
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import schedule
|
||||
import time
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
client = create_client()
|
||||
|
||||
def upload_task():
|
||||
"""Upload latest build."""
|
||||
try:
|
||||
with open('latest-build.zip', 'rb') as f:
|
||||
response = client.put_object(
|
||||
Bucket='releases',
|
||||
Key=f'builds/{datetime.now().isoformat()}.zip',
|
||||
Body=f
|
||||
)
|
||||
logger.info(f"Uploaded: {response['ETag']}")
|
||||
except Exception as e:
|
||||
logger.error(f"Upload failed: {e}")
|
||||
|
||||
def cleanup_task():
|
||||
"""Clear cache to prevent growth."""
|
||||
client.clear_cache()
|
||||
logger.info("Cache cleared - freed resources")
|
||||
|
||||
# Upload every 10 minutes
|
||||
schedule.every(10).minutes.do(upload_task)
|
||||
|
||||
# Clear cache every 2 hours (balance performance vs. memory)
|
||||
schedule.every(2).hours.do(cleanup_task)
|
||||
|
||||
# Run indefinitely
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(60)
|
||||
```
|
||||
|
||||
**Cache Clearing Frequency Guidelines**:
|
||||
- Every 1 hour: High upload frequency (>100/day), memory-constrained
|
||||
- Every 2-4 hours: Moderate upload frequency (10-100/day), normal memory
|
||||
- Every 6-12 hours: Low upload frequency (<10/day), abundant memory
|
||||
|
||||
### 2. Periodic Batch Job
|
||||
|
||||
**Scenario**: Daily backup script
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import glob
|
||||
from pathlib import Path
|
||||
|
||||
def daily_backup():
|
||||
"""Upload daily backups and clean up."""
|
||||
client = create_client()
|
||||
|
||||
try:
|
||||
# Upload all backup files
|
||||
for backup in glob.glob('/backups/*.zip'):
|
||||
with open(backup, 'rb') as f:
|
||||
client.put_object(
|
||||
Bucket='backups',
|
||||
Key=f'daily/{Path(backup).name}',
|
||||
Body=f
|
||||
)
|
||||
print(f"Backed up: {backup}")
|
||||
|
||||
finally:
|
||||
# ALWAYS clear cache at end of job
|
||||
client.clear_cache()
|
||||
print("Cache cleared - job complete")
|
||||
|
||||
# Run daily via cron/systemd timer
|
||||
if __name__ == '__main__':
|
||||
daily_backup()
|
||||
```
|
||||
|
||||
**Best Practice**: Always clear cache in `finally` block to ensure cleanup even if job fails.
|
||||
|
||||
### 3. Web Application / API Server
|
||||
|
||||
**Scenario**: Flask/FastAPI app with upload endpoints
|
||||
|
||||
```python
|
||||
from fastapi import FastAPI, UploadFile, BackgroundTasks
|
||||
from deltaglider import create_client
|
||||
import asyncio
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Create client once at startup
|
||||
client = create_client()
|
||||
|
||||
async def periodic_cache_cleanup():
|
||||
"""Background task to clear cache periodically."""
|
||||
while True:
|
||||
await asyncio.sleep(3600) # Every hour
|
||||
client.clear_cache()
|
||||
print("Cache cleared in background")
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Start background cache cleanup task."""
|
||||
asyncio.create_task(periodic_cache_cleanup())
|
||||
|
||||
@app.post("/upload")
|
||||
async def upload_file(file: UploadFile):
|
||||
"""Upload endpoint with automatic cache management."""
|
||||
content = await file.read()
|
||||
|
||||
response = client.put_object(
|
||||
Bucket='uploads',
|
||||
Key=f'files/{file.filename}',
|
||||
Body=content
|
||||
)
|
||||
|
||||
return {"message": "Uploaded", "etag": response['ETag']}
|
||||
|
||||
@app.post("/admin/clear-cache")
|
||||
async def admin_clear_cache():
|
||||
"""Manual cache clear endpoint for admin."""
|
||||
client.clear_cache()
|
||||
return {"message": "Cache cleared"}
|
||||
```
|
||||
|
||||
**Best Practice**: Run periodic cache cleanup in background task, provide manual clear endpoint for emergencies.
|
||||
|
||||
### 4. Testing / CI/CD
|
||||
|
||||
**Scenario**: Test suite using DeltaGlider
|
||||
|
||||
```python
|
||||
import pytest
|
||||
from deltaglider import create_client
|
||||
|
||||
@pytest.fixture
|
||||
def deltaglider_client():
|
||||
"""Provide clean client for each test."""
|
||||
client = create_client()
|
||||
yield client
|
||||
# ALWAYS clear cache after test
|
||||
client.clear_cache()
|
||||
|
||||
def test_upload(deltaglider_client):
|
||||
"""Test upload with automatic cleanup."""
|
||||
response = deltaglider_client.put_object(
|
||||
Bucket='test-bucket',
|
||||
Key='test-file.zip',
|
||||
Body=b'test data'
|
||||
)
|
||||
assert response['ETag'] is not None
|
||||
# Cache automatically cleared by fixture
|
||||
|
||||
def test_download(deltaglider_client):
|
||||
"""Test download with clean cache."""
|
||||
# Cache is clean from previous test
|
||||
deltaglider_client.put_object(Bucket='test', Key='file.zip', Body=b'data')
|
||||
response = deltaglider_client.get_object(Bucket='test', Key='file.zip')
|
||||
assert response['Body'].read() == b'data'
|
||||
# Cache automatically cleared by fixture
|
||||
```
|
||||
|
||||
**Best Practice**: Use pytest fixtures to ensure cache is cleared after each test.
|
||||
|
||||
### 5. AWS Lambda / Serverless
|
||||
|
||||
**Scenario**: Lambda function with warm container reuse
|
||||
|
||||
```python
|
||||
import os
|
||||
from deltaglider import create_client
|
||||
|
||||
# Initialize client outside handler (reused across invocations)
|
||||
client = create_client()
|
||||
|
||||
# Track invocation count for cache clearing
|
||||
invocation_count = 0
|
||||
|
||||
def lambda_handler(event, context):
|
||||
"""Lambda handler with periodic cache clearing."""
|
||||
global invocation_count
|
||||
invocation_count += 1
|
||||
|
||||
try:
|
||||
# Upload file
|
||||
response = client.put_object(
|
||||
Bucket='lambda-uploads',
|
||||
Key=event['filename'],
|
||||
Body=event['data']
|
||||
)
|
||||
|
||||
# Clear cache every 50 invocations (warm container optimization)
|
||||
if invocation_count % 50 == 0:
|
||||
client.clear_cache()
|
||||
print(f"Cache cleared after {invocation_count} invocations")
|
||||
|
||||
return {'statusCode': 200, 'etag': response['ETag']}
|
||||
|
||||
except Exception as e:
|
||||
# Clear cache on error to prevent poisoned state
|
||||
client.clear_cache()
|
||||
return {'statusCode': 500, 'error': str(e)}
|
||||
```
|
||||
|
||||
**Best Practice**: Clear cache periodically (every N invocations) and on errors. Lambda warm containers can reuse cache across invocations for performance.
|
||||
|
||||
## Encryption Key Management
|
||||
|
||||
DeltaGlider always encrypts cache data. Understanding key management is critical for programmatic usage.
|
||||
|
||||
### Ephemeral Keys (Default - Recommended)
|
||||
|
||||
**How It Works**:
|
||||
- New encryption key generated per client instance
|
||||
- Cache encrypted with instance-specific key
|
||||
- Key lost when client is garbage collected
|
||||
- **Maximum security** - keys never persist
|
||||
|
||||
**When to Use**:
|
||||
- Single-process applications
|
||||
- Short-lived scripts
|
||||
- CI/CD pipelines
|
||||
- Testing
|
||||
- Maximum security requirements
|
||||
|
||||
**Example**:
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
# Create client (generates ephemeral key automatically)
|
||||
client = create_client()
|
||||
|
||||
# Upload file (encrypted with ephemeral key)
|
||||
client.put_object(Bucket='bucket', Key='file.zip', Body=data)
|
||||
|
||||
# Clear cache
|
||||
client.clear_cache()
|
||||
|
||||
# ✅ Encrypted cache cleared
|
||||
# ✅ Key was never persisted
|
||||
# ✅ Perfect forward secrecy
|
||||
```
|
||||
|
||||
**Characteristics**:
|
||||
- ✅ Maximum security (keys never leave process)
|
||||
- ✅ Perfect forward secrecy
|
||||
- ✅ No key management overhead
|
||||
- ❌ Cache not shareable between processes
|
||||
- ❌ Cache not reusable after client recreation
|
||||
|
||||
### Persistent Keys (Advanced - Shared Cache)
|
||||
|
||||
**How It Works**:
|
||||
- Use same encryption key across multiple processes/clients
|
||||
- Key stored in environment variable or secrets manager
|
||||
- All processes can read each other's encrypted cache
|
||||
- **Trade-off**: Convenience vs. security
|
||||
|
||||
**When to Use**:
|
||||
- Multi-process applications (workers, replicas)
|
||||
- Shared cache across containers
|
||||
- Cache persistence across application restarts
|
||||
- Horizontal scaling scenarios
|
||||
|
||||
**Example - Environment Variable**:
|
||||
```python
|
||||
import os
|
||||
from cryptography.fernet import Fernet
|
||||
import base64
|
||||
|
||||
# Generate persistent key (do this ONCE, securely)
|
||||
key = Fernet.generate_key()
|
||||
key_b64 = base64.b64encode(key).decode('utf-8')
|
||||
print(f"DG_CACHE_ENCRYPTION_KEY={key_b64}") # Store in secrets manager!
|
||||
|
||||
# Set in environment (or use secrets manager)
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = key_b64
|
||||
|
||||
# All client instances use same key
|
||||
client1 = create_client()
|
||||
client2 = create_client()
|
||||
|
||||
# Client1 writes to cache
|
||||
client1.put_object(Bucket='bucket', Key='file.zip', Body=data)
|
||||
|
||||
# Client2 can read same cached data (same key!)
|
||||
client2.get_object(Bucket='bucket', Key='file.zip')
|
||||
|
||||
# ✅ Cache shared between processes
|
||||
# ⚠️ Key must be securely managed
|
||||
```
|
||||
|
||||
**Example - AWS Secrets Manager**:
|
||||
```python
|
||||
import boto3
|
||||
import json
|
||||
from deltaglider import create_client
|
||||
|
||||
def get_encryption_key_from_secrets_manager():
|
||||
"""Retrieve encryption key from AWS Secrets Manager."""
|
||||
secrets = boto3.client('secretsmanager', region_name='us-west-2')
|
||||
response = secrets.get_secret_value(SecretId='deltaglider/cache-encryption-key')
|
||||
secret = json.loads(response['SecretString'])
|
||||
return secret['encryption_key']
|
||||
|
||||
# Retrieve key securely
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = get_encryption_key_from_secrets_manager()
|
||||
|
||||
# Create client with persistent key
|
||||
client = create_client()
|
||||
```
|
||||
|
||||
**Security Best Practices for Persistent Keys**:
|
||||
1. **Generate once**: Never regenerate unless rotating
|
||||
2. **Store securely**: AWS Secrets Manager, HashiCorp Vault, etc.
|
||||
3. **Rotate regularly**: Follow your key rotation policy
|
||||
4. **Audit access**: Log who accesses encryption keys
|
||||
5. **Principle of least privilege**: Only processes that need shared cache get the key
|
||||
|
||||
### Key Rotation
|
||||
|
||||
**When to Rotate**:
|
||||
- Regular schedule (every 90 days recommended)
|
||||
- After security incident
|
||||
- When team members leave
|
||||
- After key exposure
|
||||
|
||||
**How to Rotate**:
|
||||
```python
|
||||
import os
|
||||
from deltaglider import create_client
|
||||
|
||||
# OLD KEY (existing cache encrypted with this)
|
||||
old_key = os.environ.get('DG_CACHE_ENCRYPTION_KEY')
|
||||
|
||||
# Generate NEW KEY
|
||||
from cryptography.fernet import Fernet
|
||||
new_key = Fernet.generate_key()
|
||||
new_key_b64 = base64.b64encode(new_key).decode('utf-8')
|
||||
|
||||
# Steps for rotation:
|
||||
# 1. Clear old cache (encrypted with old key)
|
||||
client_old = create_client() # Uses old key from env
|
||||
client_old.clear_cache()
|
||||
|
||||
# 2. Update environment with new key
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = new_key_b64
|
||||
|
||||
# 3. Create new client with new key
|
||||
client_new = create_client() # Uses new key
|
||||
|
||||
# 4. Continue operations
|
||||
client_new.put_object(Bucket='bucket', Key='file.zip', Body=data)
|
||||
|
||||
# ✅ Cache now encrypted with new key
|
||||
# ✅ Old encrypted cache cleared
|
||||
```
|
||||
|
||||
## Memory vs. Filesystem Cache
|
||||
|
||||
### Filesystem Cache (Default)
|
||||
|
||||
**Characteristics**:
|
||||
- Stored in `/tmp/deltaglider-*/`
|
||||
- Persistent across client recreations (within same process)
|
||||
- Can be shared between processes (with persistent encryption key)
|
||||
- Slower than memory cache (disk I/O)
|
||||
|
||||
**Configuration**:
|
||||
```python
|
||||
import os
|
||||
|
||||
# Explicitly set filesystem cache (this is the default)
|
||||
os.environ['DG_CACHE_BACKEND'] = 'filesystem'
|
||||
|
||||
from deltaglider import create_client
|
||||
client = create_client()
|
||||
```
|
||||
|
||||
**When to Use**:
|
||||
- Default choice for most applications
|
||||
- When cache should persist across client recreations
|
||||
- Multi-process applications (with persistent key)
|
||||
- Memory-constrained environments
|
||||
|
||||
**Cache Clearing**:
|
||||
```python
|
||||
client.clear_cache()
|
||||
# Removes all files from /tmp/deltaglider-*/
|
||||
# Frees disk space
|
||||
```
|
||||
|
||||
### Memory Cache
|
||||
|
||||
**Characteristics**:
|
||||
- Stored in process memory (RAM)
|
||||
- Fast access (no disk I/O)
|
||||
- Automatically freed when process exits
|
||||
- LRU eviction prevents unlimited growth
|
||||
- Not shared between processes
|
||||
|
||||
**Configuration**:
|
||||
```python
|
||||
import os
|
||||
|
||||
# Enable memory cache
|
||||
os.environ['DG_CACHE_BACKEND'] = 'memory'
|
||||
os.environ['DG_CACHE_MEMORY_SIZE_MB'] = '200' # Default: 100MB
|
||||
|
||||
from deltaglider import create_client
|
||||
client = create_client()
|
||||
```
|
||||
|
||||
**When to Use**:
|
||||
- High-performance requirements
|
||||
- Ephemeral environments (containers, Lambda)
|
||||
- Short-lived applications
|
||||
- CI/CD pipelines
|
||||
- When disk I/O is bottleneck
|
||||
|
||||
**Cache Clearing**:
|
||||
```python
|
||||
client.clear_cache()
|
||||
# Frees memory immediately
|
||||
# No disk I/O
|
||||
```
|
||||
|
||||
**LRU Eviction**:
|
||||
Memory cache automatically evicts least recently used entries when size limit is reached. No manual intervention needed.
|
||||
|
||||
## Best Practices Summary
|
||||
|
||||
### ✅ DO
|
||||
|
||||
1. **Clear cache periodically** in long-running applications
|
||||
2. **Clear cache in `finally` blocks** for batch jobs
|
||||
3. **Use fixtures for tests** to ensure clean state
|
||||
4. **Monitor cache size** in production
|
||||
5. **Use ephemeral keys** when possible (maximum security)
|
||||
6. **Store persistent keys securely** (Secrets Manager, Vault)
|
||||
7. **Rotate encryption keys** regularly
|
||||
8. **Use memory cache** for ephemeral environments
|
||||
9. **Clear cache after config changes**
|
||||
10. **Document cache strategy** for your application
|
||||
|
||||
### ❌ DON'T
|
||||
|
||||
1. **Never let cache grow unbounded** in long-running apps
|
||||
2. **Don't share ephemeral encrypted cache** between processes
|
||||
3. **Don't store persistent keys in code** or version control
|
||||
4. **Don't forget to clear cache in tests**
|
||||
5. **Don't assume cache is automatically cleaned** in SDK usage
|
||||
6. **Don't use persistent keys** unless you need cross-process sharing
|
||||
7. **Don't skip key rotation**
|
||||
8. **Don't ignore memory limits** for memory cache
|
||||
|
||||
## Monitoring Cache Health
|
||||
|
||||
### Cache Size Tracking
|
||||
|
||||
```python
|
||||
import os
|
||||
from pathlib import Path
|
||||
from deltaglider import create_client
|
||||
|
||||
def get_cache_size_mb(cache_dir: Path) -> float:
|
||||
"""Calculate total cache size in MB."""
|
||||
total_bytes = sum(f.stat().st_size for f in cache_dir.rglob('*') if f.is_file())
|
||||
return total_bytes / (1024 * 1024)
|
||||
|
||||
# Get cache directory (ephemeral, changes per process)
|
||||
cache_dir = Path(tempfile.gettempdir())
|
||||
deltaglider_caches = list(cache_dir.glob('deltaglider-*'))
|
||||
|
||||
if deltaglider_caches:
|
||||
cache_path = deltaglider_caches[0]
|
||||
cache_size_mb = get_cache_size_mb(cache_path)
|
||||
print(f"Cache size: {cache_size_mb:.2f} MB")
|
||||
|
||||
# Clear if over threshold
|
||||
if cache_size_mb > 500:
|
||||
client = create_client()
|
||||
client.clear_cache()
|
||||
print("Cache cleared - exceeded 500MB")
|
||||
```
|
||||
|
||||
### Memory Cache Monitoring
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ['DG_CACHE_BACKEND'] = 'memory'
|
||||
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Memory cache auto-evicts at configured limit
|
||||
# Monitor via application memory usage tools (not direct API)
|
||||
|
||||
# Example with memory_profiler
|
||||
from memory_profiler import profile
|
||||
|
||||
@profile
|
||||
def upload_many_files():
|
||||
for i in range(1000):
|
||||
client.put_object(Bucket='test', Key=f'file{i}.zip', Body=b'data' * 1000)
|
||||
# Check memory profile to see cache memory usage
|
||||
|
||||
upload_many_files()
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Problem: Cache Growing Too Large
|
||||
|
||||
**Symptoms**:
|
||||
- Disk space running out (`/tmp` filling up)
|
||||
- High memory usage
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Implement automatic cache clearing
|
||||
import psutil
|
||||
|
||||
client = create_client()
|
||||
|
||||
def smart_cache_clear():
|
||||
"""Clear cache if memory/disk pressure detected."""
|
||||
# Check disk space
|
||||
disk = psutil.disk_usage('/tmp')
|
||||
if disk.percent > 80:
|
||||
client.clear_cache()
|
||||
print("Cache cleared - disk pressure")
|
||||
return
|
||||
|
||||
# Check memory usage
|
||||
memory = psutil.virtual_memory()
|
||||
if memory.percent > 80:
|
||||
client.clear_cache()
|
||||
print("Cache cleared - memory pressure")
|
||||
|
||||
# Call periodically
|
||||
schedule.every(15).minutes.do(smart_cache_clear)
|
||||
```
|
||||
|
||||
### Problem: Decryption Failures After Key Rotation
|
||||
|
||||
**Symptoms**:
|
||||
- `CacheCorruptionError: Decryption failed`
|
||||
- After rotating encryption keys
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Clear cache before using new key
|
||||
old_client = create_client() # Old key
|
||||
old_client.clear_cache()
|
||||
|
||||
# Update encryption key
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = new_key
|
||||
|
||||
# Create new client
|
||||
new_client = create_client() # New key
|
||||
```
|
||||
|
||||
### Problem: Tests Failing Due to Cached Data
|
||||
|
||||
**Symptoms**:
|
||||
- Tests pass in isolation, fail when run together
|
||||
- Unexpected data in downloads
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Always clear cache in test teardown
|
||||
@pytest.fixture(autouse=True)
|
||||
def clear_cache_after_test():
|
||||
"""Automatically clear cache after every test."""
|
||||
yield
|
||||
# Teardown
|
||||
client = create_client()
|
||||
client.clear_cache()
|
||||
```
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [docs/sdk/getting-started.md](sdk/getting-started.md) - SDK configuration
|
||||
- [README.md](../README.md) - Docker and environment variables
|
||||
- [CLAUDE.md](../CLAUDE.md) - Development guide
|
||||
|
||||
## Quick Reference
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
# Create client
|
||||
client = create_client()
|
||||
|
||||
# Clear all cache
|
||||
client.clear_cache()
|
||||
|
||||
# Use memory cache
|
||||
os.environ['DG_CACHE_BACKEND'] = 'memory'
|
||||
|
||||
# Use persistent encryption key
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = 'base64-encoded-key'
|
||||
```
|
||||
526
docs/DG_MAX_RATIO.md
Normal file
526
docs/DG_MAX_RATIO.md
Normal file
@@ -0,0 +1,526 @@
|
||||
# DG_MAX_RATIO - Delta Compression Efficiency Guard
|
||||
|
||||
## Overview
|
||||
|
||||
`DG_MAX_RATIO` is a **safety threshold** that prevents inefficient delta compression. It controls the maximum acceptable ratio of `delta_size / original_file_size`.
|
||||
|
||||
**Default**: `0.5` (50%)
|
||||
**Range**: `0.0` to `1.0`
|
||||
**Environment Variable**: `DG_MAX_RATIO`
|
||||
|
||||
## The Problem It Solves
|
||||
|
||||
When files are **too different**, xdelta3 can create a delta file that's **almost as large as the original file** (or even larger!). This defeats the purpose of compression and wastes:
|
||||
- ❌ Storage space (storing a large delta instead of the original)
|
||||
- ❌ CPU time (creating and applying the delta)
|
||||
- ❌ Network bandwidth (downloading delta + reference instead of just the file)
|
||||
|
||||
## How It Works
|
||||
|
||||
```
|
||||
1. Upload file → Create delta using xdelta3
|
||||
2. Calculate ratio = delta_size / original_file_size
|
||||
3. If ratio > DG_MAX_RATIO:
|
||||
❌ Discard delta, store original file directly
|
||||
Else:
|
||||
✅ Keep delta, save storage space
|
||||
```
|
||||
|
||||
### Example Flow
|
||||
|
||||
```
|
||||
Original file: 100MB
|
||||
Reference file: 100MB (uploaded previously)
|
||||
|
||||
xdelta3 creates delta: 60MB
|
||||
Ratio = 60MB / 100MB = 0.6 (60%)
|
||||
|
||||
With DG_MAX_RATIO=0.5:
|
||||
❌ Delta rejected (60% > 50%)
|
||||
Action: Store 100MB original file
|
||||
Reason: Delta not efficient enough
|
||||
|
||||
With DG_MAX_RATIO=0.7:
|
||||
✅ Delta accepted (60% ≤ 70%)
|
||||
Action: Store 60MB delta
|
||||
Savings: 40MB (40%)
|
||||
```
|
||||
|
||||
## Default Value: 0.5 (50%)
|
||||
|
||||
**Meaning**: Only use delta compression if the delta is **≤50% of the original file size**
|
||||
|
||||
This default provides:
|
||||
- ✅ Minimum 50% storage savings when delta compression is used
|
||||
- ✅ Prevents wasting CPU on inefficient compression
|
||||
- ✅ Works well for typical versioned releases (minor updates between versions)
|
||||
- ✅ Balanced approach without manual tuning
|
||||
|
||||
## When to Adjust
|
||||
|
||||
### 🔽 Lower Value (More Conservative)
|
||||
|
||||
**Set `DG_MAX_RATIO=0.2-0.3` when:**
|
||||
- Files change significantly between versions (major updates, refactors)
|
||||
- Storage cost is **very high** (premium S3 tiers, small quotas)
|
||||
- You want to avoid **any** inefficient compression
|
||||
- You need guaranteed high-quality compression (≥70% savings)
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.3
|
||||
|
||||
# Only accept deltas that are ≤30% of original size
|
||||
# More files stored directly, but guaranteed ≥70% savings when using deltas
|
||||
```
|
||||
|
||||
**Trade-off**:
|
||||
- ✅ Higher quality compression when deltas are used
|
||||
- ❌ Fewer files will use delta compression
|
||||
- ❌ More direct uploads (higher storage for dissimilar files)
|
||||
|
||||
### 🔼 Higher Value (More Permissive)
|
||||
|
||||
**Set `DG_MAX_RATIO=0.6-0.8` when:**
|
||||
- Files are very similar (minor patches, nightly builds, incremental changes)
|
||||
- Storage cost is **cheap** (large S3 buckets, unlimited quotas)
|
||||
- CPU time is **expensive** (you want to save re-uploading even with modest compression)
|
||||
- You want to maximize delta compression usage
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.7
|
||||
|
||||
# Accept deltas up to 70% of original size
|
||||
# More files use delta compression, even with modest 30% savings
|
||||
```
|
||||
|
||||
**Trade-off**:
|
||||
- ✅ More files use delta compression
|
||||
- ✅ Save bandwidth even with modest compression
|
||||
- ❌ Some deltas may only save 20-30% space
|
||||
- ❌ More CPU time spent on marginal compressions
|
||||
|
||||
## Real-World Scenarios
|
||||
|
||||
### Scenario 1: Nightly Builds (Minimal Changes) ⭐ IDEAL
|
||||
|
||||
```
|
||||
my-app-v1.0.0.zip → 100MB (reference)
|
||||
my-app-v1.0.1.zip → 100MB (0.1% code change)
|
||||
|
||||
Delta: 200KB (0.2% of original)
|
||||
Ratio: 0.002
|
||||
|
||||
With ANY DG_MAX_RATIO: ✅ Use delta (99.8% savings!)
|
||||
Result: Store 200KB instead of 100MB
|
||||
```
|
||||
|
||||
**This is what DeltaGlider excels at!**
|
||||
|
||||
### Scenario 2: Major Version (Significant Changes)
|
||||
|
||||
```
|
||||
my-app-v1.0.0.zip → 100MB (reference)
|
||||
my-app-v2.0.0.zip → 100MB (complete rewrite, 85% different)
|
||||
|
||||
Delta: 85MB (85% of original)
|
||||
Ratio: 0.85
|
||||
|
||||
With DG_MAX_RATIO=0.5: ❌ Store original (85% > 50%)
|
||||
→ Stores 100MB directly
|
||||
→ No compression benefit, but no CPU waste
|
||||
|
||||
With DG_MAX_RATIO=0.9: ✅ Use delta (85% ≤ 90%)
|
||||
→ Stores 85MB delta
|
||||
→ Only 15% savings, questionable benefit
|
||||
```
|
||||
|
||||
**Recommendation**: For major versions, default `0.5` correctly rejects inefficient compression.
|
||||
|
||||
### Scenario 3: Different File Format (Same Content)
|
||||
|
||||
```
|
||||
my-app-v1.0.0.zip → 100MB (ZIP archive)
|
||||
my-app-v1.0.0.tar → 100MB (TAR archive, same content)
|
||||
|
||||
Delta: 70MB (completely different format structure)
|
||||
Ratio: 0.70
|
||||
|
||||
With DG_MAX_RATIO=0.5: ❌ Store original (70% > 50%)
|
||||
→ Stores 100MB directly
|
||||
→ Correct decision: formats too different
|
||||
|
||||
With DG_MAX_RATIO=0.8: ✅ Use delta (70% ≤ 80%)
|
||||
→ Stores 70MB delta
|
||||
→ 30% savings, but CPU-intensive
|
||||
```
|
||||
|
||||
**Recommendation**: Use consistent file formats for better compression. Default `0.5` correctly rejects cross-format compression.
|
||||
|
||||
### Scenario 4: Incremental Updates (Sweet Spot) ⭐
|
||||
|
||||
```
|
||||
my-app-v1.0.0.zip → 100MB (reference)
|
||||
my-app-v1.0.1.zip → 100MB (minor bugfix, 5% code change)
|
||||
|
||||
Delta: 5MB (5% of original)
|
||||
Ratio: 0.05
|
||||
|
||||
With ANY DG_MAX_RATIO: ✅ Use delta (95% savings!)
|
||||
Result: Store 5MB instead of 100MB
|
||||
```
|
||||
|
||||
**This is the target use case for delta compression!**
|
||||
|
||||
## How to Choose the Right Value
|
||||
|
||||
### Decision Tree
|
||||
|
||||
```
|
||||
Do your files have minimal changes between versions? (< 5% different)
|
||||
├─ YES → Use default 0.5 ✅
|
||||
│ Delta compression will work perfectly
|
||||
│
|
||||
└─ NO → Are your files significantly different? (> 50% different)
|
||||
├─ YES → Lower to 0.2-0.3 🔽
|
||||
│ Avoid wasting time on inefficient compression
|
||||
│
|
||||
└─ NO → Are they moderately different? (20-50% different)
|
||||
├─ Storage is expensive → Lower to 0.3 🔽
|
||||
│ Only high-quality compression
|
||||
│
|
||||
└─ Storage is cheap → Raise to 0.6-0.7 🔼
|
||||
Accept modest savings
|
||||
```
|
||||
|
||||
### Quick Reference Table
|
||||
|
||||
| File Similarity | Recommended DG_MAX_RATIO | Expected Behavior |
|
||||
|----------------|--------------------------|-------------------|
|
||||
| Nearly identical (< 5% change) | **0.5 (default)** | 🟢 95%+ savings |
|
||||
| Minor updates (5-20% change) | **0.5 (default)** | 🟢 80-95% savings |
|
||||
| Moderate changes (20-50% change) | **0.4-0.6** | 🟡 50-80% savings |
|
||||
| Major changes (50-80% change) | **0.3 or lower** | 🔴 Store directly |
|
||||
| Complete rewrites (> 80% change) | **0.3 or lower** | 🔴 Store directly |
|
||||
|
||||
### Use Cases by Industry
|
||||
|
||||
**Software Releases (SaaS)**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.5 # Default
|
||||
# Nightly builds with minor changes compress perfectly
|
||||
```
|
||||
|
||||
**Mobile App Builds**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.4 # Slightly conservative
|
||||
# iOS/Android builds can vary, want quality compression only
|
||||
```
|
||||
|
||||
**Database Backups**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.7 # Permissive
|
||||
# Daily backups are very similar, accept modest savings
|
||||
```
|
||||
|
||||
**Document Archives**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.6 # Moderate
|
||||
# Documents change incrementally, accept good savings
|
||||
```
|
||||
|
||||
**Video/Media Archives**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.2 # Very conservative
|
||||
# Media files are unique, only compress if very similar
|
||||
```
|
||||
|
||||
## Configuration Examples
|
||||
|
||||
### Docker
|
||||
|
||||
**Conservative (Premium Storage)**:
|
||||
```bash
|
||||
docker run -e DG_MAX_RATIO=0.3 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
```
|
||||
|
||||
**Default (Balanced)**:
|
||||
```bash
|
||||
docker run -e DG_MAX_RATIO=0.5 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
```
|
||||
|
||||
**Permissive (Cheap Storage)**:
|
||||
```bash
|
||||
docker run -e DG_MAX_RATIO=0.7 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
```
|
||||
|
||||
### Python SDK
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
# Conservative (only high-quality compression)
|
||||
os.environ['DG_MAX_RATIO'] = '0.3'
|
||||
|
||||
# Default (balanced)
|
||||
os.environ['DG_MAX_RATIO'] = '0.5'
|
||||
|
||||
# Permissive (accept modest savings)
|
||||
os.environ['DG_MAX_RATIO'] = '0.7'
|
||||
|
||||
from deltaglider import create_client
|
||||
client = create_client()
|
||||
|
||||
summary = client.upload("file.zip", "s3://bucket/")
|
||||
print(f"Delta ratio: {summary.delta_ratio:.2f}")
|
||||
print(f"Used delta: {summary.is_delta}")
|
||||
```
|
||||
|
||||
### CLI
|
||||
|
||||
```bash
|
||||
# Conservative
|
||||
export DG_MAX_RATIO=0.3
|
||||
deltaglider cp my-app-v2.0.0.zip s3://releases/
|
||||
|
||||
# Default
|
||||
export DG_MAX_RATIO=0.5
|
||||
deltaglider cp my-app-v2.0.0.zip s3://releases/
|
||||
|
||||
# Permissive
|
||||
export DG_MAX_RATIO=0.7
|
||||
deltaglider cp my-app-v2.0.0.zip s3://releases/
|
||||
```
|
||||
|
||||
### Override Per-Upload (CLI)
|
||||
|
||||
```bash
|
||||
# Use custom ratio for specific file
|
||||
deltaglider cp large-file.zip s3://releases/ --max-ratio 0.3
|
||||
```
|
||||
|
||||
## Monitoring and Tuning
|
||||
|
||||
### Check Delta Ratios After Upload
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
summary = client.upload("file.zip", "s3://bucket/")
|
||||
|
||||
print(f"Delta ratio: {summary.delta_ratio:.2f}")
|
||||
print(f"Savings: {summary.savings_percent:.0f}%")
|
||||
print(f"Used delta: {summary.is_delta}")
|
||||
|
||||
if summary.is_delta:
|
||||
print(f"✅ Used delta compression (ratio {summary.delta_ratio:.2f})")
|
||||
if summary.delta_ratio > 0.4:
|
||||
print(f"⚠️ Consider lowering DG_MAX_RATIO for better quality")
|
||||
else:
|
||||
print(f"❌ Stored directly (delta would have been ratio ~{summary.delta_ratio:.2f})")
|
||||
if summary.delta_ratio < 0.6:
|
||||
print(f"💡 Consider raising DG_MAX_RATIO to enable compression")
|
||||
```
|
||||
|
||||
### Batch Analysis
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
from pathlib import Path
|
||||
|
||||
client = create_client()
|
||||
ratios = []
|
||||
|
||||
for file in Path("releases").glob("*.zip"):
|
||||
summary = client.upload(str(file), f"s3://bucket/{file.name}")
|
||||
if summary.is_delta:
|
||||
ratios.append(summary.delta_ratio)
|
||||
|
||||
if ratios:
|
||||
avg_ratio = sum(ratios) / len(ratios)
|
||||
max_ratio = max(ratios)
|
||||
|
||||
print(f"Average delta ratio: {avg_ratio:.2f}")
|
||||
print(f"Maximum delta ratio: {max_ratio:.2f}")
|
||||
print(f"Files compressed: {len(ratios)}")
|
||||
|
||||
if avg_ratio < 0.2:
|
||||
print("💡 Consider raising DG_MAX_RATIO - you're getting excellent compression")
|
||||
elif max_ratio > 0.6:
|
||||
print("⚠️ Consider lowering DG_MAX_RATIO - some deltas are inefficient")
|
||||
```
|
||||
|
||||
### Optimization Strategy
|
||||
|
||||
1. **Start with default (0.5)**
|
||||
2. **Monitor delta ratios** for 1 week of uploads
|
||||
3. **Analyze results**:
|
||||
- If most ratios < 0.2: Consider raising to 0.6-0.7
|
||||
- If many ratios > 0.4: Consider lowering to 0.3-0.4
|
||||
- If ratios vary widely: Keep default 0.5
|
||||
4. **Adjust and re-test** for 1 week
|
||||
5. **Repeat until optimal** for your use case
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Dynamic Ratio Based on File Type
|
||||
|
||||
```python
|
||||
import os
|
||||
from pathlib import Path
|
||||
from deltaglider import create_client
|
||||
|
||||
def get_optimal_ratio(file_path: str) -> float:
|
||||
"""Determine optimal ratio based on file type."""
|
||||
suffix = Path(file_path).suffix.lower()
|
||||
|
||||
# Very compressible (source code archives)
|
||||
if suffix in ['.zip', '.tar', '.gz']:
|
||||
return 0.6
|
||||
|
||||
# Moderately compressible (compiled binaries)
|
||||
elif suffix in ['.jar', '.war', '.deb', '.rpm']:
|
||||
return 0.5
|
||||
|
||||
# Rarely compressible (media, already compressed)
|
||||
elif suffix in ['.mp4', '.jpg', '.png']:
|
||||
return 0.2
|
||||
|
||||
# Default
|
||||
return 0.5
|
||||
|
||||
file = "my-app.zip"
|
||||
os.environ['DG_MAX_RATIO'] = str(get_optimal_ratio(file))
|
||||
|
||||
client = create_client()
|
||||
summary = client.upload(file, "s3://bucket/")
|
||||
```
|
||||
|
||||
### A/B Testing Different Ratios
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import os
|
||||
|
||||
def test_ratios(file_path: str, ratios: list[float]):
|
||||
"""Test different ratios and report results."""
|
||||
results = []
|
||||
|
||||
for ratio in ratios:
|
||||
os.environ['DG_MAX_RATIO'] = str(ratio)
|
||||
client = create_client()
|
||||
|
||||
# Simulate upload (don't actually upload)
|
||||
summary = client.estimate_compression(file_path, "s3://bucket/test/")
|
||||
|
||||
results.append({
|
||||
'ratio_threshold': ratio,
|
||||
'would_use_delta': summary.delta_ratio <= ratio,
|
||||
'delta_ratio': summary.delta_ratio,
|
||||
'savings': summary.savings_percent if summary.delta_ratio <= ratio else 0
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
# Test different ratios
|
||||
file = "my-app-v2.0.0.zip"
|
||||
test_results = test_ratios(file, [0.3, 0.5, 0.7])
|
||||
|
||||
for result in test_results:
|
||||
print(f"Ratio {result['ratio_threshold']}: "
|
||||
f"Delta={result['would_use_delta']}, "
|
||||
f"Savings={result['savings']:.0f}%")
|
||||
```
|
||||
|
||||
## FAQ
|
||||
|
||||
### Q: What happens if I set DG_MAX_RATIO=1.0?
|
||||
|
||||
**A**: Delta compression will **always** be used, even if the delta is larger than the original file! This is generally a bad idea and defeats the purpose of the threshold.
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=1.0
|
||||
|
||||
# File: 100MB, Delta: 120MB
|
||||
# Ratio: 1.2
|
||||
# With DG_MAX_RATIO=1.0: ✅ Use delta (1.2 > 1.0 but we accept anything ≤1.0)
|
||||
# Wait, that's wrong! The delta is LARGER than the original!
|
||||
|
||||
# NEVER set DG_MAX_RATIO to 1.0 or higher
|
||||
```
|
||||
|
||||
### Q: What happens if I set DG_MAX_RATIO=0.0?
|
||||
|
||||
**A**: Delta compression will **never** be used. All files will be stored directly. This is equivalent to disabling DeltaGlider's compression entirely.
|
||||
|
||||
### Q: Can I disable the ratio check?
|
||||
|
||||
**A**: No, and you shouldn't want to. The ratio check is a critical safety feature that prevents wasting storage and CPU on inefficient compression.
|
||||
|
||||
### Q: Does DG_MAX_RATIO affect downloading?
|
||||
|
||||
**A**: No, `DG_MAX_RATIO` only affects **uploads**. During download, DeltaGlider automatically detects whether a file is stored as a delta or directly and handles reconstruction accordingly.
|
||||
|
||||
### Q: Can I set different ratios for different buckets?
|
||||
|
||||
**A**: Not directly via environment variables, but you can change `DG_MAX_RATIO` before each upload in your code:
|
||||
|
||||
```python
|
||||
import os
|
||||
from deltaglider import create_client
|
||||
|
||||
# High-quality compression for production releases
|
||||
os.environ['DG_MAX_RATIO'] = '0.3'
|
||||
client = create_client()
|
||||
client.upload("prod-release.zip", "s3://production/")
|
||||
|
||||
# Permissive compression for dev builds
|
||||
os.environ['DG_MAX_RATIO'] = '0.7'
|
||||
client = create_client()
|
||||
client.upload("dev-build.zip", "s3://development/")
|
||||
```
|
||||
|
||||
### Q: How do I know if my DG_MAX_RATIO is set correctly?
|
||||
|
||||
**A**: Monitor your upload summaries. If most deltas have ratios close to your threshold (e.g., 0.45-0.50 with default 0.5), you might want to lower it. If most deltas have very low ratios (e.g., < 0.2), you could raise it.
|
||||
|
||||
**Ideal scenario**: Most successful delta compressions have ratios < 0.3, and inefficient deltas (> 0.5) are correctly rejected.
|
||||
|
||||
## Summary
|
||||
|
||||
**`DG_MAX_RATIO` prevents wasting time and storage on inefficient delta compression.**
|
||||
|
||||
### Quick Takeaways
|
||||
|
||||
✅ **Default 0.5 works for 90% of use cases**
|
||||
✅ **Lower values (0.2-0.3) for dissimilar files or expensive storage**
|
||||
✅ **Higher values (0.6-0.7) for very similar files or cheap storage**
|
||||
✅ **Monitor delta ratios to tune for your use case**
|
||||
✅ **Never set to 1.0 or higher (defeats the purpose)**
|
||||
✅ **Never set to 0.0 (disables delta compression entirely)**
|
||||
|
||||
### Golden Rule
|
||||
|
||||
**If you're not sure, keep the default `0.5`.**
|
||||
|
||||
It's a sensible balance that:
|
||||
- Prevents inefficient compression (no deltas > 50% of original size)
|
||||
- Allows excellent savings on similar files (most deltas are < 20%)
|
||||
- Works well for typical versioned releases
|
||||
- Requires no manual tuning for most use cases
|
||||
|
||||
---
|
||||
|
||||
**Related Documentation**:
|
||||
- [CLAUDE.md](../CLAUDE.md) - Environment variables reference
|
||||
- [README.md](../README.md) - Docker usage and configuration
|
||||
- [docs/sdk/getting-started.md](sdk/getting-started.md) - SDK configuration guide
|
||||
364
docs/DOCKER.md
Normal file
364
docs/DOCKER.md
Normal file
@@ -0,0 +1,364 @@
|
||||
# Docker Support for DeltaGlider
|
||||
|
||||
This document describes how to build, run, and publish Docker images for DeltaGlider.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Pull and run the latest image
|
||||
|
||||
```bash
|
||||
docker pull beshultd/deltaglider:latest
|
||||
docker run --rm beshultd/deltaglider:latest --help
|
||||
```
|
||||
|
||||
### Run with AWS credentials
|
||||
|
||||
```bash
|
||||
docker run --rm \
|
||||
-e AWS_ACCESS_KEY_ID=your_key \
|
||||
-e AWS_SECRET_ACCESS_KEY=your_secret \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:latest ls s3://your-bucket/
|
||||
```
|
||||
|
||||
### Run with MinIO (local S3 alternative)
|
||||
|
||||
```bash
|
||||
# Start MinIO
|
||||
docker run -d \
|
||||
-p 9000:9000 -p 9001:9001 \
|
||||
-e MINIO_ROOT_USER=minioadmin \
|
||||
-e MINIO_ROOT_PASSWORD=minioadmin \
|
||||
--name minio \
|
||||
minio/minio server /data --console-address ":9001"
|
||||
|
||||
# Use DeltaGlider with MinIO
|
||||
docker run --rm \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:latest ls
|
||||
```
|
||||
|
||||
## Building Locally
|
||||
|
||||
### Build with current git version
|
||||
|
||||
```bash
|
||||
VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//')
|
||||
docker build --build-arg VERSION=${VERSION} -t beshultd/deltaglider:${VERSION} .
|
||||
```
|
||||
|
||||
### Build with custom version
|
||||
|
||||
```bash
|
||||
docker build --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 .
|
||||
```
|
||||
|
||||
### Multi-platform build
|
||||
|
||||
```bash
|
||||
# Create a buildx builder (one-time setup)
|
||||
docker buildx create --name deltaglider-builder --use
|
||||
|
||||
# Build for multiple platforms
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--build-arg VERSION=6.0.2 \
|
||||
-t beshultd/deltaglider:6.0.2 \
|
||||
--push \
|
||||
.
|
||||
```
|
||||
|
||||
## Testing the Image
|
||||
|
||||
### Basic functionality test
|
||||
|
||||
```bash
|
||||
# Check version
|
||||
docker run --rm beshultd/deltaglider:test --version
|
||||
|
||||
# Check help
|
||||
docker run --rm beshultd/deltaglider:test --help
|
||||
|
||||
# List available commands
|
||||
docker run --rm beshultd/deltaglider:test
|
||||
```
|
||||
|
||||
### Integration test with MinIO
|
||||
|
||||
```bash
|
||||
# 1. Start MinIO
|
||||
docker run -d \
|
||||
-p 9000:9000 -p 9001:9001 \
|
||||
-e MINIO_ROOT_USER=minioadmin \
|
||||
-e MINIO_ROOT_PASSWORD=minioadmin \
|
||||
--name minio \
|
||||
minio/minio server /data --console-address ":9001"
|
||||
|
||||
# 2. Create a test file
|
||||
echo "Hello DeltaGlider" > test.txt
|
||||
|
||||
# 3. Upload to S3/MinIO
|
||||
docker run --rm \
|
||||
-v $(pwd):/data \
|
||||
-w /data \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:test cp test.txt s3://test-bucket/
|
||||
|
||||
# 4. List bucket contents
|
||||
docker run --rm \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:test ls s3://test-bucket/
|
||||
|
||||
# 5. Get statistics
|
||||
docker run --rm \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:test stats test-bucket
|
||||
|
||||
# 6. Cleanup
|
||||
docker stop minio && docker rm minio
|
||||
rm test.txt
|
||||
```
|
||||
|
||||
## Publishing to Docker Hub
|
||||
|
||||
### Manual Publishing
|
||||
|
||||
```bash
|
||||
# 1. Log in to Docker Hub
|
||||
docker login
|
||||
|
||||
# 2. Build the image
|
||||
VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//')
|
||||
docker build --build-arg VERSION=${VERSION} \
|
||||
-t beshultd/deltaglider:${VERSION} \
|
||||
-t beshultd/deltaglider:latest \
|
||||
.
|
||||
|
||||
# 3. Push to Docker Hub
|
||||
docker push beshultd/deltaglider:${VERSION}
|
||||
docker push beshultd/deltaglider:latest
|
||||
```
|
||||
|
||||
### Multi-platform Publishing
|
||||
|
||||
```bash
|
||||
# Create builder (one-time setup)
|
||||
docker buildx create --name deltaglider-builder --use
|
||||
|
||||
# Build and push for multiple platforms
|
||||
VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//')
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--build-arg VERSION=${VERSION} \
|
||||
-t beshultd/deltaglider:${VERSION} \
|
||||
-t beshultd/deltaglider:latest \
|
||||
--push \
|
||||
.
|
||||
```
|
||||
|
||||
## GitHub Actions Automation
|
||||
|
||||
The repository includes a GitHub Action workflow (`.github/workflows/docker-publish.yml`) that automatically builds and publishes Docker images.
|
||||
|
||||
### Automatic Publishing Triggers
|
||||
|
||||
- **On main branch push**: Tags as `latest`
|
||||
- **On develop branch push**: Tags as `develop`
|
||||
- **On version tag push** (e.g., `v6.0.2`): Tags with semver patterns:
|
||||
- `6.0.2` (full version)
|
||||
- `6.0` (major.minor)
|
||||
- `6` (major)
|
||||
- **On pull request**: Builds but doesn't push (testing only)
|
||||
|
||||
### Required GitHub Secrets
|
||||
|
||||
Set these secrets in your GitHub repository settings (`Settings > Secrets and variables > Actions`):
|
||||
|
||||
1. **DOCKERHUB_USERNAME**: Your Docker Hub username (e.g., `beshultd`)
|
||||
2. **DOCKERHUB_TOKEN**: Docker Hub access token (create at https://hub.docker.com/settings/security)
|
||||
|
||||
### Manual Workflow Trigger
|
||||
|
||||
You can manually trigger the Docker build workflow from the GitHub Actions tab:
|
||||
|
||||
1. Go to **Actions** tab
|
||||
2. Select **Build and Publish Docker Images**
|
||||
3. Click **Run workflow**
|
||||
4. Select branch and click **Run workflow**
|
||||
|
||||
## Docker Image Details
|
||||
|
||||
### Image Layers
|
||||
|
||||
The Dockerfile uses a multi-stage build:
|
||||
|
||||
1. **Builder stage**: Installs UV and Python dependencies
|
||||
2. **Runtime stage**: Minimal Python 3.12-slim with only runtime dependencies
|
||||
|
||||
### Image Features
|
||||
|
||||
- **Size**: ~150MB (compressed)
|
||||
- **Platforms**: linux/amd64, linux/arm64
|
||||
- **User**: Runs as non-root user `deltaglider` (UID 1000)
|
||||
- **Base**: Python 3.12-slim (Debian)
|
||||
- **Dependencies**:
|
||||
- Python 3.12
|
||||
- xdelta3 (binary diff tool)
|
||||
- All Python dependencies from `pyproject.toml`
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The image supports the following environment variables:
|
||||
|
||||
```bash
|
||||
# Logging
|
||||
DG_LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR
|
||||
|
||||
# Performance & Compression
|
||||
DG_MAX_RATIO=0.5 # Max delta/file ratio (0.0-1.0)
|
||||
|
||||
# Cache Configuration
|
||||
DG_CACHE_BACKEND=filesystem # filesystem or memory
|
||||
DG_CACHE_MEMORY_SIZE_MB=100 # Memory cache size
|
||||
DG_CACHE_ENCRYPTION_KEY= # Optional encryption key
|
||||
|
||||
# AWS Configuration
|
||||
AWS_ENDPOINT_URL= # S3 endpoint (for MinIO/LocalStack)
|
||||
AWS_ACCESS_KEY_ID= # AWS access key
|
||||
AWS_SECRET_ACCESS_KEY= # AWS secret key
|
||||
AWS_DEFAULT_REGION=us-east-1 # AWS region
|
||||
```
|
||||
|
||||
### Health Check
|
||||
|
||||
The image includes a health check that runs every 30 seconds:
|
||||
|
||||
```bash
|
||||
docker inspect --format='{{.State.Health.Status}}' <container-id>
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Build Issues
|
||||
|
||||
#### "setuptools-scm was unable to detect version"
|
||||
|
||||
**Cause**: Git metadata not available during build.
|
||||
|
||||
**Solution**: Always use the `VERSION` build arg:
|
||||
|
||||
```bash
|
||||
docker build --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 .
|
||||
```
|
||||
|
||||
#### Cache issues
|
||||
|
||||
**Cause**: Docker build cache causing stale builds.
|
||||
|
||||
**Solution**: Use `--no-cache` flag:
|
||||
|
||||
```bash
|
||||
docker build --no-cache --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 .
|
||||
```
|
||||
|
||||
### Runtime Issues
|
||||
|
||||
#### "unauthorized: access token has insufficient scopes"
|
||||
|
||||
**Cause**: Not logged in to Docker Hub or invalid credentials.
|
||||
|
||||
**Solution**:
|
||||
|
||||
```bash
|
||||
docker login
|
||||
# Enter your Docker Hub credentials
|
||||
```
|
||||
|
||||
#### "Cannot connect to MinIO/LocalStack"
|
||||
|
||||
**Cause**: Using `localhost` instead of `host.docker.internal` from inside container.
|
||||
|
||||
**Solution**: Use `host.docker.internal` for Mac/Windows or `172.17.0.1` for Linux:
|
||||
|
||||
```bash
|
||||
# Mac/Windows
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000
|
||||
|
||||
# Linux
|
||||
-e AWS_ENDPOINT_URL=http://172.17.0.1:9000
|
||||
```
|
||||
|
||||
## Docker Compose
|
||||
|
||||
For local development with MinIO:
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
environment:
|
||||
MINIO_ROOT_USER: minioadmin
|
||||
MINIO_ROOT_PASSWORD: minioadmin
|
||||
command: server /data --console-address ":9001"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
deltaglider:
|
||||
image: beshultd/deltaglider:latest
|
||||
environment:
|
||||
AWS_ENDPOINT_URL: http://minio:9000
|
||||
AWS_ACCESS_KEY_ID: minioadmin
|
||||
AWS_SECRET_ACCESS_KEY: minioadmin
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
DG_LOG_LEVEL: DEBUG
|
||||
depends_on:
|
||||
- minio
|
||||
volumes:
|
||||
- ./data:/data
|
||||
working_dir: /data
|
||||
command: ["--help"]
|
||||
```
|
||||
|
||||
Run with:
|
||||
|
||||
```bash
|
||||
docker-compose up -d
|
||||
docker-compose run --rm deltaglider ls
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always specify version**: Use `--build-arg VERSION=x.y.z` when building
|
||||
2. **Use multi-stage builds**: Keeps final image small
|
||||
3. **Tag with semantic versions**: Follow semver (major.minor.patch)
|
||||
4. **Test before pushing**: Run integration tests locally
|
||||
5. **Use secrets**: Never hardcode credentials in images
|
||||
6. **Multi-platform builds**: Support both amd64 and arm64
|
||||
7. **Update README**: Keep Docker Hub description in sync with README.md
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [Docker Hub Repository](https://hub.docker.com/r/beshultd/deltaglider)
|
||||
- [GitHub Repository](https://github.com/beshu-tech/deltaglider)
|
||||
- [MinIO Documentation](https://min.io/docs/minio/container/index.html)
|
||||
- [Docker Buildx Documentation](https://docs.docker.com/buildx/working-with-buildx/)
|
||||
242
docs/EC2_REGION_DETECTION.md
Normal file
242
docs/EC2_REGION_DETECTION.md
Normal file
@@ -0,0 +1,242 @@
|
||||
# EC2 Region Detection & Cost Optimization
|
||||
|
||||
DeltaGlider automatically detects when you're running on an EC2 instance and warns you about potential cross-region data transfer charges.
|
||||
|
||||
## Overview
|
||||
|
||||
When running `deltaglider migrate` on an EC2 instance, DeltaGlider:
|
||||
|
||||
1. **Detects EC2 Environment**: Uses IMDSv2 (Instance Metadata Service v2) to determine if running on EC2
|
||||
2. **Retrieves Instance Region**: Gets the actual AWS region where your EC2 instance is running
|
||||
3. **Compares Regions**: Checks if your EC2 region matches the S3 client region
|
||||
4. **Warns About Costs**: Displays clear warnings when regions don't match
|
||||
|
||||
## Why This Matters
|
||||
|
||||
**AWS Cross-Region Data Transfer Costs**:
|
||||
- **Same region**: No additional charges for data transfer
|
||||
- **Cross-region**: $0.02 per GB transferred (can add up quickly for large migrations)
|
||||
- **NAT Gateway**: Additional charges if going through NAT
|
||||
|
||||
**Example Cost Impact**:
|
||||
- Migrating 1TB from `us-east-1` EC2 → `us-west-2` S3 = ~$20 in data transfer charges
|
||||
- Same migration within same region = $0 in data transfer charges
|
||||
|
||||
## Output Examples
|
||||
|
||||
### Scenario 1: Regions Aligned (Optimal) ✅
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
EC2 Instance: us-east-1a
|
||||
S3 Client Region: us-east-1
|
||||
✓ Regions aligned - no cross-region charges
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: No warnings, optimal configuration, no extra charges.
|
||||
|
||||
---
|
||||
|
||||
### Scenario 2: Auto-Detected Mismatch (INFO) ℹ️
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
EC2 Instance: us-west-2a
|
||||
S3 Client Region: us-east-1
|
||||
|
||||
ℹ️ INFO: EC2 region (us-west-2) differs from configured S3 region (us-east-1)
|
||||
Consider using --region us-west-2 to avoid cross-region charges.
|
||||
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: Informational warning, suggests optimal region. User didn't explicitly set wrong region, so it's likely from their AWS config.
|
||||
|
||||
---
|
||||
|
||||
### Scenario 3: Explicit Region Override Mismatch (WARNING) ⚠️
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate --region us-east-1 s3://old-bucket/ s3://new-bucket/
|
||||
EC2 Instance: us-west-2a
|
||||
S3 Client Region: us-east-1
|
||||
|
||||
⚠️ WARNING: EC2 region=us-west-2 != S3 client region=us-east-1
|
||||
Expect cross-region/NAT data charges. Align regions (set client region=us-west-2)
|
||||
before proceeding. Or drop --region for automatic region resolution.
|
||||
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: Strong warning because user explicitly set the wrong region with `--region` flag. They might not realize the cost implications.
|
||||
|
||||
---
|
||||
|
||||
### Scenario 4: Not on EC2
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
S3 Client Region: us-east-1
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: Simple region display, no EC2 warnings (not applicable).
|
||||
|
||||
## Configuration
|
||||
|
||||
### Disable EC2 Detection
|
||||
|
||||
If you want to disable EC2 detection (e.g., for testing or if it causes issues):
|
||||
|
||||
```bash
|
||||
export DG_DISABLE_EC2_DETECTION=true
|
||||
deltaglider migrate s3://old/ s3://new/
|
||||
```
|
||||
|
||||
Or in your script:
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ["DG_DISABLE_EC2_DETECTION"] = "true"
|
||||
```
|
||||
|
||||
### How It Works
|
||||
|
||||
DeltaGlider uses **IMDSv2** (Instance Metadata Service v2) for security:
|
||||
|
||||
1. **Token Request** (PUT with TTL):
|
||||
```
|
||||
PUT http://169.254.169.254/latest/api/token
|
||||
X-aws-ec2-metadata-token-ttl-seconds: 21600
|
||||
```
|
||||
|
||||
2. **Metadata Request** (GET with token):
|
||||
```
|
||||
GET http://169.254.169.254/latest/meta-data/placement/region
|
||||
X-aws-ec2-metadata-token: <token>
|
||||
```
|
||||
|
||||
3. **Fast Timeout**: 1 second timeout for non-EC2 environments (no delay if not on EC2)
|
||||
|
||||
### Security Notes
|
||||
|
||||
- **IMDSv2 Only**: DeltaGlider uses the more secure IMDSv2, not the legacy IMDSv1
|
||||
- **No Credentials**: Only reads metadata, never accesses credentials
|
||||
- **Graceful Fallback**: Silently skips detection if IMDS unavailable
|
||||
- **No Network Impact**: Uses local-only IP (169.254.169.254), never leaves the instance
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Cost Optimization
|
||||
|
||||
1. **Same Region**: Always try to keep EC2 instance and S3 bucket in the same region
|
||||
2. **Check First**: Run with `--dry-run` to verify the setup before actual migration
|
||||
3. **Use Auto-Detection**: Don't specify `--region` unless you have a specific reason
|
||||
4. **Monitor Costs**: Use AWS Cost Explorer to track cross-region data transfer
|
||||
|
||||
### For Terraform/IaC
|
||||
|
||||
```hcl
|
||||
# Good: EC2 and S3 in same region
|
||||
resource "aws_instance" "app" {
|
||||
region = "us-west-2"
|
||||
}
|
||||
|
||||
resource "aws_s3_bucket" "data" {
|
||||
region = "us-west-2" # Same region
|
||||
}
|
||||
```
|
||||
|
||||
### For Multi-Region Setups
|
||||
|
||||
If you MUST do cross-region transfers:
|
||||
|
||||
1. **Use VPC Endpoints**: Reduce NAT Gateway costs
|
||||
2. **Schedule Off-Peak**: AWS charges less during off-peak hours in some regions
|
||||
3. **Consider S3 Transfer Acceleration**: May be cheaper for very large transfers
|
||||
4. **Batch Operations**: Minimize number of API calls
|
||||
|
||||
## Technical Details
|
||||
|
||||
### EC2MetadataAdapter
|
||||
|
||||
Location: `src/deltaglider/adapters/ec2_metadata.py`
|
||||
|
||||
Key methods:
|
||||
- `is_running_on_ec2()`: Detects EC2 environment
|
||||
- `get_region()`: Returns AWS region code (e.g., "us-east-1")
|
||||
- `get_availability_zone()`: Returns AZ (e.g., "us-east-1a")
|
||||
|
||||
### Region Logging
|
||||
|
||||
Location: `src/deltaglider/app/cli/aws_compat.py`
|
||||
|
||||
Function: `log_aws_region(service, region_override=False)`
|
||||
|
||||
Logic:
|
||||
- If not EC2: Show S3 region only
|
||||
- If EC2 + regions match: Green checkmark ✅
|
||||
- If EC2 + auto-detected mismatch: Blue INFO ℹ️
|
||||
- If EC2 + `--region` mismatch: Yellow WARNING ⚠️
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Cannot connect to IMDS"
|
||||
|
||||
**Cause**: Network policy blocks access to 169.254.169.254
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Test IMDS connectivity
|
||||
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" \
|
||||
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
|
||||
curl -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/placement/region
|
||||
|
||||
# If it fails, disable detection
|
||||
export DG_DISABLE_EC2_DETECTION=true
|
||||
```
|
||||
|
||||
### "Wrong region detected"
|
||||
|
||||
**Cause**: Cached metadata or race condition
|
||||
|
||||
**Solution**: DeltaGlider caches metadata for performance. Restart the process to refresh.
|
||||
|
||||
### "Warning appears but I want cross-region"
|
||||
|
||||
**Cause**: You intentionally need cross-region transfer
|
||||
|
||||
**Solution**: This is just a warning, not an error. The migration will proceed. The warning helps you confirm you understand the cost implications.
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: Does this slow down my migrations?**
|
||||
A: No. EC2 detection happens once before migration starts (< 100ms). It doesn't affect migration performance.
|
||||
|
||||
**Q: What if I'm not on EC2 but the detection is slow?**
|
||||
A: The timeout is 1 second. If IMDS is unreachable, it fails fast. Disable with `DG_DISABLE_EC2_DETECTION=true`.
|
||||
|
||||
**Q: Does this work on Fargate/ECS/Lambda?**
|
||||
A: Yes! All AWS compute services support IMDSv2. The detection works the same way.
|
||||
|
||||
**Q: Can I use this with LocalStack/MinIO?**
|
||||
A: Yes. When using `--endpoint-url`, DeltaGlider skips EC2 detection (not applicable for non-AWS S3).
|
||||
|
||||
**Q: Will this detect VPC endpoints?**
|
||||
A: No. VPC endpoints don't change the "region" from an EC2 perspective. The warning still applies if regions don't match.
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [AWS Data Transfer Pricing](https://aws.amazon.com/ec2/pricing/on-demand/#Data_Transfer)
|
||||
- [AWS IMDSv2 Documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html)
|
||||
- [S3 Transfer Costs](https://aws.amazon.com/s3/pricing/)
|
||||
342
docs/STATS_CACHING.md
Normal file
342
docs/STATS_CACHING.md
Normal file
@@ -0,0 +1,342 @@
|
||||
# Bucket Statistics Caching
|
||||
|
||||
**TL;DR**: Bucket stats are now cached in S3 with automatic validation. What took 20 minutes now takes ~100ms when the bucket hasn't changed.
|
||||
|
||||
## Overview
|
||||
|
||||
DeltaGlider's `get_bucket_stats()` operation now includes intelligent S3-based caching that dramatically improves performance for read-heavy workloads while maintaining accuracy through automatic validation.
|
||||
|
||||
## The Problem
|
||||
|
||||
Computing bucket statistics requires:
|
||||
1. **LIST operation**: Get all objects (~50-100ms per 1000 objects)
|
||||
2. **HEAD operations**: Fetch metadata for delta files (expensive!)
|
||||
- For a bucket with 10,000 delta files: 10,000 HEAD calls
|
||||
- Even with 10 parallel workers: ~1,000 sequential batches
|
||||
- At ~100ms per batch: **100+ seconds minimum**
|
||||
- With network issues or throttling: **20+ minutes** 😱
|
||||
|
||||
This made monitoring dashboards and repeated stats checks impractical.
|
||||
|
||||
## The Solution
|
||||
|
||||
### S3-Based Cache with Automatic Validation
|
||||
|
||||
Statistics are cached in S3 at `.deltaglider/stats_{mode}.json` (one per mode). On every call:
|
||||
|
||||
1. **Quick LIST operation** (~50-100ms) - always performed for validation
|
||||
2. **Compare** current object_count + compressed_size with cache
|
||||
3. **If unchanged** → Return cached stats instantly ✅ (**~100ms total**)
|
||||
4. **If changed** → Recompute and update cache automatically
|
||||
|
||||
### Three Stats Modes
|
||||
|
||||
```bash
|
||||
# Quick mode (default): Fast listing-only, approximate compression metrics
|
||||
deltaglider stats my-bucket
|
||||
|
||||
# Sampled mode: One HEAD per deltaspace, balanced accuracy/speed
|
||||
deltaglider stats my-bucket --sampled
|
||||
|
||||
# Detailed mode: All HEAD calls, most accurate (slowest)
|
||||
deltaglider stats my-bucket --detailed
|
||||
```
|
||||
|
||||
Each mode has its own independent cache file.
|
||||
|
||||
## Performance
|
||||
|
||||
| Scenario | Before | After | Speedup |
|
||||
|----------|--------|-------|---------|
|
||||
| **First run** (cold cache) | 20 min | 20 min | 1x (must compute) |
|
||||
| **Bucket unchanged** (warm cache) | 20 min | **100ms** | **200x** ✨ |
|
||||
| **Bucket changed** (stale cache) | 20 min | 20 min | 1x (auto-recompute) |
|
||||
| **Dashboard monitoring** | 20 min/check | **100ms/check** | **200x** ✨ |
|
||||
|
||||
## CLI Usage
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
# Use cache (default behavior)
|
||||
deltaglider stats my-bucket
|
||||
|
||||
# Force recomputation even if cache valid
|
||||
deltaglider stats my-bucket --refresh
|
||||
|
||||
# Skip cache entirely (both read and write)
|
||||
deltaglider stats my-bucket --no-cache
|
||||
|
||||
# Different modes with caching
|
||||
deltaglider stats my-bucket --sampled
|
||||
deltaglider stats my-bucket --detailed
|
||||
```
|
||||
|
||||
### Cache Control Flags
|
||||
|
||||
| Flag | Description | Use Case |
|
||||
|------|-------------|----------|
|
||||
| *(none)* | Use cache if valid | **Default** - Fast monitoring |
|
||||
| `--refresh` | Force recomputation | Updated data needed now |
|
||||
| `--no-cache` | Skip caching entirely | Testing, one-off analysis |
|
||||
| `--sampled` | Balanced mode | Good accuracy, faster than detailed |
|
||||
| `--detailed` | Most accurate mode | Analytics, reports |
|
||||
|
||||
## Python SDK Usage
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Use cache (fast, ~100ms with cache hit)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
|
||||
# Force refresh (slow, recomputes everything)
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Skip cache entirely
|
||||
stats = client.get_bucket_stats('releases', use_cache=False)
|
||||
|
||||
# Different modes with caching
|
||||
stats = client.get_bucket_stats('releases', mode='quick') # Fast
|
||||
stats = client.get_bucket_stats('releases', mode='sampled') # Balanced
|
||||
stats = client.get_bucket_stats('releases', mode='detailed') # Accurate
|
||||
```
|
||||
|
||||
## Cache Structure
|
||||
|
||||
Cache files are stored at `.deltaglider/stats_{mode}.json` in your bucket:
|
||||
|
||||
```json
|
||||
{
|
||||
"version": "1.0",
|
||||
"mode": "quick",
|
||||
"computed_at": "2025-10-14T10:30:00Z",
|
||||
"validation": {
|
||||
"object_count": 1523,
|
||||
"compressed_size": 1234567890
|
||||
},
|
||||
"stats": {
|
||||
"bucket": "releases",
|
||||
"object_count": 1523,
|
||||
"total_size": 50000000000,
|
||||
"compressed_size": 1234567890,
|
||||
"space_saved": 48765432110,
|
||||
"average_compression_ratio": 0.9753,
|
||||
"delta_objects": 1500,
|
||||
"direct_objects": 23
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## How Validation Works
|
||||
|
||||
**Smart Staleness Detection**:
|
||||
1. Always perform quick LIST operation (required anyway, ~50-100ms)
|
||||
2. Calculate current `object_count` and `compressed_size` from LIST
|
||||
3. Compare with cached values
|
||||
4. If **both match** → Cache valid, return instantly
|
||||
5. If **either differs** → Bucket changed, recompute automatically
|
||||
|
||||
This catches:
|
||||
- ✅ Objects added (count increases)
|
||||
- ✅ Objects removed (count decreases)
|
||||
- ✅ Objects replaced (size changes)
|
||||
- ✅ Content modified (size changes)
|
||||
|
||||
**Edge Case**: If only metadata changes (tags, headers) but not content/count/size, cache remains valid. This is acceptable since metadata changes are rare and don't affect core statistics.
|
||||
|
||||
## Use Cases
|
||||
|
||||
### ✅ Perfect For
|
||||
|
||||
1. **Monitoring Dashboards**
|
||||
- Check stats every minute
|
||||
- Bucket rarely changes
|
||||
- **20 min → 100ms per check** ✨
|
||||
|
||||
2. **CI/CD Status Checks**
|
||||
- Verify upload success
|
||||
- Check compression effectiveness
|
||||
- Near-instant feedback
|
||||
|
||||
3. **Repeated Analysis**
|
||||
- Multiple stats queries during investigation
|
||||
- Cache persists across sessions
|
||||
- Huge time savings
|
||||
|
||||
### ⚠️ Less Beneficial For
|
||||
|
||||
1. **Write-Heavy Buckets**
|
||||
- Bucket changes on every check
|
||||
- Cache always stale
|
||||
- **No benefit, but no harm either** (graceful degradation)
|
||||
|
||||
2. **One-Off Queries**
|
||||
- Single stats check
|
||||
- Cache doesn't help (cold cache)
|
||||
- Still works normally
|
||||
|
||||
## Cache Management
|
||||
|
||||
### Automatic Management
|
||||
|
||||
- **Creation**: Automatic on first `get_bucket_stats()` call
|
||||
- **Validation**: Automatic on every call (always current)
|
||||
- **Updates**: Automatic when bucket changes
|
||||
- **Cleanup**: Not needed (cache files are tiny ~1-10KB)
|
||||
|
||||
### Manual Management
|
||||
|
||||
```bash
|
||||
# View cache files
|
||||
deltaglider ls s3://my-bucket/.deltaglider/
|
||||
|
||||
# Delete cache manually (will be recreated automatically)
|
||||
deltaglider rm s3://my-bucket/.deltaglider/stats_quick.json
|
||||
deltaglider rm s3://my-bucket/.deltaglider/stats_sampled.json
|
||||
deltaglider rm s3://my-bucket/.deltaglider/stats_detailed.json
|
||||
|
||||
# Or delete entire .deltaglider prefix
|
||||
deltaglider rm -r s3://my-bucket/.deltaglider/
|
||||
```
|
||||
|
||||
## Technical Details
|
||||
|
||||
### Cache Files
|
||||
|
||||
- **Location**: `.deltaglider/` prefix in each bucket
|
||||
- **Naming**: `stats_{mode}.json` (quick, sampled, detailed)
|
||||
- **Size**: ~1-10KB per file
|
||||
- **Format**: JSON with version, mode, validation data, and stats
|
||||
|
||||
### Validation Logic
|
||||
|
||||
```python
|
||||
def is_cache_valid(cached, current):
|
||||
"""Cache is valid if object count and size unchanged."""
|
||||
return (
|
||||
cached['object_count'] == current['object_count'] and
|
||||
cached['compressed_size'] == current['compressed_size']
|
||||
)
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
Cache operations are **non-fatal**:
|
||||
- ✅ Cache read fails → Compute normally, log warning
|
||||
- ✅ Cache write fails → Return computed stats, log warning
|
||||
- ✅ Corrupted cache → Ignore, recompute, overwrite
|
||||
- ✅ Version mismatch → Ignore, recompute with new version
|
||||
- ✅ Permission denied → Log warning, continue without caching
|
||||
|
||||
**The stats operation never fails due to cache issues.**
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Potential improvements for the future:
|
||||
|
||||
1. **TTL-Based Expiration**: Auto-refresh after N hours even if unchanged
|
||||
2. **Cache Cleanup Command**: `deltaglider cache clear` for manual invalidation
|
||||
3. **Cache Statistics**: Show hit/miss rates, staleness info
|
||||
4. **Async Cache Updates**: Background refresh for very large buckets
|
||||
5. **Cross-Bucket Cache**: Share reference data across related buckets
|
||||
|
||||
## Comparison with Old Implementation
|
||||
|
||||
| Aspect | Old (In-Memory) | New (S3-Based) |
|
||||
|--------|----------------|----------------|
|
||||
| **Storage** | Process memory | S3 bucket |
|
||||
| **Persistence** | Lost on restart | Survives restarts |
|
||||
| **Sharing** | Per-process | Shared across all clients |
|
||||
| **Validation** | None | Automatic on every call |
|
||||
| **Staleness** | Always fresh | Automatically detected |
|
||||
| **Use Case** | Single session | Monitoring, dashboards |
|
||||
|
||||
## Examples
|
||||
|
||||
### Example 1: Monitoring Dashboard
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import time
|
||||
|
||||
client = create_client()
|
||||
|
||||
while True:
|
||||
# Fast stats check (~100ms with cache)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, "
|
||||
f"Compression: {stats.average_compression_ratio:.1%}")
|
||||
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
# First run: 20 min (computes and caches)
|
||||
# All subsequent runs: ~100ms (cache hit)
|
||||
```
|
||||
|
||||
### Example 2: CI/CD Pipeline
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Upload new release
|
||||
client.upload("v2.0.0.zip", "s3://releases/v2.0.0/")
|
||||
|
||||
# Quick verification (fast with cache)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
if stats.average_compression_ratio < 0.90:
|
||||
print("Warning: Lower than expected compression")
|
||||
```
|
||||
|
||||
### Example 3: Force Fresh Stats
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Force recomputation for accurate report
|
||||
stats = client.get_bucket_stats(
|
||||
'releases',
|
||||
mode='detailed',
|
||||
refresh_cache=True
|
||||
)
|
||||
|
||||
print(f"Accurate compression report:")
|
||||
print(f" Original: {stats.total_size / 1e9:.1f} GB")
|
||||
print(f" Stored: {stats.compressed_size / 1e9:.1f} GB")
|
||||
print(f" Saved: {stats.space_saved / 1e9:.1f} GB ({stats.average_compression_ratio:.1%})")
|
||||
```
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: Does caching affect accuracy?**
|
||||
A: No! Cache is automatically validated on every call. If the bucket changed, stats are recomputed automatically.
|
||||
|
||||
**Q: What if I need fresh stats immediately?**
|
||||
A: Use `--refresh` flag (CLI) or `refresh_cache=True` (SDK) to force recomputation.
|
||||
|
||||
**Q: Can I disable caching?**
|
||||
A: Yes, use `--no-cache` flag (CLI) or `use_cache=False` (SDK).
|
||||
|
||||
**Q: How much space do cache files use?**
|
||||
A: ~1-10KB per mode, negligible for any bucket.
|
||||
|
||||
**Q: What happens if cache write fails?**
|
||||
A: The operation continues normally - computed stats are returned and a warning is logged. Caching is optional and non-fatal.
|
||||
|
||||
**Q: Do I need to clean up cache files?**
|
||||
A: No, they're tiny and automatically managed. But you can delete `.deltaglider/` prefix if desired.
|
||||
|
||||
**Q: Does cache work across different modes?**
|
||||
A: Each mode (quick, sampled, detailed) has its own independent cache file.
|
||||
|
||||
---
|
||||
|
||||
**Implementation**: See [PR #XX] for complete implementation details and test coverage.
|
||||
|
||||
**Related**: [SDK Documentation](sdk/README.md) | [CLI Reference](../README.md#cli-reference) | [Architecture](sdk/architecture.md)
|
||||
@@ -9,7 +9,11 @@ DeltaGlider provides AWS S3 CLI compatible commands with automatic delta compres
|
||||
- `deltaglider ls [s3_url]` - List buckets and objects
|
||||
- `deltaglider rm <s3_url>` - Remove objects
|
||||
- `deltaglider sync <source> <destination>` - Synchronize directories
|
||||
- `deltaglider migrate <source> <destination>` - Migrate S3 buckets with compression and EC2 cost warnings
|
||||
- `deltaglider stats <bucket>` - Get bucket statistics and compression metrics
|
||||
- `deltaglider verify <s3_url>` - Verify file integrity
|
||||
- `deltaglider put-bucket-acl <bucket>` - Set bucket ACL (s3api compatible)
|
||||
- `deltaglider get-bucket-acl <bucket>` - Get bucket ACL (s3api compatible)
|
||||
|
||||
### Current Usage Examples
|
||||
```bash
|
||||
@@ -21,6 +25,14 @@ deltaglider cp s3://bucket/path/to/file.zip .
|
||||
|
||||
# Verify integrity
|
||||
deltaglider verify s3://bucket/path/to/file.zip.delta
|
||||
|
||||
# Set bucket ACL
|
||||
deltaglider put-bucket-acl my-bucket --acl public-read
|
||||
deltaglider put-bucket-acl my-bucket --acl private
|
||||
deltaglider put-bucket-acl my-bucket --grant-read id=12345
|
||||
|
||||
# Get bucket ACL
|
||||
deltaglider get-bucket-acl my-bucket
|
||||
```
|
||||
|
||||
## Target State: AWS S3 CLI Compatibility
|
||||
|
||||
@@ -1,347 +1,76 @@
|
||||
# Case Study: How ReadOnlyREST Reduced Storage Costs by 99.9% with DeltaGlider
|
||||
## How ReadonlyREST Cut 4TB of S3 Storage Down to 5GB (and Saved 99.9%)
|
||||
|
||||
## Executive Summary
|
||||
### TL;DR
|
||||
|
||||
**The Challenge**: ReadOnlyREST, a security plugin for Elasticsearch, was facing exponential storage costs managing 145 release versions across multiple product lines, consuming nearly 4TB of S3 storage.
|
||||
We were paying to store 4TB of mostly identical plugin builds.
|
||||
DeltaGlider deduplicated everything down to 4.9GB — 99.9% smaller, $1.1k/year cheaper, and no workflow changes.
|
||||
|
||||
**The Solution**: DeltaGlider, an intelligent delta compression system that reduced storage from 4,060GB to just 4.9GB.
|
||||
#### The Problem
|
||||
|
||||
**The Impact**:
|
||||
- 💰 **$1,119 annual savings** on storage costs
|
||||
- 📉 **99.9% reduction** in storage usage
|
||||
- ⚡ **Zero changes** to existing workflows
|
||||
- ✅ **Full data integrity** maintained
|
||||
ReadonlyREST supports ~150 Elasticsearch/Kibana versions × multiple product lines × all our own releases.
|
||||
After years of publishing builds, our S3 archive hit `4TB` (201,840 files, $93/month).
|
||||
Glacier helped, but restoring files took 48 hours — useless for CI/CD.
|
||||
|
||||
---
|
||||
Every plugin ZIP was ~82MB, but `99.7% identical` to the next one. We were paying to store duplicates.
|
||||
|
||||
## The Storage Crisis
|
||||
#### The Fix: DeltaGlider
|
||||
|
||||
### The Numbers That Kept Us Up at Night
|
||||
DeltaGlider stores binary deltas between similar files instead of full copies.
|
||||
|
||||
ReadOnlyREST maintains a comprehensive release archive:
|
||||
- **145 version folders** (v1.50.0 through v1.66.1)
|
||||
- **201,840 total files** to manage
|
||||
- **3.96 TB** of S3 storage consumed
|
||||
- **$1,120/year** in storage costs alone
|
||||
|
||||
Each version folder contained:
|
||||
- 513 plugin ZIP files (one for each Elasticsearch version)
|
||||
- 879 checksum files (SHA1 and SHA512)
|
||||
- 3 product lines (Enterprise, Pro, Free)
|
||||
|
||||
### The Hidden Problem
|
||||
|
||||
What made this particularly painful wasn't just the size—it was the **redundancy**. Each 82.5MB plugin ZIP was 99.7% identical to others in the same version, differing only in minor Elasticsearch compatibility adjustments. We were essentially storing the same data hundreds of times.
|
||||
|
||||
> "We were paying to store 4TB of data that was fundamentally just variations of the same ~250MB of unique content. It felt like photocopying War and Peace 500 times because each copy had a different page number."
|
||||
>
|
||||
> — *DevOps Lead*
|
||||
|
||||
---
|
||||
|
||||
## Enter DeltaGlider
|
||||
|
||||
### The Lightbulb Moment
|
||||
|
||||
The breakthrough came when we realized we didn't need to store complete files—just the *differences* between them. DeltaGlider applies this principle automatically:
|
||||
|
||||
1. **First file becomes the reference** (stored in full)
|
||||
2. **Similar files store only deltas** (typically 0.3% of original size)
|
||||
3. **Different files uploaded directly** (no delta overhead)
|
||||
|
||||
### Implementation: Surprisingly Simple
|
||||
|
||||
```bash
|
||||
# Before DeltaGlider (standard S3 upload)
|
||||
aws s3 cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/
|
||||
# Size on S3: 82.5MB
|
||||
|
||||
# With DeltaGlider
|
||||
deltaglider cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/
|
||||
# Size on S3: 65KB (99.92% smaller!)
|
||||
# Before
|
||||
```
|
||||
aws s3 cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/ # 82MB
|
||||
```
|
||||
|
||||
The beauty? **Zero changes to our build pipeline**. DeltaGlider works as a drop-in replacement for S3 uploads.
|
||||
|
||||
---
|
||||
|
||||
## The Results: Beyond Our Expectations
|
||||
|
||||
### Storage Transformation
|
||||
|
||||
# After
|
||||
```
|
||||
BEFORE DELTAGLIDER AFTER DELTAGLIDER
|
||||
━━━━━━━━━━━━━━━━━ ━━━━━━━━━━━━━━━━
|
||||
4,060 GB (3.96 TB) → 4.9 GB
|
||||
$93.38/month → $0.11/month
|
||||
201,840 files → 201,840 files (same!)
|
||||
deltaglider cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/ # 65KB
|
||||
```
|
||||
|
||||
### Real Performance Metrics
|
||||
Drop-in replacement for `aws s3 cp`. No pipeline changes.
|
||||
Data integrity checked with SHA256, stored as metadata in S3.
|
||||
|
||||
From our actual production deployment:
|
||||
|
||||
| Metric | Value | Impact |
|
||||
|--------|-------|--------|
|
||||
| **Compression Ratio** | 99.9% | Near-perfect deduplication |
|
||||
| **Delta Size** | ~65KB per 82.5MB file | 1/1,269th of original |
|
||||
| **Upload Speed** | 3-4 files/second | Faster than raw S3 uploads |
|
||||
| **Download Speed** | Transparent reconstruction | No user impact |
|
||||
| **Storage Savings** | 4,055 GB | Enough for 850,000 more files |
|
||||
### The Result
|
||||
|
||||
### Version-to-Version Comparison
|
||||
| Metric | Before | After | Δ |
|
||||
|-------------- |----------|----------|--------------|
|
||||
| Storage | 4.06TB | 4.9GB | -99.9% |
|
||||
| Cost | $93/mo | $0.11/mo | -$1,119/yr |
|
||||
| Files | 201,840 | 201,840 | identical |
|
||||
| Upload speed | 1x | 3–4x | faster |
|
||||
|
||||
Testing between similar versions showed incredible efficiency:
|
||||
Each “different” ZIP? Just a 65KB delta.
|
||||
Reconstruction time: <100ms.
|
||||
Zero user impact.
|
||||
|
||||
|
||||
## Under the Hood
|
||||
|
||||
Uses xdelta3 diffs.
|
||||
• Keeps one reference per group
|
||||
• Stores deltas for near-identical files
|
||||
• Skips small or text-based ones (.sha, .json, etc.)
|
||||
|
||||
It’s smart enough to decide what’s worth diffing automatically.
|
||||
|
||||
|
||||
## Payoff
|
||||
• 4TB → 5GB overnight
|
||||
• Uploads 1,200× faster
|
||||
• CI bandwidth cut 99%
|
||||
• 100% checksum verified integrity
|
||||
• Zero vendor lock-in (open source)
|
||||
|
||||
## Takeaways
|
||||
|
||||
If You Ship Versioned Artifacts
|
||||
|
||||
This will probably save you four figures and hours of upload time per year.
|
||||
|
||||
```
|
||||
readonlyrest-1.66.1_es7.17.0.zip (82.5MB) → reference.bin (82.5MB)
|
||||
readonlyrest-1.66.1_es7.17.1.zip (82.5MB) → 64KB delta (0.08% size)
|
||||
readonlyrest-1.66.1_es7.17.2.zip (82.5MB) → 65KB delta (0.08% size)
|
||||
...
|
||||
readonlyrest-1.66.1_es8.15.0.zip (82.5MB) → 71KB delta (0.09% size)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Technical Deep Dive
|
||||
|
||||
### How DeltaGlider Achieves 99.9% Compression
|
||||
|
||||
DeltaGlider uses binary diff algorithms (xdelta3) to identify and store only the bytes that change between files:
|
||||
|
||||
```python
|
||||
# Simplified concept
|
||||
reference = "readonlyrest-1.66.1_es7.17.0.zip" # 82.5MB
|
||||
new_file = "readonlyrest-1.66.1_es7.17.1.zip" # 82.5MB
|
||||
|
||||
delta = binary_diff(reference, new_file) # 65KB
|
||||
# Delta contains only:
|
||||
# - Elasticsearch version string changes
|
||||
# - Compatibility metadata updates
|
||||
# - Build timestamp differences
|
||||
```
|
||||
|
||||
### Intelligent File Type Detection
|
||||
|
||||
Not every file benefits from delta compression. DeltaGlider automatically:
|
||||
|
||||
- **Applies delta compression to**: `.zip`, `.tar`, `.gz`, `.dmg`, `.jar`, `.war`
|
||||
- **Uploads directly**: `.txt`, `.sha1`, `.sha512`, `.json`, `.md`
|
||||
|
||||
This intelligence meant our 127,455 checksum files were uploaded directly, avoiding unnecessary processing overhead.
|
||||
|
||||
### Architecture That Scales
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
|
||||
│ Client │────▶│ DeltaGlider │────▶│ S3/MinIO │
|
||||
│ (CI/CD) │ │ │ │ │
|
||||
└─────────────┘ └──────────────┘ └─────────────┘
|
||||
│
|
||||
┌──────▼───────┐
|
||||
│ Local Cache │
|
||||
│ (References) │
|
||||
└──────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Business Impact
|
||||
|
||||
### Immediate ROI
|
||||
|
||||
- **Day 1**: 99.9% storage reduction
|
||||
- **Month 1**: $93 saved
|
||||
- **Year 1**: $1,119 saved
|
||||
- **5 Years**: $5,595 saved (not counting growth)
|
||||
|
||||
### Hidden Benefits We Didn't Expect
|
||||
|
||||
1. **Faster Deployments**: Uploading 65KB deltas is 1,200x faster than 82.5MB files
|
||||
2. **Reduced Bandwidth**: CI/CD pipeline bandwidth usage dropped 99%
|
||||
3. **Improved Reliability**: Fewer timeout errors on large file uploads
|
||||
4. **Better Compliance**: Automatic SHA256 integrity verification on every operation
|
||||
|
||||
### Environmental Impact
|
||||
|
||||
> "Reducing storage by 4TB means fewer drives spinning in data centers. It's a small contribution to our sustainability goals, but every bit counts."
|
||||
>
|
||||
> — *CTO*
|
||||
|
||||
---
|
||||
|
||||
## Implementation Journey
|
||||
|
||||
### Week 1: Proof of Concept
|
||||
- Tested with 10 files
|
||||
- Achieved 99.6% compression
|
||||
- Decision to proceed
|
||||
|
||||
### Week 2: Production Rollout
|
||||
- Uploaded all 201,840 files
|
||||
- Zero errors or failures
|
||||
- Immediate cost reduction
|
||||
|
||||
### Week 3: Integration
|
||||
```bash
|
||||
# Simple integration into our CI/CD
|
||||
- aws s3 cp $FILE s3://releases/
|
||||
+ deltaglider cp $FILE s3://releases/
|
||||
```
|
||||
|
||||
### Week 4: Full Migration
|
||||
- All build pipelines updated
|
||||
- Developer documentation completed
|
||||
- Monitoring dashboards configured
|
||||
|
||||
---
|
||||
|
||||
## Lessons Learned
|
||||
|
||||
### What Worked Well
|
||||
|
||||
1. **Drop-in replacement**: No architectural changes needed
|
||||
2. **Automatic intelligence**: File type detection "just worked"
|
||||
3. **Preservation of structure**: Directory hierarchy maintained perfectly
|
||||
|
||||
### Challenges Overcome
|
||||
|
||||
1. **Initial skepticism**: "99.9% compression sounds too good to be true"
|
||||
- *Solution*: Live demonstration with real data
|
||||
|
||||
2. **Download concerns**: "Will it be slow to reconstruct files?"
|
||||
- *Solution*: Benchmarking showed <100ms reconstruction time
|
||||
|
||||
3. **Reliability questions**: "What if the reference file is corrupted?"
|
||||
- *Solution*: SHA256 verification on every operation
|
||||
|
||||
---
|
||||
|
||||
## For Decision Makers
|
||||
|
||||
### Why This Matters
|
||||
|
||||
Storage costs scale linearly with data growth. Without DeltaGlider:
|
||||
- Next 145 versions: Additional $1,120/year
|
||||
- 5-year projection: $11,200 in storage alone
|
||||
- Opportunity cost: Resources that could fund innovation
|
||||
|
||||
### Risk Assessment
|
||||
|
||||
| Risk | Mitigation | Status |
|
||||
|------|------------|--------|
|
||||
| Vendor lock-in | Open-source, standards-based | ✅ Mitigated |
|
||||
| Data corruption | SHA256 verification built-in | ✅ Mitigated |
|
||||
| Performance impact | Faster than original | ✅ No risk |
|
||||
| Complexity | Drop-in replacement | ✅ No risk |
|
||||
|
||||
### Strategic Advantages
|
||||
|
||||
1. **Cost Predictability**: Storage costs become negligible
|
||||
2. **Scalability**: Can handle 100x more versions in same space
|
||||
3. **Competitive Edge**: More resources for product development
|
||||
4. **Green IT**: Reduced carbon footprint from storage
|
||||
|
||||
---
|
||||
|
||||
## For Engineers
|
||||
|
||||
### Getting Started
|
||||
|
||||
```bash
|
||||
# Install DeltaGlider
|
||||
pip install deltaglider
|
||||
|
||||
# Upload a file (automatic compression)
|
||||
deltaglider cp my-release-v1.0.0.zip s3://releases/
|
||||
|
||||
# Download (automatic reconstruction)
|
||||
deltaglider cp s3://releases/my-release-v1.0.0.zip .
|
||||
|
||||
# It's that simple.
|
||||
deltaglider cp my-release.zip s3://releases/
|
||||
```
|
||||
|
||||
### Performance Characteristics
|
||||
|
||||
```python
|
||||
# Compression ratios by similarity
|
||||
identical_files: 99.9% # Same file, different name
|
||||
minor_changes: 99.7% # Version bumps, timestamps
|
||||
moderate_changes: 95.0% # Feature additions
|
||||
major_changes: 70.0% # Significant refactoring
|
||||
completely_different: 0% # No compression (uploaded as-is)
|
||||
```
|
||||
|
||||
### Integration Examples
|
||||
|
||||
**GitHub Actions**:
|
||||
```yaml
|
||||
- name: Upload Release
|
||||
run: deltaglider cp dist/*.zip s3://releases/${{ github.ref_name }}/
|
||||
```
|
||||
|
||||
**Jenkins Pipeline**:
|
||||
```groovy
|
||||
sh "deltaglider cp ${WORKSPACE}/target/*.jar s3://artifacts/"
|
||||
```
|
||||
|
||||
**Python Script**:
|
||||
```python
|
||||
from deltaglider import DeltaService
|
||||
service = DeltaService(bucket="releases")
|
||||
service.put("my-app-v2.0.0.zip", "v2.0.0/")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## The Bottom Line
|
||||
|
||||
DeltaGlider transformed our storage crisis into a solved problem:
|
||||
|
||||
- ✅ **4TB → 5GB** storage reduction
|
||||
- ✅ **$1,119/year** saved
|
||||
- ✅ **Zero** workflow disruption
|
||||
- ✅ **100%** data integrity maintained
|
||||
|
||||
For ReadOnlyREST, DeltaGlider wasn't just a cost-saving tool—it was a glimpse into the future of intelligent storage. When 99.9% of your data is redundant, why pay to store it 500 times?
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
### For Your Organization
|
||||
|
||||
1. **Identify similar use cases**: Version releases, backups, build artifacts
|
||||
2. **Run the calculator**: `[Your files] × [Versions] × [Similarity] = Savings`
|
||||
3. **Start small**: Test with one project's releases
|
||||
4. **Scale confidently**: Deploy across all similar data
|
||||
|
||||
### Get Started Today
|
||||
|
||||
```bash
|
||||
# See your potential savings
|
||||
git clone https://github.com/beshu-tech/deltaglider
|
||||
cd deltaglider
|
||||
python calculate_savings.py --path /your/releases
|
||||
|
||||
# Try it yourself
|
||||
docker run -p 9000:9000 minio/minio # Local S3
|
||||
pip install deltaglider
|
||||
deltaglider cp your-file.zip s3://test/
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## About ReadOnlyREST
|
||||
|
||||
ReadOnlyREST is the enterprise security plugin for Elasticsearch and OpenSearch, protecting clusters in production since 2015. Learn more at [readonlyrest.com](https://readonlyrest.com)
|
||||
|
||||
## About DeltaGlider
|
||||
|
||||
DeltaGlider is an open-source delta compression system for S3-compatible storage, turning redundant data into remarkable savings. Built with modern Python, containerized for portability, and designed for scale.
|
||||
|
||||
---
|
||||
|
||||
*"In a world where storage is cheap but not free, and data grows exponentially but changes incrementally, DeltaGlider represents a fundamental shift in how we think about storing versioned artifacts."*
|
||||
|
||||
**— ReadOnlyREST Engineering Team**
|
||||
That’s it.
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 267 KiB After Width: | Height: | Size: 4.0 MiB |
@@ -1,6 +1,6 @@
|
||||
# DeltaGlider Python SDK Documentation
|
||||
|
||||
The DeltaGlider Python SDK provides a **boto3-compatible API for core S3 operations** (~20% of methods covering 80% of use cases), while achieving 99%+ compression for versioned artifacts through intelligent binary delta compression.
|
||||
The DeltaGlider Python SDK provides a **boto3-compatible API for core S3 operations** (~20% of methods covering 80% of use cases), while achieving 99%+ compression for very similar versioned artifacts through intelligent binary delta compression.
|
||||
|
||||
## 🎯 Key Highlights
|
||||
|
||||
@@ -38,17 +38,29 @@ response = client.get_object(Bucket='releases', Key='v1.0.0/app.zip')
|
||||
# Optimized list_objects with smart performance defaults (NEW!)
|
||||
# Fast by default - no unnecessary metadata fetching
|
||||
response = client.list_objects(Bucket='releases', Prefix='v1.0.0/')
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Pagination for large buckets
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100,
|
||||
ContinuationToken=response.next_continuation_token)
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100)
|
||||
while response.get('IsTruncated'):
|
||||
# Process current page
|
||||
for obj in response['Contents']:
|
||||
print(obj['Key'])
|
||||
# Get next page
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
MaxKeys=100,
|
||||
ContinuationToken=response.get('NextContinuationToken')
|
||||
)
|
||||
|
||||
# Get detailed compression stats only when needed
|
||||
response = client.list_objects(Bucket='releases', FetchMetadata=True) # Slower but detailed
|
||||
|
||||
# Quick bucket statistics
|
||||
stats = client.get_bucket_stats('releases') # Fast overview
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True) # With compression metrics
|
||||
# Bucket statistics with intelligent S3-based caching (NEW!)
|
||||
stats = client.get_bucket_stats('releases') # Fast (~100ms with cache)
|
||||
stats = client.get_bucket_stats('releases', mode='detailed') # Accurate compression metrics
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True) # Force fresh computation
|
||||
|
||||
client.delete_object(Bucket='releases', Key='old-version.zip')
|
||||
```
|
||||
@@ -101,6 +113,8 @@ client.put_object(Bucket='mybucket', Key='myfile.zip', Body=data)
|
||||
- **Data Integrity**: SHA256 verification on every operation
|
||||
- **Transparent**: Works with existing tools and workflows
|
||||
- **Production Ready**: Battle-tested with 200K+ files
|
||||
- **Thoroughly Tested**: 99 integration/unit tests with comprehensive coverage
|
||||
- **Type Safe**: Full mypy type checking, zero type errors
|
||||
|
||||
## When to Use DeltaGlider
|
||||
|
||||
@@ -192,10 +206,17 @@ from deltaglider import create_client
|
||||
client = create_client(
|
||||
endpoint_url="http://minio.internal:9000", # Custom S3 endpoint
|
||||
log_level="DEBUG", # Detailed logging
|
||||
cache_dir="/var/cache/deltaglider", # Custom cache location
|
||||
aws_access_key_id="minio",
|
||||
aws_secret_access_key="minio",
|
||||
region_name="eu-west-1",
|
||||
max_ratio=0.3, # Stricter delta acceptance
|
||||
)
|
||||
```
|
||||
|
||||
> ℹ️ The SDK now manages an encrypted, process-isolated cache automatically in `/tmp/deltaglider-*`.
|
||||
> Tune cache behavior via environment variables such as `DG_CACHE_BACKEND`,
|
||||
> `DG_CACHE_MEMORY_SIZE_MB`, and `DG_CACHE_ENCRYPTION_KEY` instead of passing a `cache_dir` argument.
|
||||
|
||||
## Real-World Example
|
||||
|
||||
```python
|
||||
@@ -285,4 +306,4 @@ url = client.generate_presigned_url(
|
||||
|
||||
## License
|
||||
|
||||
MIT License - See [LICENSE](https://github.com/beshu-tech/deltaglider/blob/main/LICENSE) for details.
|
||||
MIT License - See [LICENSE](https://github.com/beshu-tech/deltaglider/blob/main/LICENSE) for details.
|
||||
|
||||
151
docs/sdk/api.md
151
docs/sdk/api.md
@@ -21,7 +21,6 @@ Factory function to create a configured DeltaGlider client with sensible default
|
||||
def create_client(
|
||||
endpoint_url: Optional[str] = None,
|
||||
log_level: str = "INFO",
|
||||
cache_dir: str = "/tmp/.deltaglider/cache",
|
||||
**kwargs
|
||||
) -> DeltaGliderClient
|
||||
```
|
||||
@@ -30,11 +29,12 @@ def create_client(
|
||||
|
||||
- **endpoint_url** (`Optional[str]`): S3 endpoint URL for MinIO, R2, or other S3-compatible storage. If None, uses AWS S3.
|
||||
- **log_level** (`str`): Logging verbosity level. Options: "DEBUG", "INFO", "WARNING", "ERROR". Default: "INFO".
|
||||
- **cache_dir** (`str`): Directory for local reference cache. Default: "/tmp/.deltaglider/cache".
|
||||
- **kwargs**: Additional arguments passed to `DeltaService`:
|
||||
- **tool_version** (`str`): Version string for metadata. Default: "deltaglider/0.1.0"
|
||||
- **max_ratio** (`float`): Maximum acceptable delta/file ratio. Default: 0.5
|
||||
|
||||
**Security Note**: DeltaGlider automatically uses ephemeral, process-isolated cache (`/tmp/deltaglider-*`) that is cleaned up on exit. No configuration needed.
|
||||
|
||||
#### Returns
|
||||
|
||||
`DeltaGliderClient`: Configured client instance ready for use.
|
||||
@@ -48,11 +48,8 @@ client = create_client()
|
||||
# Custom endpoint for MinIO
|
||||
client = create_client(endpoint_url="http://localhost:9000")
|
||||
|
||||
# Debug mode with custom cache
|
||||
client = create_client(
|
||||
log_level="DEBUG",
|
||||
cache_dir="/var/cache/deltaglider"
|
||||
)
|
||||
# Debug mode
|
||||
client = create_client(log_level="DEBUG")
|
||||
|
||||
# Custom delta ratio threshold
|
||||
client = create_client(max_ratio=0.3) # Only use delta if <30% of original
|
||||
@@ -94,7 +91,7 @@ def list_objects(
|
||||
StartAfter: Optional[str] = None,
|
||||
FetchMetadata: bool = False,
|
||||
**kwargs
|
||||
) -> ListObjectsResponse
|
||||
) -> dict[str, Any]
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
@@ -117,19 +114,32 @@ The method intelligently optimizes performance by:
|
||||
2. Only fetching metadata for delta files when explicitly requested
|
||||
3. Supporting efficient pagination for large buckets
|
||||
|
||||
##### Returns
|
||||
|
||||
boto3-compatible dict with:
|
||||
- **Contents** (`list[dict]`): List of S3Object dicts with Key, Size, LastModified, Metadata
|
||||
- **CommonPrefixes** (`list[dict]`): Optional list of common prefixes (folders)
|
||||
- **IsTruncated** (`bool`): Whether more results are available
|
||||
- **NextContinuationToken** (`str`): Token for next page
|
||||
- **KeyCount** (`int`): Number of keys returned
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Fast listing for UI display (no metadata fetching)
|
||||
response = client.list_objects(Bucket='releases')
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Paginated listing for large buckets
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100)
|
||||
while response.is_truncated:
|
||||
while response.get('IsTruncated'):
|
||||
for obj in response['Contents']:
|
||||
print(obj['Key'])
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
MaxKeys=100,
|
||||
ContinuationToken=response.next_continuation_token
|
||||
ContinuationToken=response.get('NextContinuationToken')
|
||||
)
|
||||
|
||||
# Get detailed compression stats (slower, only for analytics)
|
||||
@@ -137,37 +147,73 @@ response = client.list_objects(
|
||||
Bucket='releases',
|
||||
FetchMetadata=True # Only fetches for delta files
|
||||
)
|
||||
for obj in response['Contents']:
|
||||
metadata = obj.get('Metadata', {})
|
||||
if metadata.get('deltaglider-is-delta') == 'true':
|
||||
compression = metadata.get('deltaglider-compression-ratio', 'unknown')
|
||||
print(f"{obj['Key']}: {compression} compression")
|
||||
```
|
||||
|
||||
#### `get_bucket_stats`
|
||||
|
||||
Get statistics for a bucket with optional detailed compression metrics.
|
||||
Get statistics for a bucket with optional detailed compression metrics. Results are cached inside the bucket for performance.
|
||||
|
||||
```python
|
||||
def get_bucket_stats(
|
||||
self,
|
||||
bucket: str,
|
||||
detailed_stats: bool = False
|
||||
mode: Literal["quick", "sampled", "detailed"] = "quick",
|
||||
use_cache: bool = True,
|
||||
refresh_cache: bool = False,
|
||||
) -> BucketStats
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
|
||||
- **bucket** (`str`): S3 bucket name.
|
||||
- **detailed_stats** (`bool`): If True, fetch accurate compression ratios for delta files. Default: False.
|
||||
- With `detailed_stats=False`: ~50ms for any bucket size (LIST calls only)
|
||||
- With `detailed_stats=True`: ~2-3s per 1000 objects (adds HEAD calls for delta files)
|
||||
- **mode** (`Literal[...]`): Accuracy/cost trade-off:
|
||||
- `"quick"` (default): LIST-only scan; compression ratios for deltas are estimated.
|
||||
- `"sampled"`: HEAD one delta per deltaspace and reuse the ratio.
|
||||
- `"detailed"`: HEAD every delta object; slowest but exact.
|
||||
- **use_cache** (`bool`): If True, read/write `.deltaglider/stats_{mode}.json` in the bucket for reuse.
|
||||
- **refresh_cache** (`bool`): Force recomputation even if a cache file is valid.
|
||||
|
||||
##### Caching Behavior
|
||||
|
||||
- Stats are cached per mode directly inside the bucket at `.deltaglider/stats_{mode}.json`.
|
||||
- Every call validates cache freshness via a quick LIST (object count + compressed size).
|
||||
- `refresh_cache=True` skips cache validation and recomputes immediately.
|
||||
- `use_cache=False` bypasses both reading and writing cache artifacts.
|
||||
|
||||
##### Returns
|
||||
|
||||
`BucketStats`: Dataclass containing:
|
||||
- **bucket** (`str`): Bucket name
|
||||
- **object_count** (`int`): Total number of objects
|
||||
- **total_size** (`int`): Original size in bytes (before compression)
|
||||
- **compressed_size** (`int`): Actual stored size in bytes
|
||||
- **space_saved** (`int`): Bytes saved through compression
|
||||
- **average_compression_ratio** (`float`): Average compression ratio (0.0-1.0)
|
||||
- **delta_objects** (`int`): Number of delta-compressed objects
|
||||
- **direct_objects** (`int`): Number of directly stored objects
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Quick stats for dashboard display
|
||||
# Quick stats (fast LIST-only)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, Size: {stats.total_size}")
|
||||
|
||||
# Detailed stats for analytics (slower but accurate)
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True)
|
||||
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
|
||||
# Sampled/detailed modes for analytics
|
||||
sampled = client.get_bucket_stats('releases', mode='sampled')
|
||||
detailed = client.get_bucket_stats('releases', mode='detailed')
|
||||
print(f"Compression ratio: {detailed.average_compression_ratio:.1%}")
|
||||
|
||||
# Force refresh if an external tool modified the bucket
|
||||
fresh = client.get_bucket_stats('releases', mode='quick', refresh_cache=True)
|
||||
|
||||
# Skip cache entirely when running ad-hoc diagnostics
|
||||
uncached = client.get_bucket_stats('releases', use_cache=False)
|
||||
```
|
||||
|
||||
#### `put_object`
|
||||
@@ -300,15 +346,18 @@ def list_buckets(
|
||||
|
||||
##### Returns
|
||||
|
||||
Dict with list of buckets and owner information (identical to boto3).
|
||||
Dict with the same structure boto3 returns (`Buckets`, `Owner`, `ResponseMetadata`). DeltaGlider does not inject additional metadata; use `get_bucket_stats()` for compression data.
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# List all buckets
|
||||
response = client.list_buckets()
|
||||
for bucket in response['Buckets']:
|
||||
print(f"{bucket['Name']} - Created: {bucket['CreationDate']}")
|
||||
|
||||
# Combine with get_bucket_stats for deeper insights
|
||||
stats = client.get_bucket_stats('releases', mode='detailed')
|
||||
print(f"releases -> {stats.object_count} objects, {stats.space_saved/(1024**3):.2f} GB saved")
|
||||
```
|
||||
|
||||
### Simple API Methods
|
||||
@@ -445,20 +494,54 @@ else:
|
||||
# Re-upload or investigate
|
||||
```
|
||||
|
||||
#### `lifecycle_policy`
|
||||
### Cache Management Methods
|
||||
|
||||
Set lifecycle policy for S3 prefix (placeholder for future implementation).
|
||||
#### `clear_cache`
|
||||
|
||||
Clear all locally cached reference files.
|
||||
|
||||
```python
|
||||
def lifecycle_policy(
|
||||
self,
|
||||
s3_prefix: str,
|
||||
days_before_archive: int = 30,
|
||||
days_before_delete: int = 90
|
||||
) -> None
|
||||
def clear_cache(self) -> None
|
||||
```
|
||||
|
||||
**Note**: This method is a placeholder for future S3 lifecycle policy management.
|
||||
##### Description
|
||||
|
||||
Removes all cached reference files from the local filesystem. Useful for:
|
||||
- Freeing disk space in long-running applications
|
||||
- Ensuring the next upload/download fetches fresh references from S3
|
||||
- Resetting cache after configuration or credential changes
|
||||
- Testing and development workflows
|
||||
|
||||
##### Cache Scope
|
||||
|
||||
- **Reference Cache**: Binary reference files stored in `/tmp/deltaglider-*/`
|
||||
- Encrypted at rest with ephemeral keys
|
||||
- Content-addressed storage (SHA256-based filenames)
|
||||
- Automatically cleaned up on process exit
|
||||
- **Statistics Cache**: Stored inside the bucket as `.deltaglider/stats_{mode}.json`.
|
||||
- `clear_cache()` does *not* remove these S3 objects; use `refresh_cache=True` or delete the objects manually if needed.
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Long-running application
|
||||
client = create_client()
|
||||
|
||||
# Work with files
|
||||
for i in range(1000):
|
||||
client.upload(f"file_{i}.zip", "s3://bucket/")
|
||||
|
||||
# Periodic cache cleanup to prevent disk buildup
|
||||
if i % 100 == 0:
|
||||
client.clear_cache()
|
||||
|
||||
# Force fresh statistics after external changes (skip cache instead of clearing)
|
||||
stats_before = client.get_bucket_stats('releases')
|
||||
stats_after = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Development workflow
|
||||
client.clear_cache() # Start with clean state
|
||||
```
|
||||
|
||||
## UploadSummary
|
||||
|
||||
@@ -708,9 +791,10 @@ DeltaGlider respects these environment variables:
|
||||
### DeltaGlider Configuration
|
||||
|
||||
- **DG_LOG_LEVEL**: Logging level (DEBUG, INFO, WARNING, ERROR)
|
||||
- **DG_CACHE_DIR**: Local cache directory
|
||||
- **DG_MAX_RATIO**: Default maximum delta ratio
|
||||
|
||||
**Note**: Cache is automatically managed (ephemeral, process-isolated) and requires no configuration.
|
||||
|
||||
### Example
|
||||
|
||||
```bash
|
||||
@@ -721,10 +805,9 @@ export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
|
||||
# Configure DeltaGlider
|
||||
export DG_LOG_LEVEL=DEBUG
|
||||
export DG_CACHE_DIR=/var/cache/deltaglider
|
||||
export DG_MAX_RATIO=0.3
|
||||
|
||||
# Now use normally
|
||||
# Now use normally (cache managed automatically)
|
||||
python my_script.py
|
||||
```
|
||||
|
||||
@@ -816,4 +899,4 @@ client = create_client(log_level="DEBUG")
|
||||
|
||||
- **GitHub Issues**: [github.com/beshu-tech/deltaglider/issues](https://github.com/beshu-tech/deltaglider/issues)
|
||||
- **Documentation**: [github.com/beshu-tech/deltaglider](https://github.com/beshu-tech/deltaglider)
|
||||
- **PyPI Package**: [pypi.org/project/deltaglider](https://pypi.org/project/deltaglider)
|
||||
- **PyPI Package**: [pypi.org/project/deltaglider](https://pypi.org/project/deltaglider)
|
||||
|
||||
@@ -5,15 +5,17 @@ Real-world examples and patterns for using DeltaGlider in production application
|
||||
## Table of Contents
|
||||
|
||||
1. [Performance-Optimized Bucket Listing](#performance-optimized-bucket-listing)
|
||||
2. [Bucket Management](#bucket-management)
|
||||
3. [Software Release Management](#software-release-management)
|
||||
4. [Database Backup System](#database-backup-system)
|
||||
5. [CI/CD Pipeline Integration](#cicd-pipeline-integration)
|
||||
6. [Container Registry Storage](#container-registry-storage)
|
||||
7. [Machine Learning Model Versioning](#machine-learning-model-versioning)
|
||||
8. [Game Asset Distribution](#game-asset-distribution)
|
||||
9. [Log Archive Management](#log-archive-management)
|
||||
10. [Multi-Region Replication](#multi-region-replication)
|
||||
2. [Bucket Statistics and Monitoring](#bucket-statistics-and-monitoring)
|
||||
3. [Session-Level Cache Management](#session-level-cache-management)
|
||||
4. [Bucket Management](#bucket-management)
|
||||
5. [Software Release Management](#software-release-management)
|
||||
6. [Database Backup System](#database-backup-system)
|
||||
7. [CI/CD Pipeline Integration](#cicd-pipeline-integration)
|
||||
8. [Container Registry Storage](#container-registry-storage)
|
||||
9. [Machine Learning Model Versioning](#machine-learning-model-versioning)
|
||||
10. [Game Asset Distribution](#game-asset-distribution)
|
||||
11. [Log Archive Management](#log-archive-management)
|
||||
12. [Multi-Region Replication](#multi-region-replication)
|
||||
|
||||
## Performance-Optimized Bucket Listing
|
||||
|
||||
@@ -23,6 +25,7 @@ DeltaGlider's smart `list_objects` method eliminates the N+1 query problem by in
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
from deltaglider.client_models import BucketStats
|
||||
import time
|
||||
|
||||
client = create_client()
|
||||
@@ -39,19 +42,19 @@ def fast_bucket_listing(bucket: str):
|
||||
|
||||
# Process objects for display
|
||||
items = []
|
||||
for obj in response.contents:
|
||||
for obj in response['Contents']:
|
||||
metadata = obj.get("Metadata", {})
|
||||
items.append({
|
||||
"key": obj.key,
|
||||
"size": obj.size,
|
||||
"last_modified": obj.last_modified,
|
||||
"is_delta": obj.is_delta, # Determined from filename
|
||||
# No compression_ratio - would require HEAD request
|
||||
"key": obj["Key"],
|
||||
"size": obj["Size"],
|
||||
"last_modified": obj["LastModified"],
|
||||
"is_delta": metadata.get("deltaglider-is-delta") == "true",
|
||||
})
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"Listed {len(items)} objects in {elapsed*1000:.0f}ms")
|
||||
|
||||
return items, response.next_continuation_token
|
||||
return items, response.get("NextContinuationToken")
|
||||
|
||||
# Example: List first page
|
||||
items, next_token = fast_bucket_listing('releases')
|
||||
@@ -73,12 +76,12 @@ def paginated_listing(bucket: str, page_size: int = 50):
|
||||
FetchMetadata=False # Keep it fast
|
||||
)
|
||||
|
||||
all_objects.extend(response.contents)
|
||||
all_objects.extend(response["Contents"])
|
||||
|
||||
if not response.is_truncated:
|
||||
if not response.get("IsTruncated"):
|
||||
break
|
||||
|
||||
continuation_token = response.next_continuation_token
|
||||
continuation_token = response.get("NextContinuationToken")
|
||||
print(f"Fetched {len(all_objects)} objects so far...")
|
||||
|
||||
return all_objects
|
||||
@@ -94,8 +97,8 @@ print(f"Total objects: {len(all_objects)}")
|
||||
def dashboard_with_stats(bucket: str):
|
||||
"""Dashboard view with optional detailed stats."""
|
||||
|
||||
# Quick overview (fast - no metadata)
|
||||
stats = client.get_bucket_stats(bucket, detailed_stats=False)
|
||||
# Quick overview (fast LIST-only)
|
||||
stats = client.get_bucket_stats(bucket)
|
||||
|
||||
print(f"Quick Stats for {bucket}:")
|
||||
print(f" Total Objects: {stats.object_count}")
|
||||
@@ -106,7 +109,7 @@ def dashboard_with_stats(bucket: str):
|
||||
|
||||
# Detailed compression analysis (slower - fetches metadata for deltas only)
|
||||
if stats.delta_objects > 0:
|
||||
detailed_stats = client.get_bucket_stats(bucket, detailed_stats=True)
|
||||
detailed_stats = client.get_bucket_stats(bucket, mode='detailed')
|
||||
print(f"\nDetailed Compression Stats:")
|
||||
print(f" Average Compression: {detailed_stats.average_compression_ratio:.1%}")
|
||||
print(f" Space Saved: {detailed_stats.space_saved / (1024**3):.2f} GB")
|
||||
@@ -129,11 +132,25 @@ def compression_analysis(bucket: str, prefix: str = ""):
|
||||
)
|
||||
|
||||
# Analyze compression effectiveness
|
||||
delta_files = [obj for obj in response.contents if obj.is_delta]
|
||||
delta_files: list[dict[str, float | int | str]] = []
|
||||
for obj in response["Contents"]:
|
||||
metadata = obj.get("Metadata", {})
|
||||
if metadata.get("deltaglider-is-delta") != "true":
|
||||
continue
|
||||
original_size = int(metadata.get("deltaglider-original-size", obj["Size"]))
|
||||
compression_ratio = float(metadata.get("deltaglider-compression-ratio", 0.0))
|
||||
delta_files.append(
|
||||
{
|
||||
"key": obj["Key"],
|
||||
"original": original_size,
|
||||
"compressed": obj["Size"],
|
||||
"ratio": compression_ratio,
|
||||
}
|
||||
)
|
||||
|
||||
if delta_files:
|
||||
total_original = sum(obj.original_size for obj in delta_files)
|
||||
total_compressed = sum(obj.compressed_size for obj in delta_files)
|
||||
total_original = sum(obj["original"] for obj in delta_files)
|
||||
total_compressed = sum(obj["compressed"] for obj in delta_files)
|
||||
avg_ratio = (total_original - total_compressed) / total_original
|
||||
|
||||
print(f"Compression Analysis for {prefix or 'all files'}:")
|
||||
@@ -143,11 +160,11 @@ def compression_analysis(bucket: str, prefix: str = ""):
|
||||
print(f" Average Compression: {avg_ratio:.1%}")
|
||||
|
||||
# Find best and worst compression
|
||||
best = max(delta_files, key=lambda x: x.compression_ratio or 0)
|
||||
worst = min(delta_files, key=lambda x: x.compression_ratio or 1)
|
||||
best = max(delta_files, key=lambda x: x["ratio"])
|
||||
worst = min(delta_files, key=lambda x: x["ratio"])
|
||||
|
||||
print(f" Best Compression: {best.key} ({best.compression_ratio:.1%})")
|
||||
print(f" Worst Compression: {worst.key} ({worst.compression_ratio:.1%})")
|
||||
print(f" Best Compression: {best['key']} ({best['ratio']:.1%})")
|
||||
print(f" Worst Compression: {worst['key']} ({worst['ratio']:.1%})")
|
||||
|
||||
# Example: Analyze v2.0 releases
|
||||
compression_analysis('releases', 'v2.0/')
|
||||
@@ -178,7 +195,11 @@ def performance_comparison(bucket: str):
|
||||
)
|
||||
time_detailed = (time.time() - start) * 1000
|
||||
|
||||
delta_count = sum(1 for obj in response_fast.contents if obj.is_delta)
|
||||
delta_count = sum(
|
||||
1
|
||||
for obj in response_fast["Contents"]
|
||||
if obj.get("Metadata", {}).get("deltaglider-is-delta") == "true"
|
||||
)
|
||||
|
||||
print(f"Performance Comparison for {bucket}:")
|
||||
print(f" Fast Listing: {time_fast:.0f}ms (1 API call)")
|
||||
@@ -199,6 +220,291 @@ performance_comparison('releases')
|
||||
|
||||
2. **Never Fetch for Non-Deltas**: The SDK automatically skips metadata fetching for non-delta files even when `FetchMetadata=True`.
|
||||
|
||||
## Bucket Statistics and Monitoring
|
||||
|
||||
DeltaGlider provides powerful bucket statistics with S3-backed caching for performance.
|
||||
|
||||
### Quick Dashboard Stats (Cached)
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
def show_bucket_dashboard(bucket: str):
|
||||
"""Display real-time bucket statistics with caching."""
|
||||
|
||||
# First call: computes stats (~50ms)
|
||||
stats = client.get_bucket_stats(bucket)
|
||||
|
||||
# Second call: instant (cached)
|
||||
stats = client.get_bucket_stats(bucket)
|
||||
|
||||
print(f"Dashboard for {stats.bucket}")
|
||||
print(f"=" * 60)
|
||||
print(f"Total Objects: {stats.object_count:,}")
|
||||
print(f" Delta Objects: {stats.delta_objects:,}")
|
||||
print(f" Direct Objects: {stats.direct_objects:,}")
|
||||
print()
|
||||
print(f"Original Size: {stats.total_size / (1024**3):.2f} GB")
|
||||
print(f"Stored Size: {stats.compressed_size / (1024**3):.2f} GB")
|
||||
print(f"Space Saved: {stats.space_saved / (1024**3):.2f} GB")
|
||||
print(f"Compression Ratio: {stats.average_compression_ratio:.1%}")
|
||||
|
||||
# Example: Show stats for multiple buckets (each cached separately)
|
||||
for bucket_name in ['releases', 'backups', 'archives']:
|
||||
show_bucket_dashboard(bucket_name)
|
||||
```
|
||||
|
||||
### Detailed Compression Analysis
|
||||
|
||||
```python
|
||||
def detailed_compression_report(bucket: str):
|
||||
"""Generate detailed compression report with accurate ratios."""
|
||||
|
||||
# Detailed stats fetch metadata for delta files (slower, accurate)
|
||||
stats = client.get_bucket_stats(bucket, mode='detailed')
|
||||
|
||||
efficiency = (stats.space_saved / stats.total_size * 100) if stats.total_size > 0 else 0
|
||||
|
||||
print(f"Detailed Compression Report: {stats.bucket}")
|
||||
print(f"=" * 60)
|
||||
print(f"Object Distribution:")
|
||||
print(f" Total: {stats.object_count:,}")
|
||||
print(f" Delta-Compressed: {stats.delta_objects:,} ({stats.delta_objects/stats.object_count*100:.1f}%)")
|
||||
print(f" Direct Storage: {stats.direct_objects:,} ({stats.direct_objects/stats.object_count*100:.1f}%)")
|
||||
print()
|
||||
print(f"Storage Efficiency:")
|
||||
print(f" Original Data: {stats.total_size / (1024**3):.2f} GB")
|
||||
print(f" Actual Storage: {stats.compressed_size / (1024**3):.2f} GB")
|
||||
print(f" Space Saved: {stats.space_saved / (1024**3):.2f} GB")
|
||||
print(f" Efficiency: {efficiency:.1f}%")
|
||||
print(f" Avg Compression: {stats.average_compression_ratio:.2%}")
|
||||
|
||||
# Calculate estimated monthly costs (example: $0.023/GB S3 Standard)
|
||||
cost_without = stats.total_size / (1024**3) * 0.023
|
||||
cost_with = stats.compressed_size / (1024**3) * 0.023
|
||||
monthly_savings = cost_without - cost_with
|
||||
|
||||
print()
|
||||
print(f"Estimated Monthly S3 Costs ($0.023/GB):")
|
||||
print(f" Without DeltaGlider: ${cost_without:.2f}")
|
||||
print(f" With DeltaGlider: ${cost_with:.2f}")
|
||||
print(f" Monthly Savings: ${monthly_savings:.2f}")
|
||||
|
||||
# Example: Detailed report
|
||||
detailed_compression_report('releases')
|
||||
```
|
||||
|
||||
### List Buckets with Cached Stats
|
||||
|
||||
```python
|
||||
def list_buckets_with_stats():
|
||||
"""List buckets and augment with cached stats fetched on demand."""
|
||||
|
||||
response = client.list_buckets()
|
||||
stats_cache: dict[str, BucketStats | None] = {}
|
||||
|
||||
def ensure_stats(bucket_name: str) -> BucketStats | None:
|
||||
if bucket_name not in stats_cache:
|
||||
try:
|
||||
stats_cache[bucket_name] = client.get_bucket_stats(bucket_name)
|
||||
except Exception:
|
||||
stats_cache[bucket_name] = None
|
||||
return stats_cache[bucket_name]
|
||||
|
||||
print("All Buckets:")
|
||||
print(f"{'Name':<30} {'Objects':<10} {'Compression':<15} {'Cached'}")
|
||||
print("=" * 70)
|
||||
|
||||
for bucket in response['Buckets']:
|
||||
name = bucket['Name']
|
||||
stats = ensure_stats(name)
|
||||
|
||||
if stats:
|
||||
obj_count = f"{stats.object_count:,}"
|
||||
compression = f"{stats.average_compression_ratio:.1%}"
|
||||
cached = "✓ (S3 cache)"
|
||||
else:
|
||||
obj_count = "N/A"
|
||||
compression = "N/A"
|
||||
cached = "✗"
|
||||
|
||||
print(f"{name:<30} {obj_count:<10} {compression:<15} {cached}")
|
||||
|
||||
# Example: List with stats
|
||||
list_buckets_with_stats()
|
||||
```
|
||||
|
||||
### Monitoring Dashboard (Real-Time)
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def monitoring_dashboard(buckets: list[str], refresh_seconds: int = 60):
|
||||
"""Real-time monitoring dashboard with periodic refresh."""
|
||||
|
||||
while True:
|
||||
print("\033[2J\033[H") # Clear screen
|
||||
print(f"DeltaGlider Monitoring Dashboard - {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print("=" * 80)
|
||||
|
||||
for bucket_name in buckets:
|
||||
# Get cached stats (instant) or compute fresh
|
||||
stats = client.get_bucket_stats(bucket_name)
|
||||
|
||||
print(f"\n{bucket_name}:")
|
||||
print(f" Objects: {stats.object_count:,} | "
|
||||
f"Delta: {stats.delta_objects:,} | "
|
||||
f"Direct: {stats.direct_objects:,}")
|
||||
print(f" Size: {stats.compressed_size/(1024**3):.2f} GB | "
|
||||
f"Saved: {stats.space_saved/(1024**3):.2f} GB | "
|
||||
f"Compression: {stats.average_compression_ratio:.1%}")
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Refreshing in {refresh_seconds} seconds... (Ctrl+C to exit)")
|
||||
|
||||
time.sleep(refresh_seconds)
|
||||
|
||||
# Clear cache for fresh data on next iteration
|
||||
client.clear_cache()
|
||||
|
||||
# Example: Monitor key buckets
|
||||
try:
|
||||
monitoring_dashboard(['releases', 'backups', 'archives'], refresh_seconds=30)
|
||||
except KeyboardInterrupt:
|
||||
print("\nMonitoring stopped.")
|
||||
```
|
||||
|
||||
## Session-Level Cache Management
|
||||
|
||||
DeltaGlider maintains an encrypted reference cache for optimal performance in long-running applications.
|
||||
|
||||
### Long-Running Application Pattern
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import time
|
||||
|
||||
def long_running_upload_service():
|
||||
"""Upload service with periodic cache cleanup."""
|
||||
|
||||
client = create_client()
|
||||
processed_count = 0
|
||||
|
||||
while True:
|
||||
# Simulate file processing
|
||||
files_to_upload = get_pending_files() # Your file queue
|
||||
|
||||
for file_path in files_to_upload:
|
||||
try:
|
||||
summary = client.upload(file_path, "s3://releases/")
|
||||
processed_count += 1
|
||||
|
||||
print(f"Uploaded {file_path}: {summary.savings_percent:.0f}% saved")
|
||||
|
||||
# Periodic cache cleanup (every 100 files)
|
||||
if processed_count % 100 == 0:
|
||||
client.clear_cache()
|
||||
print(f"Cache cleared after {processed_count} files")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error uploading {file_path}: {e}")
|
||||
|
||||
time.sleep(60) # Check for new files every minute
|
||||
|
||||
# Example: Run upload service
|
||||
# long_running_upload_service()
|
||||
```
|
||||
|
||||
### Cache Invalidation After External Changes
|
||||
|
||||
```python
|
||||
def handle_external_bucket_changes(bucket: str):
|
||||
"""Refresh statistics after external tools modify bucket."""
|
||||
|
||||
# Get initial stats (cached)
|
||||
stats_before = client.get_bucket_stats(bucket)
|
||||
print(f"Before: {stats_before.object_count} objects")
|
||||
|
||||
# External process modifies bucket
|
||||
print("External backup tool running...")
|
||||
run_external_backup_tool(bucket) # Your external tool
|
||||
|
||||
# Force a recompute of the cached stats
|
||||
stats_after = client.get_bucket_stats(bucket, refresh_cache=True)
|
||||
print(f"After: {stats_after.object_count} objects")
|
||||
print(f"Added: {stats_after.object_count - stats_before.object_count} objects")
|
||||
|
||||
# Example usage
|
||||
handle_external_bucket_changes('backups')
|
||||
```
|
||||
|
||||
### Testing with Clean Cache
|
||||
|
||||
```python
|
||||
import pytest
|
||||
from deltaglider import create_client
|
||||
|
||||
def test_upload_workflow():
|
||||
"""Test with clean cache state."""
|
||||
|
||||
client = create_client()
|
||||
client.clear_cache() # Start with clean state
|
||||
|
||||
# Test first upload (no reference exists)
|
||||
summary1 = client.upload("file1.zip", "s3://test-bucket/prefix/")
|
||||
assert not summary1.is_delta # First file is reference
|
||||
|
||||
# Test subsequent upload (uses cached reference)
|
||||
summary2 = client.upload("file2.zip", "s3://test-bucket/prefix/")
|
||||
assert summary2.is_delta # Should use delta
|
||||
|
||||
# Clear and test again
|
||||
client.clear_cache()
|
||||
summary3 = client.upload("file3.zip", "s3://test-bucket/prefix/")
|
||||
assert summary3.is_delta # Still delta (reference in S3)
|
||||
|
||||
# Run test
|
||||
# test_upload_workflow()
|
||||
```
|
||||
|
||||
### Cache Performance Monitoring
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def measure_cache_performance(bucket: str):
|
||||
"""Measure performance impact of caching."""
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Test 1: Cold cache
|
||||
start = time.time()
|
||||
stats1 = client.get_bucket_stats(bucket, mode='detailed', refresh_cache=True)
|
||||
cold_time = (time.time() - start) * 1000
|
||||
|
||||
# Test 2: Warm cache
|
||||
start = time.time()
|
||||
stats2 = client.get_bucket_stats(bucket, mode='detailed')
|
||||
warm_time = (time.time() - start) * 1000
|
||||
|
||||
# Test 3: Quick stats from detailed cache
|
||||
start = time.time()
|
||||
stats3 = client.get_bucket_stats(bucket, mode='quick')
|
||||
reuse_time = (time.time() - start) * 1000
|
||||
|
||||
print(f"Cache Performance for {bucket}:")
|
||||
print(f" Cold Cache (detailed): {cold_time:.0f}ms")
|
||||
print(f" Warm Cache (detailed): {warm_time:.0f}ms")
|
||||
print(f" Cache Reuse (quick): {reuse_time:.0f}ms")
|
||||
print(f" Speedup (detailed): {cold_time/warm_time:.1f}x")
|
||||
print(f" Speedup (reuse): {cold_time/reuse_time:.1f}x")
|
||||
|
||||
# Example: Measure cache performance
|
||||
measure_cache_performance('releases')
|
||||
```
|
||||
|
||||
3. **Use Pagination**: For large buckets, use `MaxKeys` and `ContinuationToken` to paginate results.
|
||||
|
||||
4. **Cache Results**: If you need metadata frequently, consider caching the results to avoid repeated HEAD requests.
|
||||
@@ -1389,4 +1695,4 @@ files_to_upload = [
|
||||
results = uploader.upload_batch(files_to_upload)
|
||||
```
|
||||
|
||||
These examples demonstrate real-world usage patterns for DeltaGlider across various domains. Each example includes error handling, monitoring, and best practices for production deployments.
|
||||
These examples demonstrate real-world usage patterns for DeltaGlider across various domains. Each example includes error handling, monitoring, and best practices for production deployments.
|
||||
|
||||
@@ -69,6 +69,42 @@ Or via environment variable:
|
||||
export AWS_ENDPOINT_URL=http://minio.local:9000
|
||||
```
|
||||
|
||||
### DeltaGlider Configuration
|
||||
|
||||
DeltaGlider supports the following environment variables:
|
||||
|
||||
**Logging & Performance**:
|
||||
- `DG_LOG_LEVEL`: Logging level (default: `INFO`, options: `DEBUG`, `INFO`, `WARNING`, `ERROR`)
|
||||
- `DG_MAX_RATIO`: Maximum delta/file ratio (default: `0.5`, range: `0.0-1.0`)
|
||||
- **See [DG_MAX_RATIO.md](../DG_MAX_RATIO.md) for complete tuning guide**
|
||||
- Controls when to use delta compression vs. direct storage
|
||||
- Lower (0.2-0.3) = conservative, only high-quality compression
|
||||
- Higher (0.6-0.7) = permissive, accept modest savings
|
||||
|
||||
**Cache Configuration**:
|
||||
- `DG_CACHE_BACKEND`: Cache backend type (default: `filesystem`, options: `filesystem`, `memory`)
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size in MB (default: `100`)
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded Fernet key for persistent encryption
|
||||
|
||||
**Security**:
|
||||
- Encryption is **always enabled** (cannot be disabled)
|
||||
- Ephemeral encryption keys per process (forward secrecy)
|
||||
- Corrupted cache files automatically deleted
|
||||
- Set `DG_CACHE_ENCRYPTION_KEY` only for cross-process cache sharing
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
# Use memory cache for faster performance in CI/CD
|
||||
export DG_CACHE_BACKEND=memory
|
||||
export DG_CACHE_MEMORY_SIZE_MB=500
|
||||
|
||||
# Enable debug logging
|
||||
export DG_LOG_LEVEL=DEBUG
|
||||
|
||||
# Adjust delta compression threshold
|
||||
export DG_MAX_RATIO=0.3 # More aggressive compression
|
||||
```
|
||||
|
||||
## Your First Upload
|
||||
|
||||
### Basic Example
|
||||
|
||||
64
examples/boto3_compatible_types.py
Normal file
64
examples/boto3_compatible_types.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Example: Using boto3-compatible responses without importing boto3.
|
||||
|
||||
This demonstrates how DeltaGlider provides full type safety and boto3 compatibility
|
||||
without requiring boto3 imports in user code.
|
||||
|
||||
As of v5.0.0, DeltaGlider returns plain dicts (not custom dataclasses) that are
|
||||
100% compatible with boto3 S3 responses. You get IDE autocomplete through TypedDict
|
||||
type hints without any runtime overhead.
|
||||
"""
|
||||
|
||||
from deltaglider import ListObjectsV2Response, S3Object, create_client
|
||||
|
||||
# Create client (no boto3 import needed!)
|
||||
client = create_client()
|
||||
|
||||
# Type hints work perfectly without boto3
|
||||
def process_files(bucket: str, prefix: str) -> None:
|
||||
"""Process files in S3 with full type safety."""
|
||||
# Return type is fully typed - IDE autocomplete works!
|
||||
response: ListObjectsV2Response = client.list_objects(
|
||||
Bucket=bucket, Prefix=prefix, Delimiter="/"
|
||||
)
|
||||
|
||||
# Response is a plain dict - 100% boto3-compatible
|
||||
# TypedDict provides autocomplete and type checking
|
||||
for obj in response["Contents"]:
|
||||
# obj is typed as S3Object - all fields have autocomplete!
|
||||
key: str = obj["Key"] # ✅ IDE knows this is str
|
||||
size: int = obj["Size"] # ✅ IDE knows this is int
|
||||
print(f"{key}: {size} bytes")
|
||||
|
||||
# DeltaGlider metadata is in the standard Metadata field
|
||||
metadata = obj.get("Metadata", {})
|
||||
if metadata.get("deltaglider-is-delta") == "true":
|
||||
compression = metadata.get("deltaglider-compression-ratio", "unknown")
|
||||
print(f" └─ Delta file (compression: {compression})")
|
||||
|
||||
# Optional fields work too
|
||||
for prefix_dict in response.get("CommonPrefixes", []):
|
||||
print(f"Directory: {prefix_dict['Prefix']}")
|
||||
|
||||
# Pagination info
|
||||
if response.get("IsTruncated"):
|
||||
next_token = response.get("NextContinuationToken")
|
||||
print(f"More results available, token: {next_token}")
|
||||
|
||||
|
||||
# This is 100% compatible with boto3 code!
|
||||
def works_with_boto3_or_deltaglider(s3_client) -> None:
|
||||
"""This function works with EITHER boto3 or DeltaGlider client."""
|
||||
# Because the response structure is identical!
|
||||
response = s3_client.list_objects(Bucket="my-bucket")
|
||||
|
||||
for obj in response["Contents"]:
|
||||
print(obj["Key"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
print("✅ Full type safety without boto3 imports!")
|
||||
print("✅ 100% compatible with boto3")
|
||||
print("✅ Drop-in replacement")
|
||||
print("✅ Plain dict responses (not custom dataclasses)")
|
||||
print("✅ DeltaGlider metadata in standard Metadata field")
|
||||
@@ -49,9 +49,11 @@ classifiers = [
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
"boto3>=1.35.0",
|
||||
"click>=8.1.0",
|
||||
"python-dateutil>=2.9.0",
|
||||
"boto3>=1.35.0,<2.0.0",
|
||||
"click>=8.1.0,<9.0.0",
|
||||
"cryptography>=42.0.0,<45.0.0",
|
||||
"python-dateutil>=2.9.0,<3.0.0",
|
||||
"requests>=2.32.0,<3.0.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -108,6 +110,7 @@ dev-dependencies = [
|
||||
"mypy>=1.13.0",
|
||||
"boto3-stubs[s3]>=1.35.0",
|
||||
"types-python-dateutil>=2.9.0",
|
||||
"types-requests>=2.32.0",
|
||||
"setuptools-scm>=8.0.0",
|
||||
]
|
||||
|
||||
|
||||
101
scripts/check_metadata.py
Normal file
101
scripts/check_metadata.py
Normal file
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check which delta files are missing metadata."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from deltaglider import create_client
|
||||
|
||||
|
||||
def check_bucket_metadata(bucket: str) -> None:
|
||||
"""Check all delta files in a bucket for missing metadata.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
"""
|
||||
client = create_client()
|
||||
|
||||
print(f"Checking delta files in bucket: {bucket}\n")
|
||||
print("=" * 80)
|
||||
|
||||
# List all objects
|
||||
response = client.service.storage.list_objects(bucket=bucket, max_keys=10000)
|
||||
|
||||
missing_metadata = []
|
||||
has_metadata = []
|
||||
total_delta_files = 0
|
||||
|
||||
for obj in response["objects"]:
|
||||
key = obj["key"]
|
||||
|
||||
# Only check .delta files
|
||||
if not key.endswith(".delta"):
|
||||
continue
|
||||
|
||||
total_delta_files += 1
|
||||
|
||||
# Get metadata
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
|
||||
if not obj_head:
|
||||
print(f"❌ {key}: Object not found")
|
||||
continue
|
||||
|
||||
metadata = obj_head.metadata
|
||||
|
||||
# Check for required metadata fields
|
||||
required_fields = ["file_size", "file_sha256", "ref_key", "ref_sha256", "delta_size"]
|
||||
missing_fields = [f for f in required_fields if f not in metadata]
|
||||
|
||||
if missing_fields:
|
||||
missing_metadata.append({
|
||||
"key": key,
|
||||
"missing_fields": missing_fields,
|
||||
"has_metadata": bool(metadata),
|
||||
"available_keys": list(metadata.keys()) if metadata else [],
|
||||
})
|
||||
status = "⚠️ MISSING"
|
||||
detail = f"missing: {', '.join(missing_fields)}"
|
||||
else:
|
||||
has_metadata.append(key)
|
||||
status = "✅ OK"
|
||||
detail = f"file_size={metadata.get('file_size')}"
|
||||
|
||||
print(f"{status} {key}")
|
||||
print(f" {detail}")
|
||||
if metadata:
|
||||
print(f" Available keys: {', '.join(metadata.keys())}")
|
||||
print()
|
||||
|
||||
# Summary
|
||||
print("=" * 80)
|
||||
print(f"\nSummary:")
|
||||
print(f" Total delta files: {total_delta_files}")
|
||||
print(f" With complete metadata: {len(has_metadata)} ({len(has_metadata)/total_delta_files*100:.1f}%)")
|
||||
print(f" Missing metadata: {len(missing_metadata)} ({len(missing_metadata)/total_delta_files*100:.1f}%)")
|
||||
|
||||
if missing_metadata:
|
||||
print(f"\n❌ Files with missing metadata:")
|
||||
for item in missing_metadata:
|
||||
print(f" - {item['key']}")
|
||||
print(f" Missing: {', '.join(item['missing_fields'])}")
|
||||
if item['available_keys']:
|
||||
print(f" Has: {', '.join(item['available_keys'])}")
|
||||
|
||||
print(f"\n💡 Recommendation:")
|
||||
print(f" These files should be re-uploaded to get proper metadata and accurate stats.")
|
||||
print(f" You can re-upload with: deltaglider cp <local-file> s3://{bucket}/<path>")
|
||||
else:
|
||||
print(f"\n✅ All delta files have complete metadata!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python check_metadata.py <bucket-name>")
|
||||
sys.exit(1)
|
||||
|
||||
bucket_name = sys.argv[1]
|
||||
check_bucket_metadata(bucket_name)
|
||||
@@ -7,23 +7,36 @@ except ImportError:
|
||||
__version__ = "0.0.0+unknown"
|
||||
|
||||
# Import client API
|
||||
from .client import (
|
||||
from .client import DeltaGliderClient, create_client
|
||||
from .client_models import (
|
||||
BucketStats,
|
||||
CompressionEstimate,
|
||||
DeltaGliderClient,
|
||||
ListObjectsResponse,
|
||||
ObjectInfo,
|
||||
UploadSummary,
|
||||
create_client,
|
||||
)
|
||||
from .core import DeltaService, DeltaSpace, ObjectKey
|
||||
|
||||
# Import boto3-compatible type aliases (no boto3 import required!)
|
||||
from .types import (
|
||||
CopyObjectResponse,
|
||||
CreateBucketResponse,
|
||||
DeleteObjectResponse,
|
||||
DeleteObjectsResponse,
|
||||
GetObjectResponse,
|
||||
HeadObjectResponse,
|
||||
ListBucketsResponse,
|
||||
ListObjectsV2Response,
|
||||
PutObjectResponse,
|
||||
S3Object,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
# Client
|
||||
"DeltaGliderClient",
|
||||
"create_client",
|
||||
# Data classes
|
||||
# Data classes (legacy - will be deprecated in favor of TypedDict)
|
||||
"UploadSummary",
|
||||
"CompressionEstimate",
|
||||
"ObjectInfo",
|
||||
@@ -33,4 +46,15 @@ __all__ = [
|
||||
"DeltaService",
|
||||
"DeltaSpace",
|
||||
"ObjectKey",
|
||||
# boto3-compatible types (no boto3 import needed!)
|
||||
"ListObjectsV2Response",
|
||||
"PutObjectResponse",
|
||||
"GetObjectResponse",
|
||||
"DeleteObjectResponse",
|
||||
"DeleteObjectsResponse",
|
||||
"HeadObjectResponse",
|
||||
"ListBucketsResponse",
|
||||
"CreateBucketResponse",
|
||||
"CopyObjectResponse",
|
||||
"S3Object",
|
||||
]
|
||||
|
||||
@@ -1,19 +1,27 @@
|
||||
"""Adapters for DeltaGlider."""
|
||||
|
||||
from .cache_cas import ContentAddressedCache
|
||||
from .cache_encrypted import EncryptedCache
|
||||
from .cache_fs import FsCacheAdapter
|
||||
from .cache_memory import MemoryCache
|
||||
from .clock_utc import UtcClockAdapter
|
||||
from .diff_xdelta import XdeltaAdapter
|
||||
from .ec2_metadata import EC2MetadataAdapter
|
||||
from .hash_sha import Sha256Adapter
|
||||
from .logger_std import StdLoggerAdapter
|
||||
from .metrics_noop import NoopMetricsAdapter
|
||||
from .storage_s3 import S3StorageAdapter
|
||||
|
||||
__all__ = [
|
||||
"S3StorageAdapter",
|
||||
"XdeltaAdapter",
|
||||
"Sha256Adapter",
|
||||
"ContentAddressedCache",
|
||||
"EC2MetadataAdapter",
|
||||
"EncryptedCache",
|
||||
"FsCacheAdapter",
|
||||
"UtcClockAdapter",
|
||||
"StdLoggerAdapter",
|
||||
"MemoryCache",
|
||||
"NoopMetricsAdapter",
|
||||
"S3StorageAdapter",
|
||||
"Sha256Adapter",
|
||||
"StdLoggerAdapter",
|
||||
"UtcClockAdapter",
|
||||
"XdeltaAdapter",
|
||||
]
|
||||
|
||||
270
src/deltaglider/adapters/cache_cas.py
Normal file
270
src/deltaglider/adapters/cache_cas.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""Content-Addressed Storage (CAS) cache adapter.
|
||||
|
||||
This adapter stores cached references using their SHA256 hash as the filename,
|
||||
eliminating collision risks and enabling automatic deduplication.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Unix-only imports for file locking
|
||||
if sys.platform != "win32":
|
||||
import fcntl
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
from ..ports.hash import HashPort
|
||||
|
||||
|
||||
class ContentAddressedCache(CachePort):
|
||||
"""Content-addressed storage cache using SHA256 as filename.
|
||||
|
||||
Key Features:
|
||||
- Zero collision risk (SHA256 namespace is the filename)
|
||||
- Automatic deduplication (same content = same filename)
|
||||
- No metadata tracking needed (self-describing)
|
||||
- Secure by design (tampering changes SHA, breaks lookup)
|
||||
|
||||
Storage Layout:
|
||||
- base_dir/
|
||||
- ab/
|
||||
- cd/
|
||||
- abcdef123456... (full SHA256 as filename)
|
||||
|
||||
The two-level directory structure (first 2 chars, next 2 chars) prevents
|
||||
filesystem performance degradation from too many files in one directory.
|
||||
"""
|
||||
|
||||
def __init__(self, base_dir: Path, hasher: HashPort):
|
||||
"""Initialize content-addressed cache.
|
||||
|
||||
Args:
|
||||
base_dir: Root directory for cache storage
|
||||
hasher: Hash adapter for SHA256 computation
|
||||
"""
|
||||
self.base_dir = base_dir
|
||||
self.hasher = hasher
|
||||
# Mapping of (bucket, prefix) -> sha256 for compatibility
|
||||
# This is ephemeral and only used within a single process
|
||||
self._deltaspace_to_sha: dict[tuple[str, str], str] = {}
|
||||
|
||||
def _cas_path(self, sha256: str) -> Path:
|
||||
"""Get content-addressed path from SHA256 hash.
|
||||
|
||||
Uses two-level directory structure for filesystem optimization:
|
||||
- First 2 hex chars as L1 directory (256 buckets)
|
||||
- Next 2 hex chars as L2 directory (256 buckets per L1)
|
||||
- Full SHA as filename
|
||||
|
||||
Example: abcdef1234... -> ab/cd/abcdef1234...
|
||||
|
||||
Args:
|
||||
sha256: Full SHA256 hash (64 hex chars)
|
||||
|
||||
Returns:
|
||||
Path to file in content-addressed storage
|
||||
"""
|
||||
if len(sha256) < 4:
|
||||
raise ValueError(f"Invalid SHA256: {sha256}")
|
||||
|
||||
# Two-level directory structure
|
||||
l1_dir = sha256[:2] # First 2 chars
|
||||
l2_dir = sha256[2:4] # Next 2 chars
|
||||
|
||||
return self.base_dir / l1_dir / l2_dir / sha256
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str) -> Path:
|
||||
"""Get path where reference should be cached.
|
||||
|
||||
For CAS, we need the SHA to compute the path. This method looks up
|
||||
the SHA from the ephemeral mapping. If not found, it returns a
|
||||
placeholder path (backward compatibility with has_ref checks).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
|
||||
Returns:
|
||||
Path to cached reference (may not exist)
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# If we have the SHA mapping, use CAS path
|
||||
if key in self._deltaspace_to_sha:
|
||||
sha = self._deltaspace_to_sha[key]
|
||||
return self._cas_path(sha)
|
||||
|
||||
# Fallback: return a non-existent placeholder
|
||||
# This enables has_ref to return False for unmapped deltaspaces
|
||||
return self.base_dir / "_unmapped" / bucket / prefix / "reference.bin"
|
||||
|
||||
def has_ref(self, bucket: str, prefix: str, sha: str) -> bool:
|
||||
"""Check if reference exists with given SHA.
|
||||
|
||||
In CAS, existence check is simple: if file exists at SHA path,
|
||||
it MUST have that SHA (content-addressed guarantee).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
True if reference exists with this SHA
|
||||
"""
|
||||
path = self._cas_path(sha)
|
||||
return path.exists()
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with atomic SHA validation.
|
||||
|
||||
In CAS, the SHA IS the filename, so if the file exists, it's already
|
||||
validated by definition. We still perform an integrity check to detect
|
||||
filesystem corruption.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
expected_sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
Path to validated cached file
|
||||
|
||||
Raises:
|
||||
CacheMissError: File not found in cache
|
||||
CacheCorruptionError: SHA mismatch (filesystem corruption)
|
||||
"""
|
||||
path = self._cas_path(expected_sha)
|
||||
|
||||
if not path.exists():
|
||||
raise CacheMissError(f"Cache miss for SHA {expected_sha[:8]}...")
|
||||
|
||||
# Lock file and validate content atomically
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
# Acquire shared lock (Unix only)
|
||||
if sys.platform != "win32":
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
||||
|
||||
# Read and hash content
|
||||
content = f.read()
|
||||
actual_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Release lock automatically when exiting context
|
||||
|
||||
# Validate SHA (should never fail in CAS unless filesystem corruption)
|
||||
if actual_sha != expected_sha:
|
||||
# Filesystem corruption detected
|
||||
try:
|
||||
path.unlink()
|
||||
except OSError:
|
||||
pass # Best effort cleanup
|
||||
|
||||
raise CacheCorruptionError(
|
||||
f"Filesystem corruption detected: file {path.name} has wrong content. "
|
||||
f"Expected SHA {expected_sha}, got {actual_sha}"
|
||||
)
|
||||
|
||||
# Update mapping for ref_path compatibility
|
||||
self._deltaspace_to_sha[(bucket, prefix)] = expected_sha
|
||||
|
||||
return path
|
||||
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cache read error for SHA {expected_sha[:8]}...: {e}") from e
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Cache reference file using content-addressed storage.
|
||||
|
||||
The file is stored at a path determined by its SHA256 hash.
|
||||
If a file with the same content already exists, it's reused
|
||||
(automatic deduplication).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
src: Source file to cache
|
||||
|
||||
Returns:
|
||||
Path to cached file (content-addressed)
|
||||
"""
|
||||
# Compute SHA of source file
|
||||
sha = self.hasher.sha256(src)
|
||||
path = self._cas_path(sha)
|
||||
|
||||
# If file already exists, we're done (deduplication)
|
||||
if path.exists():
|
||||
# Update mapping
|
||||
self._deltaspace_to_sha[(bucket, prefix)] = sha
|
||||
return path
|
||||
|
||||
# Create directory structure with secure permissions
|
||||
path.parent.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
|
||||
# Atomic write using temp file + rename
|
||||
temp_path = path.parent / f".tmp.{sha}"
|
||||
try:
|
||||
shutil.copy2(src, temp_path)
|
||||
# Atomic rename (POSIX guarantee)
|
||||
temp_path.rename(path)
|
||||
except Exception:
|
||||
# Cleanup on failure
|
||||
if temp_path.exists():
|
||||
temp_path.unlink()
|
||||
raise
|
||||
|
||||
# Update mapping
|
||||
self._deltaspace_to_sha[(bucket, prefix)] = sha
|
||||
|
||||
return path
|
||||
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference for given deltaspace.
|
||||
|
||||
In CAS, eviction is more complex because:
|
||||
1. Multiple deltaspaces may reference the same SHA (deduplication)
|
||||
2. We can't delete the file unless we know no other deltaspace uses it
|
||||
|
||||
For safety, we only remove the mapping, not the actual file.
|
||||
Orphaned files will be cleaned up by cache expiry (future feature).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# Remove mapping (safe operation)
|
||||
if key in self._deltaspace_to_sha:
|
||||
del self._deltaspace_to_sha[key]
|
||||
|
||||
# NOTE: We don't delete the actual CAS file because:
|
||||
# - Other deltaspaces may reference the same SHA
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached references.
|
||||
|
||||
Removes all cached files and mappings. This is a destructive operation
|
||||
that forcibly removes the entire cache directory.
|
||||
|
||||
Use cases:
|
||||
- Long-running applications that need to free disk space
|
||||
- Manual cache invalidation
|
||||
- Test cleanup
|
||||
- Ensuring fresh data fetch after configuration changes
|
||||
"""
|
||||
import shutil
|
||||
|
||||
# Clear in-memory mapping
|
||||
self._deltaspace_to_sha.clear()
|
||||
|
||||
# Remove all cache files (destructive!)
|
||||
if self.base_dir.exists():
|
||||
shutil.rmtree(self.base_dir, ignore_errors=True)
|
||||
|
||||
# Recreate base directory with secure permissions
|
||||
self.base_dir.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
# - The ephemeral cache will be cleaned on process exit anyway
|
||||
# - For persistent cache (future), we'd need reference counting
|
||||
305
src/deltaglider/adapters/cache_encrypted.py
Normal file
305
src/deltaglider/adapters/cache_encrypted.py
Normal file
@@ -0,0 +1,305 @@
|
||||
"""Encrypted cache wrapper using Fernet symmetric encryption.
|
||||
|
||||
This adapter wraps any CachePort implementation and adds transparent encryption/decryption.
|
||||
It uses Fernet (symmetric encryption based on AES-128-CBC with HMAC authentication).
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from cryptography.fernet import Fernet
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
|
||||
|
||||
class EncryptedCache(CachePort):
|
||||
"""Encrypted cache wrapper using Fernet symmetric encryption.
|
||||
|
||||
Wraps any CachePort implementation and transparently encrypts data at rest.
|
||||
Uses Fernet which provides:
|
||||
- AES-128-CBC encryption
|
||||
- HMAC authentication (prevents tampering)
|
||||
- Automatic key rotation support
|
||||
- Safe for ephemeral process-isolated caches
|
||||
|
||||
Key Management:
|
||||
- Ephemeral key generated per process (default, most secure)
|
||||
- Or use DG_CACHE_ENCRYPTION_KEY env var (base64-encoded Fernet key)
|
||||
- For production: use secrets management system (AWS KMS, HashiCorp Vault, etc.)
|
||||
|
||||
Security Properties:
|
||||
- Confidentiality: Data encrypted at rest
|
||||
- Integrity: HMAC prevents tampering
|
||||
- Authenticity: Only valid keys can decrypt
|
||||
- Forward Secrecy: Ephemeral keys destroyed on process exit
|
||||
"""
|
||||
|
||||
def __init__(self, backend: CachePort, encryption_key: bytes | None = None):
|
||||
"""Initialize encrypted cache wrapper.
|
||||
|
||||
Args:
|
||||
backend: Underlying cache implementation (CAS, filesystem, memory, etc.)
|
||||
encryption_key: Optional Fernet key (32 bytes base64-encoded).
|
||||
If None, generates ephemeral key for this process.
|
||||
"""
|
||||
self.backend = backend
|
||||
|
||||
# Key management: ephemeral (default) or provided
|
||||
if encryption_key is None:
|
||||
# Generate ephemeral key for this process (most secure)
|
||||
self._key = Fernet.generate_key()
|
||||
self._ephemeral = True
|
||||
else:
|
||||
# Use provided key (for persistent cache scenarios)
|
||||
self._key = encryption_key
|
||||
self._ephemeral = False
|
||||
|
||||
self._cipher = Fernet(self._key)
|
||||
|
||||
# Mapping: (bucket, prefix) -> plaintext_sha256
|
||||
# Needed because backend uses SHA for storage, but encrypted content has different SHA
|
||||
self._plaintext_sha_map: dict[tuple[str, str], str] = {}
|
||||
|
||||
@classmethod
|
||||
def from_env(cls, backend: CachePort) -> "EncryptedCache":
|
||||
"""Create encrypted cache with key from environment.
|
||||
|
||||
Looks for DG_CACHE_ENCRYPTION_KEY environment variable.
|
||||
If not found, generates ephemeral key.
|
||||
|
||||
Args:
|
||||
backend: Underlying cache implementation
|
||||
|
||||
Returns:
|
||||
EncryptedCache instance
|
||||
"""
|
||||
key_str = os.environ.get("DG_CACHE_ENCRYPTION_KEY")
|
||||
if key_str:
|
||||
# Decode base64-encoded key
|
||||
encryption_key = key_str.encode("utf-8")
|
||||
else:
|
||||
# Use ephemeral key
|
||||
encryption_key = None
|
||||
|
||||
return cls(backend, encryption_key)
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str) -> Path:
|
||||
"""Get path where reference should be cached.
|
||||
|
||||
Delegates to backend. Path structure determined by backend
|
||||
(e.g., CAS uses SHA256-based paths).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
|
||||
Returns:
|
||||
Path from backend
|
||||
"""
|
||||
return self.backend.ref_path(bucket, prefix)
|
||||
|
||||
def has_ref(self, bucket: str, prefix: str, sha: str) -> bool:
|
||||
"""Check if reference exists with given SHA.
|
||||
|
||||
Note: SHA is of the *unencrypted* content. The backend may store
|
||||
encrypted data, but we verify against original content hash.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
sha: SHA256 of unencrypted content
|
||||
|
||||
Returns:
|
||||
True if encrypted reference exists with this SHA
|
||||
"""
|
||||
# Delegate to backend
|
||||
# Backend may use SHA for content-addressed storage of encrypted data
|
||||
return self.backend.has_ref(bucket, prefix, sha)
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with decryption and validation.
|
||||
|
||||
Retrieves encrypted data from backend, decrypts it, validates SHA,
|
||||
and returns path to decrypted temporary file.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
expected_sha: Expected SHA256 of *decrypted* content
|
||||
|
||||
Returns:
|
||||
Path to decrypted validated file (temporary)
|
||||
|
||||
Raises:
|
||||
CacheMissError: File not in cache
|
||||
CacheCorruptionError: Decryption failed or SHA mismatch
|
||||
"""
|
||||
# Check if we have this plaintext SHA mapped
|
||||
key = (bucket, prefix)
|
||||
if key not in self._plaintext_sha_map:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Verify the requested SHA matches our mapping
|
||||
if self._plaintext_sha_map[key] != expected_sha:
|
||||
raise CacheCorruptionError(
|
||||
f"SHA mismatch for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, have {self._plaintext_sha_map[key]}"
|
||||
)
|
||||
|
||||
# Get encrypted file from backend using ref_path (not validated, we validate plaintext)
|
||||
encrypted_path = self.backend.ref_path(bucket, prefix)
|
||||
if not encrypted_path.exists():
|
||||
raise CacheMissError(f"Encrypted cache file not found for {bucket}/{prefix}")
|
||||
|
||||
# Read encrypted content
|
||||
try:
|
||||
with open(encrypted_path, "rb") as f:
|
||||
encrypted_data = f.read()
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cannot read encrypted cache: {e}") from e
|
||||
|
||||
# Decrypt
|
||||
try:
|
||||
decrypted_data = self._cipher.decrypt(encrypted_data)
|
||||
except Exception as e:
|
||||
# Fernet raises InvalidToken for tampering/wrong key
|
||||
# SECURITY: Auto-delete corrupted cache files
|
||||
try:
|
||||
encrypted_path.unlink(missing_ok=True)
|
||||
# Clean up mapping
|
||||
if key in self._plaintext_sha_map:
|
||||
del self._plaintext_sha_map[key]
|
||||
except Exception:
|
||||
pass # Best effort cleanup
|
||||
raise CacheCorruptionError(
|
||||
f"Decryption failed for {bucket}/{prefix}: {e}. "
|
||||
f"Corrupted cache deleted automatically."
|
||||
) from e
|
||||
|
||||
# Validate SHA of decrypted content
|
||||
import hashlib
|
||||
|
||||
actual_sha = hashlib.sha256(decrypted_data).hexdigest()
|
||||
if actual_sha != expected_sha:
|
||||
# SECURITY: Auto-delete corrupted cache files
|
||||
try:
|
||||
encrypted_path.unlink(missing_ok=True)
|
||||
# Clean up mapping
|
||||
if key in self._plaintext_sha_map:
|
||||
del self._plaintext_sha_map[key]
|
||||
except Exception:
|
||||
pass # Best effort cleanup
|
||||
raise CacheCorruptionError(
|
||||
f"Decrypted content SHA mismatch for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, got {actual_sha}. "
|
||||
f"Corrupted cache deleted automatically."
|
||||
)
|
||||
|
||||
# Write decrypted content to temporary file
|
||||
# Use same path as encrypted file but with .decrypted suffix
|
||||
decrypted_path = encrypted_path.with_suffix(".decrypted")
|
||||
try:
|
||||
with open(decrypted_path, "wb") as f:
|
||||
f.write(decrypted_data)
|
||||
except OSError as e:
|
||||
raise CacheCorruptionError(f"Cannot write decrypted cache: {e}") from e
|
||||
|
||||
return decrypted_path
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Encrypt and cache reference file.
|
||||
|
||||
Reads source file, encrypts it, and stores encrypted version via backend.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
src: Source file to encrypt and cache
|
||||
|
||||
Returns:
|
||||
Path to encrypted cached file (from backend)
|
||||
"""
|
||||
# Read source file
|
||||
try:
|
||||
with open(src, "rb") as f:
|
||||
plaintext_data = f.read()
|
||||
except OSError as e:
|
||||
raise CacheCorruptionError(f"Cannot read source file {src}: {e}") from e
|
||||
|
||||
# Compute plaintext SHA for mapping
|
||||
import hashlib
|
||||
|
||||
plaintext_sha = hashlib.sha256(plaintext_data).hexdigest()
|
||||
|
||||
# Encrypt
|
||||
encrypted_data = self._cipher.encrypt(plaintext_data)
|
||||
|
||||
# Write encrypted data to temporary file
|
||||
temp_encrypted = src.with_suffix(".encrypted.tmp")
|
||||
try:
|
||||
with open(temp_encrypted, "wb") as f:
|
||||
f.write(encrypted_data)
|
||||
|
||||
# Store encrypted file via backend
|
||||
result_path = self.backend.write_ref(bucket, prefix, temp_encrypted)
|
||||
|
||||
# Store mapping of plaintext SHA
|
||||
key = (bucket, prefix)
|
||||
self._plaintext_sha_map[key] = plaintext_sha
|
||||
|
||||
return result_path
|
||||
|
||||
finally:
|
||||
# Cleanup temporary file
|
||||
if temp_encrypted.exists():
|
||||
temp_encrypted.unlink()
|
||||
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference (encrypted version).
|
||||
|
||||
Delegates to backend. Also cleans up any .decrypted temporary files and mappings.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
"""
|
||||
# Remove from plaintext SHA mapping
|
||||
key = (bucket, prefix)
|
||||
if key in self._plaintext_sha_map:
|
||||
del self._plaintext_sha_map[key]
|
||||
|
||||
# Get path to potentially clean up .decrypted files
|
||||
try:
|
||||
path = self.backend.ref_path(bucket, prefix)
|
||||
decrypted_path = path.with_suffix(".decrypted")
|
||||
if decrypted_path.exists():
|
||||
decrypted_path.unlink()
|
||||
except Exception:
|
||||
# Best effort cleanup
|
||||
pass
|
||||
|
||||
# Evict from backend
|
||||
self.backend.evict(bucket, prefix)
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached references and encryption mappings.
|
||||
|
||||
Removes all cached data and clears encryption key mappings.
|
||||
This is the proper way to forcibly clean up cache in long-running
|
||||
applications.
|
||||
|
||||
Use cases:
|
||||
- Long-running applications needing to free resources
|
||||
- Manual cache invalidation after key rotation
|
||||
- Test cleanup
|
||||
- Memory pressure situations
|
||||
|
||||
Note: After clearing, the cache will use a fresh encryption key
|
||||
(ephemeral mode) or the same persistent key (if DG_CACHE_ENCRYPTION_KEY set).
|
||||
"""
|
||||
# Clear encryption mapping
|
||||
self._plaintext_sha_map.clear()
|
||||
|
||||
# Delegate to backend to clear actual files/memory
|
||||
self.backend.clear()
|
||||
@@ -1,8 +1,15 @@
|
||||
"""Filesystem cache adapter."""
|
||||
|
||||
import hashlib
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Unix-only imports for file locking
|
||||
if sys.platform != "win32":
|
||||
import fcntl
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
from ..ports.hash import HashPort
|
||||
|
||||
@@ -29,6 +36,60 @@ class FsCacheAdapter(CachePort):
|
||||
actual_sha = self.hasher.sha256(path)
|
||||
return actual_sha == sha
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with atomic SHA validation.
|
||||
|
||||
This method prevents TOCTOU attacks by validating the SHA at use-time,
|
||||
not just at check-time.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix/deltaspace within bucket
|
||||
expected_sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
Path to validated cached file
|
||||
|
||||
Raises:
|
||||
CacheMissError: File not found in cache
|
||||
CacheCorruptionError: SHA mismatch detected
|
||||
"""
|
||||
path = self.ref_path(bucket, prefix)
|
||||
|
||||
if not path.exists():
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Lock file and validate content atomically
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
# Acquire shared lock (Unix only)
|
||||
if sys.platform != "win32":
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
||||
|
||||
# Read and hash content
|
||||
content = f.read()
|
||||
actual_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Release lock automatically when exiting context
|
||||
|
||||
# Validate SHA
|
||||
if actual_sha != expected_sha:
|
||||
# File corrupted or tampered - remove it
|
||||
try:
|
||||
path.unlink()
|
||||
except OSError:
|
||||
pass # Best effort cleanup
|
||||
|
||||
raise CacheCorruptionError(
|
||||
f"Cache corruption detected for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, got {actual_sha}"
|
||||
)
|
||||
|
||||
return path
|
||||
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cache read error for {bucket}/{prefix}: {e}") from e
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Cache reference file."""
|
||||
path = self.ref_path(bucket, prefix)
|
||||
|
||||
279
src/deltaglider/adapters/cache_memory.py
Normal file
279
src/deltaglider/adapters/cache_memory.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""In-memory cache implementation with optional size limits.
|
||||
|
||||
This adapter stores cached references entirely in memory, avoiding filesystem I/O.
|
||||
Useful for:
|
||||
- High-performance scenarios where memory is abundant
|
||||
- Containerized environments with limited filesystem access
|
||||
- Testing and development
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Unix-only imports for compatibility
|
||||
if sys.platform != "win32":
|
||||
import fcntl # noqa: F401
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
from ..ports.hash import HashPort
|
||||
|
||||
|
||||
class MemoryCache(CachePort):
|
||||
"""In-memory cache implementation with LRU eviction.
|
||||
|
||||
Stores cached references in memory as bytes. Useful for high-performance
|
||||
scenarios or when filesystem access is limited.
|
||||
|
||||
Features:
|
||||
- Zero filesystem I/O (everything in RAM)
|
||||
- Optional size limits with LRU eviction
|
||||
- Thread-safe operations
|
||||
- Temporary file creation for compatibility with file-based APIs
|
||||
|
||||
Limitations:
|
||||
- Data lost on process exit (ephemeral only)
|
||||
- Memory usage proportional to cache size
|
||||
- Not suitable for very large reference files
|
||||
|
||||
Storage Layout:
|
||||
- Key: (bucket, prefix) tuple
|
||||
- Value: (content_bytes, sha256) tuple
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hasher: HashPort,
|
||||
max_size_mb: int = 100,
|
||||
temp_dir: Path | None = None,
|
||||
):
|
||||
"""Initialize in-memory cache.
|
||||
|
||||
Args:
|
||||
hasher: Hash adapter for SHA256 computation
|
||||
max_size_mb: Maximum cache size in megabytes (default 100MB)
|
||||
temp_dir: Directory for temporary files (default: system temp)
|
||||
"""
|
||||
self.hasher = hasher
|
||||
self.max_size_bytes = max_size_mb * 1024 * 1024
|
||||
|
||||
# Storage: (bucket, prefix) -> (content_bytes, sha256)
|
||||
self._cache: dict[tuple[str, str], tuple[bytes, str]] = {}
|
||||
|
||||
# Size tracking
|
||||
self._current_size = 0
|
||||
|
||||
# Access order for LRU eviction: (bucket, prefix) list
|
||||
self._access_order: list[tuple[str, str]] = []
|
||||
|
||||
# Temp directory for file-based API compatibility
|
||||
if temp_dir is None:
|
||||
import tempfile
|
||||
|
||||
self.temp_dir = Path(tempfile.gettempdir()) / "deltaglider-mem-cache"
|
||||
else:
|
||||
self.temp_dir = temp_dir
|
||||
|
||||
self.temp_dir.mkdir(parents=True, exist_ok=True, mode=0o700)
|
||||
|
||||
def _update_access(self, key: tuple[str, str]) -> None:
|
||||
"""Update LRU access order.
|
||||
|
||||
Args:
|
||||
key: Cache key (bucket, prefix)
|
||||
"""
|
||||
# Remove old position if exists
|
||||
if key in self._access_order:
|
||||
self._access_order.remove(key)
|
||||
|
||||
# Add to end (most recently used)
|
||||
self._access_order.append(key)
|
||||
|
||||
def _evict_lru(self, needed_bytes: int) -> None:
|
||||
"""Evict least recently used entries to free space.
|
||||
|
||||
Args:
|
||||
needed_bytes: Bytes needed for new entry
|
||||
"""
|
||||
while self._current_size + needed_bytes > self.max_size_bytes and self._access_order:
|
||||
# Evict least recently used
|
||||
lru_key = self._access_order[0]
|
||||
bucket, prefix = lru_key
|
||||
|
||||
# Remove from cache
|
||||
if lru_key in self._cache:
|
||||
content, _ = self._cache[lru_key]
|
||||
self._current_size -= len(content)
|
||||
del self._cache[lru_key]
|
||||
|
||||
# Remove from access order
|
||||
self._access_order.remove(lru_key)
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str) -> Path:
|
||||
"""Get placeholder path for in-memory reference.
|
||||
|
||||
Returns a virtual path that doesn't actually exist on filesystem.
|
||||
Used for API compatibility.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
|
||||
Returns:
|
||||
Virtual path (may not exist on filesystem)
|
||||
"""
|
||||
# Return virtual path for compatibility
|
||||
# Actual data is in memory, but we need Path for API
|
||||
safe_bucket = bucket.replace("/", "_")
|
||||
safe_prefix = prefix.replace("/", "_")
|
||||
return self.temp_dir / safe_bucket / safe_prefix / "reference.bin"
|
||||
|
||||
def has_ref(self, bucket: str, prefix: str, sha: str) -> bool:
|
||||
"""Check if reference exists in memory with given SHA.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
True if reference exists with this SHA
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
if key not in self._cache:
|
||||
return False
|
||||
|
||||
_, cached_sha = self._cache[key]
|
||||
return cached_sha == sha
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference from memory with validation.
|
||||
|
||||
Retrieves content from memory, validates SHA, and writes to
|
||||
temporary file for compatibility with file-based APIs.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
expected_sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
Path to temporary file containing content
|
||||
|
||||
Raises:
|
||||
CacheMissError: Content not in cache
|
||||
CacheCorruptionError: SHA mismatch
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# Check if in cache
|
||||
if key not in self._cache:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Get content and validate
|
||||
content, cached_sha = self._cache[key]
|
||||
|
||||
# Update LRU
|
||||
self._update_access(key)
|
||||
|
||||
# Validate SHA
|
||||
if cached_sha != expected_sha:
|
||||
# SHA mismatch - possible corruption
|
||||
raise CacheCorruptionError(
|
||||
f"Memory cache SHA mismatch for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, got {cached_sha}"
|
||||
)
|
||||
|
||||
# Write to temporary file for API compatibility
|
||||
temp_path = self.ref_path(bucket, prefix)
|
||||
temp_path.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
|
||||
|
||||
try:
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(content)
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cannot write temp file: {e}") from e
|
||||
|
||||
return temp_path
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Store reference file in memory.
|
||||
|
||||
Reads file content and stores in memory with SHA hash.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
src: Source file to cache
|
||||
|
||||
Returns:
|
||||
Virtual path (content is in memory)
|
||||
"""
|
||||
# Read source file
|
||||
try:
|
||||
with open(src, "rb") as f:
|
||||
content = f.read()
|
||||
except OSError as e:
|
||||
raise CacheCorruptionError(f"Cannot read source file {src}: {e}") from e
|
||||
|
||||
# Compute SHA
|
||||
sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Check if we need to evict
|
||||
content_size = len(content)
|
||||
if content_size > self.max_size_bytes:
|
||||
raise CacheCorruptionError(
|
||||
f"File too large for memory cache: {content_size} bytes "
|
||||
f"(limit: {self.max_size_bytes} bytes)"
|
||||
)
|
||||
|
||||
# Evict LRU entries if needed
|
||||
self._evict_lru(content_size)
|
||||
|
||||
# Store in memory
|
||||
key = (bucket, prefix)
|
||||
self._cache[key] = (content, sha)
|
||||
self._current_size += content_size
|
||||
|
||||
# Update LRU
|
||||
self._update_access(key)
|
||||
|
||||
# Return virtual path
|
||||
return self.ref_path(bucket, prefix)
|
||||
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference from memory.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# Remove from cache
|
||||
if key in self._cache:
|
||||
content, _ = self._cache[key]
|
||||
self._current_size -= len(content)
|
||||
del self._cache[key]
|
||||
|
||||
# Remove from LRU tracking
|
||||
if key in self._access_order:
|
||||
self._access_order.remove(key)
|
||||
|
||||
# Clean up temp file if exists
|
||||
temp_path = self.ref_path(bucket, prefix)
|
||||
if temp_path.exists():
|
||||
try:
|
||||
temp_path.unlink()
|
||||
except OSError:
|
||||
pass # Best effort
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached content from memory.
|
||||
|
||||
Useful for testing and cleanup.
|
||||
"""
|
||||
self._cache.clear()
|
||||
self._access_order.clear()
|
||||
self._current_size = 0
|
||||
126
src/deltaglider/adapters/ec2_metadata.py
Normal file
126
src/deltaglider/adapters/ec2_metadata.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""EC2 Instance Metadata Service (IMDS) adapter.
|
||||
|
||||
Provides access to EC2 instance metadata using IMDSv2 with token-based authentication.
|
||||
Falls back gracefully when not running on EC2.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class EC2MetadataAdapter:
|
||||
"""Adapter for EC2 Instance Metadata Service (IMDSv2)."""
|
||||
|
||||
IMDS_BASE_URL = "http://169.254.169.254/latest"
|
||||
TOKEN_URL = f"{IMDS_BASE_URL}/api/token"
|
||||
TOKEN_TTL_SECONDS = 21600 # 6 hours
|
||||
TOKEN_HEADER = "X-aws-ec2-metadata-token"
|
||||
TIMEOUT_SECONDS = 1 # Fast timeout for non-EC2 environments
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize EC2 metadata adapter."""
|
||||
self._token: str | None = None
|
||||
self._is_ec2: bool | None = None
|
||||
self._region: str | None = None
|
||||
|
||||
def is_running_on_ec2(self) -> bool:
|
||||
"""Check if running on an EC2 instance.
|
||||
|
||||
Returns:
|
||||
True if running on EC2, False otherwise
|
||||
|
||||
Note:
|
||||
Result is cached after first check for performance.
|
||||
"""
|
||||
if self._is_ec2 is not None:
|
||||
return self._is_ec2
|
||||
|
||||
# Skip check if explicitly disabled
|
||||
if os.environ.get("DG_DISABLE_EC2_DETECTION", "").lower() in ("true", "1", "yes"):
|
||||
self._is_ec2 = False
|
||||
return False
|
||||
|
||||
try:
|
||||
# Try to get IMDSv2 token
|
||||
self._token = self._get_token()
|
||||
self._is_ec2 = self._token is not None
|
||||
except Exception:
|
||||
self._is_ec2 = False
|
||||
|
||||
return self._is_ec2
|
||||
|
||||
def get_region(self) -> str | None:
|
||||
"""Get the EC2 instance's AWS region.
|
||||
|
||||
Returns:
|
||||
AWS region code (e.g., "us-east-1") or None if not on EC2
|
||||
|
||||
Note:
|
||||
Result is cached after first successful fetch.
|
||||
"""
|
||||
if not self.is_running_on_ec2():
|
||||
return None
|
||||
|
||||
if self._region is not None:
|
||||
return self._region
|
||||
|
||||
try:
|
||||
if self._token:
|
||||
response = requests.get(
|
||||
f"{self.IMDS_BASE_URL}/meta-data/placement/region",
|
||||
headers={self.TOKEN_HEADER: self._token},
|
||||
timeout=self.TIMEOUT_SECONDS,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
self._region = response.text.strip()
|
||||
return self._region
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def get_availability_zone(self) -> str | None:
|
||||
"""Get the EC2 instance's availability zone.
|
||||
|
||||
Returns:
|
||||
Availability zone (e.g., "us-east-1a") or None if not on EC2
|
||||
"""
|
||||
if not self.is_running_on_ec2():
|
||||
return None
|
||||
|
||||
try:
|
||||
if self._token:
|
||||
response = requests.get(
|
||||
f"{self.IMDS_BASE_URL}/meta-data/placement/availability-zone",
|
||||
headers={self.TOKEN_HEADER: self._token},
|
||||
timeout=self.TIMEOUT_SECONDS,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return str(response.text.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _get_token(self) -> str | None:
|
||||
"""Get IMDSv2 token for authenticated metadata requests.
|
||||
|
||||
Returns:
|
||||
IMDSv2 token or None if unable to retrieve
|
||||
|
||||
Note:
|
||||
Uses IMDSv2 for security. IMDSv1 is not supported.
|
||||
"""
|
||||
try:
|
||||
response = requests.put(
|
||||
self.TOKEN_URL,
|
||||
headers={"X-aws-ec2-metadata-token-ttl-seconds": str(self.TOKEN_TTL_SECONDS)},
|
||||
timeout=self.TIMEOUT_SECONDS,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.text.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
@@ -1,18 +1,22 @@
|
||||
"""S3 storage adapter."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from collections.abc import Iterator
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, Optional
|
||||
|
||||
import boto3
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from ..ports.storage import ObjectHead, PutResult, StoragePort
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from mypy_boto3_s3.client import S3Client
|
||||
|
||||
from ..ports.storage import ObjectHead, PutResult, StoragePort
|
||||
|
||||
|
||||
class S3StorageAdapter(StoragePort):
|
||||
"""S3 implementation of StoragePort."""
|
||||
@@ -39,6 +43,13 @@ class S3StorageAdapter(StoragePort):
|
||||
client_params: dict[str, Any] = {
|
||||
"service_name": "s3",
|
||||
"endpoint_url": endpoint_url or os.environ.get("AWS_ENDPOINT_URL"),
|
||||
# Disable automatic request checksums (CRC32/CRC64) added in
|
||||
# boto3 1.36+. S3-compatible stores like Hetzner Object Storage
|
||||
# reject the checksum headers with BadRequest.
|
||||
"config": Config(
|
||||
request_checksum_calculation="when_required",
|
||||
response_checksum_validation="when_required",
|
||||
),
|
||||
}
|
||||
|
||||
# Merge in any additional boto3 kwargs (credentials, region, etc.)
|
||||
@@ -55,12 +66,21 @@ class S3StorageAdapter(StoragePort):
|
||||
|
||||
try:
|
||||
response = self.client.head_object(Bucket=bucket, Key=object_key)
|
||||
extracted_metadata = self._extract_metadata(response.get("Metadata", {}))
|
||||
|
||||
# Debug: Log metadata received (to verify it's stored correctly)
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
f"HEAD {object_key}: Received metadata with {len(extracted_metadata)} keys: "
|
||||
f"{list(extracted_metadata.keys())}"
|
||||
)
|
||||
|
||||
return ObjectHead(
|
||||
key=object_key,
|
||||
size=response["ContentLength"],
|
||||
etag=response["ETag"].strip('"'),
|
||||
last_modified=response["LastModified"],
|
||||
metadata=self._extract_metadata(response.get("Metadata", {})),
|
||||
metadata=extracted_metadata,
|
||||
)
|
||||
except ClientError as e:
|
||||
if e.response["Error"]["Code"] == "404":
|
||||
@@ -97,6 +117,7 @@ class S3StorageAdapter(StoragePort):
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
continuation_token: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""List objects with S3-compatible response.
|
||||
|
||||
@@ -105,7 +126,8 @@ class S3StorageAdapter(StoragePort):
|
||||
prefix: Filter results to keys beginning with prefix
|
||||
delimiter: Delimiter for grouping keys (e.g., '/' for folders)
|
||||
max_keys: Maximum number of keys to return
|
||||
start_after: Start listing after this key
|
||||
start_after: Start listing after this key (for first page only)
|
||||
continuation_token: Token from previous response for pagination
|
||||
|
||||
Returns:
|
||||
Dict with objects, common_prefixes, and pagination info
|
||||
@@ -119,7 +141,11 @@ class S3StorageAdapter(StoragePort):
|
||||
params["Prefix"] = prefix
|
||||
if delimiter:
|
||||
params["Delimiter"] = delimiter
|
||||
if start_after:
|
||||
|
||||
# Use ContinuationToken for pagination if available, otherwise StartAfter
|
||||
if continuation_token:
|
||||
params["ContinuationToken"] = continuation_token
|
||||
elif start_after:
|
||||
params["StartAfter"] = start_after
|
||||
|
||||
try:
|
||||
@@ -191,20 +217,110 @@ class S3StorageAdapter(StoragePort):
|
||||
# AWS requires lowercase metadata keys
|
||||
clean_metadata = {k.lower(): v for k, v in metadata.items()}
|
||||
|
||||
try:
|
||||
response = self.client.put_object(
|
||||
Bucket=bucket,
|
||||
Key=object_key,
|
||||
Body=body_data,
|
||||
ContentType=content_type,
|
||||
Metadata=clean_metadata,
|
||||
# Calculate total metadata size (AWS has 2KB limit)
|
||||
total_metadata_size = sum(len(k) + len(v) for k, v in clean_metadata.items())
|
||||
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
f"PUT {object_key}: Sending metadata with {len(clean_metadata)} keys "
|
||||
f"({total_metadata_size} bytes): {list(clean_metadata.keys())}"
|
||||
)
|
||||
return PutResult(
|
||||
etag=response["ETag"].strip('"'),
|
||||
version_id=response.get("VersionId"),
|
||||
|
||||
# Warn if approaching AWS metadata size limit (2KB per key, 2KB total for user metadata)
|
||||
if total_metadata_size > 1800: # Warn at 1.8KB
|
||||
logger.warning(
|
||||
f"PUT {object_key}: Metadata size ({total_metadata_size} bytes) approaching "
|
||||
f"AWS S3 limit (2KB). Some metadata may be lost!"
|
||||
)
|
||||
except ClientError as e:
|
||||
raise RuntimeError(f"Failed to put object: {e}") from e
|
||||
|
||||
import time
|
||||
|
||||
max_retries = 3
|
||||
last_error: ClientError | None = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = self.client.put_object(
|
||||
Bucket=bucket,
|
||||
Key=object_key,
|
||||
Body=body_data,
|
||||
ContentType=content_type,
|
||||
Metadata=clean_metadata,
|
||||
)
|
||||
|
||||
# VERIFICATION: Check if metadata was actually stored (especially for delta files)
|
||||
if object_key.endswith(".delta") and clean_metadata:
|
||||
try:
|
||||
# Verify metadata was stored by doing a HEAD immediately
|
||||
verify_response = self.client.head_object(Bucket=bucket, Key=object_key)
|
||||
stored_metadata = verify_response.get("Metadata", {})
|
||||
|
||||
if not stored_metadata:
|
||||
logger.error(
|
||||
f"PUT {object_key}: CRITICAL - Metadata was sent but NOT STORED! "
|
||||
f"Sent {len(clean_metadata)} keys, received 0 keys back."
|
||||
)
|
||||
elif len(stored_metadata) < len(clean_metadata):
|
||||
missing_keys = set(clean_metadata.keys()) - set(stored_metadata.keys())
|
||||
logger.warning(
|
||||
f"PUT {object_key}: Metadata partially stored. "
|
||||
f"Sent {len(clean_metadata)} keys, stored {len(stored_metadata)} keys. "
|
||||
f"Missing keys: {missing_keys}"
|
||||
)
|
||||
elif logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
f"PUT {object_key}: Metadata verified - "
|
||||
f"all {len(clean_metadata)} keys stored"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"PUT {object_key}: Could not verify metadata: {e}")
|
||||
|
||||
return PutResult(
|
||||
etag=response["ETag"].strip('"'),
|
||||
version_id=response.get("VersionId"),
|
||||
)
|
||||
except ClientError as e:
|
||||
last_error = e
|
||||
if attempt < max_retries - 1:
|
||||
delay = 2**attempt # 1s, 2s
|
||||
# Log full error details
|
||||
error_response = e.response if hasattr(e, "response") else {}
|
||||
http_headers = error_response.get("ResponseMetadata", {}).get("HTTPHeaders", {})
|
||||
logger.warning(
|
||||
f"PUT {object_key}: Attempt {attempt + 1}/{max_retries} failed: {e}. "
|
||||
f"Retrying in {delay}s... "
|
||||
f"Details: bucket={bucket}, key={object_key}, "
|
||||
f"body_size={len(body_data)}, content_type={content_type}, "
|
||||
f"metadata_keys={list(clean_metadata.keys())}, "
|
||||
f"endpoint={self.client.meta.endpoint_url}, "
|
||||
f"http_status={error_response.get('ResponseMetadata', {}).get('HTTPStatusCode')}, "
|
||||
f"error_code={error_response.get('Error', {}).get('Code')}, "
|
||||
f"error_message={error_response.get('Error', {}).get('Message')}, "
|
||||
f"request_id={error_response.get('ResponseMetadata', {}).get('RequestId')}, "
|
||||
f"http_headers={dict(http_headers)}"
|
||||
)
|
||||
# Enable botocore wire-level logging for the retry
|
||||
logging.getLogger("botocore").setLevel(logging.DEBUG)
|
||||
time.sleep(delay)
|
||||
else:
|
||||
# Final attempt failed — log everything
|
||||
error_response = e.response if hasattr(e, "response") else {}
|
||||
http_headers = error_response.get("ResponseMetadata", {}).get("HTTPHeaders", {})
|
||||
logger.error(
|
||||
f"PUT {object_key}: All {max_retries} attempts failed. "
|
||||
f"Last error: {e}. "
|
||||
f"Details: bucket={bucket}, key={object_key}, "
|
||||
f"body_size={len(body_data)}, content_type={content_type}, "
|
||||
f"metadata={clean_metadata}, "
|
||||
f"endpoint={self.client.meta.endpoint_url}, "
|
||||
f"http_status={error_response.get('ResponseMetadata', {}).get('HTTPStatusCode')}, "
|
||||
f"error_code={error_response.get('Error', {}).get('Code')}, "
|
||||
f"error_message={error_response.get('Error', {}).get('Message')}, "
|
||||
f"request_id={error_response.get('ResponseMetadata', {}).get('RequestId')}, "
|
||||
f"http_headers={dict(http_headers)}"
|
||||
)
|
||||
|
||||
raise RuntimeError(f"Failed to put object: {last_error}") from last_error
|
||||
|
||||
def delete(self, key: str) -> None:
|
||||
"""Delete object."""
|
||||
|
||||
@@ -1,28 +1,120 @@
|
||||
"""AWS S3 CLI compatible commands."""
|
||||
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
|
||||
from ...core import DeltaService, DeltaSpace, ObjectKey
|
||||
from ...core import (
|
||||
DeltaService,
|
||||
DeltaSpace,
|
||||
ObjectKey,
|
||||
build_s3_url,
|
||||
is_s3_url,
|
||||
)
|
||||
from ...core import parse_s3_url as core_parse_s3_url
|
||||
from .sync import fetch_s3_object_heads
|
||||
|
||||
__all__ = [
|
||||
"is_s3_path",
|
||||
"parse_s3_url",
|
||||
"determine_operation",
|
||||
"upload_file",
|
||||
"download_file",
|
||||
"copy_s3_to_s3",
|
||||
"migrate_s3_to_s3",
|
||||
"handle_recursive",
|
||||
"log_aws_region",
|
||||
]
|
||||
|
||||
|
||||
def log_aws_region(service: DeltaService, region_override: bool = False) -> None:
|
||||
"""Log the AWS region being used and warn about cross-region charges.
|
||||
|
||||
This function:
|
||||
1. Detects if running on EC2
|
||||
2. Compares EC2 region with S3 client region
|
||||
3. Warns about potential cross-region data transfer charges
|
||||
4. Helps users optimize for cost and performance
|
||||
|
||||
Args:
|
||||
service: DeltaService instance with storage adapter
|
||||
region_override: True if user explicitly specified --region flag
|
||||
"""
|
||||
try:
|
||||
from ...adapters.ec2_metadata import EC2MetadataAdapter
|
||||
from ...adapters.storage_s3 import S3StorageAdapter
|
||||
|
||||
if not isinstance(service.storage, S3StorageAdapter):
|
||||
return # Not using S3 storage, skip
|
||||
|
||||
# Get S3 client region
|
||||
s3_region = service.storage.client.meta.region_name
|
||||
if not s3_region:
|
||||
s3_region = "us-east-1" # boto3 default
|
||||
|
||||
# Check if running on EC2
|
||||
ec2_metadata = EC2MetadataAdapter()
|
||||
if ec2_metadata.is_running_on_ec2():
|
||||
ec2_region = ec2_metadata.get_region()
|
||||
ec2_az = ec2_metadata.get_availability_zone()
|
||||
|
||||
# Log EC2 context
|
||||
click.echo(f"EC2 Instance: {ec2_az or ec2_region or 'unknown'}")
|
||||
click.echo(f"S3 Client Region: {s3_region}")
|
||||
|
||||
# Check for region mismatch
|
||||
if ec2_region and ec2_region != s3_region:
|
||||
if region_override:
|
||||
# User explicitly set --region, warn about costs
|
||||
click.echo("")
|
||||
click.secho(
|
||||
f"⚠️ WARNING: EC2 region={ec2_region} != S3 client region={s3_region}",
|
||||
fg="yellow",
|
||||
bold=True,
|
||||
)
|
||||
click.secho(
|
||||
f" Expect cross-region/NAT data charges. Align regions (set client region={ec2_region})",
|
||||
fg="yellow",
|
||||
)
|
||||
click.secho(
|
||||
" before proceeding. Or drop --region for automatic region resolution.",
|
||||
fg="yellow",
|
||||
)
|
||||
click.echo("")
|
||||
else:
|
||||
# Auto-detected mismatch, but user can still cancel
|
||||
click.echo("")
|
||||
click.secho(
|
||||
f"ℹ️ INFO: EC2 region ({ec2_region}) differs from configured S3 region ({s3_region})",
|
||||
fg="cyan",
|
||||
)
|
||||
click.secho(
|
||||
f" Consider using --region {ec2_region} to avoid cross-region charges.",
|
||||
fg="cyan",
|
||||
)
|
||||
click.echo("")
|
||||
elif ec2_region and ec2_region == s3_region:
|
||||
# Regions match - optimal configuration
|
||||
click.secho("✓ Regions aligned - no cross-region charges", fg="green")
|
||||
else:
|
||||
# Not on EC2, just show S3 region
|
||||
click.echo(f"S3 Client Region: {s3_region}")
|
||||
|
||||
except Exception:
|
||||
pass # Silently ignore errors getting region info
|
||||
|
||||
|
||||
def is_s3_path(path: str) -> bool:
|
||||
"""Check if path is an S3 URL."""
|
||||
return path.startswith("s3://")
|
||||
return is_s3_url(path)
|
||||
|
||||
|
||||
def parse_s3_url(url: str) -> tuple[str, str]:
|
||||
"""Parse S3 URL into bucket and key."""
|
||||
if not url.startswith("s3://"):
|
||||
raise ValueError(f"Invalid S3 URL: {url}")
|
||||
|
||||
s3_path = url[5:].rstrip("/")
|
||||
parts = s3_path.split("/", 1)
|
||||
bucket = parts[0]
|
||||
key = parts[1] if len(parts) > 1 else ""
|
||||
return bucket, key
|
||||
parsed = core_parse_s3_url(url, strip_trailing_slash=True)
|
||||
return parsed.bucket, parsed.key
|
||||
|
||||
|
||||
def determine_operation(source: str, dest: str) -> str:
|
||||
@@ -57,6 +149,8 @@ def upload_file(
|
||||
|
||||
delta_space = DeltaSpace(bucket=bucket, prefix="/".join(key.split("/")[:-1]))
|
||||
|
||||
dest_url = build_s3_url(bucket, key)
|
||||
|
||||
try:
|
||||
# Check if delta should be disabled
|
||||
if no_delta:
|
||||
@@ -66,7 +160,7 @@ def upload_file(
|
||||
|
||||
if not quiet:
|
||||
file_size = local_path.stat().st_size
|
||||
click.echo(f"upload: '{local_path}' to 's3://{bucket}/{key}' ({file_size} bytes)")
|
||||
click.echo(f"upload: '{local_path}' to '{dest_url}' ({file_size} bytes)")
|
||||
else:
|
||||
# Use delta compression
|
||||
summary = service.put(local_path, delta_space, max_ratio)
|
||||
@@ -75,12 +169,12 @@ def upload_file(
|
||||
if summary.delta_size:
|
||||
ratio = round((summary.delta_size / summary.file_size) * 100, 1)
|
||||
click.echo(
|
||||
f"upload: '{local_path}' to 's3://{bucket}/{summary.key}' "
|
||||
f"upload: '{local_path}' to '{build_s3_url(bucket, summary.key)}' "
|
||||
f"(delta: {ratio}% of original)"
|
||||
)
|
||||
else:
|
||||
click.echo(
|
||||
f"upload: '{local_path}' to 's3://{bucket}/{summary.key}' "
|
||||
f"upload: '{local_path}' to '{build_s3_url(bucket, summary.key)}' "
|
||||
f"(reference: {summary.file_size} bytes)"
|
||||
)
|
||||
|
||||
@@ -112,7 +206,7 @@ def download_file(
|
||||
actual_key = delta_key
|
||||
obj_key = ObjectKey(bucket=bucket, key=delta_key)
|
||||
if not quiet:
|
||||
click.echo(f"Auto-detected delta: s3://{bucket}/{delta_key}")
|
||||
click.echo(f"Auto-detected delta: {build_s3_url(bucket, delta_key)}")
|
||||
|
||||
# Determine output path
|
||||
if local_path is None:
|
||||
@@ -136,7 +230,7 @@ def download_file(
|
||||
if not quiet:
|
||||
file_size = local_path.stat().st_size
|
||||
click.echo(
|
||||
f"download: 's3://{bucket}/{actual_key}' to '{local_path}' ({file_size} bytes)"
|
||||
f"download: '{build_s3_url(bucket, actual_key)}' to '{local_path}' ({file_size} bytes)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -149,31 +243,310 @@ def copy_s3_to_s3(
|
||||
source_url: str,
|
||||
dest_url: str,
|
||||
quiet: bool = False,
|
||||
max_ratio: float | None = None,
|
||||
no_delta: bool = False,
|
||||
) -> None:
|
||||
"""Copy object between S3 locations."""
|
||||
# For now, implement as download + upload
|
||||
# TODO: Optimize with server-side copy when possible
|
||||
"""Copy object between S3 locations with optional delta compression.
|
||||
|
||||
This performs a direct S3-to-S3 transfer using streaming to preserve
|
||||
the original file content and apply delta compression at the destination.
|
||||
"""
|
||||
source_bucket, source_key = parse_s3_url(source_url)
|
||||
dest_bucket, dest_key = parse_s3_url(dest_url)
|
||||
|
||||
if not quiet:
|
||||
click.echo(f"copy: 's3://{source_bucket}/{source_key}' to 's3://{dest_bucket}/{dest_key}'")
|
||||
click.echo(
|
||||
f"copy: '{build_s3_url(source_bucket, source_key)}' "
|
||||
f"to '{build_s3_url(dest_bucket, dest_key)}'"
|
||||
)
|
||||
|
||||
# Use temporary file
|
||||
import tempfile
|
||||
try:
|
||||
# Get the source object as a stream
|
||||
source_stream = service.storage.get(f"{source_bucket}/{source_key}")
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=Path(source_key).suffix) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
# Determine the destination deltaspace
|
||||
dest_key_parts = dest_key.split("/")
|
||||
if len(dest_key_parts) > 1:
|
||||
dest_prefix = "/".join(dest_key_parts[:-1])
|
||||
else:
|
||||
dest_prefix = ""
|
||||
|
||||
# Download from source
|
||||
download_file(service, source_url, tmp_path, quiet=True)
|
||||
dest_deltaspace = DeltaSpace(bucket=dest_bucket, prefix=dest_prefix)
|
||||
|
||||
# Upload to destination
|
||||
upload_file(service, tmp_path, dest_url, quiet=True)
|
||||
# If delta is disabled or max_ratio specified, use direct put
|
||||
if no_delta:
|
||||
# Direct storage put without delta compression
|
||||
service.storage.put(f"{dest_bucket}/{dest_key}", source_stream, {})
|
||||
if not quiet:
|
||||
click.echo("Copy completed (no delta compression)")
|
||||
else:
|
||||
# Write to a temporary file and use override_name to preserve original filename
|
||||
import tempfile
|
||||
|
||||
# Extract original filename from source
|
||||
original_filename = Path(source_key).name
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(source_key).suffix) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
|
||||
# Write stream to temp file
|
||||
with open(tmp_path, "wb") as f:
|
||||
shutil.copyfileobj(source_stream, f)
|
||||
|
||||
try:
|
||||
# Use DeltaService.put() with override_name to preserve original filename
|
||||
summary = service.put(
|
||||
tmp_path, dest_deltaspace, max_ratio, override_name=original_filename
|
||||
)
|
||||
|
||||
if not quiet:
|
||||
if summary.delta_size:
|
||||
ratio = round((summary.delta_size / summary.file_size) * 100, 1)
|
||||
click.echo(f"Copy completed with delta compression ({ratio}% of original)")
|
||||
else:
|
||||
click.echo("Copy completed (stored as reference)")
|
||||
finally:
|
||||
# Clean up temp file
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"S3-to-S3 copy failed: {e}", err=True)
|
||||
raise
|
||||
|
||||
|
||||
def migrate_s3_to_s3(
|
||||
service: DeltaService,
|
||||
source_url: str,
|
||||
dest_url: str,
|
||||
exclude: str | None = None,
|
||||
include: str | None = None,
|
||||
quiet: bool = False,
|
||||
no_delta: bool = False,
|
||||
max_ratio: float | None = None,
|
||||
dry_run: bool = False,
|
||||
skip_confirm: bool = False,
|
||||
preserve_prefix: bool = False,
|
||||
region_override: bool = False,
|
||||
) -> None:
|
||||
"""Migrate objects from one S3 location to another with delta compression.
|
||||
|
||||
Features:
|
||||
- Resume support: Only copies files that don't exist in destination
|
||||
- Progress tracking: Shows migration progress
|
||||
- Confirmation prompt: Shows file count before starting
|
||||
- Prefix preservation: Optionally preserves source prefix structure in destination
|
||||
- EC2 region detection: Warns about cross-region data transfer charges
|
||||
|
||||
Args:
|
||||
service: DeltaService instance
|
||||
source_url: Source S3 URL
|
||||
dest_url: Destination S3 URL
|
||||
exclude: Pattern to exclude files
|
||||
include: Pattern to include files
|
||||
quiet: Suppress output
|
||||
no_delta: Disable delta compression
|
||||
max_ratio: Maximum delta/file ratio
|
||||
dry_run: Show what would be migrated without migrating
|
||||
skip_confirm: Skip confirmation prompt
|
||||
preserve_prefix: Preserve source prefix in destination
|
||||
region_override: True if user explicitly specified --region flag
|
||||
"""
|
||||
import fnmatch
|
||||
|
||||
source_bucket, source_prefix = parse_s3_url(source_url)
|
||||
dest_bucket, dest_prefix = parse_s3_url(dest_url)
|
||||
|
||||
# Ensure prefixes end with / if they exist
|
||||
if source_prefix and not source_prefix.endswith("/"):
|
||||
source_prefix += "/"
|
||||
if dest_prefix and not dest_prefix.endswith("/"):
|
||||
dest_prefix += "/"
|
||||
|
||||
# Determine the effective destination prefix based on preserve_prefix setting
|
||||
effective_dest_prefix = dest_prefix
|
||||
if preserve_prefix and source_prefix:
|
||||
# Extract the last component of the source prefix (e.g., "prefix1/" from "path/to/prefix1/")
|
||||
source_prefix_name = source_prefix.rstrip("/").split("/")[-1]
|
||||
if source_prefix_name:
|
||||
# Append source prefix name to destination
|
||||
effective_dest_prefix = (dest_prefix or "") + source_prefix_name + "/"
|
||||
|
||||
if not quiet:
|
||||
# Log AWS region being used (helps users verify their configuration)
|
||||
# Pass region_override to warn about cross-region charges if user explicitly set --region
|
||||
log_aws_region(service, region_override=region_override)
|
||||
|
||||
source_display = build_s3_url(source_bucket, source_prefix)
|
||||
dest_display = build_s3_url(dest_bucket, dest_prefix)
|
||||
effective_dest_display = build_s3_url(dest_bucket, effective_dest_prefix)
|
||||
|
||||
if preserve_prefix and source_prefix:
|
||||
click.echo(f"Migrating from {source_display}")
|
||||
click.echo(f" to {effective_dest_display}")
|
||||
else:
|
||||
click.echo(f"Migrating from {source_display} to {dest_display}")
|
||||
click.echo("Scanning source and destination buckets...")
|
||||
|
||||
# List source objects
|
||||
source_list_prefix = f"{source_bucket}/{source_prefix}" if source_prefix else source_bucket
|
||||
source_objects = []
|
||||
|
||||
for obj in service.storage.list(source_list_prefix):
|
||||
# Skip reference.bin files (internal delta reference)
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
continue
|
||||
# Skip .delta files in source (we'll handle the original files)
|
||||
if obj.key.endswith(".delta"):
|
||||
continue
|
||||
|
||||
# Apply include/exclude filters
|
||||
rel_key = obj.key.removeprefix(source_prefix) if source_prefix else obj.key
|
||||
if exclude and fnmatch.fnmatch(rel_key, exclude):
|
||||
continue
|
||||
if include and not fnmatch.fnmatch(rel_key, include):
|
||||
continue
|
||||
|
||||
source_objects.append(obj)
|
||||
|
||||
# List destination objects to detect what needs copying
|
||||
dest_list_prefix = (
|
||||
f"{dest_bucket}/{effective_dest_prefix}" if effective_dest_prefix else dest_bucket
|
||||
)
|
||||
dest_keys = set()
|
||||
|
||||
for obj in service.storage.list(dest_list_prefix):
|
||||
# Get the relative key in destination
|
||||
rel_key = obj.key.removeprefix(effective_dest_prefix) if effective_dest_prefix else obj.key
|
||||
# Remove .delta suffix for comparison
|
||||
if rel_key.endswith(".delta"):
|
||||
rel_key = rel_key[:-6]
|
||||
# Skip reference.bin
|
||||
if not rel_key.endswith("/reference.bin"):
|
||||
dest_keys.add(rel_key)
|
||||
|
||||
# Determine files to migrate (not in destination)
|
||||
files_to_migrate = []
|
||||
total_size = 0
|
||||
|
||||
for source_obj in source_objects:
|
||||
# Get relative path from source prefix
|
||||
rel_key = source_obj.key.removeprefix(source_prefix) if source_prefix else source_obj.key
|
||||
|
||||
# Check if already exists in destination
|
||||
if rel_key not in dest_keys:
|
||||
files_to_migrate.append((source_obj, rel_key))
|
||||
total_size += source_obj.size
|
||||
|
||||
# Show summary and ask for confirmation
|
||||
if not files_to_migrate:
|
||||
if not quiet:
|
||||
click.echo("Copy completed")
|
||||
click.echo("All files are already migrated. Nothing to do.")
|
||||
return
|
||||
|
||||
if not quiet:
|
||||
|
||||
def format_bytes(size: int) -> str:
|
||||
size_float = float(size)
|
||||
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
||||
if size_float < 1024.0:
|
||||
return f"{size_float:.2f} {unit}"
|
||||
size_float /= 1024.0
|
||||
return f"{size_float:.2f} PB"
|
||||
|
||||
click.echo("")
|
||||
click.echo(f"Files to migrate: {len(files_to_migrate)}")
|
||||
click.echo(f"Total size: {format_bytes(total_size)}")
|
||||
if len(dest_keys) > 0:
|
||||
click.echo(f"Already migrated: {len(dest_keys)} files (will be skipped)")
|
||||
|
||||
# Handle dry run mode early (before confirmation prompt)
|
||||
if dry_run:
|
||||
if not quiet:
|
||||
click.echo("\n--- DRY RUN MODE ---")
|
||||
for _obj, rel_key in files_to_migrate[:10]: # Show first 10 files
|
||||
click.echo(f" Would migrate: {rel_key}")
|
||||
if len(files_to_migrate) > 10:
|
||||
click.echo(f" ... and {len(files_to_migrate) - 10} more files")
|
||||
return
|
||||
|
||||
# Ask for confirmation before proceeding with actual migration
|
||||
if not quiet and not skip_confirm:
|
||||
click.echo("")
|
||||
if not click.confirm("Do you want to proceed with the migration?"):
|
||||
click.echo("Migration cancelled.")
|
||||
return
|
||||
|
||||
# Perform migration
|
||||
if not quiet:
|
||||
click.echo(f"\nStarting migration of {len(files_to_migrate)} files...")
|
||||
|
||||
successful = 0
|
||||
failed = 0
|
||||
failed_files = []
|
||||
|
||||
for i, (source_obj, rel_key) in enumerate(files_to_migrate, 1):
|
||||
source_s3_url = build_s3_url(source_bucket, source_obj.key)
|
||||
|
||||
# Construct destination URL using effective prefix
|
||||
if effective_dest_prefix:
|
||||
dest_key = effective_dest_prefix + rel_key
|
||||
else:
|
||||
dest_key = rel_key
|
||||
dest_s3_url = build_s3_url(dest_bucket, dest_key)
|
||||
|
||||
try:
|
||||
if not quiet:
|
||||
progress = f"[{i}/{len(files_to_migrate)}]"
|
||||
click.echo(f"{progress} Migrating {rel_key}...", nl=False)
|
||||
|
||||
# Copy with delta compression
|
||||
copy_s3_to_s3(
|
||||
service,
|
||||
source_s3_url,
|
||||
dest_s3_url,
|
||||
quiet=True,
|
||||
max_ratio=max_ratio,
|
||||
no_delta=no_delta,
|
||||
)
|
||||
|
||||
successful += 1
|
||||
if not quiet:
|
||||
click.echo(" ✓")
|
||||
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
failed_files.append((rel_key, str(e)))
|
||||
if not quiet:
|
||||
click.echo(f" ✗ ({e})")
|
||||
|
||||
# Show final summary
|
||||
if not quiet:
|
||||
click.echo("")
|
||||
click.echo("Migration Summary:")
|
||||
click.echo(f" Successfully migrated: {successful} files")
|
||||
if failed > 0:
|
||||
click.echo(f" Failed: {failed} files")
|
||||
click.echo("\nFailed files:")
|
||||
for file, error in failed_files[:10]: # Show first 10 failures
|
||||
click.echo(f" {file}: {error}")
|
||||
if len(failed_files) > 10:
|
||||
click.echo(f" ... and {len(failed_files) - 10} more failures")
|
||||
|
||||
# Show compression statistics from cache if available (no bucket scan)
|
||||
if successful > 0 and not no_delta:
|
||||
try:
|
||||
from ...client import DeltaGliderClient
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
# Use cached stats only - don't scan bucket (prevents blocking)
|
||||
cached_stats = client._get_cached_bucket_stats(dest_bucket, "quick")
|
||||
if cached_stats and cached_stats.delta_objects > 0:
|
||||
click.echo(
|
||||
f"\nCompression achieved: {cached_stats.average_compression_ratio:.1%}"
|
||||
)
|
||||
click.echo(f"Space saved: {format_bytes(cached_stats.space_saved)}")
|
||||
except Exception:
|
||||
pass # Ignore stats errors
|
||||
|
||||
|
||||
def handle_recursive(
|
||||
@@ -228,10 +601,7 @@ def handle_recursive(
|
||||
dest_path = Path(dest)
|
||||
dest_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# List all objects with prefix
|
||||
# Note: S3StorageAdapter.list() expects "bucket/prefix" format
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
objects = list(service.storage.list(list_prefix))
|
||||
objects = fetch_s3_object_heads(service, bucket, prefix)
|
||||
|
||||
if not quiet:
|
||||
click.echo(f"Downloading {len(objects)} files...")
|
||||
@@ -261,9 +631,22 @@ def handle_recursive(
|
||||
local_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download file
|
||||
s3_url = f"s3://{bucket}/{obj.key}"
|
||||
s3_url = build_s3_url(bucket, obj.key)
|
||||
download_file(service, s3_url, local_path, quiet)
|
||||
|
||||
else:
|
||||
click.echo("S3-to-S3 recursive copy not yet implemented", err=True)
|
||||
sys.exit(1)
|
||||
elif operation == "copy":
|
||||
# S3-to-S3 recursive copy with migration support
|
||||
migrate_s3_to_s3(
|
||||
service,
|
||||
source,
|
||||
dest,
|
||||
exclude=exclude,
|
||||
include=include,
|
||||
quiet=quiet,
|
||||
no_delta=no_delta,
|
||||
max_ratio=max_ratio,
|
||||
dry_run=False,
|
||||
skip_confirm=True, # Don't prompt for cp command
|
||||
preserve_prefix=True, # Always preserve prefix for cp -r
|
||||
region_override=False, # cp command doesn't track region override explicitly
|
||||
)
|
||||
|
||||
@@ -1,14 +1,19 @@
|
||||
"""CLI main entry point."""
|
||||
|
||||
import atexit
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from datetime import UTC
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import click
|
||||
|
||||
from ... import __version__
|
||||
from ...adapters import (
|
||||
FsCacheAdapter,
|
||||
NoopMetricsAdapter,
|
||||
S3StorageAdapter,
|
||||
Sha256Adapter,
|
||||
@@ -17,7 +22,9 @@ from ...adapters import (
|
||||
XdeltaAdapter,
|
||||
)
|
||||
from ...core import DeltaService, ObjectKey
|
||||
from ...core.config import DeltaGliderConfig
|
||||
from ...ports import MetricsPort
|
||||
from ...ports.cache import CachePort
|
||||
from .aws_compat import (
|
||||
copy_s3_to_s3,
|
||||
determine_operation,
|
||||
@@ -35,48 +42,87 @@ def create_service(
|
||||
endpoint_url: str | None = None,
|
||||
region: str | None = None,
|
||||
profile: str | None = None,
|
||||
*,
|
||||
config: DeltaGliderConfig | None = None,
|
||||
) -> DeltaService:
|
||||
"""Create service with wired adapters."""
|
||||
# Get config from environment
|
||||
cache_dir = Path(os.environ.get("DG_CACHE_DIR", "/tmp/.deltaglider/reference_cache"))
|
||||
max_ratio = float(os.environ.get("DG_MAX_RATIO", "0.5"))
|
||||
metrics_type = os.environ.get("DG_METRICS", "logging") # Options: noop, logging, cloudwatch
|
||||
"""Create service with wired adapters.
|
||||
|
||||
# Set AWS environment variables if provided
|
||||
if endpoint_url:
|
||||
os.environ["AWS_ENDPOINT_URL"] = endpoint_url
|
||||
if region:
|
||||
os.environ["AWS_DEFAULT_REGION"] = region
|
||||
if profile:
|
||||
os.environ["AWS_PROFILE"] = profile
|
||||
Args:
|
||||
log_level: Logging level (overridden by config.log_level if config provided).
|
||||
endpoint_url: S3 endpoint URL (overridden by config if provided).
|
||||
region: AWS region (overridden by config if provided).
|
||||
profile: AWS profile (overridden by config if provided).
|
||||
config: Optional pre-built config. If None, built from env vars + explicit params.
|
||||
"""
|
||||
if config is None:
|
||||
config = DeltaGliderConfig.from_env(
|
||||
log_level=log_level,
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
# SECURITY: Always use ephemeral process-isolated cache
|
||||
cache_dir = Path(tempfile.mkdtemp(prefix="deltaglider-", dir="/tmp"))
|
||||
# Register cleanup handler to remove cache on exit
|
||||
atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True))
|
||||
|
||||
# Set AWS environment variables if provided (for compatibility with other AWS tools)
|
||||
if config.endpoint_url:
|
||||
os.environ["AWS_ENDPOINT_URL"] = config.endpoint_url
|
||||
if config.region:
|
||||
os.environ["AWS_DEFAULT_REGION"] = config.region
|
||||
if config.profile:
|
||||
os.environ["AWS_PROFILE"] = config.profile
|
||||
|
||||
# Build boto3_kwargs for explicit parameter passing (preferred over env vars)
|
||||
boto3_kwargs: dict[str, Any] = {}
|
||||
if config.region:
|
||||
boto3_kwargs["region_name"] = config.region
|
||||
|
||||
# Create adapters
|
||||
hasher = Sha256Adapter()
|
||||
storage = S3StorageAdapter(endpoint_url=endpoint_url)
|
||||
storage = S3StorageAdapter(endpoint_url=config.endpoint_url, boto3_kwargs=boto3_kwargs)
|
||||
diff = XdeltaAdapter()
|
||||
cache = FsCacheAdapter(cache_dir, hasher)
|
||||
|
||||
# SECURITY: Configurable cache with encryption and backend selection
|
||||
from deltaglider.adapters import ContentAddressedCache, EncryptedCache, MemoryCache
|
||||
|
||||
base_cache: CachePort
|
||||
if config.cache_backend == "memory":
|
||||
base_cache = MemoryCache(
|
||||
hasher, max_size_mb=config.cache_memory_size_mb, temp_dir=cache_dir
|
||||
)
|
||||
else:
|
||||
base_cache = ContentAddressedCache(cache_dir, hasher)
|
||||
|
||||
# Always apply encryption with ephemeral keys (security hardening)
|
||||
cache: CachePort = EncryptedCache.from_env(base_cache)
|
||||
|
||||
clock = UtcClockAdapter()
|
||||
logger = StdLoggerAdapter(level=log_level)
|
||||
logger = StdLoggerAdapter(level=config.log_level)
|
||||
|
||||
# Create metrics adapter based on configuration
|
||||
metrics: MetricsPort
|
||||
if metrics_type == "cloudwatch":
|
||||
# Import here to avoid dependency if not used
|
||||
if config.metrics_type == "cloudwatch":
|
||||
from ...adapters.metrics_cloudwatch import CloudWatchMetricsAdapter
|
||||
|
||||
metrics = CloudWatchMetricsAdapter(
|
||||
namespace=os.environ.get("DG_METRICS_NAMESPACE", "DeltaGlider"),
|
||||
region=region,
|
||||
endpoint_url=endpoint_url if endpoint_url and "localhost" in endpoint_url else None,
|
||||
namespace=config.metrics_namespace,
|
||||
region=config.region,
|
||||
endpoint_url=(
|
||||
config.endpoint_url
|
||||
if config.endpoint_url and "localhost" in config.endpoint_url
|
||||
else None
|
||||
),
|
||||
)
|
||||
elif metrics_type == "logging":
|
||||
elif config.metrics_type == "logging":
|
||||
from ...adapters.metrics_cloudwatch import LoggingMetricsAdapter
|
||||
|
||||
metrics = LoggingMetricsAdapter(log_level=log_level)
|
||||
metrics = LoggingMetricsAdapter(log_level=config.log_level)
|
||||
else:
|
||||
metrics = NoopMetricsAdapter()
|
||||
|
||||
# Create service
|
||||
return DeltaService(
|
||||
storage=storage,
|
||||
diff=diff,
|
||||
@@ -85,17 +131,35 @@ def create_service(
|
||||
clock=clock,
|
||||
logger=logger,
|
||||
metrics=metrics,
|
||||
max_ratio=max_ratio,
|
||||
max_ratio=config.max_ratio,
|
||||
)
|
||||
|
||||
|
||||
def _version_callback(ctx: click.Context, param: click.Parameter, value: bool) -> None:
|
||||
"""Callback for --version option."""
|
||||
if value:
|
||||
click.echo(f"deltaglider {__version__}")
|
||||
ctx.exit(0)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.option("--debug", is_flag=True, help="Enable debug logging")
|
||||
@click.option(
|
||||
"--version",
|
||||
is_flag=True,
|
||||
is_eager=True,
|
||||
expose_value=False,
|
||||
callback=_version_callback,
|
||||
help="Show version and exit",
|
||||
)
|
||||
@click.pass_context
|
||||
def cli(ctx: click.Context, debug: bool) -> None:
|
||||
"""DeltaGlider - Delta-aware S3 file storage wrapper."""
|
||||
import logging
|
||||
|
||||
log_level = "DEBUG" if debug else os.environ.get("DG_LOG_LEVEL", "INFO")
|
||||
ctx.obj = create_service(log_level)
|
||||
logging.getLogger("deltaglider").info("deltaglider %s", __version__)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@@ -148,9 +212,6 @@ def cp(
|
||||
|
||||
# Handle recursive operations for directories
|
||||
if recursive:
|
||||
if operation == "copy":
|
||||
click.echo("S3-to-S3 recursive copy not yet implemented", err=True)
|
||||
sys.exit(1)
|
||||
handle_recursive(
|
||||
service, source, dest, recursive, exclude, include, quiet, no_delta, max_ratio
|
||||
)
|
||||
@@ -172,7 +233,7 @@ def cp(
|
||||
download_file(service, source, local_path, quiet)
|
||||
|
||||
elif operation == "copy":
|
||||
copy_s3_to_s3(service, source, dest, quiet)
|
||||
copy_s3_to_s3(service, source, dest, quiet, max_ratio, no_delta)
|
||||
|
||||
except ValueError as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
@@ -240,6 +301,13 @@ def ls(
|
||||
prefix_str: str
|
||||
bucket_name, prefix_str = parse_s3_url(s3_url)
|
||||
|
||||
# Ensure prefix ends with / if it's meant to be a directory
|
||||
# This helps with proper path handling
|
||||
if prefix_str and not prefix_str.endswith("/"):
|
||||
# Check if this is a file or directory by listing
|
||||
# For now, assume it's a directory prefix
|
||||
prefix_str = prefix_str + "/"
|
||||
|
||||
# Format bytes to human readable
|
||||
def format_bytes(size: int) -> str:
|
||||
if not human_readable:
|
||||
@@ -252,33 +320,38 @@ def ls(
|
||||
return f"{size_float:.1f}P"
|
||||
|
||||
# List objects using SDK (automatically filters .delta and reference.bin)
|
||||
from deltaglider.client import DeltaGliderClient, ListObjectsResponse
|
||||
from deltaglider.client import DeltaGliderClient
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
dg_response: ListObjectsResponse = client.list_objects(
|
||||
Bucket=bucket_name, Prefix=prefix_str, MaxKeys=10000
|
||||
dg_response = client.list_objects(
|
||||
Bucket=bucket_name,
|
||||
Prefix=prefix_str,
|
||||
MaxKeys=10000,
|
||||
Delimiter="/" if not recursive else "",
|
||||
)
|
||||
objects = dg_response.contents
|
||||
objects = dg_response["Contents"]
|
||||
|
||||
# Filter by recursive flag
|
||||
if not recursive:
|
||||
# Only show direct children
|
||||
seen_prefixes = set()
|
||||
# Show common prefixes (subdirectories) from S3 response
|
||||
for common_prefix in dg_response.get("CommonPrefixes", []):
|
||||
prefix_path = common_prefix.get("Prefix", "")
|
||||
# Show only the directory name, not the full path
|
||||
if prefix_str:
|
||||
# Strip the current prefix to show only the subdirectory
|
||||
display_name = prefix_path[len(prefix_str) :]
|
||||
else:
|
||||
display_name = prefix_path
|
||||
click.echo(f" PRE {display_name}")
|
||||
|
||||
# Only show files at current level (not in subdirectories)
|
||||
filtered_objects = []
|
||||
for obj in objects:
|
||||
rel_path = obj.key[len(prefix_str) :] if prefix_str else obj.key
|
||||
if "/" in rel_path:
|
||||
# It's in a subdirectory
|
||||
subdir = rel_path.split("/")[0] + "/"
|
||||
if subdir not in seen_prefixes:
|
||||
seen_prefixes.add(subdir)
|
||||
# Show as directory
|
||||
full_prefix = f"{prefix_str}{subdir}" if prefix_str else subdir
|
||||
click.echo(f" PRE {full_prefix}")
|
||||
else:
|
||||
# Direct file
|
||||
if rel_path: # Only add if there's actually a file at this level
|
||||
filtered_objects.append(obj)
|
||||
obj_key = obj["Key"]
|
||||
rel_path = obj_key[len(prefix_str) :] if prefix_str else obj_key
|
||||
# Only include if it's a direct child (no / in relative path)
|
||||
if "/" not in rel_path and rel_path:
|
||||
filtered_objects.append(obj)
|
||||
objects = filtered_objects
|
||||
|
||||
# Display objects (SDK already filters reference.bin and strips .delta)
|
||||
@@ -286,19 +359,26 @@ def ls(
|
||||
total_count = 0
|
||||
|
||||
for obj in objects:
|
||||
total_size += obj.size
|
||||
total_size += obj["Size"]
|
||||
total_count += 1
|
||||
|
||||
# Format the display
|
||||
size_str = format_bytes(obj.size)
|
||||
size_str = format_bytes(obj["Size"])
|
||||
# last_modified is a string from SDK, parse it if needed
|
||||
if isinstance(obj.last_modified, str):
|
||||
last_modified = obj.get("LastModified", "")
|
||||
if isinstance(last_modified, str):
|
||||
# Already a string, extract date portion
|
||||
date_str = obj.last_modified[:19].replace("T", " ")
|
||||
date_str = last_modified[:19].replace("T", " ")
|
||||
else:
|
||||
date_str = obj.last_modified.strftime("%Y-%m-%d %H:%M:%S")
|
||||
date_str = last_modified.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
click.echo(f"{date_str} {size_str:>10} s3://{bucket_name}/{obj.key}")
|
||||
# Show only the filename relative to current prefix (like AWS CLI)
|
||||
if prefix_str:
|
||||
display_key = obj["Key"][len(prefix_str) :]
|
||||
else:
|
||||
display_key = obj["Key"]
|
||||
|
||||
click.echo(f"{date_str} {size_str:>10} {display_key}")
|
||||
|
||||
# Show summary if requested
|
||||
if summarize:
|
||||
@@ -426,24 +506,24 @@ def rm(
|
||||
|
||||
# Report the results
|
||||
if not quiet:
|
||||
if result["deleted_count"] == 0:
|
||||
if result.deleted_count == 0:
|
||||
click.echo(f"delete: No objects found with prefix: s3://{bucket}/{prefix}")
|
||||
else:
|
||||
click.echo(f"Deleted {result['deleted_count']} object(s)")
|
||||
click.echo(f"Deleted {result.deleted_count} object(s)")
|
||||
|
||||
# Show warnings if any references were kept
|
||||
for warning in result.get("warnings", []):
|
||||
for warning in result.warnings:
|
||||
if "Kept reference" in warning:
|
||||
click.echo(
|
||||
f"Keeping reference file (still in use): s3://{bucket}/{warning.split()[2]}"
|
||||
)
|
||||
|
||||
# Report any errors
|
||||
if result["failed_count"] > 0:
|
||||
for error in result.get("errors", []):
|
||||
if result.failed_count > 0:
|
||||
for error in result.errors:
|
||||
click.echo(f"Error: {error}", err=True)
|
||||
|
||||
if result["failed_count"] > 0:
|
||||
if result.failed_count > 0:
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
@@ -561,20 +641,14 @@ def sync(
|
||||
@click.pass_obj
|
||||
def verify(service: DeltaService, s3_url: str) -> None:
|
||||
"""Verify integrity of delta file."""
|
||||
# Parse S3 URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
try:
|
||||
bucket, key = parse_s3_url(s3_url)
|
||||
if not key:
|
||||
raise ValueError("Missing key")
|
||||
except ValueError:
|
||||
click.echo(f"Error: Invalid S3 URL: {s3_url}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
s3_path = s3_url[5:]
|
||||
parts = s3_path.split("/", 1)
|
||||
if len(parts) != 2:
|
||||
click.echo(f"Error: Invalid S3 URL: {s3_url}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
bucket = parts[0]
|
||||
key = parts[1]
|
||||
|
||||
obj_key = ObjectKey(bucket=bucket, key=key)
|
||||
|
||||
try:
|
||||
@@ -597,6 +671,538 @@ def verify(service: DeltaService, s3_url: str) -> None:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("source")
|
||||
@click.argument("dest")
|
||||
@click.option("--exclude", help="Exclude files matching pattern")
|
||||
@click.option("--include", help="Include only files matching pattern")
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Suppress output")
|
||||
@click.option("--no-delta", is_flag=True, help="Disable delta compression")
|
||||
@click.option("--max-ratio", type=float, help="Max delta/file ratio (default: 0.5)")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be migrated without migrating")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompt")
|
||||
@click.option(
|
||||
"--no-preserve-prefix", is_flag=True, help="Don't preserve source prefix in destination"
|
||||
)
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def migrate(
|
||||
service: DeltaService,
|
||||
source: str,
|
||||
dest: str,
|
||||
exclude: str | None,
|
||||
include: str | None,
|
||||
quiet: bool,
|
||||
no_delta: bool,
|
||||
max_ratio: float | None,
|
||||
dry_run: bool,
|
||||
yes: bool,
|
||||
no_preserve_prefix: bool,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Migrate S3 bucket/prefix to DeltaGlider-compressed storage.
|
||||
|
||||
This command facilitates the migration of existing S3 objects to another bucket
|
||||
with DeltaGlider compression. It supports:
|
||||
- Resume capability: Only copies files that don't exist in destination
|
||||
- Progress tracking: Shows migration progress
|
||||
- Confirmation prompt: Shows file count before starting (use --yes to skip)
|
||||
- Prefix preservation: By default, source prefix is preserved in destination
|
||||
|
||||
When migrating a prefix, the source prefix name is preserved by default:
|
||||
s3://src/prefix1/ → s3://dest/ creates s3://dest/prefix1/
|
||||
s3://src/a/b/c/ → s3://dest/x/ creates s3://dest/x/c/
|
||||
|
||||
Use --no-preserve-prefix to disable this behavior:
|
||||
s3://src/prefix1/ → s3://dest/ creates s3://dest/ (files at root)
|
||||
|
||||
Examples:
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
deltaglider migrate s3://old-bucket/data/ s3://new-bucket/
|
||||
deltaglider migrate --no-preserve-prefix s3://src/v1/ s3://dest/
|
||||
deltaglider migrate --dry-run s3://old-bucket/ s3://new-bucket/
|
||||
deltaglider migrate --yes --quiet s3://old-bucket/ s3://new-bucket/
|
||||
"""
|
||||
from .aws_compat import is_s3_path, migrate_s3_to_s3
|
||||
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Validate both paths are S3
|
||||
if not is_s3_path(source) or not is_s3_path(dest):
|
||||
click.echo("Error: Both source and destination must be S3 paths", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Perform migration
|
||||
migrate_s3_to_s3(
|
||||
service,
|
||||
source,
|
||||
dest,
|
||||
exclude=exclude,
|
||||
include=include,
|
||||
quiet=quiet,
|
||||
no_delta=no_delta,
|
||||
max_ratio=max_ratio,
|
||||
dry_run=dry_run,
|
||||
skip_confirm=yes,
|
||||
preserve_prefix=not no_preserve_prefix,
|
||||
region_override=region is not None, # True if user explicitly specified --region
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Migration failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command(short_help="Get bucket statistics and compression metrics")
|
||||
@click.argument("bucket")
|
||||
@click.option("--sampled", is_flag=True, help="Balanced mode: one sample per deltaspace (~5-15s)")
|
||||
@click.option(
|
||||
"--detailed", is_flag=True, help="Most accurate: HEAD for all deltas (slowest, ~1min+)"
|
||||
)
|
||||
@click.option("--refresh", is_flag=True, help="Force cache refresh even if valid")
|
||||
@click.option("--no-cache", is_flag=True, help="Skip caching entirely (both read and write)")
|
||||
@click.option("--json", "output_json", is_flag=True, help="Output in JSON format")
|
||||
@click.pass_obj
|
||||
def stats(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
sampled: bool,
|
||||
detailed: bool,
|
||||
refresh: bool,
|
||||
no_cache: bool,
|
||||
output_json: bool,
|
||||
) -> None:
|
||||
"""Get bucket statistics and compression metrics with intelligent S3-based caching.
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name/
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Modes (mutually exclusive):
|
||||
- quick (default): Fast listing-only stats (~0.5s), approximate compression metrics
|
||||
- --sampled: Balanced mode - one HEAD per deltaspace (~5-15s for typical buckets)
|
||||
- --detailed: Most accurate - HEAD for every delta file (slowest, ~1min+ for large buckets)
|
||||
|
||||
Caching (NEW - massive performance improvement!):
|
||||
Stats are cached in S3 at .deltaglider/stats_{mode}.json (one per mode).
|
||||
Cache is automatically validated on every call using object count + size.
|
||||
If bucket changed, stats are recomputed automatically.
|
||||
|
||||
Performance with cache:
|
||||
- Cache hit: ~0.1s (200x faster than recomputation!)
|
||||
- Cache miss: Full computation time (creates cache for next time)
|
||||
- Cache invalid: Auto-recomputes when bucket changes
|
||||
|
||||
Options:
|
||||
--refresh: Force cache refresh even if valid (use when you need fresh data now)
|
||||
--no-cache: Skip caching entirely - always recompute (useful for testing/debugging)
|
||||
--json: Output in JSON format for automation/scripting
|
||||
|
||||
Examples:
|
||||
deltaglider stats mybucket # Fast (~0.1s with cache, ~0.5s without)
|
||||
deltaglider stats mybucket --sampled # Balanced accuracy/speed (~5-15s first run)
|
||||
deltaglider stats mybucket --detailed # Most accurate (~1-10min first run, ~0.1s cached)
|
||||
deltaglider stats mybucket --refresh # Force recomputation even if cached
|
||||
deltaglider stats mybucket --no-cache # Always compute fresh (skip cache)
|
||||
deltaglider stats mybucket --json # JSON output for scripts
|
||||
deltaglider stats s3://mybucket/ # Also accepts s3:// URLs
|
||||
|
||||
Timing Logs:
|
||||
Set DG_LOG_LEVEL=INFO to see detailed phase timing with timestamps:
|
||||
[HH:MM:SS.mmm] Phase 1: LIST completed in 0.52s - Found 1523 objects
|
||||
[HH:MM:SS.mmm] Phase 2: Cache HIT in 0.06s - Using cached stats
|
||||
[HH:MM:SS.mmm] COMPLETE: Total time 0.58s
|
||||
|
||||
See docs/STATS_CACHING.md for complete documentation.
|
||||
"""
|
||||
from ...client import DeltaGliderClient
|
||||
from ...client_operations.stats import StatsMode
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if sampled and detailed:
|
||||
click.echo("Error: --sampled and --detailed cannot be used together", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if refresh and no_cache:
|
||||
click.echo("Error: --refresh and --no-cache cannot be used together", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
mode: StatsMode = "quick"
|
||||
if sampled:
|
||||
mode = "sampled"
|
||||
if detailed:
|
||||
mode = "detailed"
|
||||
|
||||
# Create client from service
|
||||
client = DeltaGliderClient(service=service)
|
||||
|
||||
# Get bucket stats with caching control
|
||||
use_cache = not no_cache
|
||||
bucket_stats = client.get_bucket_stats(
|
||||
bucket, mode=mode, use_cache=use_cache, refresh_cache=refresh
|
||||
)
|
||||
|
||||
if output_json:
|
||||
# JSON output
|
||||
output = {
|
||||
"bucket": bucket_stats.bucket,
|
||||
"object_count": bucket_stats.object_count,
|
||||
"total_size": bucket_stats.total_size,
|
||||
"compressed_size": bucket_stats.compressed_size,
|
||||
"space_saved": bucket_stats.space_saved,
|
||||
"average_compression_ratio": bucket_stats.average_compression_ratio,
|
||||
"delta_objects": bucket_stats.delta_objects,
|
||||
"direct_objects": bucket_stats.direct_objects,
|
||||
}
|
||||
click.echo(json.dumps(output, indent=2))
|
||||
else:
|
||||
# Human-readable output
|
||||
def format_bytes(size: float) -> str:
|
||||
"""Format bytes to human-readable size."""
|
||||
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
||||
if size < 1024.0:
|
||||
return f"{size:.2f} {unit}"
|
||||
size /= 1024.0
|
||||
return f"{size:.2f} PB"
|
||||
|
||||
click.echo(f"Bucket Statistics: {bucket_stats.bucket}")
|
||||
click.echo(f"{'=' * 60}")
|
||||
click.echo(f"Total Objects: {bucket_stats.object_count:,}")
|
||||
click.echo(f" Delta Objects: {bucket_stats.delta_objects:,}")
|
||||
click.echo(f" Direct Objects: {bucket_stats.direct_objects:,}")
|
||||
click.echo("")
|
||||
click.echo(
|
||||
f"Original Size: {format_bytes(bucket_stats.total_size)} ({bucket_stats.total_size:,} bytes)"
|
||||
)
|
||||
click.echo(
|
||||
f"Compressed Size: {format_bytes(bucket_stats.compressed_size)} ({bucket_stats.compressed_size:,} bytes)"
|
||||
)
|
||||
click.echo(
|
||||
f"Space Saved: {format_bytes(bucket_stats.space_saved)} ({bucket_stats.space_saved:,} bytes)"
|
||||
)
|
||||
click.echo(f"Compression Ratio: {bucket_stats.average_compression_ratio:.1%}")
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("bucket")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted without deleting")
|
||||
@click.option("--json", "output_json", is_flag=True, help="Output in JSON format")
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def purge(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
dry_run: bool,
|
||||
output_json: bool,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Purge expired temporary files from .deltaglider/tmp/.
|
||||
|
||||
This command scans the .deltaglider/tmp/ prefix in the specified bucket
|
||||
and deletes any files whose dg-expires-at metadata indicates they have expired.
|
||||
|
||||
These temporary files are created by the rehydration process when deltaglider-compressed
|
||||
files need to be made available for direct download (e.g., via presigned URLs).
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name/
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Examples:
|
||||
deltaglider purge mybucket # Purge expired files
|
||||
deltaglider purge mybucket --dry-run # Preview what would be deleted
|
||||
deltaglider purge mybucket --json # JSON output for automation
|
||||
deltaglider purge s3://mybucket/ # Also accepts s3:// URLs
|
||||
"""
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Perform the purge (or dry run simulation)
|
||||
if dry_run:
|
||||
# For dry run, we need to simulate what would be deleted
|
||||
prefix = ".deltaglider/tmp/"
|
||||
expired_files = []
|
||||
total_size = 0
|
||||
|
||||
# List all objects in temp directory
|
||||
from datetime import datetime
|
||||
|
||||
import boto3
|
||||
|
||||
s3_client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=endpoint_url or os.environ.get("AWS_ENDPOINT_URL"),
|
||||
region_name=region,
|
||||
)
|
||||
|
||||
paginator = s3_client.get_paginator("list_objects_v2")
|
||||
page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
|
||||
|
||||
for page in page_iterator:
|
||||
for obj in page.get("Contents", []):
|
||||
# Get object metadata
|
||||
head_response = s3_client.head_object(Bucket=bucket, Key=obj["Key"])
|
||||
metadata = head_response.get("Metadata", {})
|
||||
|
||||
expires_at_str = metadata.get("dg-expires-at")
|
||||
if expires_at_str:
|
||||
try:
|
||||
expires_at = datetime.fromisoformat(
|
||||
expires_at_str.replace("Z", "+00:00")
|
||||
)
|
||||
if expires_at.tzinfo is None:
|
||||
expires_at = expires_at.replace(tzinfo=UTC)
|
||||
|
||||
if datetime.now(UTC) >= expires_at:
|
||||
expired_files.append(
|
||||
{
|
||||
"key": obj["Key"],
|
||||
"size": obj["Size"],
|
||||
"expires_at": expires_at_str,
|
||||
}
|
||||
)
|
||||
total_size += obj["Size"]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if output_json:
|
||||
output = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"dry_run": True,
|
||||
"would_delete_count": len(expired_files),
|
||||
"total_size_to_free": total_size,
|
||||
"expired_files": expired_files[:10], # Show first 10
|
||||
}
|
||||
click.echo(json.dumps(output, indent=2))
|
||||
else:
|
||||
click.echo(f"Dry run: Would delete {len(expired_files)} expired file(s)")
|
||||
click.echo(f"Total space to free: {total_size:,} bytes")
|
||||
if expired_files:
|
||||
click.echo("\nFiles that would be deleted (first 10):")
|
||||
for file_info in expired_files[:10]:
|
||||
click.echo(f" {file_info['key']} (expires: {file_info['expires_at']})")
|
||||
if len(expired_files) > 10:
|
||||
click.echo(f" ... and {len(expired_files) - 10} more")
|
||||
else:
|
||||
# Perform actual purge using the service method
|
||||
result = service.purge_temp_files(bucket)
|
||||
|
||||
if output_json:
|
||||
# JSON output
|
||||
click.echo(json.dumps(result, indent=2))
|
||||
else:
|
||||
# Human-readable output
|
||||
click.echo(f"Purge Statistics for bucket: {bucket}")
|
||||
click.echo(f"{'=' * 60}")
|
||||
click.echo(f"Expired files found: {result['expired_count']}")
|
||||
click.echo(f"Files deleted: {result['deleted_count']}")
|
||||
click.echo(f"Errors: {result['error_count']}")
|
||||
click.echo(f"Space freed: {result['total_size_freed']:,} bytes")
|
||||
click.echo(f"Duration: {result['duration_seconds']:.2f} seconds")
|
||||
|
||||
if result["errors"]:
|
||||
click.echo("\nErrors encountered:")
|
||||
for error in result["errors"][:5]:
|
||||
click.echo(f" - {error}")
|
||||
if len(result["errors"]) > 5:
|
||||
click.echo(f" ... and {len(result['errors']) - 5} more errors")
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command("put-bucket-acl")
|
||||
@click.argument("bucket")
|
||||
@click.option(
|
||||
"--acl",
|
||||
type=click.Choice(["private", "public-read", "public-read-write", "authenticated-read"]),
|
||||
help="Canned ACL to apply",
|
||||
)
|
||||
@click.option("--grant-full-control", help="Grants full control (e.g., id=account-id)")
|
||||
@click.option("--grant-read", help="Allows grantee to list objects (e.g., id=account-id)")
|
||||
@click.option("--grant-read-acp", help="Allows grantee to read the bucket ACL")
|
||||
@click.option("--grant-write", help="Allows grantee to create objects in the bucket")
|
||||
@click.option("--grant-write-acp", help="Allows grantee to write the ACL for the bucket")
|
||||
@click.option("--access-control-policy", help="Full ACL policy as JSON string")
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def put_bucket_acl(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
acl: str | None,
|
||||
grant_full_control: str | None,
|
||||
grant_read: str | None,
|
||||
grant_read_acp: str | None,
|
||||
grant_write: str | None,
|
||||
grant_write_acp: str | None,
|
||||
access_control_policy: str | None,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Set the access control list (ACL) for an S3 bucket.
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Examples:
|
||||
deltaglider put-bucket-acl my-bucket --acl private
|
||||
deltaglider put-bucket-acl my-bucket --acl public-read
|
||||
deltaglider put-bucket-acl my-bucket --grant-read id=12345
|
||||
"""
|
||||
from ...client import DeltaGliderClient
|
||||
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
client = DeltaGliderClient(service=service)
|
||||
|
||||
kwargs: dict[str, Any] = {}
|
||||
if acl is not None:
|
||||
kwargs["ACL"] = acl
|
||||
if grant_full_control is not None:
|
||||
kwargs["GrantFullControl"] = grant_full_control
|
||||
if grant_read is not None:
|
||||
kwargs["GrantRead"] = grant_read
|
||||
if grant_read_acp is not None:
|
||||
kwargs["GrantReadACP"] = grant_read_acp
|
||||
if grant_write is not None:
|
||||
kwargs["GrantWrite"] = grant_write
|
||||
if grant_write_acp is not None:
|
||||
kwargs["GrantWriteACP"] = grant_write_acp
|
||||
if access_control_policy is not None:
|
||||
kwargs["AccessControlPolicy"] = json.loads(access_control_policy)
|
||||
|
||||
client.put_bucket_acl(Bucket=bucket, **kwargs)
|
||||
click.echo(f"ACL updated for bucket: {bucket}")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
click.echo(f"Error: Invalid JSON for --access-control-policy: {e}", err=True)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command("get-bucket-acl")
|
||||
@click.argument("bucket")
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def get_bucket_acl(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Get the access control list (ACL) for an S3 bucket.
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Examples:
|
||||
deltaglider get-bucket-acl my-bucket
|
||||
deltaglider get-bucket-acl s3://my-bucket
|
||||
"""
|
||||
from ...client import DeltaGliderClient
|
||||
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
client = DeltaGliderClient(service=service)
|
||||
response = client.get_bucket_acl(Bucket=bucket)
|
||||
|
||||
# Output as JSON like aws s3api get-bucket-acl
|
||||
click.echo(json.dumps(response, indent=2, default=str))
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
cli()
|
||||
|
||||
@@ -5,9 +5,27 @@ from pathlib import Path
|
||||
import click
|
||||
|
||||
from ...core import DeltaService
|
||||
from ...core.object_listing import list_all_objects, object_dict_to_head
|
||||
from ...ports import ObjectHead
|
||||
|
||||
|
||||
def fetch_s3_object_heads(service: DeltaService, bucket: str, prefix: str) -> list[ObjectHead]:
|
||||
"""Retrieve all objects for a prefix, falling back to iterator when needed."""
|
||||
try:
|
||||
listing = list_all_objects(
|
||||
service.storage,
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
max_keys=1000,
|
||||
logger=getattr(service, "logger", None),
|
||||
)
|
||||
except (RuntimeError, NotImplementedError):
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
return list(service.storage.list(list_prefix))
|
||||
|
||||
return [object_dict_to_head(obj) for obj in listing.objects]
|
||||
|
||||
|
||||
def get_local_files(
|
||||
local_dir: Path, exclude: str | None = None, include: str | None = None
|
||||
) -> dict[str, tuple[Path, int]]:
|
||||
@@ -42,8 +60,7 @@ def get_s3_files(
|
||||
import fnmatch
|
||||
|
||||
files = {}
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
objects = service.storage.list(list_prefix)
|
||||
objects = fetch_s3_object_heads(service, bucket, prefix)
|
||||
|
||||
for obj in objects:
|
||||
# Skip reference.bin files (internal)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
36
src/deltaglider/client_delete_helpers.py
Normal file
36
src/deltaglider/client_delete_helpers.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""Helper utilities for client delete operations."""
|
||||
|
||||
from .core import DeltaService, ObjectKey
|
||||
from .core.errors import NotFoundError
|
||||
from .core.models import DeleteResult
|
||||
|
||||
|
||||
def delete_with_delta_suffix(
|
||||
service: DeltaService, bucket: str, key: str
|
||||
) -> tuple[str, DeleteResult]:
|
||||
"""Delete an object, retrying with '.delta' suffix when needed.
|
||||
|
||||
Args:
|
||||
service: DeltaService-like instance exposing ``delete(ObjectKey)``.
|
||||
bucket: Target bucket.
|
||||
key: Requested key (without forcing .delta suffix).
|
||||
|
||||
Returns:
|
||||
Tuple containing the actual key deleted in storage and the DeleteResult.
|
||||
|
||||
Raises:
|
||||
NotFoundError: Propagated when both the direct and '.delta' keys are missing.
|
||||
"""
|
||||
actual_key = key
|
||||
object_key = ObjectKey(bucket=bucket, key=actual_key)
|
||||
|
||||
try:
|
||||
delete_result = service.delete(object_key)
|
||||
except NotFoundError:
|
||||
if key.endswith(".delta"):
|
||||
raise
|
||||
actual_key = f"{key}.delta"
|
||||
object_key = ObjectKey(bucket=bucket, key=actual_key)
|
||||
delete_result = service.delete(object_key)
|
||||
|
||||
return actual_key, delete_result
|
||||
100
src/deltaglider/client_models.py
Normal file
100
src/deltaglider/client_models.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Shared data models for the DeltaGlider client."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class UploadSummary:
|
||||
"""User-friendly upload summary."""
|
||||
|
||||
operation: str
|
||||
bucket: str
|
||||
key: str
|
||||
original_size: int
|
||||
stored_size: int
|
||||
is_delta: bool
|
||||
delta_ratio: float = 0.0
|
||||
|
||||
@property
|
||||
def original_size_mb(self) -> float:
|
||||
"""Original size in MB."""
|
||||
return self.original_size / (1024 * 1024)
|
||||
|
||||
@property
|
||||
def stored_size_mb(self) -> float:
|
||||
"""Stored size in MB."""
|
||||
return self.stored_size / (1024 * 1024)
|
||||
|
||||
@property
|
||||
def savings_percent(self) -> float:
|
||||
"""Percentage saved through compression."""
|
||||
if self.original_size == 0:
|
||||
return 0.0
|
||||
return ((self.original_size - self.stored_size) / self.original_size) * 100
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompressionEstimate:
|
||||
"""Compression estimate for a file."""
|
||||
|
||||
original_size: int
|
||||
estimated_compressed_size: int
|
||||
estimated_ratio: float
|
||||
confidence: float
|
||||
recommended_reference: str | None = None
|
||||
should_use_delta: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ObjectInfo:
|
||||
"""Detailed object information with compression stats."""
|
||||
|
||||
key: str
|
||||
size: int
|
||||
last_modified: str
|
||||
etag: str | None = None
|
||||
storage_class: str = "STANDARD"
|
||||
|
||||
# DeltaGlider-specific fields
|
||||
original_size: int | None = None
|
||||
compressed_size: int | None = None
|
||||
compression_ratio: float | None = None
|
||||
is_delta: bool = False
|
||||
reference_key: str | None = None
|
||||
delta_chain_length: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListObjectsResponse:
|
||||
"""Response from list_objects, compatible with boto3."""
|
||||
|
||||
name: str # Bucket name
|
||||
prefix: str = ""
|
||||
delimiter: str = ""
|
||||
max_keys: int = 1000
|
||||
common_prefixes: list[dict[str, str]] = field(default_factory=list)
|
||||
contents: list[ObjectInfo] = field(default_factory=list)
|
||||
is_truncated: bool = False
|
||||
next_continuation_token: str | None = None
|
||||
continuation_token: str | None = None
|
||||
key_count: int = 0
|
||||
|
||||
@property
|
||||
def objects(self) -> list[ObjectInfo]:
|
||||
"""Alias for contents, for convenience."""
|
||||
return self.contents
|
||||
|
||||
|
||||
@dataclass
|
||||
class BucketStats:
|
||||
"""Statistics for a bucket."""
|
||||
|
||||
bucket: str
|
||||
object_count: int
|
||||
total_size: int
|
||||
compressed_size: int
|
||||
space_saved: int
|
||||
average_compression_ratio: float
|
||||
delta_objects: int
|
||||
direct_objects: int
|
||||
object_limit_reached: bool = False
|
||||
39
src/deltaglider/client_operations/__init__.py
Normal file
39
src/deltaglider/client_operations/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""Client operation modules for DeltaGliderClient.
|
||||
|
||||
This package contains modular operation implementations:
|
||||
- bucket: S3 bucket management (create, delete, list)
|
||||
- presigned: Presigned URL generation for temporary access
|
||||
- batch: Batch upload/download operations
|
||||
- stats: Statistics and analytics operations
|
||||
"""
|
||||
|
||||
from .batch import download_batch, upload_batch, upload_chunked
|
||||
from .bucket import create_bucket, delete_bucket, get_bucket_acl, list_buckets, put_bucket_acl
|
||||
from .presigned import generate_presigned_post, generate_presigned_url
|
||||
from .stats import (
|
||||
estimate_compression,
|
||||
find_similar_files,
|
||||
get_bucket_stats,
|
||||
get_object_info,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Bucket operations
|
||||
"create_bucket",
|
||||
"delete_bucket",
|
||||
"get_bucket_acl",
|
||||
"list_buckets",
|
||||
"put_bucket_acl",
|
||||
# Presigned operations
|
||||
"generate_presigned_url",
|
||||
"generate_presigned_post",
|
||||
# Batch operations
|
||||
"upload_chunked",
|
||||
"upload_batch",
|
||||
"download_batch",
|
||||
# Stats operations
|
||||
"get_bucket_stats",
|
||||
"get_object_info",
|
||||
"estimate_compression",
|
||||
"find_similar_files",
|
||||
]
|
||||
159
src/deltaglider/client_operations/batch.py
Normal file
159
src/deltaglider/client_operations/batch.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""Batch upload/download operations for DeltaGlider client.
|
||||
|
||||
This module contains DeltaGlider-specific batch operations:
|
||||
- upload_batch
|
||||
- download_batch
|
||||
- upload_chunked
|
||||
"""
|
||||
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from ..client_models import UploadSummary
|
||||
|
||||
|
||||
def upload_chunked(
|
||||
client: Any, # DeltaGliderClient
|
||||
file_path: str | Path,
|
||||
s3_url: str,
|
||||
chunk_size: int = 5 * 1024 * 1024,
|
||||
progress_callback: Callable[[int, int, int, int], None] | None = None,
|
||||
max_ratio: float = 0.5,
|
||||
) -> UploadSummary:
|
||||
"""Upload a file in chunks with progress callback.
|
||||
|
||||
This method reads the file in chunks to avoid loading large files entirely into memory,
|
||||
making it suitable for uploading very large files. Progress is reported after each chunk.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
file_path: Local file to upload
|
||||
s3_url: S3 destination URL (s3://bucket/path/filename)
|
||||
chunk_size: Size of each chunk in bytes (default 5MB)
|
||||
progress_callback: Callback(chunk_number, total_chunks, bytes_sent, total_bytes)
|
||||
max_ratio: Maximum acceptable delta/file ratio for compression
|
||||
|
||||
Returns:
|
||||
UploadSummary with compression statistics
|
||||
|
||||
Example:
|
||||
def on_progress(chunk_num, total_chunks, bytes_sent, total_bytes):
|
||||
percent = (bytes_sent / total_bytes) * 100
|
||||
print(f"Upload progress: {percent:.1f}%")
|
||||
|
||||
client.upload_chunked(
|
||||
"large_file.zip",
|
||||
"s3://bucket/releases/large_file.zip",
|
||||
chunk_size=10 * 1024 * 1024, # 10MB chunks
|
||||
progress_callback=on_progress
|
||||
)
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
# For small files, just use regular upload
|
||||
if file_size <= chunk_size:
|
||||
if progress_callback:
|
||||
progress_callback(1, 1, file_size, file_size)
|
||||
result: UploadSummary = client.upload(file_path, s3_url, max_ratio=max_ratio)
|
||||
return result
|
||||
|
||||
# Calculate chunks
|
||||
total_chunks = (file_size + chunk_size - 1) // chunk_size
|
||||
|
||||
# Create a temporary file for chunked processing
|
||||
# For now, we read the entire file but report progress in chunks
|
||||
# Future enhancement: implement true streaming upload in storage adapter
|
||||
bytes_read = 0
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
for chunk_num in range(1, total_chunks + 1):
|
||||
# Read chunk (simulated for progress reporting)
|
||||
chunk_data = f.read(chunk_size)
|
||||
bytes_read += len(chunk_data)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(chunk_num, total_chunks, bytes_read, file_size)
|
||||
|
||||
# Perform the actual upload
|
||||
# TODO: When storage adapter supports streaming, pass chunks directly
|
||||
upload_result: UploadSummary = client.upload(file_path, s3_url, max_ratio=max_ratio)
|
||||
|
||||
# Final progress callback
|
||||
if progress_callback:
|
||||
progress_callback(total_chunks, total_chunks, file_size, file_size)
|
||||
|
||||
return upload_result
|
||||
|
||||
|
||||
def upload_batch(
|
||||
client: Any, # DeltaGliderClient
|
||||
files: list[str | Path],
|
||||
s3_prefix: str,
|
||||
max_ratio: float = 0.5,
|
||||
progress_callback: Callable[[str, int, int], None] | None = None,
|
||||
) -> list[UploadSummary]:
|
||||
"""Upload multiple files in batch.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
files: List of local file paths
|
||||
s3_prefix: S3 destination prefix (s3://bucket/prefix/)
|
||||
max_ratio: Maximum acceptable delta/file ratio
|
||||
progress_callback: Callback(filename, current_file_index, total_files)
|
||||
|
||||
Returns:
|
||||
List of UploadSummary objects
|
||||
"""
|
||||
results = []
|
||||
|
||||
for i, file_path in enumerate(files):
|
||||
file_path = Path(file_path)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(file_path.name, i + 1, len(files))
|
||||
|
||||
# Upload each file
|
||||
s3_url = f"{s3_prefix.rstrip('/')}/{file_path.name}"
|
||||
summary = client.upload(file_path, s3_url, max_ratio=max_ratio)
|
||||
results.append(summary)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def download_batch(
|
||||
client: Any, # DeltaGliderClient
|
||||
s3_urls: list[str],
|
||||
output_dir: str | Path,
|
||||
progress_callback: Callable[[str, int, int], None] | None = None,
|
||||
) -> list[Path]:
|
||||
"""Download multiple files in batch.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
s3_urls: List of S3 URLs to download
|
||||
output_dir: Local directory to save files
|
||||
progress_callback: Callback(filename, current_file_index, total_files)
|
||||
|
||||
Returns:
|
||||
List of downloaded file paths
|
||||
"""
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
results = []
|
||||
|
||||
for i, s3_url in enumerate(s3_urls):
|
||||
# Extract filename from URL
|
||||
filename = s3_url.split("/")[-1]
|
||||
if filename.endswith(".delta"):
|
||||
filename = filename[:-6] # Remove .delta suffix
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(filename, i + 1, len(s3_urls))
|
||||
|
||||
output_path = output_dir / filename
|
||||
client.download(s3_url, output_path)
|
||||
results.append(output_path)
|
||||
|
||||
return results
|
||||
275
src/deltaglider/client_operations/bucket.py
Normal file
275
src/deltaglider/client_operations/bucket.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""Bucket management operations for DeltaGlider client.
|
||||
|
||||
This module contains boto3-compatible bucket operations:
|
||||
- create_bucket
|
||||
- delete_bucket
|
||||
- list_buckets
|
||||
- put_bucket_acl
|
||||
- get_bucket_acl
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def create_bucket(
|
||||
client: Any, # DeltaGliderClient (avoiding circular import)
|
||||
Bucket: str,
|
||||
CreateBucketConfiguration: dict[str, str] | None = None,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Create an S3 bucket (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name to create
|
||||
CreateBucketConfiguration: Optional bucket configuration (e.g., LocationConstraint)
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with bucket location
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> client.create_bucket(Bucket='my-bucket')
|
||||
>>> # With region
|
||||
>>> client.create_bucket(
|
||||
... Bucket='my-bucket',
|
||||
... CreateBucketConfiguration={'LocationConstraint': 'us-west-2'}
|
||||
... )
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
params: dict[str, Any] = {"Bucket": Bucket}
|
||||
if CreateBucketConfiguration:
|
||||
params["CreateBucketConfiguration"] = CreateBucketConfiguration
|
||||
|
||||
response = storage_adapter.client.create_bucket(**params)
|
||||
return {
|
||||
"Location": response.get("Location", f"/{Bucket}"),
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "BucketAlreadyExists" in error_msg or "BucketAlreadyOwnedByYou" in error_msg:
|
||||
# Bucket already exists - return success
|
||||
client.service.logger.debug(f"Bucket {Bucket} already exists")
|
||||
return {
|
||||
"Location": f"/{Bucket}",
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
raise RuntimeError(f"Failed to create bucket: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket creation")
|
||||
|
||||
|
||||
def delete_bucket(
|
||||
client: Any, # DeltaGliderClient
|
||||
Bucket: str,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Delete an S3 bucket (boto3-compatible).
|
||||
|
||||
Note: Bucket must be empty before deletion.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name to delete
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with deletion status
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> client.delete_bucket(Bucket='my-bucket')
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
storage_adapter.client.delete_bucket(Bucket=Bucket)
|
||||
return {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 204,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "NoSuchBucket" in error_msg:
|
||||
# Bucket doesn't exist - return success
|
||||
client.service.logger.debug(f"Bucket {Bucket} does not exist")
|
||||
return {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 204,
|
||||
},
|
||||
}
|
||||
raise RuntimeError(f"Failed to delete bucket: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket deletion")
|
||||
|
||||
|
||||
def list_buckets(
|
||||
client: Any, # DeltaGliderClient
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""List all S3 buckets (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with bucket list
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> response = client.list_buckets()
|
||||
>>> for bucket in response['Buckets']:
|
||||
... print(bucket['Name'])
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
raw_response = storage_adapter.client.list_buckets()
|
||||
|
||||
buckets: list[dict[str, Any]] = []
|
||||
for bucket_entry in raw_response.get("Buckets", []):
|
||||
bucket_data = dict(bucket_entry)
|
||||
name = bucket_data.get("Name")
|
||||
if isinstance(name, str) and name:
|
||||
cached_stats, cached_mode = client._get_cached_bucket_stats_for_listing(name)
|
||||
if cached_stats is not None and cached_mode is not None:
|
||||
bucket_data["DeltaGliderStats"] = {
|
||||
"Cached": True,
|
||||
"Mode": cached_mode,
|
||||
"Detailed": cached_mode == "detailed",
|
||||
"ObjectCount": cached_stats.object_count,
|
||||
"TotalSize": cached_stats.total_size,
|
||||
"CompressedSize": cached_stats.compressed_size,
|
||||
"SpaceSaved": cached_stats.space_saved,
|
||||
"AverageCompressionRatio": cached_stats.average_compression_ratio,
|
||||
"DeltaObjects": cached_stats.delta_objects,
|
||||
"DirectObjects": cached_stats.direct_objects,
|
||||
}
|
||||
|
||||
buckets.append(bucket_data)
|
||||
|
||||
return {
|
||||
"Buckets": buckets,
|
||||
"Owner": raw_response.get("Owner", {}),
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to list buckets: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket listing")
|
||||
|
||||
|
||||
def put_bucket_acl(
|
||||
client: Any, # DeltaGliderClient (avoiding circular import)
|
||||
Bucket: str,
|
||||
ACL: str | None = None,
|
||||
AccessControlPolicy: dict[str, Any] | None = None,
|
||||
GrantFullControl: str | None = None,
|
||||
GrantRead: str | None = None,
|
||||
GrantReadACP: str | None = None,
|
||||
GrantWrite: str | None = None,
|
||||
GrantWriteACP: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Set the ACL for an S3 bucket (boto3-compatible passthrough).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name
|
||||
ACL: Canned ACL (private, public-read, public-read-write, authenticated-read)
|
||||
AccessControlPolicy: Full ACL policy dict
|
||||
GrantFullControl: Grants full control to the grantee
|
||||
GrantRead: Allows grantee to list objects in the bucket
|
||||
GrantReadACP: Allows grantee to read the bucket ACL
|
||||
GrantWrite: Allows grantee to create objects in the bucket
|
||||
GrantWriteACP: Allows grantee to write the ACL for the bucket
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with status
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> client.put_bucket_acl(Bucket='my-bucket', ACL='public-read')
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
params: dict[str, Any] = {"Bucket": Bucket}
|
||||
if ACL is not None:
|
||||
params["ACL"] = ACL
|
||||
if AccessControlPolicy is not None:
|
||||
params["AccessControlPolicy"] = AccessControlPolicy
|
||||
if GrantFullControl is not None:
|
||||
params["GrantFullControl"] = GrantFullControl
|
||||
if GrantRead is not None:
|
||||
params["GrantRead"] = GrantRead
|
||||
if GrantReadACP is not None:
|
||||
params["GrantReadACP"] = GrantReadACP
|
||||
if GrantWrite is not None:
|
||||
params["GrantWrite"] = GrantWrite
|
||||
if GrantWriteACP is not None:
|
||||
params["GrantWriteACP"] = GrantWriteACP
|
||||
|
||||
storage_adapter.client.put_bucket_acl(**params)
|
||||
return {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to set bucket ACL: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket ACL operations")
|
||||
|
||||
|
||||
def get_bucket_acl(
|
||||
client: Any, # DeltaGliderClient (avoiding circular import)
|
||||
Bucket: str,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Get the ACL for an S3 bucket (boto3-compatible passthrough).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with Owner and Grants
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> response = client.get_bucket_acl(Bucket='my-bucket')
|
||||
>>> print(response['Owner'])
|
||||
>>> print(response['Grants'])
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
response: dict[str, Any] = storage_adapter.client.get_bucket_acl(Bucket=Bucket)
|
||||
return response
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to get bucket ACL: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket ACL operations")
|
||||
124
src/deltaglider/client_operations/presigned.py
Normal file
124
src/deltaglider/client_operations/presigned.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Presigned URL operations for DeltaGlider client.
|
||||
|
||||
This module contains boto3-compatible presigned URL operations:
|
||||
- generate_presigned_url
|
||||
- generate_presigned_post
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def try_boto3_presigned_operation(
|
||||
client: Any, # DeltaGliderClient
|
||||
operation: str,
|
||||
**kwargs: Any,
|
||||
) -> Any | None:
|
||||
"""Try to generate presigned operation using boto3 client, return None if not available."""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
if operation == "url":
|
||||
return str(storage_adapter.client.generate_presigned_url(**kwargs))
|
||||
elif operation == "post":
|
||||
return dict(storage_adapter.client.generate_presigned_post(**kwargs))
|
||||
except AttributeError:
|
||||
# storage_adapter does not have a 'client' attribute
|
||||
pass
|
||||
except Exception as e:
|
||||
# Fall back to manual construction if needed
|
||||
client.service.logger.warning(f"Failed to generate presigned {operation}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def generate_presigned_url(
|
||||
client: Any, # DeltaGliderClient
|
||||
ClientMethod: str,
|
||||
Params: dict[str, Any],
|
||||
ExpiresIn: int = 3600,
|
||||
) -> str:
|
||||
"""Generate presigned URL (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
ClientMethod: Method name ('get_object' or 'put_object')
|
||||
Params: Parameters dict with Bucket and Key
|
||||
ExpiresIn: URL expiration in seconds
|
||||
|
||||
Returns:
|
||||
Presigned URL string
|
||||
"""
|
||||
# Try boto3 first, fallback to manual construction
|
||||
url = try_boto3_presigned_operation(
|
||||
client,
|
||||
"url",
|
||||
ClientMethod=ClientMethod,
|
||||
Params=Params,
|
||||
ExpiresIn=ExpiresIn,
|
||||
)
|
||||
if url is not None:
|
||||
return str(url)
|
||||
|
||||
# Fallback: construct URL manually (less secure, for dev/testing only)
|
||||
bucket = Params.get("Bucket", "")
|
||||
key = Params.get("Key", "")
|
||||
|
||||
if client.endpoint_url:
|
||||
base_url = client.endpoint_url
|
||||
else:
|
||||
base_url = f"https://{bucket}.s3.amazonaws.com"
|
||||
|
||||
# Warning: This is not a real presigned URL, just a placeholder
|
||||
client.service.logger.warning("Using placeholder presigned URL - not suitable for production")
|
||||
return f"{base_url}/{key}?expires={ExpiresIn}"
|
||||
|
||||
|
||||
def generate_presigned_post(
|
||||
client: Any, # DeltaGliderClient
|
||||
Bucket: str,
|
||||
Key: str,
|
||||
Fields: dict[str, str] | None = None,
|
||||
Conditions: list[Any] | None = None,
|
||||
ExpiresIn: int = 3600,
|
||||
) -> dict[str, Any]:
|
||||
"""Generate presigned POST data for HTML forms (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: S3 bucket name
|
||||
Key: Object key
|
||||
Fields: Additional fields to include
|
||||
Conditions: Upload conditions
|
||||
ExpiresIn: URL expiration in seconds
|
||||
|
||||
Returns:
|
||||
Dict with 'url' and 'fields' for form submission
|
||||
"""
|
||||
# Try boto3 first, fallback to manual construction
|
||||
response = try_boto3_presigned_operation(
|
||||
client,
|
||||
"post",
|
||||
Bucket=Bucket,
|
||||
Key=Key,
|
||||
Fields=Fields,
|
||||
Conditions=Conditions,
|
||||
ExpiresIn=ExpiresIn,
|
||||
)
|
||||
if response is not None:
|
||||
return dict(response)
|
||||
|
||||
# Fallback: return minimal structure for compatibility
|
||||
if client.endpoint_url:
|
||||
url = f"{client.endpoint_url}/{Bucket}"
|
||||
else:
|
||||
url = f"https://{Bucket}.s3.amazonaws.com"
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"fields": {
|
||||
"key": Key,
|
||||
**(Fields or {}),
|
||||
},
|
||||
}
|
||||
994
src/deltaglider/client_operations/stats.py
Normal file
994
src/deltaglider/client_operations/stats.py
Normal file
@@ -0,0 +1,994 @@
|
||||
"""Statistics and analysis operations for DeltaGlider client.
|
||||
|
||||
This module contains DeltaGlider-specific statistics operations:
|
||||
- get_bucket_stats
|
||||
- get_object_info
|
||||
- estimate_compression
|
||||
- find_similar_files
|
||||
"""
|
||||
|
||||
import concurrent.futures
|
||||
import json
|
||||
import re
|
||||
from dataclasses import asdict
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
from ..client_models import BucketStats, CompressionEstimate, ObjectInfo
|
||||
from ..core.delta_extensions import is_delta_candidate
|
||||
from ..core.object_listing import list_all_objects
|
||||
from ..core.s3_uri import parse_s3_url
|
||||
|
||||
StatsMode = Literal["quick", "sampled", "detailed"]
|
||||
|
||||
# Cache configuration
|
||||
CACHE_VERSION = "1.0"
|
||||
CACHE_PREFIX = ".deltaglider"
|
||||
|
||||
# Listing limits (prevent runaway scans on gigantic buckets)
|
||||
QUICK_LIST_LIMIT = 60_000
|
||||
SAMPLED_LIST_LIMIT = 30_000
|
||||
|
||||
# ============================================================================
|
||||
# Internal Helper Functions
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _first_metadata_value(metadata: dict[str, Any], *keys: str) -> str | None:
|
||||
"""Return the first non-empty metadata value matching the provided keys."""
|
||||
for key in keys:
|
||||
value = metadata.get(key)
|
||||
if value not in (None, ""):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_delta_metadata(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
delta_keys: list[str],
|
||||
max_timeout: int = 600,
|
||||
) -> dict[str, dict[str, Any]]:
|
||||
"""Fetch metadata for delta files in parallel with timeout.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
delta_keys: List of delta file keys
|
||||
max_timeout: Maximum total timeout in seconds (default: 600 = 10 min)
|
||||
|
||||
Returns:
|
||||
Dict mapping delta key -> metadata dict
|
||||
"""
|
||||
metadata_map: dict[str, dict[str, Any]] = {}
|
||||
|
||||
if not delta_keys:
|
||||
return metadata_map
|
||||
|
||||
client.service.logger.info(
|
||||
f"Fetching metadata for {len(delta_keys)} delta files in parallel..."
|
||||
)
|
||||
|
||||
def fetch_single_metadata(key: str) -> tuple[str, dict[str, Any] | None]:
|
||||
try:
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
if obj_head and obj_head.metadata:
|
||||
return key, obj_head.metadata
|
||||
except Exception as e:
|
||||
client.service.logger.debug(f"Failed to fetch metadata for {key}: {e}")
|
||||
return key, None
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(delta_keys))) as executor:
|
||||
futures = [executor.submit(fetch_single_metadata, key) for key in delta_keys]
|
||||
|
||||
# Calculate timeout: 60s per file, capped at max_timeout
|
||||
timeout_per_file = 60
|
||||
total_timeout = min(len(delta_keys) * timeout_per_file, max_timeout)
|
||||
|
||||
try:
|
||||
for future in concurrent.futures.as_completed(futures, timeout=total_timeout):
|
||||
try:
|
||||
key, metadata = future.result(timeout=5) # 5s per result
|
||||
if metadata:
|
||||
metadata_map[key] = metadata
|
||||
except concurrent.futures.TimeoutError:
|
||||
client.service.logger.warning("Timeout fetching metadata for a delta file")
|
||||
continue
|
||||
except concurrent.futures.TimeoutError:
|
||||
client.service.logger.warning(
|
||||
f"_fetch_delta_metadata: Timeout after {total_timeout}s. "
|
||||
f"Fetched {len(metadata_map)}/{len(delta_keys)} metadata entries. "
|
||||
f"Continuing with partial metadata..."
|
||||
)
|
||||
# Cancel remaining futures
|
||||
for future in futures:
|
||||
future.cancel()
|
||||
|
||||
return metadata_map
|
||||
|
||||
|
||||
def _extract_deltaspace(key: str) -> str:
|
||||
"""Return the delta space (prefix) for a given object key."""
|
||||
if "/" in key:
|
||||
return key.rsplit("/", 1)[0]
|
||||
return ""
|
||||
|
||||
|
||||
def _get_cache_key(mode: StatsMode) -> str:
|
||||
"""Get the S3 key for a cache file based on mode.
|
||||
|
||||
Args:
|
||||
mode: Stats mode (quick, sampled, or detailed)
|
||||
|
||||
Returns:
|
||||
S3 key like ".deltaglider/stats_quick.json"
|
||||
"""
|
||||
return f"{CACHE_PREFIX}/stats_{mode}.json"
|
||||
|
||||
|
||||
def _read_stats_cache(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
mode: StatsMode,
|
||||
) -> tuple[BucketStats | None, dict[str, Any] | None]:
|
||||
"""Read cached stats from S3 if available.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
mode: Stats mode to read cache for
|
||||
|
||||
Returns:
|
||||
Tuple of (BucketStats | None, validation_data | None)
|
||||
Returns (None, None) if cache doesn't exist or is invalid
|
||||
"""
|
||||
cache_key = _get_cache_key(mode)
|
||||
|
||||
try:
|
||||
# Try to read cache file from S3
|
||||
obj = client.service.storage.get(f"{bucket}/{cache_key}")
|
||||
if not obj or not obj.data:
|
||||
return None, None
|
||||
|
||||
# Parse JSON
|
||||
cache_data = json.loads(obj.data.decode("utf-8"))
|
||||
|
||||
# Validate version
|
||||
if cache_data.get("version") != CACHE_VERSION:
|
||||
client.service.logger.warning(
|
||||
f"Cache version mismatch: expected {CACHE_VERSION}, got {cache_data.get('version')}"
|
||||
)
|
||||
return None, None
|
||||
|
||||
# Validate mode
|
||||
if cache_data.get("mode") != mode:
|
||||
client.service.logger.warning(
|
||||
f"Cache mode mismatch: expected {mode}, got {cache_data.get('mode')}"
|
||||
)
|
||||
return None, None
|
||||
|
||||
# Extract stats and validation data
|
||||
stats_dict = cache_data.get("stats")
|
||||
validation_data = cache_data.get("validation")
|
||||
|
||||
if not stats_dict or not validation_data:
|
||||
client.service.logger.warning("Cache missing stats or validation data")
|
||||
return None, None
|
||||
|
||||
# Reconstruct BucketStats from dict
|
||||
stats = BucketStats(**stats_dict)
|
||||
|
||||
client.service.logger.debug(
|
||||
f"Successfully read cache for {bucket} (mode={mode}, "
|
||||
f"computed_at={cache_data.get('computed_at')})"
|
||||
)
|
||||
|
||||
return stats, validation_data
|
||||
|
||||
except FileNotFoundError:
|
||||
# Cache doesn't exist yet - this is normal
|
||||
client.service.logger.debug(f"No cache found for {bucket} (mode={mode})")
|
||||
return None, None
|
||||
except json.JSONDecodeError as e:
|
||||
client.service.logger.warning(f"Invalid JSON in cache file: {e}")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
client.service.logger.warning(f"Error reading cache: {e}")
|
||||
return None, None
|
||||
|
||||
|
||||
def _write_stats_cache(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
mode: StatsMode,
|
||||
stats: BucketStats,
|
||||
object_count: int,
|
||||
compressed_size: int,
|
||||
) -> None:
|
||||
"""Write computed stats to S3 cache.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
mode: Stats mode being cached
|
||||
stats: Computed BucketStats to cache
|
||||
object_count: Current object count (for validation)
|
||||
compressed_size: Current compressed size (for validation)
|
||||
"""
|
||||
cache_key = _get_cache_key(mode)
|
||||
|
||||
try:
|
||||
# Build cache structure
|
||||
cache_data = {
|
||||
"version": CACHE_VERSION,
|
||||
"mode": mode,
|
||||
"computed_at": datetime.now(UTC).isoformat(),
|
||||
"validation": {
|
||||
"object_count": object_count,
|
||||
"compressed_size": compressed_size,
|
||||
},
|
||||
"stats": asdict(stats),
|
||||
}
|
||||
|
||||
# Serialize to JSON
|
||||
cache_json = json.dumps(cache_data, indent=2)
|
||||
|
||||
# Write to S3
|
||||
client.service.storage.put(
|
||||
address=f"{bucket}/{cache_key}",
|
||||
data=cache_json.encode("utf-8"),
|
||||
metadata={
|
||||
"content-type": "application/json",
|
||||
"x-deltaglider-cache": "true",
|
||||
},
|
||||
)
|
||||
|
||||
client.service.logger.info(
|
||||
f"Wrote cache for {bucket} (mode={mode}, {len(cache_json)} bytes)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# Log warning but don't fail - caching is optional
|
||||
client.service.logger.warning(f"Failed to write cache (non-fatal): {e}")
|
||||
|
||||
|
||||
def _is_cache_valid(
|
||||
cached_validation: dict[str, Any],
|
||||
current_object_count: int,
|
||||
current_compressed_size: int,
|
||||
) -> bool:
|
||||
"""Check if cached stats are still valid based on bucket state.
|
||||
|
||||
Validation strategy: Compare object count and total compressed size.
|
||||
If either changed, the cache is stale.
|
||||
|
||||
Args:
|
||||
cached_validation: Validation data from cache
|
||||
current_object_count: Current object count from LIST
|
||||
current_compressed_size: Current compressed size from LIST
|
||||
|
||||
Returns:
|
||||
True if cache is still valid, False if stale
|
||||
"""
|
||||
cached_count = cached_validation.get("object_count")
|
||||
cached_size = cached_validation.get("compressed_size")
|
||||
|
||||
if cached_count != current_object_count:
|
||||
return False
|
||||
|
||||
if cached_size != current_compressed_size:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _build_object_info_list(
|
||||
raw_objects: list[dict[str, Any]],
|
||||
metadata_map: dict[str, dict[str, Any]],
|
||||
logger: Any,
|
||||
sampled_space_metadata: dict[str, dict[str, Any]] | None = None,
|
||||
) -> list[ObjectInfo]:
|
||||
"""Build ObjectInfo list from raw objects and metadata.
|
||||
|
||||
Args:
|
||||
raw_objects: List of raw object dicts from S3 LIST
|
||||
metadata_map: Dict of key -> metadata for delta files
|
||||
logger: Logger instance
|
||||
|
||||
Returns:
|
||||
List of ObjectInfo objects
|
||||
"""
|
||||
all_objects = []
|
||||
|
||||
for obj_dict in raw_objects:
|
||||
key = obj_dict["key"]
|
||||
size = obj_dict["size"]
|
||||
is_delta = key.endswith(".delta")
|
||||
|
||||
deltaspace = _extract_deltaspace(key)
|
||||
|
||||
# Get metadata from map (empty dict if not present)
|
||||
metadata = metadata_map.get(key)
|
||||
if metadata is None and sampled_space_metadata and deltaspace in sampled_space_metadata:
|
||||
metadata = sampled_space_metadata[deltaspace]
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
|
||||
# Parse compression ratio and original size
|
||||
compression_ratio = 0.0
|
||||
# For delta files without metadata, set original_size to None to indicate unknown
|
||||
# This prevents nonsensical stats like "693 bytes compressed to 82MB"
|
||||
original_size = None if is_delta else size
|
||||
|
||||
if is_delta and metadata:
|
||||
try:
|
||||
ratio_str = metadata.get("compression_ratio", "0.0")
|
||||
compression_ratio = float(ratio_str) if ratio_str != "unknown" else 0.0
|
||||
except (ValueError, TypeError):
|
||||
compression_ratio = 0.0
|
||||
|
||||
try:
|
||||
original_size_raw = _first_metadata_value(
|
||||
metadata,
|
||||
"dg-file-size",
|
||||
"dg_file_size",
|
||||
"file_size",
|
||||
"file-size",
|
||||
"deltaglider-original-size",
|
||||
)
|
||||
if original_size_raw is not None:
|
||||
original_size = int(original_size_raw)
|
||||
logger.debug(f"Delta {key}: using original_size={original_size} from metadata")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Delta {key}: metadata missing file size. Available keys: {list(metadata.keys())}. Using None as original_size (unknown)"
|
||||
)
|
||||
original_size = None
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning(
|
||||
f"Delta {key}: failed to parse file size from metadata: {e}. Using None as original_size (unknown)"
|
||||
)
|
||||
original_size = None
|
||||
|
||||
all_objects.append(
|
||||
ObjectInfo(
|
||||
key=key,
|
||||
size=size,
|
||||
last_modified=obj_dict.get("last_modified", ""),
|
||||
etag=obj_dict.get("etag"),
|
||||
storage_class=obj_dict.get("storage_class", "STANDARD"),
|
||||
original_size=original_size,
|
||||
compressed_size=size,
|
||||
is_delta=is_delta,
|
||||
compression_ratio=compression_ratio,
|
||||
reference_key=_first_metadata_value(
|
||||
metadata,
|
||||
"dg-ref-key",
|
||||
"dg_ref_key",
|
||||
"ref_key",
|
||||
"ref-key",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
return all_objects
|
||||
|
||||
|
||||
def _calculate_bucket_statistics(
|
||||
all_objects: list[ObjectInfo],
|
||||
bucket: str,
|
||||
logger: Any,
|
||||
mode: StatsMode = "quick",
|
||||
) -> BucketStats:
|
||||
"""Calculate statistics from ObjectInfo list.
|
||||
|
||||
Args:
|
||||
all_objects: List of ObjectInfo objects
|
||||
bucket: Bucket name for stats
|
||||
logger: Logger instance
|
||||
mode: Stats mode (quick, sampled, or detailed) - controls warning behavior
|
||||
|
||||
Returns:
|
||||
BucketStats object
|
||||
"""
|
||||
total_original_size = 0
|
||||
total_compressed_size = 0
|
||||
delta_count = 0
|
||||
direct_count = 0
|
||||
reference_files = {} # deltaspace -> size
|
||||
|
||||
# First pass: identify object types and reference files
|
||||
for obj in all_objects:
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
deltaspace = obj.key.rsplit("/reference.bin", 1)[0] if "/" in obj.key else ""
|
||||
reference_files[deltaspace] = obj.size
|
||||
elif obj.is_delta:
|
||||
delta_count += 1
|
||||
else:
|
||||
direct_count += 1
|
||||
|
||||
# Second pass: calculate sizes
|
||||
for obj in all_objects:
|
||||
# Skip reference.bin (handled separately)
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
continue
|
||||
|
||||
if obj.is_delta:
|
||||
# Delta: use original_size if available
|
||||
if obj.original_size is not None:
|
||||
logger.debug(f"Delta {obj.key}: using original_size={obj.original_size}")
|
||||
total_original_size += obj.original_size
|
||||
else:
|
||||
# original_size is None - metadata not available
|
||||
# In quick mode, this is expected (no HEAD requests)
|
||||
# In sampled/detailed mode, this means metadata is genuinely missing
|
||||
if mode != "quick":
|
||||
logger.warning(
|
||||
f"Delta {obj.key}: no original_size metadata available. "
|
||||
f"Cannot calculate original size without metadata. "
|
||||
f"Use --detailed mode for accurate stats."
|
||||
)
|
||||
# Don't add anything to total_original_size for deltas without metadata
|
||||
# This prevents nonsensical stats
|
||||
total_compressed_size += obj.size
|
||||
else:
|
||||
# Direct files: original = compressed
|
||||
total_original_size += obj.size
|
||||
total_compressed_size += obj.size
|
||||
|
||||
# Handle reference.bin files
|
||||
total_reference_size = sum(reference_files.values())
|
||||
|
||||
if delta_count > 0 and total_reference_size > 0:
|
||||
total_compressed_size += total_reference_size
|
||||
logger.info(
|
||||
f"Including {len(reference_files)} reference.bin file(s) "
|
||||
f"({total_reference_size:,} bytes) in compressed size"
|
||||
)
|
||||
elif delta_count == 0 and total_reference_size > 0:
|
||||
_log_orphaned_references(bucket, reference_files, total_reference_size, logger)
|
||||
|
||||
# Calculate final metrics
|
||||
# If we couldn't calculate original size (quick mode with deltas), set space_saved to 0
|
||||
# to avoid nonsensical negative numbers
|
||||
if total_original_size == 0 and total_compressed_size > 0:
|
||||
space_saved = 0
|
||||
avg_ratio = 0.0
|
||||
else:
|
||||
raw_space_saved = total_original_size - total_compressed_size
|
||||
space_saved = raw_space_saved if raw_space_saved > 0 else 0
|
||||
avg_ratio = (space_saved / total_original_size) if total_original_size > 0 else 0.0
|
||||
if avg_ratio < 0:
|
||||
avg_ratio = 0.0
|
||||
elif avg_ratio > 1:
|
||||
avg_ratio = 1.0
|
||||
|
||||
# Warn if quick mode with delta files (stats will be incomplete)
|
||||
if mode == "quick" and delta_count > 0 and total_original_size == 0:
|
||||
logger.warning(
|
||||
f"Quick mode cannot calculate original size for delta files (no metadata fetched). "
|
||||
f"Stats show {delta_count} delta file(s) with unknown original size. "
|
||||
f"Use --detailed for accurate compression metrics."
|
||||
)
|
||||
|
||||
return BucketStats(
|
||||
bucket=bucket,
|
||||
object_count=delta_count + direct_count,
|
||||
total_size=total_original_size,
|
||||
compressed_size=total_compressed_size,
|
||||
space_saved=space_saved,
|
||||
average_compression_ratio=avg_ratio,
|
||||
delta_objects=delta_count,
|
||||
direct_objects=direct_count,
|
||||
)
|
||||
|
||||
|
||||
def _log_orphaned_references(
|
||||
bucket: str,
|
||||
reference_files: dict[str, int],
|
||||
total_reference_size: int,
|
||||
logger: Any,
|
||||
) -> None:
|
||||
"""Log warning about orphaned reference.bin files.
|
||||
|
||||
Args:
|
||||
bucket: Bucket name
|
||||
reference_files: Dict of deltaspace -> size
|
||||
total_reference_size: Total size of all reference files
|
||||
logger: Logger instance
|
||||
"""
|
||||
waste_mb = total_reference_size / 1024 / 1024
|
||||
logger.warning(
|
||||
f"\n{'=' * 60}\n"
|
||||
f"WARNING: ORPHANED REFERENCE FILE(S) DETECTED!\n"
|
||||
f"{'=' * 60}\n"
|
||||
f"Found {len(reference_files)} reference.bin file(s) totaling "
|
||||
f"{total_reference_size:,} bytes ({waste_mb:.2f} MB)\n"
|
||||
f"but NO delta files are using them.\n"
|
||||
f"\n"
|
||||
f"This wastes {waste_mb:.2f} MB of storage!\n"
|
||||
f"\n"
|
||||
f"Orphaned reference files:\n"
|
||||
)
|
||||
|
||||
for deltaspace, size in reference_files.items():
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
logger.warning(f" - s3://{bucket}/{path} ({size:,} bytes)")
|
||||
|
||||
logger.warning("\nConsider removing these orphaned files:\n")
|
||||
for deltaspace in reference_files:
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
logger.warning(f" aws s3 rm s3://{bucket}/{path}")
|
||||
|
||||
logger.warning(f"{'=' * 60}")
|
||||
|
||||
|
||||
def get_object_info(
|
||||
client: Any, # DeltaGliderClient
|
||||
s3_url: str,
|
||||
) -> ObjectInfo:
|
||||
"""Get detailed object information including compression stats.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
s3_url: S3 URL of the object
|
||||
|
||||
Returns:
|
||||
ObjectInfo with detailed metadata
|
||||
"""
|
||||
address = parse_s3_url(s3_url, allow_empty_key=False)
|
||||
bucket = address.bucket
|
||||
key = address.key
|
||||
|
||||
# Get object metadata
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
if not obj_head:
|
||||
raise FileNotFoundError(f"Object not found: {s3_url}")
|
||||
|
||||
metadata = obj_head.metadata
|
||||
is_delta = key.endswith(".delta")
|
||||
|
||||
return ObjectInfo(
|
||||
key=key,
|
||||
size=obj_head.size,
|
||||
last_modified=metadata.get("last_modified", ""),
|
||||
etag=metadata.get("etag"),
|
||||
original_size=int(metadata.get("file_size", obj_head.size)),
|
||||
compressed_size=obj_head.size,
|
||||
compression_ratio=float(metadata.get("compression_ratio", 0.0)),
|
||||
is_delta=is_delta,
|
||||
reference_key=metadata.get("ref_key"),
|
||||
)
|
||||
|
||||
|
||||
def get_bucket_stats(
|
||||
client: Any, # DeltaGliderClient
|
||||
bucket: str,
|
||||
mode: StatsMode = "quick",
|
||||
use_cache: bool = True,
|
||||
refresh_cache: bool = False,
|
||||
) -> BucketStats:
|
||||
"""Get statistics for a bucket with configurable metadata strategies and caching.
|
||||
|
||||
Modes:
|
||||
- ``quick`` (default): Stream LIST results only. Compression metrics for delta files are
|
||||
approximate (falls back to delta size when metadata is unavailable).
|
||||
- ``sampled``: Fetch HEAD metadata for a single delta per delta-space and reuse the ratios for
|
||||
other deltas in the same space. Balances accuracy and speed.
|
||||
- ``detailed``: Fetch HEAD metadata for every delta object for the most accurate statistics.
|
||||
|
||||
Caching:
|
||||
- Stats are cached per mode in ``.deltaglider/stats_{mode}.json``
|
||||
- Cache is validated using object count and compressed size from LIST
|
||||
- If bucket changed, cache is recomputed automatically
|
||||
- Use ``refresh_cache=True`` to force recomputation
|
||||
- Use ``use_cache=False`` to skip caching entirely
|
||||
|
||||
**Robustness**: This function is designed to always return valid stats:
|
||||
- Returns partial stats if timeouts or pagination issues occur
|
||||
- Returns empty stats (zeros) if bucket listing completely fails
|
||||
- Never hangs indefinitely (max 10 min timeout, 10M object limit)
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
mode: Stats mode ("quick", "sampled", or "detailed")
|
||||
use_cache: If True, use cached stats when available (default: True)
|
||||
refresh_cache: If True, force cache recomputation even if valid (default: False)
|
||||
|
||||
Returns:
|
||||
BucketStats with compression and space savings info. Always returns a valid BucketStats
|
||||
object, even if errors occur (will return empty/partial stats with warnings logged).
|
||||
|
||||
Raises:
|
||||
RuntimeError: Only if bucket listing fails immediately with no objects collected.
|
||||
All other errors result in partial/empty stats being returned.
|
||||
|
||||
Performance:
|
||||
- With cache hit: ~50-100ms (LIST + cache read + validation)
|
||||
- quick (no cache): ~50ms for any bucket size (LIST calls only)
|
||||
- sampled (no cache): LIST + one HEAD per delta-space
|
||||
- detailed (no cache): LIST + HEAD for every delta (slowest but accurate)
|
||||
- Max timeout: 10 minutes (prevents indefinite hangs)
|
||||
- Max objects: 10M (prevents infinite loops)
|
||||
|
||||
Example:
|
||||
# Use cached stats (fast, ~100ms)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
|
||||
# Force refresh (slow, recomputes everything)
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Skip cache entirely
|
||||
stats = client.get_bucket_stats('releases', use_cache=False)
|
||||
|
||||
# Different modes with caching
|
||||
stats_sampled = client.get_bucket_stats('releases', mode='sampled')
|
||||
stats_detailed = client.get_bucket_stats('releases', mode='detailed')
|
||||
"""
|
||||
try:
|
||||
if mode not in {"quick", "sampled", "detailed"}:
|
||||
raise ValueError(f"Unknown stats mode: {mode}")
|
||||
|
||||
# Phase 1: Always do a quick LIST to get current state (needed for validation)
|
||||
import time
|
||||
|
||||
phase1_start = time.time()
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: Starting LIST operation for bucket '{bucket}'"
|
||||
)
|
||||
|
||||
list_cap = QUICK_LIST_LIMIT if mode == "quick" else SAMPLED_LIST_LIMIT
|
||||
listing = list_all_objects(
|
||||
client.service.storage,
|
||||
bucket=bucket,
|
||||
max_keys=1000,
|
||||
logger=client.service.logger,
|
||||
max_objects=list_cap,
|
||||
)
|
||||
raw_objects = listing.objects
|
||||
|
||||
# Calculate validation metrics from LIST
|
||||
current_object_count = len(raw_objects)
|
||||
current_compressed_size = sum(obj["size"] for obj in raw_objects)
|
||||
limit_reached = listing.limit_reached or listing.is_truncated
|
||||
if limit_reached:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: Listing capped at {list_cap} objects (bucket likely larger)."
|
||||
)
|
||||
|
||||
phase1_duration = time.time() - phase1_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: LIST completed in {phase1_duration:.2f}s - "
|
||||
f"Found {current_object_count} objects, {current_compressed_size:,} bytes total"
|
||||
)
|
||||
|
||||
# Phase 2: Try to use cache if enabled and not forcing refresh
|
||||
phase2_start = time.time()
|
||||
if use_cache and not refresh_cache:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Checking cache for mode '{mode}'"
|
||||
)
|
||||
cached_stats, cached_validation = _read_stats_cache(client, bucket, mode)
|
||||
|
||||
if cached_stats and cached_validation:
|
||||
# Validate cache against current bucket state
|
||||
if _is_cache_valid(
|
||||
cached_validation, current_object_count, current_compressed_size
|
||||
):
|
||||
phase2_duration = time.time() - phase2_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache HIT in {phase2_duration:.2f}s - "
|
||||
f"Using cached stats for {bucket} (mode={mode}, bucket unchanged)"
|
||||
)
|
||||
return cached_stats
|
||||
else:
|
||||
phase2_duration = time.time() - phase2_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache INVALID in {phase2_duration:.2f}s - "
|
||||
f"Bucket changed: count {cached_validation.get('object_count')} → {current_object_count}, "
|
||||
f"size {cached_validation.get('compressed_size')} → {current_compressed_size}"
|
||||
)
|
||||
else:
|
||||
phase2_duration = time.time() - phase2_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache MISS in {phase2_duration:.2f}s - "
|
||||
f"No valid cache found"
|
||||
)
|
||||
else:
|
||||
if refresh_cache:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache SKIPPED (refresh requested)"
|
||||
)
|
||||
elif not use_cache:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache DISABLED"
|
||||
)
|
||||
|
||||
# Phase 3: Cache miss or invalid - compute stats from scratch
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 3: Computing stats (mode={mode})"
|
||||
)
|
||||
|
||||
# Phase 4: Extract delta keys for metadata fetching
|
||||
phase4_start = time.time()
|
||||
delta_keys = [obj["key"] for obj in raw_objects if obj["key"].endswith(".delta")]
|
||||
phase4_duration = time.time() - phase4_start
|
||||
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 4: Delta extraction completed in {phase4_duration:.3f}s - "
|
||||
f"Found {len(delta_keys)} delta files"
|
||||
)
|
||||
|
||||
# Phase 5: Fetch metadata for delta files based on mode
|
||||
phase5_start = time.time()
|
||||
metadata_map: dict[str, dict[str, Any]] = {}
|
||||
sampled_space_metadata: dict[str, dict[str, Any]] | None = None
|
||||
|
||||
if delta_keys:
|
||||
if mode == "detailed":
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Fetching metadata for ALL {len(delta_keys)} delta files"
|
||||
)
|
||||
metadata_map = _fetch_delta_metadata(client, bucket, delta_keys)
|
||||
|
||||
elif mode == "sampled":
|
||||
# Sample one delta per deltaspace
|
||||
seen_spaces: set[str] = set()
|
||||
sampled_keys: list[str] = []
|
||||
for key in delta_keys:
|
||||
space = _extract_deltaspace(key)
|
||||
if space not in seen_spaces:
|
||||
seen_spaces.add(space)
|
||||
sampled_keys.append(key)
|
||||
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Sampling {len(sampled_keys)} delta files "
|
||||
f"(one per deltaspace) out of {len(delta_keys)} total delta files"
|
||||
)
|
||||
|
||||
# Log which files are being sampled
|
||||
if sampled_keys:
|
||||
for idx, key in enumerate(sampled_keys[:10], 1): # Show first 10
|
||||
space = _extract_deltaspace(key)
|
||||
client.service.logger.info(
|
||||
f" [{idx}] Sampling: {key} (deltaspace: '{space or '(root)'}')"
|
||||
)
|
||||
if len(sampled_keys) > 10:
|
||||
client.service.logger.info(f" ... and {len(sampled_keys) - 10} more")
|
||||
|
||||
if sampled_keys:
|
||||
metadata_map = _fetch_delta_metadata(client, bucket, sampled_keys)
|
||||
sampled_space_metadata = {
|
||||
_extract_deltaspace(k): metadata for k, metadata in metadata_map.items()
|
||||
}
|
||||
|
||||
phase5_duration = time.time() - phase5_start
|
||||
if mode == "quick":
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Skipped metadata fetching (quick mode) in {phase5_duration:.3f}s"
|
||||
)
|
||||
else:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Metadata fetching completed in {phase5_duration:.2f}s - "
|
||||
f"Fetched {len(metadata_map)} metadata records"
|
||||
)
|
||||
|
||||
# Phase 6: Build ObjectInfo list
|
||||
phase6_start = time.time()
|
||||
all_objects = _build_object_info_list(
|
||||
raw_objects,
|
||||
metadata_map,
|
||||
client.service.logger,
|
||||
sampled_space_metadata,
|
||||
)
|
||||
phase6_duration = time.time() - phase6_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 6: ObjectInfo list built in {phase6_duration:.3f}s - "
|
||||
f"{len(all_objects)} objects processed"
|
||||
)
|
||||
|
||||
# Phase 7: Calculate final statistics
|
||||
phase7_start = time.time()
|
||||
stats = _calculate_bucket_statistics(all_objects, bucket, client.service.logger, mode)
|
||||
phase7_duration = time.time() - phase7_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 7: Statistics calculated in {phase7_duration:.3f}s - "
|
||||
f"{stats.delta_objects} delta, {stats.direct_objects} direct objects"
|
||||
)
|
||||
|
||||
# Phase 8: Write cache if enabled
|
||||
phase8_start = time.time()
|
||||
if use_cache:
|
||||
_write_stats_cache(
|
||||
client=client,
|
||||
bucket=bucket,
|
||||
mode=mode,
|
||||
stats=stats,
|
||||
object_count=current_object_count,
|
||||
compressed_size=current_compressed_size,
|
||||
)
|
||||
phase8_duration = time.time() - phase8_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 8: Cache written in {phase8_duration:.3f}s"
|
||||
)
|
||||
else:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 8: Cache write skipped (caching disabled)"
|
||||
)
|
||||
|
||||
# Summary
|
||||
total_duration = time.time() - phase1_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] COMPLETE: Total time {total_duration:.2f}s for bucket '{bucket}' (mode={mode})"
|
||||
)
|
||||
|
||||
stats.object_limit_reached = limit_reached
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
# Last resort: return empty stats with error indication
|
||||
client.service.logger.error(
|
||||
f"get_bucket_stats: Failed to build statistics for '{bucket}': {e}. "
|
||||
f"Returning empty stats."
|
||||
)
|
||||
return BucketStats(
|
||||
bucket=bucket,
|
||||
object_count=0,
|
||||
total_size=0,
|
||||
compressed_size=0,
|
||||
space_saved=0,
|
||||
average_compression_ratio=0.0,
|
||||
delta_objects=0,
|
||||
direct_objects=0,
|
||||
object_limit_reached=False,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Public API Functions
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def estimate_compression(
|
||||
client: Any, # DeltaGliderClient
|
||||
file_path: str | Path,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
sample_size: int = 1024 * 1024,
|
||||
) -> CompressionEstimate:
|
||||
"""Estimate compression ratio before upload.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
file_path: Local file to estimate
|
||||
bucket: Target bucket
|
||||
prefix: Target prefix (for finding similar files)
|
||||
sample_size: Bytes to sample for estimation (default 1MB)
|
||||
|
||||
Returns:
|
||||
CompressionEstimate with predicted compression
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
filename = file_path.name
|
||||
ext = file_path.suffix.lower()
|
||||
|
||||
# Already compressed formats that won't benefit from delta
|
||||
incompressible = {".jpg", ".jpeg", ".png", ".mp4", ".mp3", ".avi", ".mov"}
|
||||
|
||||
if ext in incompressible:
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
estimated_compressed_size=file_size,
|
||||
estimated_ratio=0.0,
|
||||
confidence=0.95,
|
||||
should_use_delta=False,
|
||||
)
|
||||
|
||||
if not is_delta_candidate(filename):
|
||||
# Unknown type, conservative estimate
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
estimated_compressed_size=file_size,
|
||||
estimated_ratio=0.0,
|
||||
confidence=0.5,
|
||||
should_use_delta=file_size > 1024 * 1024, # Only for files > 1MB
|
||||
)
|
||||
|
||||
# Look for similar files in the target location
|
||||
similar_files = find_similar_files(client, bucket, prefix, file_path.name)
|
||||
|
||||
if similar_files:
|
||||
# If we have similar files, estimate high compression
|
||||
estimated_ratio = 0.99 # 99% compression typical for similar versions
|
||||
confidence = 0.9
|
||||
recommended_ref = similar_files[0]["Key"] if similar_files else None
|
||||
else:
|
||||
# First file of its type
|
||||
estimated_ratio = 0.0
|
||||
confidence = 0.7
|
||||
recommended_ref = None
|
||||
|
||||
estimated_size = int(file_size * (1 - estimated_ratio))
|
||||
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
estimated_compressed_size=estimated_size,
|
||||
estimated_ratio=estimated_ratio,
|
||||
confidence=confidence,
|
||||
recommended_reference=recommended_ref,
|
||||
should_use_delta=True,
|
||||
)
|
||||
|
||||
|
||||
def find_similar_files(
|
||||
client: Any, # DeltaGliderClient
|
||||
bucket: str,
|
||||
prefix: str,
|
||||
filename: str,
|
||||
limit: int = 5,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Find similar files that could serve as references.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket
|
||||
prefix: Prefix to search in
|
||||
filename: Filename to match against
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of similar files with scores
|
||||
"""
|
||||
# List objects in the prefix (no metadata needed for similarity check)
|
||||
response = client.list_objects(
|
||||
Bucket=bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1000,
|
||||
FetchMetadata=False, # Don't need metadata for similarity
|
||||
)
|
||||
|
||||
similar: list[dict[str, Any]] = []
|
||||
base_name = Path(filename).stem
|
||||
ext = Path(filename).suffix
|
||||
|
||||
for obj in response["Contents"]:
|
||||
obj_key = obj["Key"]
|
||||
obj_base = Path(obj_key).stem
|
||||
obj_ext = Path(obj_key).suffix
|
||||
|
||||
# Skip delta files and references
|
||||
if obj_key.endswith(".delta") or obj_key.endswith("reference.bin"):
|
||||
continue
|
||||
|
||||
score = 0.0
|
||||
|
||||
# Extension match
|
||||
if ext == obj_ext:
|
||||
score += 0.5
|
||||
|
||||
# Base name similarity
|
||||
if base_name in obj_base or obj_base in base_name:
|
||||
score += 0.3
|
||||
|
||||
# Version pattern match
|
||||
if re.search(r"v?\d+[\.\d]*", base_name) and re.search(r"v?\d+[\.\d]*", obj_base):
|
||||
score += 0.2
|
||||
|
||||
if score > 0.5:
|
||||
similar.append(
|
||||
{
|
||||
"Key": obj_key,
|
||||
"Size": obj["Size"],
|
||||
"Similarity": score,
|
||||
"LastModified": obj["LastModified"],
|
||||
}
|
||||
)
|
||||
|
||||
# Sort by similarity
|
||||
similar.sort(key=lambda x: x["Similarity"], reverse=True) # type: ignore
|
||||
|
||||
return similar[:limit]
|
||||
@@ -1,5 +1,10 @@
|
||||
"""Core domain for DeltaGlider."""
|
||||
|
||||
from .delta_extensions import (
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
DEFAULT_DELTA_EXTENSIONS,
|
||||
is_delta_candidate,
|
||||
)
|
||||
from .errors import (
|
||||
DeltaGliderError,
|
||||
DiffDecodeError,
|
||||
@@ -11,14 +16,17 @@ from .errors import (
|
||||
StorageIOError,
|
||||
)
|
||||
from .models import (
|
||||
DeleteResult,
|
||||
DeltaMeta,
|
||||
DeltaSpace,
|
||||
ObjectKey,
|
||||
PutSummary,
|
||||
RecursiveDeleteResult,
|
||||
ReferenceMeta,
|
||||
Sha256,
|
||||
VerifyResult,
|
||||
)
|
||||
from .s3_uri import S3Url, build_s3_url, is_s3_url, parse_s3_url
|
||||
from .service import DeltaService
|
||||
|
||||
__all__ = [
|
||||
@@ -30,12 +38,21 @@ __all__ = [
|
||||
"DiffDecodeError",
|
||||
"StorageIOError",
|
||||
"PolicyViolationWarning",
|
||||
"DeleteResult",
|
||||
"DeltaSpace",
|
||||
"ObjectKey",
|
||||
"RecursiveDeleteResult",
|
||||
"Sha256",
|
||||
"DeltaMeta",
|
||||
"ReferenceMeta",
|
||||
"PutSummary",
|
||||
"VerifyResult",
|
||||
"DeltaService",
|
||||
"DEFAULT_DELTA_EXTENSIONS",
|
||||
"DEFAULT_COMPOUND_DELTA_EXTENSIONS",
|
||||
"is_delta_candidate",
|
||||
"S3Url",
|
||||
"build_s3_url",
|
||||
"is_s3_url",
|
||||
"parse_s3_url",
|
||||
]
|
||||
|
||||
53
src/deltaglider/core/config.py
Normal file
53
src/deltaglider/core/config.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Centralized configuration for DeltaGlider."""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DeltaGliderConfig:
|
||||
"""All DeltaGlider configuration in one place.
|
||||
|
||||
Environment variables (all optional):
|
||||
DG_MAX_RATIO: Max delta/file ratio before falling back to direct storage.
|
||||
Range 0.0-1.0, default 0.5.
|
||||
DG_LOG_LEVEL: Logging level. Default "INFO".
|
||||
DG_CACHE_BACKEND: "filesystem" (default) or "memory".
|
||||
DG_CACHE_MEMORY_SIZE_MB: Memory cache size in MB. Default 100.
|
||||
DG_METRICS: Metrics backend: "noop", "logging" (default), "cloudwatch".
|
||||
DG_METRICS_NAMESPACE: CloudWatch namespace. Default "DeltaGlider".
|
||||
"""
|
||||
|
||||
max_ratio: float = 0.5
|
||||
log_level: str = "INFO"
|
||||
cache_backend: str = "filesystem"
|
||||
cache_memory_size_mb: int = 100
|
||||
metrics_type: str = "logging"
|
||||
metrics_namespace: str = "DeltaGlider"
|
||||
|
||||
# Connection params (typically passed by CLI, not env vars)
|
||||
endpoint_url: str | None = field(default=None, repr=False)
|
||||
region: str | None = None
|
||||
profile: str | None = None
|
||||
|
||||
@classmethod
|
||||
def from_env(
|
||||
cls,
|
||||
*,
|
||||
log_level: str = "INFO",
|
||||
endpoint_url: str | None = None,
|
||||
region: str | None = None,
|
||||
profile: str | None = None,
|
||||
) -> "DeltaGliderConfig":
|
||||
"""Build config from environment variables + explicit overrides."""
|
||||
return cls(
|
||||
max_ratio=float(os.environ.get("DG_MAX_RATIO", "0.5")),
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", log_level),
|
||||
cache_backend=os.environ.get("DG_CACHE_BACKEND", "filesystem"),
|
||||
cache_memory_size_mb=int(os.environ.get("DG_CACHE_MEMORY_SIZE_MB", "100")),
|
||||
metrics_type=os.environ.get("DG_METRICS", "logging"),
|
||||
metrics_namespace=os.environ.get("DG_METRICS_NAMESPACE", "DeltaGlider"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
56
src/deltaglider/core/delta_extensions.py
Normal file
56
src/deltaglider/core/delta_extensions.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""Shared delta compression extension policy."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Collection, Iterable
|
||||
|
||||
# Compound extensions must be checked before simple suffix matching so that
|
||||
# multi-part archives like ".tar.gz" are handled correctly.
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS: tuple[str, ...] = (".tar.gz", ".tar.bz2", ".tar.xz")
|
||||
|
||||
# Simple extensions that benefit from delta compression. Keep this structure
|
||||
# immutable so it can be safely reused across modules.
|
||||
DEFAULT_DELTA_EXTENSIONS: frozenset[str] = frozenset(
|
||||
{
|
||||
".zip",
|
||||
".tar",
|
||||
".gz",
|
||||
".tgz",
|
||||
".bz2",
|
||||
".xz",
|
||||
".7z",
|
||||
".rar",
|
||||
".dmg",
|
||||
".iso",
|
||||
".pkg",
|
||||
".deb",
|
||||
".rpm",
|
||||
".apk",
|
||||
".jar",
|
||||
".war",
|
||||
".ear",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def is_delta_candidate(
|
||||
filename: str,
|
||||
*,
|
||||
simple_extensions: Collection[str] = DEFAULT_DELTA_EXTENSIONS,
|
||||
compound_extensions: Iterable[str] = DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
) -> bool:
|
||||
"""Check if a filename should use delta compression based on extension."""
|
||||
name_lower = filename.lower()
|
||||
|
||||
for ext in compound_extensions:
|
||||
if name_lower.endswith(ext):
|
||||
return True
|
||||
|
||||
return any(name_lower.endswith(ext) for ext in simple_extensions)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"DEFAULT_COMPOUND_DELTA_EXTENSIONS",
|
||||
"DEFAULT_DELTA_EXTENSIONS",
|
||||
"is_delta_candidate",
|
||||
]
|
||||
@@ -47,3 +47,15 @@ class PolicyViolationWarning(Warning):
|
||||
"""Policy violation warning."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class CacheMissError(DeltaGliderError):
|
||||
"""Cache miss - file not found in cache."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class CacheCorruptionError(DeltaGliderError):
|
||||
"""Cache corruption - SHA mismatch or tampering detected."""
|
||||
|
||||
pass
|
||||
|
||||
@@ -1,8 +1,80 @@
|
||||
"""Core domain models."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
|
||||
# Metadata key prefix for DeltaGlider
|
||||
# AWS S3 automatically adds 'x-amz-meta-' prefix, so our keys become 'x-amz-meta-dg-*'
|
||||
METADATA_PREFIX = "dg-"
|
||||
|
||||
# Canonical metadata key aliases.
|
||||
# Each field maps to all known key formats (current prefixed, legacy underscore, legacy bare,
|
||||
# legacy hyphenated). Order matters: first match wins during lookup.
|
||||
# Both DeltaMeta.from_dict() and service-layer _meta_value() MUST use these to stay in sync.
|
||||
METADATA_KEY_ALIASES: dict[str, tuple[str, ...]] = {
|
||||
"tool": (f"{METADATA_PREFIX}tool", "dg_tool", "tool"),
|
||||
"original_name": (
|
||||
f"{METADATA_PREFIX}original-name",
|
||||
"dg_original_name",
|
||||
"original_name",
|
||||
"original-name",
|
||||
),
|
||||
"file_sha256": (
|
||||
f"{METADATA_PREFIX}file-sha256",
|
||||
"dg_file_sha256",
|
||||
"file_sha256",
|
||||
"file-sha256",
|
||||
),
|
||||
"file_size": (
|
||||
f"{METADATA_PREFIX}file-size",
|
||||
"dg_file_size",
|
||||
"file_size",
|
||||
"file-size",
|
||||
),
|
||||
"created_at": (
|
||||
f"{METADATA_PREFIX}created-at",
|
||||
"dg_created_at",
|
||||
"created_at",
|
||||
"created-at",
|
||||
),
|
||||
"ref_key": (f"{METADATA_PREFIX}ref-key", "dg_ref_key", "ref_key", "ref-key"),
|
||||
"ref_sha256": (
|
||||
f"{METADATA_PREFIX}ref-sha256",
|
||||
"dg_ref_sha256",
|
||||
"ref_sha256",
|
||||
"ref-sha256",
|
||||
),
|
||||
"delta_size": (
|
||||
f"{METADATA_PREFIX}delta-size",
|
||||
"dg_delta_size",
|
||||
"delta_size",
|
||||
"delta-size",
|
||||
),
|
||||
"delta_cmd": (
|
||||
f"{METADATA_PREFIX}delta-cmd",
|
||||
"dg_delta_cmd",
|
||||
"delta_cmd",
|
||||
"delta-cmd",
|
||||
),
|
||||
"note": (f"{METADATA_PREFIX}note", "dg_note", "note"),
|
||||
}
|
||||
|
||||
|
||||
def resolve_metadata(metadata: dict[str, str], field: str) -> str | None:
|
||||
"""Look up a metadata field using all known key aliases.
|
||||
|
||||
Returns the first non-empty match, or None if not found.
|
||||
"""
|
||||
for key in METADATA_KEY_ALIASES[field]:
|
||||
value = metadata.get(key)
|
||||
if value not in (None, ""):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DeltaSpace:
|
||||
@@ -23,6 +95,11 @@ class ObjectKey:
|
||||
bucket: str
|
||||
key: str
|
||||
|
||||
@property
|
||||
def full_key(self) -> str:
|
||||
"""Full S3 path: bucket/key."""
|
||||
return f"{self.bucket}/{self.key}"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Sha256:
|
||||
@@ -47,13 +124,13 @@ class ReferenceMeta:
|
||||
note: str = "reference"
|
||||
|
||||
def to_dict(self) -> dict[str, str]:
|
||||
"""Convert to S3 metadata dict."""
|
||||
"""Convert to S3 metadata dict with DeltaGlider namespace prefix."""
|
||||
return {
|
||||
"tool": self.tool,
|
||||
"source_name": self.source_name,
|
||||
"file_sha256": self.file_sha256,
|
||||
"created_at": self.created_at.isoformat() + "Z",
|
||||
"note": self.note,
|
||||
f"{METADATA_PREFIX}tool": self.tool,
|
||||
f"{METADATA_PREFIX}source-name": self.source_name,
|
||||
f"{METADATA_PREFIX}file-sha256": self.file_sha256,
|
||||
f"{METADATA_PREFIX}created-at": self.created_at.isoformat() + "Z",
|
||||
f"{METADATA_PREFIX}note": self.note,
|
||||
}
|
||||
|
||||
|
||||
@@ -73,36 +150,79 @@ class DeltaMeta:
|
||||
note: str | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, str]:
|
||||
"""Convert to S3 metadata dict."""
|
||||
"""Convert to S3 metadata dict with DeltaGlider namespace prefix."""
|
||||
meta = {
|
||||
"tool": self.tool,
|
||||
"original_name": self.original_name,
|
||||
"file_sha256": self.file_sha256,
|
||||
"file_size": str(self.file_size),
|
||||
"created_at": self.created_at.isoformat() + "Z",
|
||||
"ref_key": self.ref_key,
|
||||
"ref_sha256": self.ref_sha256,
|
||||
"delta_size": str(self.delta_size),
|
||||
"delta_cmd": self.delta_cmd,
|
||||
f"{METADATA_PREFIX}tool": self.tool,
|
||||
f"{METADATA_PREFIX}original-name": self.original_name,
|
||||
f"{METADATA_PREFIX}file-sha256": self.file_sha256,
|
||||
f"{METADATA_PREFIX}file-size": str(self.file_size),
|
||||
f"{METADATA_PREFIX}created-at": self.created_at.isoformat() + "Z",
|
||||
f"{METADATA_PREFIX}ref-key": self.ref_key,
|
||||
f"{METADATA_PREFIX}ref-sha256": self.ref_sha256,
|
||||
f"{METADATA_PREFIX}delta-size": str(self.delta_size),
|
||||
f"{METADATA_PREFIX}delta-cmd": self.delta_cmd,
|
||||
}
|
||||
if self.note:
|
||||
meta["note"] = self.note
|
||||
meta[f"{METADATA_PREFIX}note"] = self.note
|
||||
return meta
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, str]) -> "DeltaMeta":
|
||||
"""Create from S3 metadata dict."""
|
||||
"""Create from S3 metadata dict with DeltaGlider namespace prefix."""
|
||||
|
||||
def _require(field: str) -> str:
|
||||
value = resolve_metadata(data, field)
|
||||
if value is None:
|
||||
raise KeyError(METADATA_KEY_ALIASES[field][0])
|
||||
return value
|
||||
|
||||
tool = _require("tool")
|
||||
original_name = _require("original_name")
|
||||
file_sha = _require("file_sha256")
|
||||
file_size_raw = _require("file_size")
|
||||
created_at_raw = _require("created_at")
|
||||
ref_key = _require("ref_key")
|
||||
ref_sha = _require("ref_sha256")
|
||||
delta_size_raw = _require("delta_size")
|
||||
delta_cmd_value = resolve_metadata(data, "delta_cmd") or ""
|
||||
note_value = resolve_metadata(data, "note") or ""
|
||||
|
||||
try:
|
||||
file_size = int(file_size_raw)
|
||||
except (TypeError, ValueError):
|
||||
raise ValueError(f"Invalid file size metadata: {file_size_raw}") from None
|
||||
|
||||
try:
|
||||
delta_size = int(delta_size_raw)
|
||||
except (TypeError, ValueError):
|
||||
raise ValueError(f"Invalid delta size metadata: {delta_size_raw}") from None
|
||||
|
||||
created_at_text = created_at_raw.rstrip("Z")
|
||||
try:
|
||||
created_at = datetime.fromisoformat(created_at_text)
|
||||
except ValueError as exc:
|
||||
raise ValueError(f"Invalid created_at metadata: {created_at_raw}") from exc
|
||||
|
||||
if not delta_cmd_value:
|
||||
object_name = original_name or "<unknown>"
|
||||
logger.warning(
|
||||
"Delta metadata missing %s for %s; using empty command",
|
||||
f"{METADATA_PREFIX}delta-cmd",
|
||||
object_name,
|
||||
)
|
||||
delta_cmd_value = ""
|
||||
|
||||
return cls(
|
||||
tool=data["tool"],
|
||||
original_name=data["original_name"],
|
||||
file_sha256=data["file_sha256"],
|
||||
file_size=int(data["file_size"]),
|
||||
created_at=datetime.fromisoformat(data["created_at"].rstrip("Z")),
|
||||
ref_key=data["ref_key"],
|
||||
ref_sha256=data["ref_sha256"],
|
||||
delta_size=int(data["delta_size"]),
|
||||
delta_cmd=data["delta_cmd"],
|
||||
note=data.get("note"),
|
||||
tool=tool,
|
||||
original_name=original_name,
|
||||
file_sha256=file_sha,
|
||||
file_size=file_size,
|
||||
created_at=created_at,
|
||||
ref_key=ref_key,
|
||||
ref_sha256=ref_sha,
|
||||
delta_size=delta_size,
|
||||
delta_cmd=delta_cmd_value,
|
||||
note=note_value or None,
|
||||
)
|
||||
|
||||
|
||||
@@ -131,3 +251,33 @@ class VerifyResult:
|
||||
expected_sha256: str
|
||||
actual_sha256: str
|
||||
message: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeleteResult:
|
||||
"""Result of a single delete operation."""
|
||||
|
||||
key: str
|
||||
bucket: str
|
||||
deleted: bool = False
|
||||
type: str = "unknown"
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
original_name: str | None = None
|
||||
dependent_deltas: int = 0
|
||||
cleaned_reference: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecursiveDeleteResult:
|
||||
"""Result of a recursive delete operation."""
|
||||
|
||||
bucket: str
|
||||
prefix: str
|
||||
deleted_count: int = 0
|
||||
failed_count: int = 0
|
||||
deltas_deleted: int = 0
|
||||
references_deleted: int = 0
|
||||
direct_deleted: int = 0
|
||||
other_deleted: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
222
src/deltaglider/core/object_listing.py
Normal file
222
src/deltaglider/core/object_listing.py
Normal file
@@ -0,0 +1,222 @@
|
||||
"""Shared helpers for listing bucket objects with pagination support."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from ..ports.storage import ObjectHead
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ObjectListing:
|
||||
"""All objects and prefixes returned from a bucket listing."""
|
||||
|
||||
objects: list[dict[str, Any]] = field(default_factory=list)
|
||||
common_prefixes: list[str] = field(default_factory=list)
|
||||
key_count: int = 0
|
||||
is_truncated: bool = False
|
||||
next_continuation_token: str | None = None
|
||||
limit_reached: bool = False
|
||||
|
||||
|
||||
def list_objects_page(
|
||||
storage: Any,
|
||||
*,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
continuation_token: str | None = None,
|
||||
) -> ObjectListing:
|
||||
"""Perform a single list_objects call using the storage adapter."""
|
||||
if not hasattr(storage, "list_objects"):
|
||||
raise NotImplementedError("Storage adapter does not support list_objects")
|
||||
|
||||
response = storage.list_objects(
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
delimiter=delimiter,
|
||||
max_keys=max_keys,
|
||||
start_after=start_after,
|
||||
continuation_token=continuation_token,
|
||||
)
|
||||
|
||||
return ObjectListing(
|
||||
objects=list(response.get("objects", [])),
|
||||
common_prefixes=list(response.get("common_prefixes", [])),
|
||||
key_count=response.get("key_count", len(response.get("objects", []))),
|
||||
is_truncated=bool(response.get("is_truncated", False)),
|
||||
next_continuation_token=response.get("next_continuation_token"),
|
||||
)
|
||||
|
||||
|
||||
def list_all_objects(
|
||||
storage: Any,
|
||||
*,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
logger: Any | None = None,
|
||||
max_iterations: int = 10_000,
|
||||
max_objects: int | None = None,
|
||||
) -> ObjectListing:
|
||||
"""Fetch all objects under the given bucket/prefix with pagination safety."""
|
||||
import time
|
||||
from datetime import UTC, datetime
|
||||
|
||||
aggregated = ObjectListing()
|
||||
continuation_token: str | None = None
|
||||
iteration_count = 0
|
||||
list_start_time = time.time()
|
||||
limit_reached = False
|
||||
|
||||
while True:
|
||||
iteration_count += 1
|
||||
if iteration_count > max_iterations:
|
||||
if logger:
|
||||
logger.warning(
|
||||
"list_all_objects: reached max iterations (%s). Returning partial results.",
|
||||
max_iterations,
|
||||
)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = continuation_token
|
||||
break
|
||||
|
||||
# Log progress every 10 pages or on first page
|
||||
if logger and (iteration_count == 1 or iteration_count % 10 == 0):
|
||||
elapsed = time.time() - list_start_time
|
||||
objects_per_sec = len(aggregated.objects) / elapsed if elapsed > 0 else 0
|
||||
token_info = f", token={continuation_token[:20]}..." if continuation_token else ""
|
||||
logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] LIST pagination: "
|
||||
f"page {iteration_count}, {len(aggregated.objects)} objects so far "
|
||||
f"({objects_per_sec:.0f} obj/s, {elapsed:.1f}s elapsed{token_info})"
|
||||
)
|
||||
|
||||
# Warn if taking very long (>60s)
|
||||
if elapsed > 60 and iteration_count % 50 == 0:
|
||||
estimated_total = (len(aggregated.objects) / iteration_count) * max_iterations
|
||||
logger.warning(
|
||||
f"LIST operation is slow ({elapsed:.0f}s elapsed). "
|
||||
f"This bucket has MANY objects ({len(aggregated.objects)} so far). "
|
||||
f"Consider using a smaller prefix or enabling caching. "
|
||||
f"Estimated remaining: {estimated_total - len(aggregated.objects):.0f} objects"
|
||||
)
|
||||
|
||||
try:
|
||||
page = list_objects_page(
|
||||
storage,
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
delimiter=delimiter,
|
||||
max_keys=max_keys,
|
||||
continuation_token=continuation_token,
|
||||
)
|
||||
except Exception as exc:
|
||||
if not aggregated.objects:
|
||||
raise RuntimeError(f"Failed to list objects for bucket '{bucket}': {exc}") from exc
|
||||
if logger:
|
||||
logger.warning(
|
||||
"list_all_objects: pagination error after %s objects: %s. Returning partial results.",
|
||||
len(aggregated.objects),
|
||||
exc,
|
||||
)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = continuation_token
|
||||
break
|
||||
|
||||
aggregated.objects.extend(page.objects)
|
||||
aggregated.common_prefixes.extend(page.common_prefixes)
|
||||
aggregated.key_count += page.key_count
|
||||
|
||||
if max_objects is not None and len(aggregated.objects) >= max_objects:
|
||||
if logger:
|
||||
logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] LIST capped at {max_objects} objects."
|
||||
)
|
||||
aggregated.objects = aggregated.objects[:max_objects]
|
||||
aggregated.key_count = len(aggregated.objects)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = page.next_continuation_token
|
||||
limit_reached = True
|
||||
break
|
||||
|
||||
if not page.is_truncated:
|
||||
aggregated.is_truncated = False
|
||||
aggregated.next_continuation_token = None
|
||||
if logger:
|
||||
elapsed = time.time() - list_start_time
|
||||
logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] LIST complete: "
|
||||
f"{iteration_count} pages, {len(aggregated.objects)} objects total in {elapsed:.2f}s"
|
||||
)
|
||||
break
|
||||
|
||||
continuation_token = page.next_continuation_token
|
||||
if not continuation_token:
|
||||
if logger:
|
||||
logger.warning(
|
||||
"list_all_objects: truncated response without continuation token after %s objects.",
|
||||
len(aggregated.objects),
|
||||
)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = None
|
||||
break
|
||||
|
||||
if aggregated.common_prefixes:
|
||||
seen: set[str] = set()
|
||||
unique_prefixes: list[str] = []
|
||||
for prefix in aggregated.common_prefixes:
|
||||
if prefix not in seen:
|
||||
seen.add(prefix)
|
||||
unique_prefixes.append(prefix)
|
||||
aggregated.common_prefixes = unique_prefixes
|
||||
aggregated.key_count = len(aggregated.objects)
|
||||
aggregated.limit_reached = limit_reached
|
||||
return aggregated
|
||||
|
||||
|
||||
def _parse_last_modified(value: Any) -> datetime:
|
||||
if isinstance(value, datetime):
|
||||
dt = value
|
||||
elif value:
|
||||
text = str(value)
|
||||
if text.endswith("Z"):
|
||||
text = text[:-1] + "+00:00"
|
||||
try:
|
||||
dt = datetime.fromisoformat(text)
|
||||
except ValueError:
|
||||
dt = datetime.fromtimestamp(0, tz=timezone.utc) # noqa: UP017
|
||||
else:
|
||||
dt = datetime.fromtimestamp(0, tz=timezone.utc) # noqa: UP017
|
||||
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc) # noqa: UP017
|
||||
return dt
|
||||
|
||||
|
||||
def object_dict_to_head(obj: dict[str, Any]) -> ObjectHead:
|
||||
"""Convert a list_objects entry into ObjectHead for compatibility uses."""
|
||||
metadata = obj.get("metadata")
|
||||
if metadata is None or not isinstance(metadata, dict):
|
||||
metadata = {}
|
||||
|
||||
return ObjectHead(
|
||||
key=obj["key"],
|
||||
size=int(obj.get("size", 0)),
|
||||
etag=str(obj.get("etag", "")),
|
||||
last_modified=_parse_last_modified(obj.get("last_modified")),
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ObjectListing",
|
||||
"list_objects_page",
|
||||
"list_all_objects",
|
||||
"object_dict_to_head",
|
||||
]
|
||||
85
src/deltaglider/core/s3_uri.py
Normal file
85
src/deltaglider/core/s3_uri.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Utilities for working with S3-style URLs and keys."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
S3_SCHEME = "s3://"
|
||||
|
||||
|
||||
class S3Url(NamedTuple):
|
||||
"""Normalized representation of an S3 URL."""
|
||||
|
||||
bucket: str
|
||||
key: str = ""
|
||||
|
||||
def to_url(self) -> str:
|
||||
"""Return the canonical string form."""
|
||||
if self.key:
|
||||
return f"{S3_SCHEME}{self.bucket}/{self.key}"
|
||||
return f"{S3_SCHEME}{self.bucket}"
|
||||
|
||||
def with_key(self, key: str) -> S3Url:
|
||||
"""Return a new S3Url with a different key."""
|
||||
return S3Url(self.bucket, key.lstrip("/"))
|
||||
|
||||
def join_key(self, suffix: str) -> S3Url:
|
||||
"""Append a suffix to the key using '/' semantics."""
|
||||
suffix = suffix.lstrip("/")
|
||||
if not self.key:
|
||||
return self.with_key(suffix)
|
||||
if not suffix:
|
||||
return self
|
||||
return self.with_key(f"{self.key.rstrip('/')}/{suffix}")
|
||||
|
||||
|
||||
def is_s3_url(value: str) -> bool:
|
||||
"""Check if a string is an S3 URL."""
|
||||
return value.startswith(S3_SCHEME)
|
||||
|
||||
|
||||
def parse_s3_url(
|
||||
url: str,
|
||||
*,
|
||||
allow_empty_key: bool = True,
|
||||
strip_trailing_slash: bool = False,
|
||||
) -> S3Url:
|
||||
"""Parse an S3 URL into bucket and key components."""
|
||||
if not is_s3_url(url):
|
||||
raise ValueError(f"Invalid S3 URL: {url}")
|
||||
|
||||
path = url[len(S3_SCHEME) :]
|
||||
if strip_trailing_slash:
|
||||
path = path.rstrip("/")
|
||||
|
||||
bucket, sep, key = path.partition("/")
|
||||
if not bucket:
|
||||
raise ValueError(f"S3 URL missing bucket: {url}")
|
||||
|
||||
if not sep:
|
||||
key = ""
|
||||
|
||||
key = key.lstrip("/")
|
||||
if not key and not allow_empty_key:
|
||||
raise ValueError(f"S3 URL must include a key: {url}")
|
||||
|
||||
return S3Url(bucket=bucket, key=key)
|
||||
|
||||
|
||||
def build_s3_url(bucket: str, key: str | None = None) -> str:
|
||||
"""Build an S3 URL from components."""
|
||||
if not bucket:
|
||||
raise ValueError("Bucket name cannot be empty")
|
||||
|
||||
if key:
|
||||
key = key.lstrip("/")
|
||||
return f"{S3_SCHEME}{bucket}/{key}"
|
||||
return f"{S3_SCHEME}{bucket}"
|
||||
|
||||
|
||||
__all__ = [
|
||||
"S3Url",
|
||||
"build_s3_url",
|
||||
"is_s3_url",
|
||||
"parse_s3_url",
|
||||
]
|
||||
@@ -2,9 +2,11 @@
|
||||
|
||||
import tempfile
|
||||
import warnings
|
||||
from datetime import UTC, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any, BinaryIO
|
||||
|
||||
from .. import __version__
|
||||
from ..ports import (
|
||||
CachePort,
|
||||
ClockPort,
|
||||
@@ -15,6 +17,11 @@ from ..ports import (
|
||||
StoragePort,
|
||||
)
|
||||
from ..ports.storage import ObjectHead
|
||||
from .delta_extensions import (
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
DEFAULT_DELTA_EXTENSIONS,
|
||||
is_delta_candidate,
|
||||
)
|
||||
from .errors import (
|
||||
DiffDecodeError,
|
||||
DiffEncodeError,
|
||||
@@ -23,12 +30,15 @@ from .errors import (
|
||||
PolicyViolationWarning,
|
||||
)
|
||||
from .models import (
|
||||
DeleteResult,
|
||||
DeltaMeta,
|
||||
DeltaSpace,
|
||||
ObjectKey,
|
||||
PutSummary,
|
||||
RecursiveDeleteResult,
|
||||
ReferenceMeta,
|
||||
VerifyResult,
|
||||
resolve_metadata,
|
||||
)
|
||||
|
||||
|
||||
@@ -44,10 +54,17 @@ class DeltaService:
|
||||
clock: ClockPort,
|
||||
logger: LoggerPort,
|
||||
metrics: MetricsPort,
|
||||
tool_version: str = "deltaglider/0.1.0",
|
||||
tool_version: str | None = None,
|
||||
max_ratio: float = 0.5,
|
||||
):
|
||||
"""Initialize service with ports."""
|
||||
"""Initialize service with ports.
|
||||
|
||||
Args:
|
||||
tool_version: Version string for metadata. If None, uses package __version__.
|
||||
"""
|
||||
# Use real package version if not explicitly provided
|
||||
if tool_version is None:
|
||||
tool_version = f"deltaglider/{__version__}"
|
||||
self.storage = storage
|
||||
self.diff = diff
|
||||
self.hasher = hasher
|
||||
@@ -58,51 +75,41 @@ class DeltaService:
|
||||
self.tool_version = tool_version
|
||||
self.max_ratio = max_ratio
|
||||
|
||||
# File extensions that should use delta compression
|
||||
self.delta_extensions = {
|
||||
".zip",
|
||||
".tar",
|
||||
".gz",
|
||||
".tar.gz",
|
||||
".tgz",
|
||||
".bz2",
|
||||
".tar.bz2",
|
||||
".xz",
|
||||
".tar.xz",
|
||||
".7z",
|
||||
".rar",
|
||||
".dmg",
|
||||
".iso",
|
||||
".pkg",
|
||||
".deb",
|
||||
".rpm",
|
||||
".apk",
|
||||
".jar",
|
||||
".war",
|
||||
".ear",
|
||||
}
|
||||
# File extensions that should use delta compression. Keep mutable copies
|
||||
# so advanced callers can customize the policy if needed.
|
||||
self.delta_extensions = set(DEFAULT_DELTA_EXTENSIONS)
|
||||
self.compound_delta_extensions = DEFAULT_COMPOUND_DELTA_EXTENSIONS
|
||||
|
||||
def should_use_delta(self, filename: str) -> bool:
|
||||
"""Check if file should use delta compression based on extension."""
|
||||
name_lower = filename.lower()
|
||||
# Check compound extensions first
|
||||
for ext in [".tar.gz", ".tar.bz2", ".tar.xz"]:
|
||||
if name_lower.endswith(ext):
|
||||
return True
|
||||
# Check simple extensions
|
||||
return any(name_lower.endswith(ext) for ext in self.delta_extensions)
|
||||
return is_delta_candidate(
|
||||
filename,
|
||||
simple_extensions=self.delta_extensions,
|
||||
compound_extensions=self.compound_delta_extensions,
|
||||
)
|
||||
|
||||
def put(
|
||||
self, local_file: Path, delta_space: DeltaSpace, max_ratio: float | None = None
|
||||
self,
|
||||
local_file: Path,
|
||||
delta_space: DeltaSpace,
|
||||
max_ratio: float | None = None,
|
||||
override_name: str | None = None,
|
||||
) -> PutSummary:
|
||||
"""Upload file as reference or delta (for archive files) or directly (for other files)."""
|
||||
"""Upload file as reference or delta (for archive files) or directly (for other files).
|
||||
|
||||
Args:
|
||||
local_file: Path to the local file to upload
|
||||
delta_space: DeltaSpace (bucket + prefix) for the upload
|
||||
max_ratio: Maximum acceptable delta/file ratio (default: service max_ratio)
|
||||
override_name: Optional name to use instead of local_file.name (useful for S3-to-S3 copies)
|
||||
"""
|
||||
if max_ratio is None:
|
||||
max_ratio = self.max_ratio
|
||||
|
||||
start_time = self.clock.now()
|
||||
file_size = local_file.stat().st_size
|
||||
file_sha256 = self.hasher.sha256(local_file)
|
||||
original_name = local_file.name
|
||||
original_name = override_name if override_name else local_file.name
|
||||
|
||||
self.logger.info(
|
||||
"Starting put operation",
|
||||
@@ -166,13 +173,13 @@ class DeltaService:
|
||||
self.logger.info("Starting get operation", key=object_key.key)
|
||||
|
||||
# Get object metadata
|
||||
obj_head = self.storage.head(f"{object_key.bucket}/{object_key.key}")
|
||||
obj_head = self.storage.head(object_key.full_key)
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {object_key.key}")
|
||||
|
||||
# Check if this is a regular S3 object (not uploaded via DeltaGlider)
|
||||
# Regular S3 objects won't have DeltaGlider metadata
|
||||
if "file_sha256" not in obj_head.metadata:
|
||||
# Regular S3 objects won't have DeltaGlider metadata (dg-file-sha256 key)
|
||||
if "dg-file-sha256" not in obj_head.metadata:
|
||||
# This is a regular S3 object, download it directly
|
||||
self.logger.info(
|
||||
"Downloading regular S3 object (no DeltaGlider metadata)",
|
||||
@@ -196,11 +203,13 @@ class DeltaService:
|
||||
# Direct download without delta processing
|
||||
self._get_direct(object_key, obj_head, out)
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
file_size_meta = resolve_metadata(obj_head.metadata, "file_size")
|
||||
file_size_value = int(file_size_meta) if file_size_meta else obj_head.size
|
||||
self.logger.log_operation(
|
||||
op="get",
|
||||
key=object_key.key,
|
||||
deltaspace=f"{object_key.bucket}",
|
||||
sizes={"file": int(obj_head.metadata.get("file_size", 0))},
|
||||
sizes={"file": file_size_value},
|
||||
durations={"total": duration},
|
||||
cache_hit=False,
|
||||
)
|
||||
@@ -230,12 +239,15 @@ class DeltaService:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmp_path = Path(tmpdir)
|
||||
delta_path = tmp_path / "delta"
|
||||
ref_path = self.cache.ref_path(delta_space.bucket, delta_space.prefix)
|
||||
# SECURITY: Use validated ref to prevent TOCTOU attacks
|
||||
ref_path = self.cache.get_validated_ref(
|
||||
delta_space.bucket, delta_space.prefix, delta_meta.ref_sha256
|
||||
)
|
||||
out_path = tmp_path / "output"
|
||||
|
||||
# Download delta
|
||||
with open(delta_path, "wb") as f:
|
||||
delta_stream = self.storage.get(f"{object_key.bucket}/{object_key.key}")
|
||||
delta_stream = self.storage.get(object_key.full_key)
|
||||
for chunk in iter(lambda: delta_stream.read(8192), b""):
|
||||
f.write(chunk)
|
||||
|
||||
@@ -335,10 +347,13 @@ class DeltaService:
|
||||
|
||||
# Re-check for race condition
|
||||
ref_head = self.storage.head(full_ref_key)
|
||||
if ref_head and ref_head.metadata.get("file_sha256") != file_sha256:
|
||||
existing_sha = None
|
||||
if ref_head:
|
||||
existing_sha = resolve_metadata(ref_head.metadata, "file_sha256")
|
||||
if ref_head and existing_sha and existing_sha != file_sha256:
|
||||
self.logger.warning("Reference creation race detected, using existing")
|
||||
# Proceed with existing reference
|
||||
ref_sha256 = ref_head.metadata["file_sha256"]
|
||||
ref_sha256 = existing_sha
|
||||
else:
|
||||
ref_sha256 = file_sha256
|
||||
|
||||
@@ -401,14 +416,17 @@ class DeltaService:
|
||||
) -> PutSummary:
|
||||
"""Create delta file."""
|
||||
ref_key = delta_space.reference_key()
|
||||
ref_sha256 = ref_head.metadata["file_sha256"]
|
||||
ref_sha256 = resolve_metadata(ref_head.metadata, "file_sha256")
|
||||
if not ref_sha256:
|
||||
raise ValueError("Reference metadata missing file SHA256")
|
||||
|
||||
# Ensure reference is cached
|
||||
cache_hit = self.cache.has_ref(delta_space.bucket, delta_space.prefix, ref_sha256)
|
||||
if not cache_hit:
|
||||
self._cache_reference(delta_space, ref_sha256)
|
||||
|
||||
ref_path = self.cache.ref_path(delta_space.bucket, delta_space.prefix)
|
||||
# SECURITY: Use validated ref to prevent TOCTOU attacks
|
||||
ref_path = self.cache.get_validated_ref(delta_space.bucket, delta_space.prefix, ref_sha256)
|
||||
|
||||
# Create delta
|
||||
with tempfile.NamedTemporaryFile(suffix=".delta") as delta_file:
|
||||
@@ -520,7 +538,7 @@ class DeltaService:
|
||||
) -> None:
|
||||
"""Download file directly from S3 without delta processing."""
|
||||
# Download the file directly
|
||||
file_stream = self.storage.get(f"{object_key.bucket}/{object_key.key}")
|
||||
file_stream = self.storage.get(object_key.full_key)
|
||||
|
||||
if isinstance(out, Path):
|
||||
# Write to file path
|
||||
@@ -533,7 +551,7 @@ class DeltaService:
|
||||
out.write(chunk)
|
||||
|
||||
# Verify integrity if SHA256 is present
|
||||
expected_sha = obj_head.metadata.get("file_sha256")
|
||||
expected_sha = resolve_metadata(obj_head.metadata, "file_sha256")
|
||||
if expected_sha:
|
||||
if isinstance(out, Path):
|
||||
actual_sha = self.hasher.sha256(out)
|
||||
@@ -554,7 +572,7 @@ class DeltaService:
|
||||
self.logger.info(
|
||||
"Direct download complete",
|
||||
key=object_key.key,
|
||||
size=obj_head.metadata.get("file_size"),
|
||||
size=resolve_metadata(obj_head.metadata, "file_size"),
|
||||
)
|
||||
|
||||
def _upload_direct(
|
||||
@@ -602,128 +620,37 @@ class DeltaService:
|
||||
file_sha256=file_sha256,
|
||||
)
|
||||
|
||||
def delete(self, object_key: ObjectKey) -> dict[str, Any]:
|
||||
def delete(self, object_key: ObjectKey) -> DeleteResult:
|
||||
"""Delete an object (delta-aware).
|
||||
|
||||
For delta files, just deletes the delta.
|
||||
For reference files, checks if any deltas depend on it first.
|
||||
For direct uploads, simply deletes the file.
|
||||
|
||||
Returns:
|
||||
dict with deletion details including type and any warnings
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
full_key = f"{object_key.bucket}/{object_key.key}"
|
||||
full_key = object_key.full_key
|
||||
|
||||
self.logger.info("Starting delete operation", key=object_key.key)
|
||||
|
||||
# Check if object exists
|
||||
obj_head = self.storage.head(full_key)
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {object_key.key}")
|
||||
|
||||
# Determine object type
|
||||
is_reference = object_key.key.endswith("/reference.bin")
|
||||
is_delta = object_key.key.endswith(".delta")
|
||||
is_direct = obj_head.metadata.get("compression") == "none"
|
||||
result = DeleteResult(key=object_key.key, bucket=object_key.bucket)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"key": object_key.key,
|
||||
"bucket": object_key.bucket,
|
||||
"deleted": False,
|
||||
"type": "unknown",
|
||||
"warnings": [],
|
||||
}
|
||||
|
||||
if is_reference:
|
||||
# Check if any deltas depend on this reference
|
||||
prefix = object_key.key.rsplit("/", 1)[0] if "/" in object_key.key else ""
|
||||
dependent_deltas = []
|
||||
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
# Check if this delta references our reference
|
||||
delta_head = self.storage.head(f"{object_key.bucket}/{obj.key}")
|
||||
if delta_head and delta_head.metadata.get("ref_key") == object_key.key:
|
||||
dependent_deltas.append(obj.key)
|
||||
|
||||
if dependent_deltas:
|
||||
warnings_list = result["warnings"]
|
||||
assert isinstance(warnings_list, list)
|
||||
warnings_list.append(
|
||||
f"Reference has {len(dependent_deltas)} dependent delta(s). "
|
||||
"Deleting this will make those deltas unrecoverable."
|
||||
)
|
||||
self.logger.warning(
|
||||
"Reference has dependent deltas",
|
||||
ref_key=object_key.key,
|
||||
delta_count=len(dependent_deltas),
|
||||
deltas=dependent_deltas[:5], # Log first 5
|
||||
)
|
||||
|
||||
# Delete the reference
|
||||
if object_key.key.endswith("/reference.bin"):
|
||||
self._delete_reference(object_key, full_key, result)
|
||||
elif object_key.key.endswith(".delta"):
|
||||
self._delete_delta(object_key, full_key, obj_head, result)
|
||||
elif obj_head.metadata.get("compression") == "none":
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "reference"
|
||||
result["dependent_deltas"] = len(dependent_deltas)
|
||||
|
||||
# Clear from cache if present
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = object_key.key.rsplit("/", 1)[0]
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {object_key.key}: {e}")
|
||||
|
||||
elif is_delta:
|
||||
# Delete the delta file
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "delta"
|
||||
result["original_name"] = obj_head.metadata.get("original_name", "unknown")
|
||||
|
||||
# Check if this was the last delta in the DeltaSpace - if so, clean up reference.bin
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = "/".join(object_key.key.split("/")[:-1])
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
|
||||
# Check if any other delta files exist in this DeltaSpace
|
||||
remaining_deltas = []
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{deltaspace_prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
remaining_deltas.append(obj.key)
|
||||
|
||||
if not remaining_deltas:
|
||||
# No more deltas - clean up the orphaned reference.bin
|
||||
ref_full_key = f"{object_key.bucket}/{ref_key}"
|
||||
ref_head = self.storage.head(ref_full_key)
|
||||
if ref_head:
|
||||
self.storage.delete(ref_full_key)
|
||||
self.logger.info(
|
||||
"Cleaned up orphaned reference.bin",
|
||||
ref_key=ref_key,
|
||||
reason="no remaining deltas",
|
||||
)
|
||||
result["cleaned_reference"] = ref_key
|
||||
|
||||
# Clear from cache
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {deltaspace_prefix}: {e}")
|
||||
|
||||
elif is_direct:
|
||||
# Simply delete the direct upload
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "direct"
|
||||
result["original_name"] = obj_head.metadata.get("original_name", object_key.key)
|
||||
|
||||
result.deleted = True
|
||||
result.type = "direct"
|
||||
result.original_name = obj_head.metadata.get("original_name", object_key.key)
|
||||
else:
|
||||
# Unknown file type, delete anyway
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "unknown"
|
||||
result.deleted = True
|
||||
result.type = "unknown"
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.log_operation(
|
||||
@@ -735,169 +662,139 @@ class DeltaService:
|
||||
cache_hit=False,
|
||||
)
|
||||
self.metrics.timing("deltaglider.delete.duration", duration)
|
||||
self.metrics.increment(f"deltaglider.delete.{result['type']}")
|
||||
self.metrics.increment(f"deltaglider.delete.{result.type}")
|
||||
|
||||
return result
|
||||
|
||||
def delete_recursive(self, bucket: str, prefix: str) -> dict[str, Any]:
|
||||
def _delete_reference(self, object_key: ObjectKey, full_key: str, result: DeleteResult) -> None:
|
||||
"""Handle deletion of a reference.bin file."""
|
||||
prefix = object_key.key.rsplit("/", 1)[0] if "/" in object_key.key else ""
|
||||
dependent_deltas = []
|
||||
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
delta_head = self.storage.head(f"{object_key.bucket}/{obj.key}")
|
||||
if delta_head and delta_head.metadata.get("ref_key") == object_key.key:
|
||||
dependent_deltas.append(obj.key)
|
||||
|
||||
if dependent_deltas:
|
||||
result.warnings.append(
|
||||
f"Reference has {len(dependent_deltas)} dependent delta(s). "
|
||||
"Deleting this will make those deltas unrecoverable."
|
||||
)
|
||||
self.logger.warning(
|
||||
"Reference has dependent deltas",
|
||||
ref_key=object_key.key,
|
||||
delta_count=len(dependent_deltas),
|
||||
deltas=dependent_deltas[:5],
|
||||
)
|
||||
|
||||
self.storage.delete(full_key)
|
||||
result.deleted = True
|
||||
result.type = "reference"
|
||||
result.dependent_deltas = len(dependent_deltas)
|
||||
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = object_key.key.rsplit("/", 1)[0]
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {object_key.key}: {e}")
|
||||
|
||||
def _delete_delta(
|
||||
self,
|
||||
object_key: ObjectKey,
|
||||
full_key: str,
|
||||
obj_head: ObjectHead,
|
||||
result: DeleteResult,
|
||||
) -> None:
|
||||
"""Handle deletion of a delta file, cleaning up orphaned references."""
|
||||
self.storage.delete(full_key)
|
||||
result.deleted = True
|
||||
result.type = "delta"
|
||||
result.original_name = obj_head.metadata.get("original_name", "unknown")
|
||||
|
||||
if "/" not in object_key.key:
|
||||
return
|
||||
|
||||
deltaspace_prefix = "/".join(object_key.key.split("/")[:-1])
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
|
||||
remaining_deltas = [
|
||||
obj.key
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{deltaspace_prefix}")
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key
|
||||
]
|
||||
|
||||
if not remaining_deltas:
|
||||
ref_full_key = f"{object_key.bucket}/{ref_key}"
|
||||
ref_head = self.storage.head(ref_full_key)
|
||||
if ref_head:
|
||||
self.storage.delete(ref_full_key)
|
||||
self.logger.info(
|
||||
"Cleaned up orphaned reference.bin",
|
||||
ref_key=ref_key,
|
||||
reason="no remaining deltas",
|
||||
)
|
||||
result.cleaned_reference = ref_key
|
||||
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {deltaspace_prefix}: {e}")
|
||||
|
||||
def delete_recursive(self, bucket: str, prefix: str) -> RecursiveDeleteResult:
|
||||
"""Recursively delete all objects under a prefix (delta-aware).
|
||||
|
||||
Handles delta relationships intelligently:
|
||||
- Deletes deltas before references
|
||||
- Warns about orphaned deltas
|
||||
- Handles direct uploads
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix to delete recursively
|
||||
|
||||
Returns:
|
||||
dict with deletion statistics and any warnings
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
self.logger.info("Starting recursive delete", bucket=bucket, prefix=prefix)
|
||||
|
||||
# Ensure prefix ends with / for proper directory deletion
|
||||
if prefix and not prefix.endswith("/"):
|
||||
prefix = f"{prefix}/"
|
||||
|
||||
# Collect all objects under prefix
|
||||
objects_to_delete = []
|
||||
references = []
|
||||
deltas = []
|
||||
direct_uploads = []
|
||||
affected_deltaspaces = set()
|
||||
# Phase 1: classify objects by type
|
||||
references, deltas, direct_uploads, other_objects, affected_deltaspaces = (
|
||||
self._classify_objects_for_deletion(bucket, prefix)
|
||||
)
|
||||
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}" if prefix else bucket):
|
||||
if not obj.key.startswith(prefix) and prefix:
|
||||
continue
|
||||
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
references.append(obj.key)
|
||||
elif obj.key.endswith(".delta"):
|
||||
deltas.append(obj.key)
|
||||
# Track which deltaspaces are affected by this deletion
|
||||
if "/" in obj.key:
|
||||
deltaspace_prefix = "/".join(obj.key.split("/")[:-1])
|
||||
affected_deltaspaces.add(deltaspace_prefix)
|
||||
else:
|
||||
# Check if it's a direct upload
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head and obj_head.metadata.get("compression") == "none":
|
||||
direct_uploads.append(obj.key)
|
||||
else:
|
||||
objects_to_delete.append(obj.key)
|
||||
|
||||
# Also check for references in parent directories that might be affected
|
||||
# by the deletion of delta files in affected deltaspaces
|
||||
for deltaspace_prefix in affected_deltaspaces:
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
# Also check for references in parent deltaspaces affected by delta deletion
|
||||
for ds_prefix in affected_deltaspaces:
|
||||
ref_key = f"{ds_prefix}/reference.bin"
|
||||
if ref_key not in references:
|
||||
# Check if this reference exists
|
||||
ref_head = self.storage.head(f"{bucket}/{ref_key}")
|
||||
if ref_head:
|
||||
references.append(ref_key)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": len(deltas),
|
||||
"references_deleted": len(references),
|
||||
"direct_deleted": len(direct_uploads),
|
||||
"other_deleted": len(objects_to_delete),
|
||||
"errors": [],
|
||||
"warnings": [],
|
||||
}
|
||||
result = RecursiveDeleteResult(
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
deltas_deleted=len(deltas),
|
||||
references_deleted=len(references),
|
||||
direct_deleted=len(direct_uploads),
|
||||
other_deleted=len(other_objects),
|
||||
)
|
||||
|
||||
# Delete in order: other files -> direct uploads -> deltas -> references (with checks)
|
||||
# This ensures we don't delete references that deltas depend on prematurely
|
||||
regular_files = objects_to_delete + direct_uploads + deltas
|
||||
|
||||
# Delete regular files first
|
||||
for key in regular_files:
|
||||
# Phase 2: delete non-reference files first (dependency order)
|
||||
for key in other_objects + direct_uploads + deltas:
|
||||
try:
|
||||
self.storage.delete(f"{bucket}/{key}")
|
||||
deleted_count = result["deleted_count"]
|
||||
assert isinstance(deleted_count, int)
|
||||
result["deleted_count"] = deleted_count + 1
|
||||
result.deleted_count += 1
|
||||
self.logger.debug(f"Deleted {key}")
|
||||
except Exception as e:
|
||||
failed_count = result["failed_count"]
|
||||
assert isinstance(failed_count, int)
|
||||
result["failed_count"] = failed_count + 1
|
||||
errors_list = result["errors"]
|
||||
assert isinstance(errors_list, list)
|
||||
errors_list.append(f"Failed to delete {key}: {str(e)}")
|
||||
result.failed_count += 1
|
||||
result.errors.append(f"Failed to delete {key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete {key}: {e}")
|
||||
|
||||
# Handle references intelligently - only delete if no files outside deletion scope depend on them
|
||||
references_kept = 0
|
||||
for ref_key in references:
|
||||
try:
|
||||
# Extract deltaspace prefix from reference.bin path
|
||||
if ref_key.endswith("/reference.bin"):
|
||||
deltaspace_prefix = ref_key[:-14] # Remove "/reference.bin"
|
||||
else:
|
||||
deltaspace_prefix = ""
|
||||
# Phase 3: delete references only if safe
|
||||
references_kept = self._delete_references_if_safe(bucket, prefix, references, result)
|
||||
result.references_deleted -= references_kept
|
||||
|
||||
# Check if there are any remaining files in this deltaspace
|
||||
# (outside of the deletion prefix)
|
||||
deltaspace_list_prefix = (
|
||||
f"{bucket}/{deltaspace_prefix}" if deltaspace_prefix else bucket
|
||||
)
|
||||
remaining_objects = list(self.storage.list(deltaspace_list_prefix))
|
||||
|
||||
# Filter out objects that are being deleted (within our deletion scope)
|
||||
# and the reference.bin file itself
|
||||
deletion_prefix_full = f"{bucket}/{prefix}" if prefix else bucket
|
||||
has_remaining_files = False
|
||||
|
||||
for remaining_obj in remaining_objects:
|
||||
obj_full_path = f"{bucket}/{remaining_obj.key}"
|
||||
# Skip if this object is within our deletion scope
|
||||
if prefix and obj_full_path.startswith(deletion_prefix_full):
|
||||
continue
|
||||
# Skip if this is the reference.bin file itself
|
||||
if remaining_obj.key == ref_key:
|
||||
continue
|
||||
# If we find any other file, the reference is still needed
|
||||
has_remaining_files = True
|
||||
break
|
||||
|
||||
if not has_remaining_files:
|
||||
# Safe to delete this reference.bin
|
||||
self.storage.delete(f"{bucket}/{ref_key}")
|
||||
deleted_count = result["deleted_count"]
|
||||
assert isinstance(deleted_count, int)
|
||||
result["deleted_count"] = deleted_count + 1
|
||||
self.logger.debug(f"Deleted reference {ref_key}")
|
||||
else:
|
||||
# Keep the reference as it's still needed
|
||||
references_kept += 1
|
||||
warnings_list = result["warnings"]
|
||||
assert isinstance(warnings_list, list)
|
||||
warnings_list.append(f"Kept reference {ref_key} (still in use)")
|
||||
self.logger.info(
|
||||
f"Kept reference {ref_key} - still in use outside deletion scope"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
failed_count = result["failed_count"]
|
||||
assert isinstance(failed_count, int)
|
||||
result["failed_count"] = failed_count + 1
|
||||
errors_list = result["errors"]
|
||||
assert isinstance(errors_list, list)
|
||||
errors_list.append(f"Failed to delete reference {ref_key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete reference {ref_key}: {e}")
|
||||
|
||||
# Update reference deletion count
|
||||
references_deleted = result["references_deleted"]
|
||||
assert isinstance(references_deleted, int)
|
||||
result["references_deleted"] = references_deleted - references_kept
|
||||
|
||||
# Clear any cached references for this prefix
|
||||
# Clear cached references
|
||||
if references:
|
||||
try:
|
||||
self.cache.evict(bucket, prefix.rstrip("/") if prefix else "")
|
||||
@@ -909,11 +806,291 @@ class DeltaService:
|
||||
"Recursive delete complete",
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
deleted=result["deleted_count"],
|
||||
failed=result["failed_count"],
|
||||
deleted=result.deleted_count,
|
||||
failed=result.failed_count,
|
||||
duration=duration,
|
||||
)
|
||||
self.metrics.timing("deltaglider.delete_recursive.duration", duration)
|
||||
self.metrics.increment("deltaglider.delete_recursive.completed")
|
||||
|
||||
return result
|
||||
|
||||
def _classify_objects_for_deletion(
|
||||
self, bucket: str, prefix: str
|
||||
) -> tuple[list[str], list[str], list[str], list[str], set[str]]:
|
||||
"""Classify objects under a prefix into references, deltas, direct uploads, and other.
|
||||
|
||||
Returns:
|
||||
(references, deltas, direct_uploads, other_objects, affected_deltaspaces)
|
||||
"""
|
||||
references: list[str] = []
|
||||
deltas: list[str] = []
|
||||
direct_uploads: list[str] = []
|
||||
other_objects: list[str] = []
|
||||
affected_deltaspaces: set[str] = set()
|
||||
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}" if prefix else bucket):
|
||||
if prefix and not obj.key.startswith(prefix):
|
||||
continue
|
||||
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
references.append(obj.key)
|
||||
elif obj.key.endswith(".delta"):
|
||||
deltas.append(obj.key)
|
||||
if "/" in obj.key:
|
||||
affected_deltaspaces.add("/".join(obj.key.split("/")[:-1]))
|
||||
else:
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head and obj_head.metadata.get("compression") == "none":
|
||||
direct_uploads.append(obj.key)
|
||||
else:
|
||||
other_objects.append(obj.key)
|
||||
|
||||
return references, deltas, direct_uploads, other_objects, affected_deltaspaces
|
||||
|
||||
def _delete_references_if_safe(
|
||||
self,
|
||||
bucket: str,
|
||||
prefix: str,
|
||||
references: list[str],
|
||||
result: RecursiveDeleteResult,
|
||||
) -> int:
|
||||
"""Delete references only if no files outside the deletion scope depend on them.
|
||||
|
||||
Returns the number of references kept (not deleted).
|
||||
"""
|
||||
references_kept = 0
|
||||
deletion_prefix_full = f"{bucket}/{prefix}" if prefix else bucket
|
||||
|
||||
for ref_key in references:
|
||||
try:
|
||||
if ref_key.endswith("/reference.bin"):
|
||||
deltaspace_prefix = ref_key[:-14] # Remove "/reference.bin"
|
||||
else:
|
||||
deltaspace_prefix = ""
|
||||
|
||||
ds_list_prefix = f"{bucket}/{deltaspace_prefix}" if deltaspace_prefix else bucket
|
||||
has_remaining_files = any(
|
||||
not (prefix and f"{bucket}/{obj.key}".startswith(deletion_prefix_full))
|
||||
and obj.key != ref_key
|
||||
for obj in self.storage.list(ds_list_prefix)
|
||||
)
|
||||
|
||||
if not has_remaining_files:
|
||||
self.storage.delete(f"{bucket}/{ref_key}")
|
||||
result.deleted_count += 1
|
||||
self.logger.debug(f"Deleted reference {ref_key}")
|
||||
else:
|
||||
references_kept += 1
|
||||
result.warnings.append(f"Kept reference {ref_key} (still in use)")
|
||||
self.logger.info(
|
||||
f"Kept reference {ref_key} - still in use outside deletion scope"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
result.failed_count += 1
|
||||
result.errors.append(f"Failed to delete reference {ref_key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete reference {ref_key}: {e}")
|
||||
|
||||
return references_kept
|
||||
|
||||
def rehydrate_for_download(
|
||||
self,
|
||||
bucket: str,
|
||||
key: str,
|
||||
expires_in_seconds: int = 3600,
|
||||
) -> str | None:
|
||||
"""Rehydrate a deltaglider-compressed file for direct download.
|
||||
|
||||
If the file is deltaglider-compressed, this will:
|
||||
1. Download and decompress the file
|
||||
2. Re-upload to .deltaglider/tmp/ with expiration metadata
|
||||
3. Return the new temporary file key
|
||||
|
||||
If the file is not deltaglider-compressed, returns None.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
key: Object key
|
||||
expires_in_seconds: How long the temporary file should exist
|
||||
|
||||
Returns:
|
||||
New key for temporary file, or None if not deltaglider-compressed
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
|
||||
# Check if object exists and is deltaglider-compressed
|
||||
obj_head = self.storage.head(f"{bucket}/{key}")
|
||||
|
||||
# If not found directly, try with .delta extension
|
||||
if obj_head is None and not key.endswith(".delta"):
|
||||
obj_head = self.storage.head(f"{bucket}/{key}.delta")
|
||||
if obj_head is not None:
|
||||
# Found the delta version, update the key
|
||||
key = f"{key}.delta"
|
||||
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {key}")
|
||||
|
||||
# Check if this is a deltaglider file
|
||||
is_delta = key.endswith(".delta")
|
||||
has_dg_metadata = "dg-file-sha256" in obj_head.metadata
|
||||
|
||||
if not is_delta and not has_dg_metadata:
|
||||
# Not a deltaglider file, return None
|
||||
self.logger.debug(f"File {key} is not deltaglider-compressed")
|
||||
return None
|
||||
|
||||
# Generate temporary file path
|
||||
import uuid
|
||||
|
||||
# Use the original filename without .delta extension for the temp file
|
||||
original_name = key.removesuffix(".delta") if key.endswith(".delta") else key
|
||||
temp_filename = f"{uuid.uuid4().hex}_{Path(original_name).name}"
|
||||
temp_key = f".deltaglider/tmp/{temp_filename}"
|
||||
|
||||
# Download and decompress the file
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmp_path = Path(tmpdir)
|
||||
decompressed_path = tmp_path / "decompressed"
|
||||
|
||||
# Use the existing get method to decompress
|
||||
object_key = ObjectKey(bucket=bucket, key=key)
|
||||
self.get(object_key, decompressed_path)
|
||||
|
||||
# Calculate expiration time
|
||||
expires_at = self.clock.now() + timedelta(seconds=expires_in_seconds)
|
||||
|
||||
# Create metadata for temporary file
|
||||
metadata = {
|
||||
"dg-expires-at": expires_at.isoformat(),
|
||||
"dg-original-key": key,
|
||||
"dg-original-filename": Path(original_name).name,
|
||||
"dg-rehydrated": "true",
|
||||
"dg-created-at": self.clock.now().isoformat(),
|
||||
}
|
||||
|
||||
# Upload the decompressed file
|
||||
self.logger.info(
|
||||
"Uploading rehydrated file",
|
||||
original_key=key,
|
||||
temp_key=temp_key,
|
||||
expires_at=expires_at.isoformat(),
|
||||
)
|
||||
|
||||
self.storage.put(
|
||||
f"{bucket}/{temp_key}",
|
||||
decompressed_path,
|
||||
metadata,
|
||||
)
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.info(
|
||||
"Rehydration complete",
|
||||
original_key=key,
|
||||
temp_key=temp_key,
|
||||
duration=duration,
|
||||
)
|
||||
self.metrics.timing("deltaglider.rehydrate.duration", duration)
|
||||
self.metrics.increment("deltaglider.rehydrate.completed")
|
||||
|
||||
return temp_key
|
||||
|
||||
def purge_temp_files(self, bucket: str) -> dict[str, Any]:
|
||||
"""Purge expired temporary files from .deltaglider/tmp/.
|
||||
|
||||
Scans the .deltaglider/tmp/ prefix and deletes any files
|
||||
whose dg-expires-at metadata indicates they have expired.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket to purge temp files from
|
||||
|
||||
Returns:
|
||||
dict with purge statistics
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
prefix = ".deltaglider/tmp/"
|
||||
|
||||
self.logger.info("Starting temp file purge", bucket=bucket, prefix=prefix)
|
||||
|
||||
deleted_count = 0
|
||||
expired_count = 0
|
||||
error_count = 0
|
||||
total_size_freed = 0
|
||||
errors = []
|
||||
|
||||
# List all objects in temp directory
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}"):
|
||||
if not obj.key.startswith(prefix):
|
||||
continue
|
||||
|
||||
try:
|
||||
# Get object metadata
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head is None:
|
||||
continue
|
||||
|
||||
# Check expiration
|
||||
expires_at_str = obj_head.metadata.get("dg-expires-at")
|
||||
if not expires_at_str:
|
||||
# No expiration metadata, skip
|
||||
self.logger.debug(f"No expiration metadata for {obj.key}")
|
||||
continue
|
||||
|
||||
# Parse expiration time
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
expires_at = datetime.fromisoformat(expires_at_str.replace("Z", "+00:00"))
|
||||
if expires_at.tzinfo is None:
|
||||
expires_at = expires_at.replace(tzinfo=UTC)
|
||||
except ValueError:
|
||||
self.logger.warning(
|
||||
f"Invalid expiration format for {obj.key}: {expires_at_str}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if expired
|
||||
if self.clock.now() >= expires_at:
|
||||
expired_count += 1
|
||||
# Delete the file
|
||||
self.storage.delete(f"{bucket}/{obj.key}")
|
||||
deleted_count += 1
|
||||
total_size_freed += obj.size
|
||||
self.logger.debug(
|
||||
f"Deleted expired temp file {obj.key}",
|
||||
expired_at=expires_at_str,
|
||||
size=obj.size,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
errors.append(f"Error processing {obj.key}: {str(e)}")
|
||||
self.logger.error(f"Failed to process temp file {obj.key}: {e}")
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
|
||||
result = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"deleted_count": deleted_count,
|
||||
"expired_count": expired_count,
|
||||
"error_count": error_count,
|
||||
"total_size_freed": total_size_freed,
|
||||
"duration_seconds": duration,
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
self.logger.info(
|
||||
"Temp file purge complete",
|
||||
bucket=bucket,
|
||||
deleted=deleted_count,
|
||||
size_freed=total_size_freed,
|
||||
duration=duration,
|
||||
)
|
||||
|
||||
self.metrics.timing("deltaglider.purge.duration", duration)
|
||||
self.metrics.gauge("deltaglider.purge.deleted_count", deleted_count)
|
||||
self.metrics.gauge("deltaglider.purge.size_freed", total_size_freed)
|
||||
|
||||
return result
|
||||
|
||||
@@ -15,6 +15,26 @@ class CachePort(Protocol):
|
||||
"""Check if reference exists and matches SHA."""
|
||||
...
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with atomic SHA validation.
|
||||
|
||||
This method MUST be used instead of ref_path() to prevent TOCTOU attacks.
|
||||
It validates the SHA256 hash at the time of use, not just at cache check time.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix/deltaspace within bucket
|
||||
expected_sha: Expected SHA256 hash of the file
|
||||
|
||||
Returns:
|
||||
Path to the validated cached file
|
||||
|
||||
Raises:
|
||||
CacheMissError: If cached file doesn't exist
|
||||
CacheCorruptionError: If SHA doesn't match (file corrupted or tampered)
|
||||
"""
|
||||
...
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Cache reference file."""
|
||||
...
|
||||
@@ -22,3 +42,18 @@ class CachePort(Protocol):
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference."""
|
||||
...
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached references.
|
||||
|
||||
This method forcibly removes all cached data, useful for:
|
||||
- Long-running applications that need to free memory
|
||||
- Test cleanup
|
||||
- Manual cache invalidation
|
||||
- Ensuring fresh data fetch
|
||||
|
||||
Note: For filesystem caches, this removes all files in the cache directory.
|
||||
For memory caches, this clears all in-memory data.
|
||||
For encrypted caches, this also clears encryption key mappings.
|
||||
"""
|
||||
...
|
||||
|
||||
152
src/deltaglider/response_builders.py
Normal file
152
src/deltaglider/response_builders.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""Type-safe response builders using TypedDicts for internal type safety.
|
||||
|
||||
This module provides builder functions that construct boto3-compatible responses
|
||||
with full compile-time type validation using TypedDicts. At runtime, TypedDicts
|
||||
are plain dicts, so there's no conversion overhead.
|
||||
|
||||
Benefits:
|
||||
- Field name typos caught by mypy (e.g., "HTTPStatusCode" → "HttpStatusCode")
|
||||
- Wrong types caught by mypy (e.g., string instead of int)
|
||||
- Missing required fields caught by mypy
|
||||
- Extra unknown fields caught by mypy
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from .types import (
|
||||
CommonPrefix,
|
||||
DeleteObjectResponse,
|
||||
GetObjectResponse,
|
||||
ListObjectsV2Response,
|
||||
PutObjectResponse,
|
||||
ResponseMetadata,
|
||||
S3Object,
|
||||
)
|
||||
|
||||
|
||||
def build_response_metadata(status_code: int = 200) -> ResponseMetadata:
|
||||
"""Build ResponseMetadata with full type safety via TypedDict.
|
||||
|
||||
TypedDict is a dict at runtime - no conversion needed!
|
||||
mypy validates all fields match ResponseMetadata TypedDict.
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
"""
|
||||
# Build as TypedDict - mypy validates field names and types!
|
||||
metadata: ResponseMetadata = {
|
||||
"HTTPStatusCode": status_code,
|
||||
# All other fields are NotRequired - can be omitted!
|
||||
}
|
||||
return metadata # Returns dict at runtime, ResponseMetadata type at compile-time
|
||||
|
||||
|
||||
def build_put_response(
|
||||
etag: str,
|
||||
*,
|
||||
version_id: str | None = None,
|
||||
deltaglider_info: dict[str, Any] | None = None,
|
||||
) -> PutObjectResponse:
|
||||
"""Build PutObjectResponse with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: PutObjectResponse = {
|
||||
"ETag": etag,
|
||||
"ResponseMetadata": build_response_metadata(),
|
||||
}
|
||||
|
||||
if version_id:
|
||||
response["VersionId"] = version_id
|
||||
|
||||
# DeltaGlider extension - add as Any field
|
||||
if deltaglider_info:
|
||||
response["DeltaGliderInfo"] = deltaglider_info # type: ignore[typeddict-item]
|
||||
|
||||
return response # Returns dict at runtime, PutObjectResponse type at compile-time
|
||||
|
||||
|
||||
def build_get_response(
|
||||
body: Any,
|
||||
content_length: int,
|
||||
etag: str,
|
||||
metadata: dict[str, Any],
|
||||
) -> GetObjectResponse:
|
||||
"""Build GetObjectResponse with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: GetObjectResponse = {
|
||||
"Body": body,
|
||||
"ContentLength": content_length,
|
||||
"ETag": etag,
|
||||
"Metadata": metadata,
|
||||
"ResponseMetadata": build_response_metadata(),
|
||||
}
|
||||
return response # Returns dict at runtime, GetObjectResponse type at compile-time
|
||||
|
||||
|
||||
def build_list_objects_response(
|
||||
bucket: str,
|
||||
prefix: str,
|
||||
delimiter: str,
|
||||
max_keys: int,
|
||||
contents: list[S3Object],
|
||||
common_prefixes: list[CommonPrefix] | None,
|
||||
is_truncated: bool,
|
||||
next_continuation_token: str | None,
|
||||
continuation_token: str | None,
|
||||
) -> ListObjectsV2Response:
|
||||
"""Build ListObjectsV2Response with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: ListObjectsV2Response = {
|
||||
"IsTruncated": is_truncated,
|
||||
"Contents": contents,
|
||||
"Name": bucket,
|
||||
"Prefix": prefix,
|
||||
"Delimiter": delimiter,
|
||||
"MaxKeys": max_keys,
|
||||
"KeyCount": len(contents),
|
||||
"ResponseMetadata": build_response_metadata(),
|
||||
}
|
||||
|
||||
# Add optional fields
|
||||
if common_prefixes:
|
||||
response["CommonPrefixes"] = common_prefixes
|
||||
|
||||
if next_continuation_token:
|
||||
response["NextContinuationToken"] = next_continuation_token
|
||||
|
||||
if continuation_token:
|
||||
response["ContinuationToken"] = continuation_token
|
||||
|
||||
return response # Returns dict at runtime, ListObjectsV2Response type at compile-time
|
||||
|
||||
|
||||
def build_delete_response(
|
||||
delete_marker: bool = False,
|
||||
status_code: int = 204,
|
||||
deltaglider_info: dict[str, Any] | None = None,
|
||||
) -> DeleteObjectResponse:
|
||||
"""Build DeleteObjectResponse with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: DeleteObjectResponse = {
|
||||
"DeleteMarker": delete_marker,
|
||||
"ResponseMetadata": build_response_metadata(status_code),
|
||||
}
|
||||
|
||||
# DeltaGlider extension
|
||||
if deltaglider_info:
|
||||
response["DeltaGliderInfo"] = deltaglider_info # type: ignore[typeddict-item]
|
||||
|
||||
return response # Returns dict at runtime, DeleteObjectResponse type at compile-time
|
||||
355
src/deltaglider/types.py
Normal file
355
src/deltaglider/types.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""Type definitions for boto3-compatible responses.
|
||||
|
||||
These TypedDict definitions provide type hints for DeltaGlider's boto3-compatible
|
||||
responses. All methods return plain `dict[str, Any]` at runtime for maximum
|
||||
flexibility and boto3 compatibility.
|
||||
|
||||
## Basic Usage (Recommended)
|
||||
|
||||
Use DeltaGlider with simple dict access - no type imports needed:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Returns plain dict - 100% boto3 compatible
|
||||
response = client.put_object(Bucket='my-bucket', Key='file.zip', Body=data)
|
||||
print(response['ETag'])
|
||||
|
||||
# List objects with dict access
|
||||
listing = client.list_objects(Bucket='my-bucket')
|
||||
for obj in listing['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
```
|
||||
|
||||
## Optional Type Hints
|
||||
|
||||
For IDE autocomplete and type checking, you can use our convenience TypedDicts:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
from deltaglider.types import PutObjectResponse, ListObjectsV2Response
|
||||
|
||||
client = create_client()
|
||||
response: PutObjectResponse = client.put_object(...) # IDE autocomplete
|
||||
listing: ListObjectsV2Response = client.list_objects(...)
|
||||
```
|
||||
|
||||
## Advanced: boto3-stubs Integration
|
||||
|
||||
For strictest type checking (requires boto3-stubs installation):
|
||||
|
||||
```bash
|
||||
pip install boto3-stubs[s3]
|
||||
```
|
||||
|
||||
```python
|
||||
from mypy_boto3_s3.type_defs import PutObjectOutputTypeDef
|
||||
response: PutObjectOutputTypeDef = client.put_object(...)
|
||||
```
|
||||
|
||||
**Note**: boto3-stubs TypedDefs are very strict and require ALL optional fields.
|
||||
DeltaGlider returns partial dicts for better boto3 compatibility, so boto3-stubs
|
||||
types may show false positive errors. Use `dict[str, Any]` or our TypedDicts instead.
|
||||
|
||||
## Design Philosophy
|
||||
|
||||
DeltaGlider returns `dict[str, Any]` from all boto3-compatible methods because:
|
||||
1. **Flexibility**: boto3 responses vary by service and operation
|
||||
2. **Compatibility**: Exact match with boto3 runtime behavior
|
||||
3. **Simplicity**: No complex type dependencies for users
|
||||
4. **Optional Typing**: Users choose their preferred level of type safety
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Literal, NotRequired, TypedDict
|
||||
|
||||
# ============================================================================
|
||||
# S3 Object Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class S3Object(TypedDict):
|
||||
"""An S3 object returned in list operations.
|
||||
|
||||
Compatible with boto3's S3.Client.list_objects_v2() response Contents.
|
||||
"""
|
||||
|
||||
Key: str
|
||||
Size: int
|
||||
LastModified: datetime
|
||||
ETag: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
Owner: NotRequired[dict[str, str]]
|
||||
Metadata: NotRequired[dict[str, str]]
|
||||
|
||||
|
||||
class CommonPrefix(TypedDict):
|
||||
"""A common prefix (directory) in S3 listing.
|
||||
|
||||
Compatible with boto3's S3.Client.list_objects_v2() response CommonPrefixes.
|
||||
"""
|
||||
|
||||
Prefix: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Response Metadata (used in all responses)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class ResponseMetadata(TypedDict):
|
||||
"""Metadata about the API response.
|
||||
|
||||
Compatible with all boto3 responses.
|
||||
"""
|
||||
|
||||
RequestId: NotRequired[str]
|
||||
HostId: NotRequired[str]
|
||||
HTTPStatusCode: int
|
||||
HTTPHeaders: NotRequired[dict[str, str]]
|
||||
RetryAttempts: NotRequired[int]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# List Operations Response Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class ListObjectsV2Response(TypedDict):
|
||||
"""Response from list_objects_v2 operation.
|
||||
|
||||
100% compatible with boto3's S3.Client.list_objects_v2() response.
|
||||
|
||||
Example:
|
||||
```python
|
||||
client = create_client()
|
||||
response: ListObjectsV2Response = client.list_objects(
|
||||
Bucket='my-bucket',
|
||||
Prefix='path/',
|
||||
Delimiter='/'
|
||||
)
|
||||
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
for prefix in response.get('CommonPrefixes', []):
|
||||
print(f"Directory: {prefix['Prefix']}")
|
||||
```
|
||||
"""
|
||||
|
||||
Contents: list[S3Object]
|
||||
Name: NotRequired[str] # Bucket name
|
||||
Prefix: NotRequired[str]
|
||||
Delimiter: NotRequired[str]
|
||||
MaxKeys: NotRequired[int]
|
||||
CommonPrefixes: NotRequired[list[CommonPrefix]]
|
||||
EncodingType: NotRequired[str]
|
||||
KeyCount: NotRequired[int]
|
||||
ContinuationToken: NotRequired[str]
|
||||
NextContinuationToken: NotRequired[str]
|
||||
StartAfter: NotRequired[str]
|
||||
IsTruncated: NotRequired[bool]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Put/Get/Delete Response Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class PutObjectResponse(TypedDict):
|
||||
"""Response from put_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.put_object() response.
|
||||
"""
|
||||
|
||||
ETag: str
|
||||
VersionId: NotRequired[str]
|
||||
ServerSideEncryption: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class GetObjectResponse(TypedDict):
|
||||
"""Response from get_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.get_object() response.
|
||||
"""
|
||||
|
||||
Body: Any # StreamingBody in boto3, bytes in DeltaGlider
|
||||
ContentLength: int
|
||||
ContentType: NotRequired[str]
|
||||
ETag: NotRequired[str]
|
||||
LastModified: NotRequired[datetime]
|
||||
Metadata: NotRequired[dict[str, str]]
|
||||
VersionId: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class DeleteObjectResponse(TypedDict):
|
||||
"""Response from delete_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_object() response.
|
||||
"""
|
||||
|
||||
DeleteMarker: NotRequired[bool]
|
||||
VersionId: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class DeletedObject(TypedDict):
|
||||
"""A successfully deleted object.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_objects() response Deleted.
|
||||
"""
|
||||
|
||||
Key: str
|
||||
VersionId: NotRequired[str]
|
||||
DeleteMarker: NotRequired[bool]
|
||||
DeleteMarkerVersionId: NotRequired[str]
|
||||
|
||||
|
||||
class DeleteError(TypedDict):
|
||||
"""An error that occurred during deletion.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_objects() response Errors.
|
||||
"""
|
||||
|
||||
Key: str
|
||||
Code: str
|
||||
Message: str
|
||||
VersionId: NotRequired[str]
|
||||
|
||||
|
||||
class DeleteObjectsResponse(TypedDict):
|
||||
"""Response from delete_objects operation.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_objects() response.
|
||||
"""
|
||||
|
||||
Deleted: NotRequired[list[DeletedObject]]
|
||||
Errors: NotRequired[list[DeleteError]]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Head Object Response
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class HeadObjectResponse(TypedDict):
|
||||
"""Response from head_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.head_object() response.
|
||||
"""
|
||||
|
||||
ContentLength: int
|
||||
ContentType: NotRequired[str]
|
||||
ETag: NotRequired[str]
|
||||
LastModified: NotRequired[datetime]
|
||||
Metadata: NotRequired[dict[str, str]]
|
||||
VersionId: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Bucket Operations
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class Bucket(TypedDict):
|
||||
"""An S3 bucket.
|
||||
|
||||
Compatible with boto3's S3.Client.list_buckets() response Buckets.
|
||||
"""
|
||||
|
||||
Name: str
|
||||
CreationDate: datetime
|
||||
|
||||
|
||||
class ListBucketsResponse(TypedDict):
|
||||
"""Response from list_buckets operation.
|
||||
|
||||
Compatible with boto3's S3.Client.list_buckets() response.
|
||||
"""
|
||||
|
||||
Buckets: list[Bucket]
|
||||
Owner: NotRequired[dict[str, str]]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class CreateBucketResponse(TypedDict):
|
||||
"""Response from create_bucket operation.
|
||||
|
||||
Compatible with boto3's S3.Client.create_bucket() response.
|
||||
"""
|
||||
|
||||
Location: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Multipart Upload Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class CompletedPart(TypedDict):
|
||||
"""A completed part in a multipart upload."""
|
||||
|
||||
PartNumber: int
|
||||
ETag: str
|
||||
|
||||
|
||||
class CompleteMultipartUploadResponse(TypedDict):
|
||||
"""Response from complete_multipart_upload operation."""
|
||||
|
||||
Location: NotRequired[str]
|
||||
Bucket: NotRequired[str]
|
||||
Key: NotRequired[str]
|
||||
ETag: NotRequired[str]
|
||||
VersionId: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Copy Operations
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class CopyObjectResponse(TypedDict):
|
||||
"""Response from copy_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.copy_object() response.
|
||||
"""
|
||||
|
||||
CopyObjectResult: NotRequired[dict[str, Any]]
|
||||
ETag: NotRequired[str]
|
||||
LastModified: NotRequired[datetime]
|
||||
VersionId: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Type Aliases for Convenience
|
||||
# ============================================================================
|
||||
|
||||
# Common parameter types
|
||||
BucketName = str
|
||||
ObjectKey = str
|
||||
Prefix = str
|
||||
Delimiter = str
|
||||
|
||||
# Storage class options
|
||||
StorageClass = Literal[
|
||||
"STANDARD",
|
||||
"REDUCED_REDUNDANCY",
|
||||
"STANDARD_IA",
|
||||
"ONEZONE_IA",
|
||||
"INTELLIGENT_TIERING",
|
||||
"GLACIER",
|
||||
"DEEP_ARCHIVE",
|
||||
"GLACIER_IR",
|
||||
]
|
||||
@@ -8,7 +8,7 @@ from unittest.mock import Mock
|
||||
import pytest
|
||||
|
||||
from deltaglider.adapters import (
|
||||
FsCacheAdapter,
|
||||
ContentAddressedCache,
|
||||
NoopMetricsAdapter,
|
||||
Sha256Adapter,
|
||||
StdLoggerAdapter,
|
||||
@@ -59,9 +59,9 @@ def real_hasher():
|
||||
|
||||
@pytest.fixture
|
||||
def cache_adapter(temp_dir, real_hasher):
|
||||
"""Create filesystem cache adapter."""
|
||||
"""Create content-addressed storage cache adapter."""
|
||||
cache_dir = temp_dir / "cache"
|
||||
return FsCacheAdapter(cache_dir, real_hasher)
|
||||
return ContentAddressedCache(cache_dir, real_hasher)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
@@ -130,17 +130,26 @@ class TestSyncCommand:
|
||||
|
||||
# Mock service methods
|
||||
mock_service.storage.list.return_value = [] # No existing files
|
||||
mock_service.put.return_value = PutSummary(
|
||||
operation="create_reference",
|
||||
bucket="test-bucket",
|
||||
key="backup/file.zip.delta",
|
||||
original_name="file.zip",
|
||||
file_size=8,
|
||||
file_sha256="ghi789",
|
||||
delta_size=None,
|
||||
delta_ratio=None,
|
||||
ref_key=None,
|
||||
)
|
||||
# Mock list_objects to raise NotImplementedError so it falls back to list()
|
||||
mock_service.storage.list_objects.side_effect = NotImplementedError()
|
||||
|
||||
# Mock service.put to avoid actual execution
|
||||
def mock_put(local_path, delta_space, max_ratio=None):
|
||||
return PutSummary(
|
||||
operation="create_reference",
|
||||
bucket="test-bucket",
|
||||
key=f"{delta_space.prefix}/{local_path.name}.delta"
|
||||
if delta_space.prefix
|
||||
else f"{local_path.name}.delta",
|
||||
original_name=local_path.name,
|
||||
file_size=local_path.stat().st_size,
|
||||
file_sha256="ghi789",
|
||||
delta_size=None,
|
||||
delta_ratio=None,
|
||||
ref_key=None,
|
||||
)
|
||||
|
||||
mock_service.put.side_effect = mock_put
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
result = runner.invoke(cli, ["sync", str(test_dir), "s3://test-bucket/backup/"])
|
||||
@@ -175,6 +184,8 @@ class TestSyncCommand:
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
# Mock list_objects to raise NotImplementedError so it falls back to list()
|
||||
mock_service.storage.list_objects.side_effect = NotImplementedError()
|
||||
mock_service.storage.head.side_effect = [
|
||||
None, # file1.zip doesn't exist
|
||||
Mock(), # file1.zip.delta exists
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
"""Tests for bucket management APIs."""
|
||||
|
||||
from typing import Any
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.main import create_service
|
||||
from deltaglider.client import DeltaGliderClient
|
||||
from deltaglider.client_models import BucketStats
|
||||
|
||||
|
||||
class TestBucketManagement:
|
||||
@@ -123,6 +125,48 @@ class TestBucketManagement:
|
||||
assert response["Buckets"] == []
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_list_buckets_includes_cached_stats(self):
|
||||
"""Bucket list should merge cached stats when available."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.list_buckets.return_value = {
|
||||
"Buckets": [
|
||||
{"Name": "bucket1", "CreationDate": "2025-01-01T00:00:00Z"},
|
||||
{"Name": "bucket2", "CreationDate": "2025-01-02T00:00:00Z"},
|
||||
],
|
||||
"Owner": {"DisplayName": "test-user", "ID": "12345"},
|
||||
}
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
cached_stats = BucketStats(
|
||||
bucket="bucket1",
|
||||
object_count=10,
|
||||
total_size=1000,
|
||||
compressed_size=600,
|
||||
space_saved=400,
|
||||
average_compression_ratio=0.4,
|
||||
delta_objects=6,
|
||||
direct_objects=4,
|
||||
)
|
||||
client._store_bucket_stats_cache("bucket1", mode="detailed", stats=cached_stats)
|
||||
|
||||
response = client.list_buckets()
|
||||
|
||||
bucket1 = next(bucket for bucket in response["Buckets"] if bucket["Name"] == "bucket1")
|
||||
assert bucket1["DeltaGliderStats"]["Cached"] is True
|
||||
assert bucket1["DeltaGliderStats"]["Detailed"] is True
|
||||
assert bucket1["DeltaGliderStats"]["Mode"] == "detailed"
|
||||
assert bucket1["DeltaGliderStats"]["ObjectCount"] == cached_stats.object_count
|
||||
assert bucket1["DeltaGliderStats"]["TotalSize"] == cached_stats.total_size
|
||||
|
||||
bucket2 = next(bucket for bucket in response["Buckets"] if bucket["Name"] == "bucket2")
|
||||
assert "DeltaGliderStats" not in bucket2
|
||||
|
||||
def test_delete_bucket_success(self):
|
||||
"""Test deleting a bucket successfully."""
|
||||
service = create_service()
|
||||
@@ -178,6 +222,71 @@ class TestBucketManagement:
|
||||
with pytest.raises(RuntimeError, match="Failed to delete bucket"):
|
||||
client.delete_bucket(Bucket="full-bucket")
|
||||
|
||||
def test_get_bucket_stats_caches_per_session(self, monkeypatch):
|
||||
"""Verify bucket stats are cached within the client session."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_storage.client = Mock()
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
quick_stats = BucketStats(
|
||||
bucket="bucket1",
|
||||
object_count=5,
|
||||
total_size=500,
|
||||
compressed_size=300,
|
||||
space_saved=200,
|
||||
average_compression_ratio=0.4,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
detailed_stats = BucketStats(
|
||||
bucket="bucket1",
|
||||
object_count=5,
|
||||
total_size=520,
|
||||
compressed_size=300,
|
||||
space_saved=220,
|
||||
average_compression_ratio=0.423,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
|
||||
call_count = {"value": 0}
|
||||
|
||||
def fake_get_bucket_stats(
|
||||
_: Any, bucket: str, mode: str, use_cache: bool = True, refresh_cache: bool = False
|
||||
) -> BucketStats:
|
||||
call_count["value"] += 1
|
||||
assert bucket == "bucket1"
|
||||
if mode == "detailed":
|
||||
return detailed_stats
|
||||
if mode == "sampled":
|
||||
return detailed_stats # sampled treated as detailed for cache propagation
|
||||
return quick_stats
|
||||
|
||||
monkeypatch.setattr("deltaglider.client._get_bucket_stats", fake_get_bucket_stats)
|
||||
|
||||
# First call should invoke underlying function
|
||||
result_quick = client.get_bucket_stats("bucket1")
|
||||
assert result_quick is quick_stats
|
||||
assert call_count["value"] == 1
|
||||
|
||||
# Second quick call - caching is now done in _get_bucket_stats (S3-based)
|
||||
# So each call goes through _get_bucket_stats (which handles caching internally)
|
||||
assert client.get_bucket_stats("bucket1") is quick_stats
|
||||
assert call_count["value"] == 2
|
||||
|
||||
# Detailed call triggers new computation
|
||||
result_detailed = client.get_bucket_stats("bucket1", mode="detailed")
|
||||
assert result_detailed is detailed_stats
|
||||
assert call_count["value"] == 3
|
||||
|
||||
# Quick call - each mode has its own cache in _get_bucket_stats
|
||||
assert client.get_bucket_stats("bucket1") is quick_stats
|
||||
assert call_count["value"] == 4
|
||||
|
||||
def test_bucket_methods_without_boto3_client(self):
|
||||
"""Test that bucket methods raise NotImplementedError when storage doesn't support it."""
|
||||
service = create_service()
|
||||
@@ -199,6 +308,148 @@ class TestBucketManagement:
|
||||
with pytest.raises(NotImplementedError):
|
||||
client.list_buckets()
|
||||
|
||||
def test_put_bucket_acl_with_canned_acl(self):
|
||||
"""Test setting a canned ACL on a bucket."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.put_bucket_acl(Bucket="test-bucket", ACL="public-read")
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
mock_boto3_client.put_bucket_acl.assert_called_once_with(
|
||||
Bucket="test-bucket", ACL="public-read"
|
||||
)
|
||||
|
||||
def test_put_bucket_acl_with_grants(self):
|
||||
"""Test setting ACL with grant parameters."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.put_bucket_acl(
|
||||
Bucket="test-bucket",
|
||||
GrantRead="id=12345",
|
||||
GrantWrite="id=67890",
|
||||
)
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
mock_boto3_client.put_bucket_acl.assert_called_once_with(
|
||||
Bucket="test-bucket", GrantRead="id=12345", GrantWrite="id=67890"
|
||||
)
|
||||
|
||||
def test_put_bucket_acl_with_access_control_policy(self):
|
||||
"""Test setting ACL with a full AccessControlPolicy dict."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
policy = {
|
||||
"Grants": [
|
||||
{
|
||||
"Grantee": {"Type": "CanonicalUser", "ID": "abc123"},
|
||||
"Permission": "FULL_CONTROL",
|
||||
}
|
||||
],
|
||||
"Owner": {"ID": "abc123"},
|
||||
}
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.put_bucket_acl(Bucket="test-bucket", AccessControlPolicy=policy)
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
mock_boto3_client.put_bucket_acl.assert_called_once_with(
|
||||
Bucket="test-bucket", AccessControlPolicy=policy
|
||||
)
|
||||
|
||||
def test_put_bucket_acl_failure(self):
|
||||
"""Test that put_bucket_acl raises RuntimeError on boto3 failure."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.side_effect = Exception("AccessDenied")
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Failed to set bucket ACL"):
|
||||
client.put_bucket_acl(Bucket="test-bucket", ACL="public-read")
|
||||
|
||||
def test_put_bucket_acl_no_boto3_client(self):
|
||||
"""Test that put_bucket_acl raises NotImplementedError without boto3 client."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
delattr(mock_storage, "client")
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
client.put_bucket_acl(Bucket="test-bucket", ACL="private")
|
||||
|
||||
def test_get_bucket_acl_success(self):
|
||||
"""Test getting bucket ACL successfully."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
acl_response = {
|
||||
"Owner": {"DisplayName": "test-user", "ID": "abc123"},
|
||||
"Grants": [
|
||||
{
|
||||
"Grantee": {
|
||||
"Type": "CanonicalUser",
|
||||
"DisplayName": "test-user",
|
||||
"ID": "abc123",
|
||||
},
|
||||
"Permission": "FULL_CONTROL",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.get_bucket_acl.return_value = acl_response
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.get_bucket_acl(Bucket="test-bucket")
|
||||
|
||||
assert response["Owner"]["DisplayName"] == "test-user"
|
||||
assert len(response["Grants"]) == 1
|
||||
assert response["Grants"][0]["Permission"] == "FULL_CONTROL"
|
||||
mock_boto3_client.get_bucket_acl.assert_called_once_with(Bucket="test-bucket")
|
||||
|
||||
def test_get_bucket_acl_failure(self):
|
||||
"""Test that get_bucket_acl raises RuntimeError on boto3 failure."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.get_bucket_acl.side_effect = Exception("NoSuchBucket")
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Failed to get bucket ACL"):
|
||||
client.get_bucket_acl(Bucket="nonexistent-bucket")
|
||||
|
||||
def test_complete_bucket_lifecycle(self):
|
||||
"""Test complete bucket lifecycle: create, use, delete."""
|
||||
service = create_service()
|
||||
|
||||
@@ -10,7 +10,6 @@ from deltaglider import create_client
|
||||
from deltaglider.client import (
|
||||
BucketStats,
|
||||
CompressionEstimate,
|
||||
ListObjectsResponse,
|
||||
ObjectInfo,
|
||||
)
|
||||
|
||||
@@ -44,7 +43,15 @@ class MockStorage:
|
||||
if obj_head is not None:
|
||||
yield obj_head
|
||||
|
||||
def list_objects(self, bucket, prefix="", delimiter="", max_keys=1000, start_after=None):
|
||||
def list_objects(
|
||||
self,
|
||||
bucket,
|
||||
prefix="",
|
||||
delimiter="",
|
||||
max_keys=1000,
|
||||
start_after=None,
|
||||
continuation_token=None,
|
||||
):
|
||||
"""Mock list_objects operation for S3 features."""
|
||||
objects = []
|
||||
common_prefixes = set()
|
||||
@@ -125,7 +132,7 @@ class MockStorage:
|
||||
@pytest.fixture
|
||||
def client(tmp_path):
|
||||
"""Create a client with mocked storage."""
|
||||
client = create_client(cache_dir=str(tmp_path / "cache"))
|
||||
client = create_client()
|
||||
|
||||
# Replace storage with mock
|
||||
mock_storage = MockStorage()
|
||||
@@ -157,7 +164,6 @@ class TestCredentialHandling:
|
||||
aws_access_key_id="AKIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
region_name="us-west-2",
|
||||
cache_dir=str(tmp_path / "cache"),
|
||||
)
|
||||
|
||||
# Verify the client was created
|
||||
@@ -180,7 +186,6 @@ class TestCredentialHandling:
|
||||
aws_access_key_id="ASIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
aws_session_token="FwoGZXIvYXdzEBEaDH...",
|
||||
cache_dir=str(tmp_path / "cache"),
|
||||
)
|
||||
|
||||
assert client is not None
|
||||
@@ -189,7 +194,7 @@ class TestCredentialHandling:
|
||||
def test_create_client_without_credentials_uses_environment(self, tmp_path):
|
||||
"""Test that omitting credentials falls back to environment/IAM."""
|
||||
# This should use boto3's default credential chain
|
||||
client = create_client(cache_dir=str(tmp_path / "cache"))
|
||||
client = create_client()
|
||||
|
||||
assert client is not None
|
||||
assert client.service.storage.client is not None
|
||||
@@ -200,7 +205,6 @@ class TestCredentialHandling:
|
||||
endpoint_url="http://localhost:9000",
|
||||
aws_access_key_id="minioadmin",
|
||||
aws_secret_access_key="minioadmin",
|
||||
cache_dir=str(tmp_path / "cache"),
|
||||
)
|
||||
|
||||
assert client is not None
|
||||
@@ -279,27 +283,35 @@ class TestBoto3Compatibility:
|
||||
assert response["ContentLength"] == len(content)
|
||||
|
||||
def test_list_objects(self, client):
|
||||
"""Test list_objects with various options."""
|
||||
"""Test list_objects with various options (boto3-compatible dict response)."""
|
||||
# List all objects (default: FetchMetadata=False)
|
||||
response = client.list_objects(Bucket="test-bucket")
|
||||
|
||||
assert isinstance(response, ListObjectsResponse)
|
||||
assert response.key_count > 0
|
||||
assert len(response.contents) > 0
|
||||
# Response is now a boto3-compatible dict (not ListObjectsResponse)
|
||||
assert isinstance(response, dict)
|
||||
assert response["KeyCount"] > 0
|
||||
assert len(response["Contents"]) > 0
|
||||
|
||||
# Verify S3Object structure
|
||||
for obj in response["Contents"]:
|
||||
assert "Key" in obj
|
||||
assert "Size" in obj
|
||||
assert "LastModified" in obj
|
||||
assert "Metadata" in obj # DeltaGlider metadata
|
||||
|
||||
# Test with FetchMetadata=True (should only affect delta files)
|
||||
response_with_metadata = client.list_objects(Bucket="test-bucket", FetchMetadata=True)
|
||||
assert isinstance(response_with_metadata, ListObjectsResponse)
|
||||
assert response_with_metadata.key_count > 0
|
||||
assert isinstance(response_with_metadata, dict)
|
||||
assert response_with_metadata["KeyCount"] > 0
|
||||
|
||||
def test_list_objects_with_delimiter(self, client):
|
||||
"""Test list_objects with delimiter for folder simulation."""
|
||||
"""Test list_objects with delimiter for folder simulation (boto3-compatible dict response)."""
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="", Delimiter="/")
|
||||
|
||||
# Should have common prefixes for folders
|
||||
assert len(response.common_prefixes) > 0
|
||||
assert {"Prefix": "folder1/"} in response.common_prefixes
|
||||
assert {"Prefix": "folder2/"} in response.common_prefixes
|
||||
assert len(response.get("CommonPrefixes", [])) > 0
|
||||
assert {"Prefix": "folder1/"} in response["CommonPrefixes"]
|
||||
assert {"Prefix": "folder2/"} in response["CommonPrefixes"]
|
||||
|
||||
def test_delete_object(self, client):
|
||||
"""Test delete_object."""
|
||||
@@ -430,7 +442,7 @@ class TestDeltaGliderFeatures:
|
||||
|
||||
def test_get_bucket_stats(self, client):
|
||||
"""Test getting bucket statistics."""
|
||||
# Test quick stats (default: detailed_stats=False)
|
||||
# Test quick stats (LIST only)
|
||||
stats = client.get_bucket_stats("test-bucket")
|
||||
|
||||
assert isinstance(stats, BucketStats)
|
||||
@@ -438,8 +450,8 @@ class TestDeltaGliderFeatures:
|
||||
assert stats.total_size > 0
|
||||
assert stats.delta_objects >= 1 # We have archive.zip.delta
|
||||
|
||||
# Test with detailed_stats=True
|
||||
detailed_stats = client.get_bucket_stats("test-bucket", detailed_stats=True)
|
||||
# Test with detailed mode
|
||||
detailed_stats = client.get_bucket_stats("test-bucket", mode="detailed")
|
||||
assert isinstance(detailed_stats, BucketStats)
|
||||
assert detailed_stats.object_count == stats.object_count
|
||||
|
||||
|
||||
548
tests/integration/test_delete_objects_recursive.py
Normal file
548
tests/integration/test_delete_objects_recursive.py
Normal file
@@ -0,0 +1,548 @@
|
||||
"""Comprehensive tests for DeltaGliderClient.delete_objects_recursive() method."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider import create_client
|
||||
from deltaglider.core.models import DeleteResult, RecursiveDeleteResult
|
||||
|
||||
|
||||
class MockStorage:
|
||||
"""Mock storage for testing."""
|
||||
|
||||
def __init__(self):
|
||||
self.objects = {}
|
||||
self.delete_calls = []
|
||||
|
||||
def head(self, key):
|
||||
"""Mock head operation."""
|
||||
from deltaglider.ports.storage import ObjectHead
|
||||
|
||||
if key in self.objects:
|
||||
obj = self.objects[key]
|
||||
return ObjectHead(
|
||||
key=key,
|
||||
size=obj["size"],
|
||||
etag=obj.get("etag", "mock-etag"),
|
||||
last_modified=obj.get("last_modified", datetime.now(UTC)),
|
||||
metadata=obj.get("metadata", {}),
|
||||
)
|
||||
return None
|
||||
|
||||
def list(self, prefix):
|
||||
"""Mock list operation for StoragePort interface."""
|
||||
for key, _obj in self.objects.items():
|
||||
if key.startswith(prefix):
|
||||
obj_head = self.head(key)
|
||||
if obj_head is not None:
|
||||
yield obj_head
|
||||
|
||||
def delete(self, key):
|
||||
"""Mock delete operation."""
|
||||
self.delete_calls.append(key)
|
||||
if key in self.objects:
|
||||
del self.objects[key]
|
||||
return True
|
||||
return False
|
||||
|
||||
def get(self, key):
|
||||
"""Mock get operation."""
|
||||
if key in self.objects:
|
||||
return self.objects[key].get("content", b"mock-content")
|
||||
return None
|
||||
|
||||
def put(self, key, data, metadata=None):
|
||||
"""Mock put operation."""
|
||||
self.objects[key] = {
|
||||
"size": len(data),
|
||||
"content": data,
|
||||
"metadata": metadata or {},
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_storage():
|
||||
"""Create mock storage."""
|
||||
return MockStorage()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(tmp_path):
|
||||
"""Create DeltaGliderClient with mock storage."""
|
||||
# Use create_client to get a properly configured client
|
||||
client = create_client()
|
||||
|
||||
# Replace storage with mock
|
||||
mock_storage = MockStorage()
|
||||
client.service.storage = mock_storage
|
||||
|
||||
return client
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveBasicFunctionality:
|
||||
"""Test basic functionality of delete_objects_recursive."""
|
||||
|
||||
def test_delete_single_object_with_file_prefix(self, client):
|
||||
"""Test deleting a single object when prefix is a file (no trailing slash)."""
|
||||
# Setup: Add a regular file
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify response structure
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert "DeletedCount" in response
|
||||
assert "FailedCount" in response
|
||||
assert "DeltaGliderInfo" in response
|
||||
|
||||
# Verify DeltaGliderInfo structure
|
||||
info = response["DeltaGliderInfo"]
|
||||
assert "DeltasDeleted" in info
|
||||
assert "ReferencesDeleted" in info
|
||||
assert "DirectDeleted" in info
|
||||
assert "OtherDeleted" in info
|
||||
|
||||
def test_delete_directory_with_trailing_slash(self, client):
|
||||
"""Test deleting all objects under a prefix with trailing slash."""
|
||||
# Setup: Add multiple files under a prefix
|
||||
client.service.storage.objects["test-bucket/dir/file1.txt"] = {"size": 100}
|
||||
client.service.storage.objects["test-bucket/dir/file2.txt"] = {"size": 200}
|
||||
client.service.storage.objects["test-bucket/dir/sub/file3.txt"] = {"size": 300}
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="dir/")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert response["DeletedCount"] >= 0
|
||||
assert response["FailedCount"] == 0
|
||||
|
||||
def test_delete_empty_prefix_returns_zero_counts(self, client):
|
||||
"""Test deleting with empty prefix returns zero counts."""
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert response["DeletedCount"] >= 0
|
||||
assert response["FailedCount"] == 0
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveDeltaSuffixHandling:
|
||||
"""Test delta suffix fallback logic."""
|
||||
|
||||
def test_delete_file_with_delta_suffix_fallback(self, client):
|
||||
"""Test that delete falls back to .delta suffix if original not found."""
|
||||
# Setup: Add file with .delta suffix
|
||||
client.service.storage.objects["test-bucket/archive.zip.delta"] = {
|
||||
"size": 500,
|
||||
"metadata": {"original_name": "archive.zip"},
|
||||
}
|
||||
|
||||
# Execute: Delete using original name (without .delta)
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="archive.zip")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert "test-bucket/archive.zip.delta" not in client.service.storage.objects
|
||||
|
||||
def test_delete_file_already_with_delta_suffix(self, client):
|
||||
"""Test deleting a file that already has .delta suffix."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.zip.delta"] = {"size": 300}
|
||||
|
||||
# Execute: Delete using .delta suffix directly
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.zip.delta")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_delta_suffix_not_added_for_directory_prefix(self, client):
|
||||
"""Test that .delta suffix is not added when prefix ends with /."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/dir/file.txt"] = {"size": 100}
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="dir/")
|
||||
|
||||
# Verify - should not attempt to delete "dir/.delta"
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveStatisticsAggregation:
|
||||
"""Test statistics aggregation from core service."""
|
||||
|
||||
def test_aggregates_deleted_count_from_service_and_single_deletes(self, client):
|
||||
"""Test that deleted counts are aggregated correctly."""
|
||||
# Setup: Mock service.delete_recursive to return specific counts
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=5,
|
||||
failed_count=0,
|
||||
deltas_deleted=2,
|
||||
references_deleted=1,
|
||||
direct_deleted=2,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="test/")
|
||||
|
||||
# Verify aggregation
|
||||
assert response["DeletedCount"] == 5
|
||||
assert response["FailedCount"] == 0
|
||||
assert response["DeltaGliderInfo"]["DeltasDeleted"] == 2
|
||||
assert response["DeltaGliderInfo"]["ReferencesDeleted"] == 1
|
||||
assert response["DeltaGliderInfo"]["DirectDeleted"] == 2
|
||||
assert response["DeltaGliderInfo"]["OtherDeleted"] == 0
|
||||
|
||||
def test_aggregates_single_delete_counts_with_service_counts(self, client):
|
||||
"""Test that single file deletes are aggregated with service counts."""
|
||||
# Setup: Add file to trigger single delete path
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service.delete_recursive to return additional counts
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=3,
|
||||
failed_count=0,
|
||||
deltas_deleted=1,
|
||||
references_deleted=0,
|
||||
direct_deleted=2,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify that counts include both single delete and service delete
|
||||
assert response["DeletedCount"] >= 3 # At least service count
|
||||
assert response["DeltaGliderInfo"]["DeltasDeleted"] >= 1
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveErrorHandling:
|
||||
"""Test error handling and error aggregation."""
|
||||
|
||||
def test_single_delete_error_captured_in_errors_list(self, client):
|
||||
"""Test that errors from single deletes are captured."""
|
||||
# Setup: Add file
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock delete_with_delta_suffix to raise exception
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.side_effect = RuntimeError("Simulated delete error")
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify error captured
|
||||
assert response["FailedCount"] > 0
|
||||
assert "Errors" in response
|
||||
assert any("Simulated delete error" in err for err in response["Errors"])
|
||||
|
||||
def test_service_errors_propagated_in_response(self, client):
|
||||
"""Test that errors from service.delete_recursive are propagated."""
|
||||
# Mock service to return errors
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=2,
|
||||
failed_count=1,
|
||||
deltas_deleted=2,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
errors=["Error deleting object1", "Error deleting object2"],
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="test/")
|
||||
|
||||
# Verify
|
||||
assert response["FailedCount"] == 1
|
||||
assert "Errors" in response
|
||||
assert "Error deleting object1" in response["Errors"]
|
||||
assert "Error deleting object2" in response["Errors"]
|
||||
|
||||
def test_combines_single_and_service_errors(self, client):
|
||||
"""Test that errors from both single deletes and service are combined."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service to also return errors
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=1,
|
||||
failed_count=1,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
errors=["Service delete error"],
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to raise exception
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.side_effect = RuntimeError("Single delete error")
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify both errors present
|
||||
assert "Errors" in response
|
||||
errors_str = " ".join(response["Errors"])
|
||||
assert "Single delete error" in errors_str
|
||||
assert "Service delete error" in errors_str
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveWarningsHandling:
|
||||
"""Test warning aggregation."""
|
||||
|
||||
def test_service_warnings_propagated_in_response(self, client):
|
||||
"""Test that warnings from service.delete_recursive are propagated."""
|
||||
# Mock service to return warnings
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=3,
|
||||
failed_count=0,
|
||||
deltas_deleted=2,
|
||||
references_deleted=1,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
warnings=["Reference deleted, 2 dependent deltas invalidated"],
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="test/")
|
||||
|
||||
# Verify
|
||||
assert "Warnings" in response
|
||||
assert "Reference deleted, 2 dependent deltas invalidated" in response["Warnings"]
|
||||
|
||||
def test_single_delete_warnings_propagated(self, client):
|
||||
"""Test that warnings from single deletes are captured."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/ref.bin"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="ref.bin",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to return warnings
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"ref.bin",
|
||||
DeleteResult(
|
||||
key="ref.bin",
|
||||
bucket="test-bucket",
|
||||
deleted=True,
|
||||
type="reference",
|
||||
warnings=["Warning from single delete"],
|
||||
),
|
||||
)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="ref.bin")
|
||||
|
||||
# Verify
|
||||
assert "Warnings" in response
|
||||
assert "Warning from single delete" in response["Warnings"]
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveSingleDeleteDetails:
|
||||
"""Test SingleDeletes detail tracking."""
|
||||
|
||||
def test_single_delete_details_included_for_file_prefix(self, client):
|
||||
"""Test that SingleDeletes details are included when deleting file prefix."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"file.txt",
|
||||
DeleteResult(
|
||||
key="file.txt",
|
||||
bucket="test-bucket",
|
||||
deleted=True,
|
||||
type="direct",
|
||||
dependent_deltas=0,
|
||||
),
|
||||
)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify
|
||||
assert "SingleDeletes" in response["DeltaGliderInfo"]
|
||||
single_deletes = response["DeltaGliderInfo"]["SingleDeletes"]
|
||||
assert len(single_deletes) > 0
|
||||
assert single_deletes[0]["Key"] == "file.txt"
|
||||
assert single_deletes[0]["Type"] == "direct"
|
||||
assert "DependentDeltas" in single_deletes[0]
|
||||
assert "Warnings" in single_deletes[0]
|
||||
|
||||
def test_single_delete_includes_stored_key_when_different(self, client):
|
||||
"""Test that StoredKey is included when actual key differs from requested."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.zip.delta"] = {"size": 200}
|
||||
|
||||
# Mock delete_with_delta_suffix to return different key
|
||||
from deltaglider import client_delete_helpers
|
||||
|
||||
original_delete = client_delete_helpers.delete_with_delta_suffix
|
||||
|
||||
def mock_delete(service, bucket, key):
|
||||
actual_key = "file.zip.delta" if key == "file.zip" else key
|
||||
return (
|
||||
actual_key,
|
||||
DeleteResult(
|
||||
key=actual_key,
|
||||
bucket=bucket,
|
||||
deleted=True,
|
||||
type="delta",
|
||||
dependent_deltas=0,
|
||||
),
|
||||
)
|
||||
|
||||
client_delete_helpers.delete_with_delta_suffix = mock_delete
|
||||
|
||||
# Mock service
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.zip",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
try:
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.zip")
|
||||
|
||||
# Verify
|
||||
assert "SingleDeletes" in response["DeltaGliderInfo"]
|
||||
single_deletes = response["DeltaGliderInfo"]["SingleDeletes"]
|
||||
if len(single_deletes) > 0:
|
||||
# If actual key differs, StoredKey should be present
|
||||
detail = single_deletes[0]
|
||||
if detail["Key"] != "file.zip.delta":
|
||||
assert "StoredKey" in detail
|
||||
finally:
|
||||
client_delete_helpers.delete_with_delta_suffix = original_delete
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveEdgeCases:
|
||||
"""Test edge cases and boundary conditions."""
|
||||
|
||||
def test_nonexistent_prefix_returns_zero_counts(self, client):
|
||||
"""Test deleting nonexistent prefix returns zero counts."""
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="nonexistent/path/")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert response["DeletedCount"] >= 0
|
||||
assert response["FailedCount"] == 0
|
||||
|
||||
def test_duplicate_candidates_handled_correctly(self, client):
|
||||
"""Test that duplicate delete candidates are handled correctly."""
|
||||
# Setup: This tests the seen_candidates logic
|
||||
client.service.storage.objects["test-bucket/file.delta"] = {"size": 100}
|
||||
|
||||
# Execute: Should not attempt to delete "file.delta" twice
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.delta")
|
||||
|
||||
# Verify no errors
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_unknown_result_type_categorized_as_other(self, client):
|
||||
"""Test that unknown result types are categorized as 'other'."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to return unknown type
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"file.txt",
|
||||
DeleteResult(
|
||||
key="file.txt",
|
||||
bucket="test-bucket",
|
||||
deleted=True,
|
||||
type="unknown_type", # Not in single_counts keys
|
||||
dependent_deltas=0,
|
||||
),
|
||||
)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify it's categorized as "other"
|
||||
assert response["DeltaGliderInfo"]["OtherDeleted"] >= 1
|
||||
# Also verify the detail shows the unknown type
|
||||
if "SingleDeletes" in response["DeltaGliderInfo"]:
|
||||
assert response["DeltaGliderInfo"]["SingleDeletes"][0]["Type"] == "unknown_type"
|
||||
|
||||
def test_kwargs_parameter_accepted(self, client):
|
||||
"""Test that additional kwargs are accepted without error."""
|
||||
# Execute with extra parameters
|
||||
response = client.delete_objects_recursive(
|
||||
Bucket="test-bucket",
|
||||
Prefix="test/",
|
||||
ExtraParam="value", # Should be ignored
|
||||
AnotherParam=123,
|
||||
)
|
||||
|
||||
# Verify no errors
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
@@ -53,8 +53,11 @@ class TestSDKFiltering:
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="releases/")
|
||||
|
||||
# Response is now a boto3-compatible dict
|
||||
contents = response["Contents"]
|
||||
|
||||
# Verify .delta suffix is stripped
|
||||
keys = [obj.key for obj in response.contents]
|
||||
keys = [obj["Key"] for obj in contents]
|
||||
assert "releases/app-v1.zip" in keys
|
||||
assert "releases/app-v2.zip" in keys
|
||||
assert "releases/README.md" in keys
|
||||
@@ -63,8 +66,10 @@ class TestSDKFiltering:
|
||||
for key in keys:
|
||||
assert not key.endswith(".delta"), f"Found .delta suffix in: {key}"
|
||||
|
||||
# Verify is_delta flag is set correctly
|
||||
delta_objects = [obj for obj in response.contents if obj.is_delta]
|
||||
# Verify is_delta flag is set correctly in Metadata
|
||||
delta_objects = [
|
||||
obj for obj in contents if obj.get("Metadata", {}).get("deltaglider-is-delta") == "true"
|
||||
]
|
||||
assert len(delta_objects) == 2
|
||||
|
||||
def test_list_objects_filters_reference_bin(self):
|
||||
@@ -106,15 +111,18 @@ class TestSDKFiltering:
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="releases/")
|
||||
|
||||
# Response is now a boto3-compatible dict
|
||||
contents = response["Contents"]
|
||||
|
||||
# Verify NO reference.bin files in output
|
||||
keys = [obj.key for obj in response.contents]
|
||||
keys = [obj["Key"] for obj in contents]
|
||||
for key in keys:
|
||||
assert not key.endswith("reference.bin"), f"Found reference.bin in: {key}"
|
||||
|
||||
# Should only have the app.zip (with .delta stripped)
|
||||
assert len(response.contents) == 1
|
||||
assert response.contents[0].key == "releases/app.zip"
|
||||
assert response.contents[0].is_delta is True
|
||||
assert len(contents) == 1
|
||||
assert contents[0]["Key"] == "releases/app.zip"
|
||||
assert contents[0].get("Metadata", {}).get("deltaglider-is-delta") == "true"
|
||||
|
||||
def test_list_objects_combined_filtering(self):
|
||||
"""Test filtering of both .delta and reference.bin together."""
|
||||
@@ -170,12 +178,15 @@ class TestSDKFiltering:
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="data/")
|
||||
|
||||
# Response is now a boto3-compatible dict
|
||||
contents = response["Contents"]
|
||||
|
||||
# Should filter out 2 reference.bin files
|
||||
# Should strip .delta from 3 files
|
||||
# Should keep 1 regular file as-is
|
||||
assert len(response.contents) == 4 # 3 deltas + 1 regular file
|
||||
assert len(contents) == 4 # 3 deltas + 1 regular file
|
||||
|
||||
keys = [obj.key for obj in response.contents]
|
||||
keys = [obj["Key"] for obj in contents]
|
||||
expected_keys = ["data/file1.zip", "data/file2.zip", "data/file3.txt", "data/sub/app.jar"]
|
||||
assert sorted(keys) == sorted(expected_keys)
|
||||
|
||||
@@ -232,12 +243,12 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted is True
|
||||
assert result.type == "delta"
|
||||
|
||||
# Verify reference.bin cleanup was triggered
|
||||
assert "cleaned_reference" in result
|
||||
assert result["cleaned_reference"] == "releases/reference.bin"
|
||||
assert result.cleaned_reference is not None
|
||||
assert result.cleaned_reference == "releases/reference.bin"
|
||||
|
||||
# Verify both files were deleted
|
||||
assert mock_storage.delete.call_count == 2
|
||||
@@ -284,11 +295,11 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app-v1.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted is True
|
||||
assert result.type == "delta"
|
||||
|
||||
# Verify reference.bin was NOT cleaned up
|
||||
assert "cleaned_reference" not in result
|
||||
assert result.cleaned_reference is None
|
||||
|
||||
# Verify only the delta was deleted, not reference.bin
|
||||
assert mock_storage.delete.call_count == 1
|
||||
@@ -331,11 +342,11 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted is True
|
||||
assert result.type == "delta"
|
||||
|
||||
# Verify no reference cleanup (since it didn't exist)
|
||||
assert "cleaned_reference" not in result
|
||||
assert result.cleaned_reference is None
|
||||
|
||||
# Only delta should be deleted
|
||||
assert mock_storage.delete.call_count == 1
|
||||
@@ -384,7 +395,7 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/1.0/app.zip.delta"))
|
||||
|
||||
# Should clean up only 1.0/reference.bin
|
||||
assert result["cleaned_reference"] == "releases/1.0/reference.bin"
|
||||
assert result.cleaned_reference == "releases/1.0/reference.bin"
|
||||
|
||||
# Verify correct files deleted
|
||||
delete_calls = [call[0][0] for call in mock_storage.delete.call_args_list]
|
||||
@@ -425,9 +436,9 @@ class TestRecursiveDeleteCleanup:
|
||||
result = service.delete_recursive("test-bucket", "data/")
|
||||
|
||||
# Should delete both delta and reference
|
||||
assert result["deleted_count"] == 2
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result.deleted_count == 2
|
||||
assert result.deltas_deleted == 1
|
||||
assert result.references_deleted == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -5,6 +5,7 @@ from unittest.mock import Mock, patch
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.main import create_service
|
||||
from deltaglider.core.models import RecursiveDeleteResult
|
||||
from deltaglider.ports.storage import ObjectHead
|
||||
|
||||
|
||||
@@ -28,10 +29,10 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
|
||||
result = service.delete_recursive("test-bucket", "nonexistent/")
|
||||
|
||||
assert result["deleted_count"] == 0
|
||||
assert result["failed_count"] == 0
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
assert result.deleted_count == 0
|
||||
assert result.failed_count == 0
|
||||
assert isinstance(result.errors, list)
|
||||
assert isinstance(result.warnings, list)
|
||||
|
||||
def test_delete_recursive_returns_structured_result(self):
|
||||
"""Test that delete_recursive returns a properly structured result."""
|
||||
@@ -57,26 +58,22 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Verify structure
|
||||
required_keys = [
|
||||
"bucket",
|
||||
"prefix",
|
||||
"deleted_count",
|
||||
"failed_count",
|
||||
"deltas_deleted",
|
||||
"references_deleted",
|
||||
"direct_deleted",
|
||||
"other_deleted",
|
||||
"errors",
|
||||
"warnings",
|
||||
]
|
||||
for key in required_keys:
|
||||
assert key in result, f"Missing key: {key}"
|
||||
# Verify structure - result is a RecursiveDeleteResult dataclass
|
||||
assert hasattr(result, "bucket")
|
||||
assert hasattr(result, "prefix")
|
||||
assert hasattr(result, "deleted_count")
|
||||
assert hasattr(result, "failed_count")
|
||||
assert hasattr(result, "deltas_deleted")
|
||||
assert hasattr(result, "references_deleted")
|
||||
assert hasattr(result, "direct_deleted")
|
||||
assert hasattr(result, "other_deleted")
|
||||
assert hasattr(result, "errors")
|
||||
assert hasattr(result, "warnings")
|
||||
|
||||
assert isinstance(result["deleted_count"], int)
|
||||
assert isinstance(result["failed_count"], int)
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
assert isinstance(result.deleted_count, int)
|
||||
assert isinstance(result.failed_count, int)
|
||||
assert isinstance(result.errors, list)
|
||||
assert isinstance(result.warnings, list)
|
||||
|
||||
def test_delete_recursive_categorizes_objects_correctly(self):
|
||||
"""Test that delete_recursive correctly categorizes different object types."""
|
||||
@@ -117,12 +114,12 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Should categorize correctly - the exact categorization depends on implementation
|
||||
assert result["deltas_deleted"] == 1 # app.zip.delta
|
||||
assert result["references_deleted"] == 1 # reference.bin
|
||||
assert result.deltas_deleted == 1 # app.zip.delta
|
||||
assert result.references_deleted == 1 # reference.bin
|
||||
# Direct and other files may be categorized differently based on metadata detection
|
||||
assert result["direct_deleted"] + result["other_deleted"] == 2 # readme.txt + config.json
|
||||
assert result["deleted_count"] == 4 # total
|
||||
assert result["failed_count"] == 0
|
||||
assert result.direct_deleted + result.other_deleted == 2 # readme.txt + config.json
|
||||
assert result.deleted_count == 4 # total
|
||||
assert result.failed_count == 0
|
||||
|
||||
def test_delete_recursive_handles_storage_errors_gracefully(self):
|
||||
"""Test that delete_recursive handles individual storage errors gracefully."""
|
||||
@@ -151,10 +148,10 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Should handle partial failure
|
||||
assert result["deleted_count"] == 1 # good.zip.delta succeeded
|
||||
assert result["failed_count"] == 1 # bad.zip.delta failed
|
||||
assert len(result["errors"]) == 1
|
||||
assert "bad" in result["errors"][0]
|
||||
assert result.deleted_count == 1 # good.zip.delta succeeded
|
||||
assert result.failed_count == 1 # bad.zip.delta failed
|
||||
assert len(result.errors) == 1
|
||||
assert "bad" in result.errors[0]
|
||||
|
||||
def test_affected_deltaspaces_discovery(self):
|
||||
"""Test that the system discovers affected deltaspaces when deleting deltas."""
|
||||
@@ -206,8 +203,8 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "project/team-a/v1/")
|
||||
|
||||
# Should have discovered and evaluated the parent reference
|
||||
assert result["deleted_count"] >= 1 # At least the delta file
|
||||
assert result["failed_count"] == 0
|
||||
assert result.deleted_count >= 1 # At least the delta file
|
||||
assert result.failed_count == 0
|
||||
|
||||
def test_cli_uses_core_service_method(self):
|
||||
"""Test that CLI rm -r command uses the core service delete_recursive method."""
|
||||
@@ -222,14 +219,12 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
mock_create_service.return_value = mock_service
|
||||
|
||||
# Mock successful deletion
|
||||
mock_service.delete_recursive.return_value = {
|
||||
"bucket": "test-bucket",
|
||||
"prefix": "test/",
|
||||
"deleted_count": 2,
|
||||
"failed_count": 0,
|
||||
"warnings": [],
|
||||
"errors": [],
|
||||
}
|
||||
mock_service.delete_recursive.return_value = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=2,
|
||||
failed_count=0,
|
||||
)
|
||||
|
||||
result = runner.invoke(cli, ["rm", "-r", "s3://test-bucket/test/"])
|
||||
|
||||
@@ -294,8 +289,8 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="test/file.zip.delta"))
|
||||
|
||||
assert result["deleted"]
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted
|
||||
assert result.type == "delta"
|
||||
|
||||
def test_reference_cleanup_intelligence_basic(self):
|
||||
"""Basic test to verify reference cleanup intelligence is working."""
|
||||
@@ -328,10 +323,10 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "simple/")
|
||||
|
||||
# Should delete both delta and reference since there are no other dependencies
|
||||
assert result["deleted_count"] == 2
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result["failed_count"] == 0
|
||||
assert result.deleted_count == 2
|
||||
assert result.deltas_deleted == 1
|
||||
assert result.references_deleted == 1
|
||||
assert result.failed_count == 0
|
||||
|
||||
def test_comprehensive_result_validation(self):
|
||||
"""Test that all result fields are properly populated."""
|
||||
@@ -366,31 +361,31 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "mixed/")
|
||||
|
||||
# Validate all expected fields are present and have correct types
|
||||
assert isinstance(result["bucket"], str)
|
||||
assert isinstance(result["prefix"], str)
|
||||
assert isinstance(result["deleted_count"], int)
|
||||
assert isinstance(result["failed_count"], int)
|
||||
assert isinstance(result["deltas_deleted"], int)
|
||||
assert isinstance(result["references_deleted"], int)
|
||||
assert isinstance(result["direct_deleted"], int)
|
||||
assert isinstance(result["other_deleted"], int)
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
assert isinstance(result.bucket, str)
|
||||
assert isinstance(result.prefix, str)
|
||||
assert isinstance(result.deleted_count, int)
|
||||
assert isinstance(result.failed_count, int)
|
||||
assert isinstance(result.deltas_deleted, int)
|
||||
assert isinstance(result.references_deleted, int)
|
||||
assert isinstance(result.direct_deleted, int)
|
||||
assert isinstance(result.other_deleted, int)
|
||||
assert isinstance(result.errors, list)
|
||||
assert isinstance(result.warnings, list)
|
||||
|
||||
# Validate counts add up
|
||||
total_by_type = (
|
||||
result["deltas_deleted"]
|
||||
+ result["references_deleted"]
|
||||
+ result["direct_deleted"]
|
||||
+ result["other_deleted"]
|
||||
result.deltas_deleted
|
||||
+ result.references_deleted
|
||||
+ result.direct_deleted
|
||||
+ result.other_deleted
|
||||
)
|
||||
assert result["deleted_count"] == total_by_type
|
||||
assert result.deleted_count == total_by_type
|
||||
|
||||
# Validate specific counts for this scenario
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result.deltas_deleted == 1
|
||||
assert result.references_deleted == 1
|
||||
# Direct and other files may be categorized differently
|
||||
assert result["direct_deleted"] + result["other_deleted"] == 2
|
||||
assert result.direct_deleted + result.other_deleted == 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
271
tests/integration/test_s3_migration.py
Normal file
271
tests/integration/test_s3_migration.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""Test S3-to-S3 migration functionality."""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.aws_compat import migrate_s3_to_s3
|
||||
from deltaglider.core import DeltaService
|
||||
from deltaglider.ports import ObjectHead
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_service():
|
||||
"""Create a mock DeltaService."""
|
||||
service = MagicMock(spec=DeltaService)
|
||||
service.storage = MagicMock()
|
||||
return service
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_with_resume(mock_service):
|
||||
"""Test migration with resume support (skips existing files)."""
|
||||
# Setup mock storage with source files
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
etag="abc123",
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="file2.zip",
|
||||
size=2048,
|
||||
etag="def456",
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="subdir/file3.zip",
|
||||
size=512,
|
||||
etag="ghi789",
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
# Destination already has file1.zip (as .delta)
|
||||
dest_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip.delta",
|
||||
size=100,
|
||||
last_modified="2024-01-02T00:00:00Z",
|
||||
etag="delta123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
# Configure mock to return appropriate objects
|
||||
def list_side_effect(prefix):
|
||||
if "source-bucket" in prefix:
|
||||
return iter(source_objects)
|
||||
elif "dest-bucket" in prefix:
|
||||
return iter(dest_objects)
|
||||
return iter([])
|
||||
|
||||
mock_service.storage.list.side_effect = list_side_effect
|
||||
|
||||
# Mock the copy operation and click functions
|
||||
# Use quiet=True to skip EC2 detection logging
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm", return_value=True):
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection and logging
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Should copy only file2.zip and subdir/file3.zip (file1 already exists)
|
||||
assert mock_copy.call_count == 2
|
||||
|
||||
# Verify the files being migrated
|
||||
call_args = [call[0] for call in mock_copy.call_args_list]
|
||||
migrated_files = [(args[1], args[2]) for args in call_args]
|
||||
|
||||
assert ("s3://source-bucket/file2.zip", "s3://dest-bucket/file2.zip") in migrated_files
|
||||
assert (
|
||||
"s3://source-bucket/subdir/file3.zip",
|
||||
"s3://dest-bucket/subdir/file3.zip",
|
||||
) in migrated_files
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_dry_run(mock_service):
|
||||
"""Test dry run mode shows what would be migrated without actually migrating."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_service.storage.list.return_value = iter(source_objects)
|
||||
|
||||
# Mock the copy operation and EC2 detection
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("deltaglider.app.cli.aws_compat.click.echo") as mock_echo:
|
||||
with patch("deltaglider.app.cli.aws_compat.log_aws_region"):
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=False, # Allow output to test dry run messages
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=True,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Should not actually copy anything in dry run mode
|
||||
mock_copy.assert_not_called()
|
||||
|
||||
# Should show dry run message
|
||||
echo_calls = [str(call[0][0]) for call in mock_echo.call_args_list if call[0]]
|
||||
assert any("DRY RUN MODE" in msg for msg in echo_calls)
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_with_filters(mock_service):
|
||||
"""Test migration with include/exclude filters."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="file2.log",
|
||||
size=256,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="def456",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="file3.tar",
|
||||
size=512,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="ghi789",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_service.storage.list.return_value = iter(source_objects)
|
||||
|
||||
# Mock the copy operation
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("click.echo"):
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm", return_value=True):
|
||||
# Exclude .log files
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude="*.log",
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Should copy file1.zip and file3.tar, but not file2.log
|
||||
assert mock_copy.call_count == 2
|
||||
|
||||
call_args = [call[0] for call in mock_copy.call_args_list]
|
||||
migrated_sources = [args[1] for args in call_args]
|
||||
|
||||
assert "s3://source-bucket/file1.zip" in migrated_sources
|
||||
assert "s3://source-bucket/file3.tar" in migrated_sources
|
||||
assert "s3://source-bucket/file2.log" not in migrated_sources
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_skip_confirm(mock_service):
|
||||
"""Test skipping confirmation prompt with skip_confirm=True."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_service.storage.list.return_value = iter(source_objects)
|
||||
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("click.echo"):
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm") as mock_confirm:
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=True, # Skip confirmation
|
||||
)
|
||||
|
||||
# Should not ask for confirmation
|
||||
mock_confirm.assert_not_called()
|
||||
|
||||
# Should still perform the copy
|
||||
mock_copy.assert_called_once()
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_with_prefix(mock_service):
|
||||
"""Test migration with source and destination prefixes."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="data/file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
def list_side_effect(prefix):
|
||||
if "source-bucket/data" in prefix:
|
||||
return iter(source_objects)
|
||||
return iter([])
|
||||
|
||||
mock_service.storage.list.side_effect = list_side_effect
|
||||
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("click.echo"):
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm", return_value=True):
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/data/",
|
||||
"s3://dest-bucket/archive/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Verify the correct destination path is used
|
||||
mock_copy.assert_called_once()
|
||||
call_args = mock_copy.call_args[0]
|
||||
assert call_args[1] == "s3://source-bucket/data/file1.zip"
|
||||
assert call_args[2] == "s3://dest-bucket/archive/file1.zip"
|
||||
255
tests/integration/test_stats_command.py
Normal file
255
tests/integration/test_stats_command.py
Normal file
@@ -0,0 +1,255 @@
|
||||
"""Integration tests for stats CLI command."""
|
||||
|
||||
import json
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from click.testing import CliRunner
|
||||
|
||||
from deltaglider.app.cli.main import cli
|
||||
from deltaglider.client_models import BucketStats
|
||||
|
||||
|
||||
class TestStatsCommand:
|
||||
"""Test stats CLI command."""
|
||||
|
||||
def test_stats_json_output(self):
|
||||
"""Test stats command with JSON output."""
|
||||
# Create mock bucket stats
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=10,
|
||||
total_size=1000000,
|
||||
compressed_size=500000,
|
||||
space_saved=500000,
|
||||
average_compression_ratio=0.5,
|
||||
delta_objects=7,
|
||||
direct_objects=3,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
# Setup mock client
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
# Run command
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--json"])
|
||||
|
||||
# Verify
|
||||
assert result.exit_code == 0
|
||||
output = json.loads(result.output)
|
||||
assert output["bucket"] == "test-bucket"
|
||||
assert output["object_count"] == 10
|
||||
assert output["total_size"] == 1000000
|
||||
assert output["compressed_size"] == 500000
|
||||
assert output["space_saved"] == 500000
|
||||
assert output["average_compression_ratio"] == 0.5
|
||||
assert output["delta_objects"] == 7
|
||||
assert output["direct_objects"] == 3
|
||||
|
||||
# Verify client was called correctly
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="quick", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_json_output_detailed(self):
|
||||
"""Test stats command with detailed JSON output."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=2000000,
|
||||
compressed_size=100000,
|
||||
space_saved=1900000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=5,
|
||||
direct_objects=0,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--detailed", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
output = json.loads(result.output)
|
||||
assert output["average_compression_ratio"] == 0.95
|
||||
|
||||
# Verify detailed flag was passed
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="detailed", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_json_output_sampled(self):
|
||||
"""Test stats command with sampled JSON output."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=2000000,
|
||||
compressed_size=100000,
|
||||
space_saved=1900000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=5,
|
||||
direct_objects=0,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--sampled", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="sampled", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_sampled_and_detailed_conflict(self):
|
||||
"""--sampled and --detailed flags must be mutually exclusive."""
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--sampled", "--detailed"])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "cannot be used together" in result.output
|
||||
|
||||
def test_stats_human_readable_output(self):
|
||||
"""Test stats command with human-readable output."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=10,
|
||||
total_size=1500000, # ~1.43 MB
|
||||
compressed_size=300000, # ~293 KB
|
||||
space_saved=1200000, # ~1.14 MB
|
||||
average_compression_ratio=0.8,
|
||||
delta_objects=7,
|
||||
direct_objects=3,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
output = result.output
|
||||
|
||||
# Verify human-readable format
|
||||
assert "Bucket Statistics: test-bucket" in output
|
||||
assert "Total Objects:" in output
|
||||
assert "10" in output
|
||||
assert "Delta Objects:" in output
|
||||
assert "7" in output
|
||||
assert "Direct Objects:" in output
|
||||
assert "3" in output
|
||||
assert "Original Size:" in output
|
||||
assert "Compressed Size:" in output
|
||||
assert "Space Saved:" in output
|
||||
assert "Compression Ratio:" in output
|
||||
assert "80.0%" in output # 0.8 = 80%
|
||||
|
||||
def test_stats_error_handling(self):
|
||||
"""Test stats command error handling."""
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.side_effect = Exception("Bucket not found")
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "nonexistent-bucket"])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "Error: Bucket not found" in result.output
|
||||
|
||||
def test_stats_with_s3_url(self):
|
||||
"""Test stats command with s3:// URL format."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=1000000,
|
||||
compressed_size=500000,
|
||||
space_saved=500000,
|
||||
average_compression_ratio=0.5,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "s3://test-bucket", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Verify bucket name was parsed correctly from S3 URL
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="quick", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_with_s3_url_trailing_slash(self):
|
||||
"""Test stats command with s3:// URL format with trailing slash."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=1000000,
|
||||
compressed_size=500000,
|
||||
space_saved=500000,
|
||||
average_compression_ratio=0.5,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "s3://test-bucket/", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Verify bucket name was parsed correctly from S3 URL with trailing slash
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="quick", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_with_s3_url_with_prefix(self):
|
||||
"""Test stats command with s3:// URL format with prefix (should ignore prefix)."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=1000000,
|
||||
compressed_size=500000,
|
||||
space_saved=500000,
|
||||
average_compression_ratio=0.5,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "s3://test-bucket/some/prefix/", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Verify only bucket name was extracted, prefix ignored
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="quick", use_cache=True, refresh_cache=False
|
||||
)
|
||||
189
tests/unit/test_cache_encrypted.py
Normal file
189
tests/unit/test_cache_encrypted.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Tests for encrypted cache adapter."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from cryptography.fernet import Fernet
|
||||
|
||||
from deltaglider.adapters import ContentAddressedCache, EncryptedCache, Sha256Adapter
|
||||
from deltaglider.core.errors import CacheCorruptionError, CacheMissError
|
||||
|
||||
|
||||
class TestEncryptedCache:
|
||||
"""Test encrypted cache wrapper functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def temp_dir(self):
|
||||
"""Create temporary directory for tests."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
yield Path(tmpdir)
|
||||
|
||||
@pytest.fixture
|
||||
def hasher(self):
|
||||
"""Create SHA256 hasher."""
|
||||
return Sha256Adapter()
|
||||
|
||||
@pytest.fixture
|
||||
def backend(self, temp_dir, hasher):
|
||||
"""Create CAS backend."""
|
||||
return ContentAddressedCache(temp_dir, hasher)
|
||||
|
||||
@pytest.fixture
|
||||
def encrypted_cache(self, backend):
|
||||
"""Create encrypted cache with ephemeral key."""
|
||||
return EncryptedCache(backend)
|
||||
|
||||
def test_ephemeral_key_generation(self, backend):
|
||||
"""Test that ephemeral key is generated automatically."""
|
||||
cache = EncryptedCache(backend)
|
||||
|
||||
assert cache._ephemeral is True
|
||||
assert cache._key is not None
|
||||
assert len(cache._key) == 44 # Base64-encoded 32-byte key
|
||||
|
||||
def test_provided_key_usage(self, backend):
|
||||
"""Test using provided encryption key."""
|
||||
key = Fernet.generate_key()
|
||||
cache = EncryptedCache(backend, encryption_key=key)
|
||||
|
||||
assert cache._ephemeral is False
|
||||
assert cache._key == key
|
||||
|
||||
def test_write_and_read_encrypted(self, encrypted_cache, temp_dir):
|
||||
"""Test writing and reading encrypted content."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Secret data that should be encrypted"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Compute expected SHA
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
# Write to encrypted cache
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Read back and validate
|
||||
decrypted_path = encrypted_cache.get_validated_ref(
|
||||
"test-bucket", "test-prefix", expected_sha
|
||||
)
|
||||
|
||||
# Verify decrypted content matches original
|
||||
decrypted_content = decrypted_path.read_bytes()
|
||||
assert decrypted_content == test_content
|
||||
|
||||
def test_encrypted_storage_not_readable(self, encrypted_cache, backend, temp_dir):
|
||||
"""Test that stored data is actually encrypted."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Plaintext secret"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Write to encrypted cache
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Get the encrypted file path from backend
|
||||
backend_path = backend.ref_path("test-bucket", "test-prefix")
|
||||
|
||||
# Read encrypted content directly
|
||||
encrypted_content = backend_path.read_bytes()
|
||||
|
||||
# Verify content is NOT the same as plaintext
|
||||
assert encrypted_content != test_content
|
||||
# Verify content doesn't contain plaintext substring
|
||||
assert b"secret" not in encrypted_content.lower()
|
||||
|
||||
def test_cache_miss(self, encrypted_cache):
|
||||
"""Test cache miss error."""
|
||||
with pytest.raises(CacheMissError):
|
||||
encrypted_cache.get_validated_ref("no-bucket", "no-prefix", "fakehash")
|
||||
|
||||
def test_decryption_with_wrong_sha(self, encrypted_cache, temp_dir):
|
||||
"""Test that wrong SHA is detected after decryption."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test content"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Write to cache
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Try to read with wrong SHA
|
||||
with pytest.raises(CacheCorruptionError, match="SHA mismatch"):
|
||||
encrypted_cache.get_validated_ref("test-bucket", "test-prefix", "wrong_sha_hash_here")
|
||||
|
||||
def test_decryption_with_wrong_key(self, temp_dir):
|
||||
"""Test that decryption fails with wrong key."""
|
||||
# Create shared backend
|
||||
from deltaglider.adapters import ContentAddressedCache, Sha256Adapter
|
||||
|
||||
hasher = Sha256Adapter()
|
||||
backend = ContentAddressedCache(temp_dir / "shared", hasher)
|
||||
|
||||
# Create two caches with different keys sharing same backend
|
||||
cache1 = EncryptedCache(backend)
|
||||
|
||||
# Write with cache1
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Encrypted data"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
cache1.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Create cache2 with different key (fresh instance, different ephemeral key)
|
||||
# and manually add to its mapping (simulating persistent storage scenario)
|
||||
cache2 = EncryptedCache(backend)
|
||||
cache2._plaintext_sha_map[("test-bucket", "test-prefix")] = expected_sha
|
||||
|
||||
# Try to read with cache2 (different key) - should fail decryption
|
||||
with pytest.raises(CacheCorruptionError, match="Decryption failed"):
|
||||
cache2.get_validated_ref("test-bucket", "test-prefix", expected_sha)
|
||||
|
||||
def test_evict_cleans_decrypted_files(self, encrypted_cache, temp_dir):
|
||||
"""Test that evict cleans up .decrypted temporary files."""
|
||||
# Create and store file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Read to create .decrypted file
|
||||
decrypted_path = encrypted_cache.get_validated_ref(
|
||||
"test-bucket", "test-prefix", expected_sha
|
||||
)
|
||||
assert decrypted_path.exists()
|
||||
|
||||
# Evict
|
||||
encrypted_cache.evict("test-bucket", "test-prefix")
|
||||
|
||||
# Verify .decrypted file is removed
|
||||
assert not decrypted_path.exists()
|
||||
|
||||
def test_from_env_with_no_key(self, backend, monkeypatch):
|
||||
"""Test from_env creates ephemeral key when env var not set."""
|
||||
monkeypatch.delenv("DG_CACHE_ENCRYPTION_KEY", raising=False)
|
||||
|
||||
cache = EncryptedCache.from_env(backend)
|
||||
|
||||
assert cache._ephemeral is True
|
||||
|
||||
def test_from_env_with_key(self, backend, monkeypatch):
|
||||
"""Test from_env uses key from environment."""
|
||||
key = Fernet.generate_key()
|
||||
monkeypatch.setenv("DG_CACHE_ENCRYPTION_KEY", key.decode("utf-8"))
|
||||
|
||||
cache = EncryptedCache.from_env(backend)
|
||||
|
||||
assert cache._ephemeral is False
|
||||
assert cache._key == key
|
||||
200
tests/unit/test_cache_memory.py
Normal file
200
tests/unit/test_cache_memory.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Tests for in-memory cache adapter."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.adapters import MemoryCache, Sha256Adapter
|
||||
from deltaglider.core.errors import CacheCorruptionError, CacheMissError
|
||||
|
||||
|
||||
class TestMemoryCache:
|
||||
"""Test in-memory cache functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def temp_dir(self):
|
||||
"""Create temporary directory for tests."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
yield Path(tmpdir)
|
||||
|
||||
@pytest.fixture
|
||||
def hasher(self):
|
||||
"""Create SHA256 hasher."""
|
||||
return Sha256Adapter()
|
||||
|
||||
@pytest.fixture
|
||||
def memory_cache(self, hasher, temp_dir):
|
||||
"""Create memory cache with 1MB limit."""
|
||||
return MemoryCache(hasher, max_size_mb=1, temp_dir=temp_dir)
|
||||
|
||||
def test_write_and_read(self, memory_cache, temp_dir):
|
||||
"""Test basic write and read functionality."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Hello, memory cache!"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Compute expected SHA
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
# Write to memory cache
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Read back
|
||||
retrieved_path = memory_cache.get_validated_ref("test-bucket", "test-prefix", expected_sha)
|
||||
|
||||
# Verify content
|
||||
assert retrieved_path.read_bytes() == test_content
|
||||
|
||||
def test_has_ref_true(self, memory_cache, temp_dir):
|
||||
"""Test has_ref returns True for existing content."""
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
assert memory_cache.has_ref("test-bucket", "test-prefix", sha) is True
|
||||
|
||||
def test_has_ref_false(self, memory_cache):
|
||||
"""Test has_ref returns False for non-existent content."""
|
||||
assert memory_cache.has_ref("no-bucket", "no-prefix", "fakehash") is False
|
||||
|
||||
def test_cache_miss(self, memory_cache):
|
||||
"""Test cache miss error."""
|
||||
with pytest.raises(CacheMissError):
|
||||
memory_cache.get_validated_ref("no-bucket", "no-prefix", "fakehash")
|
||||
|
||||
def test_sha_mismatch_detection(self, memory_cache, temp_dir):
|
||||
"""Test that SHA mismatch is detected."""
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_file.write_bytes(b"Content")
|
||||
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Try to read with wrong SHA
|
||||
with pytest.raises(CacheCorruptionError, match="SHA mismatch"):
|
||||
memory_cache.get_validated_ref("test-bucket", "test-prefix", "wrong_sha")
|
||||
|
||||
def test_lru_eviction(self, hasher, temp_dir):
|
||||
"""Test LRU eviction when cache is full."""
|
||||
# Create small cache (only 10KB)
|
||||
small_cache = MemoryCache(hasher, max_size_mb=0.01, temp_dir=temp_dir)
|
||||
|
||||
# Create files that will exceed cache limit
|
||||
file1 = temp_dir / "file1.txt"
|
||||
file2 = temp_dir / "file2.txt"
|
||||
file3 = temp_dir / "file3.txt"
|
||||
|
||||
# Each file is 5KB
|
||||
file1.write_bytes(b"A" * 5000)
|
||||
file2.write_bytes(b"B" * 5000)
|
||||
file3.write_bytes(b"C" * 5000)
|
||||
|
||||
# Write file1 and file2 (total 10KB, at limit)
|
||||
small_cache.write_ref("bucket", "prefix1", file1)
|
||||
small_cache.write_ref("bucket", "prefix2", file2)
|
||||
|
||||
# Verify both are in cache
|
||||
import hashlib
|
||||
|
||||
sha1 = hashlib.sha256(b"A" * 5000).hexdigest()
|
||||
sha2 = hashlib.sha256(b"B" * 5000).hexdigest()
|
||||
|
||||
assert small_cache.has_ref("bucket", "prefix1", sha1) is True
|
||||
assert small_cache.has_ref("bucket", "prefix2", sha2) is True
|
||||
|
||||
# Write file3 (5KB) - should evict file1 (LRU)
|
||||
small_cache.write_ref("bucket", "prefix3", file3)
|
||||
|
||||
# file1 should be evicted
|
||||
assert small_cache.has_ref("bucket", "prefix1", sha1) is False
|
||||
|
||||
# file2 and file3 should still be in cache
|
||||
sha3 = hashlib.sha256(b"C" * 5000).hexdigest()
|
||||
assert small_cache.has_ref("bucket", "prefix2", sha2) is True
|
||||
assert small_cache.has_ref("bucket", "prefix3", sha3) is True
|
||||
|
||||
def test_file_too_large_for_cache(self, hasher, temp_dir):
|
||||
"""Test error when file exceeds cache size limit."""
|
||||
small_cache = MemoryCache(hasher, max_size_mb=0.001, temp_dir=temp_dir) # 1KB limit
|
||||
|
||||
large_file = temp_dir / "large.txt"
|
||||
large_file.write_bytes(b"X" * 2000) # 2KB file
|
||||
|
||||
with pytest.raises(CacheCorruptionError, match="too large"):
|
||||
small_cache.write_ref("bucket", "prefix", large_file)
|
||||
|
||||
def test_evict_removes_from_memory(self, memory_cache, temp_dir):
|
||||
"""Test that evict removes content from memory."""
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Verify it's in cache
|
||||
assert memory_cache.has_ref("test-bucket", "test-prefix", sha) is True
|
||||
|
||||
# Evict
|
||||
memory_cache.evict("test-bucket", "test-prefix")
|
||||
|
||||
# Verify it's gone
|
||||
assert memory_cache.has_ref("test-bucket", "test-prefix", sha) is False
|
||||
|
||||
def test_clear_removes_all(self, memory_cache, temp_dir):
|
||||
"""Test that clear removes all cached content."""
|
||||
# Add multiple files
|
||||
for i in range(3):
|
||||
test_file = temp_dir / f"test{i}.txt"
|
||||
test_file.write_bytes(f"Content {i}".encode())
|
||||
memory_cache.write_ref("bucket", f"prefix{i}", test_file)
|
||||
|
||||
# Verify cache is not empty
|
||||
assert memory_cache._current_size > 0
|
||||
assert len(memory_cache._cache) == 3
|
||||
|
||||
# Clear
|
||||
memory_cache.clear()
|
||||
|
||||
# Verify cache is empty
|
||||
assert memory_cache._current_size == 0
|
||||
assert len(memory_cache._cache) == 0
|
||||
assert len(memory_cache._access_order) == 0
|
||||
|
||||
def test_access_order_updated_on_read(self, memory_cache, temp_dir):
|
||||
"""Test that LRU access order is updated on reads."""
|
||||
# Create two files
|
||||
file1 = temp_dir / "file1.txt"
|
||||
file2 = temp_dir / "file2.txt"
|
||||
file1.write_bytes(b"File 1")
|
||||
file2.write_bytes(b"File 2")
|
||||
|
||||
# Write both
|
||||
memory_cache.write_ref("bucket", "prefix1", file1)
|
||||
memory_cache.write_ref("bucket", "prefix2", file2)
|
||||
|
||||
# Access order should be: [prefix1, prefix2]
|
||||
assert memory_cache._access_order[0] == ("bucket", "prefix1")
|
||||
assert memory_cache._access_order[1] == ("bucket", "prefix2")
|
||||
|
||||
# Read prefix1 again
|
||||
import hashlib
|
||||
|
||||
sha1 = hashlib.sha256(b"File 1").hexdigest()
|
||||
memory_cache.get_validated_ref("bucket", "prefix1", sha1)
|
||||
|
||||
# Access order should now be: [prefix2, prefix1]
|
||||
assert memory_cache._access_order[0] == ("bucket", "prefix2")
|
||||
assert memory_cache._access_order[1] == ("bucket", "prefix1")
|
||||
@@ -50,10 +50,10 @@ class TestDeltaServicePut:
|
||||
ref_sha = service.hasher.sha256(io.BytesIO(ref_content))
|
||||
|
||||
ref_metadata = {
|
||||
"tool": "deltaglider/0.1.0",
|
||||
"source_name": "original.zip",
|
||||
"file_sha256": ref_sha,
|
||||
"created_at": "2025-01-01T00:00:00Z",
|
||||
"dg-tool": "deltaglider/0.1.0",
|
||||
"dg-source-name": "original.zip",
|
||||
"dg-file-sha256": ref_sha,
|
||||
"dg-created-at": "2025-01-01T00:00:00Z",
|
||||
}
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="test/prefix/reference.bin",
|
||||
@@ -98,7 +98,7 @@ class TestDeltaServicePut:
|
||||
ref_sha = service.hasher.sha256(io.BytesIO(ref_content))
|
||||
|
||||
ref_metadata = {
|
||||
"file_sha256": ref_sha,
|
||||
"dg-file-sha256": ref_sha,
|
||||
}
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="test/prefix/reference.bin",
|
||||
@@ -200,15 +200,15 @@ class TestDeltaServiceVerify:
|
||||
ref_sha = service.hasher.sha256(io.BytesIO(ref_content))
|
||||
|
||||
delta_metadata = {
|
||||
"tool": "deltaglider/0.1.0",
|
||||
"original_name": "file.zip",
|
||||
"file_sha256": test_sha,
|
||||
"file_size": str(len(test_content)),
|
||||
"created_at": "2025-01-01T00:00:00Z",
|
||||
"ref_key": "test/reference.bin",
|
||||
"ref_sha256": ref_sha,
|
||||
"delta_size": "100",
|
||||
"delta_cmd": "xdelta3 -e -9 -s reference.bin file.zip file.zip.delta",
|
||||
"dg-tool": "deltaglider/0.1.0",
|
||||
"dg-original-name": "file.zip",
|
||||
"dg-file-sha256": test_sha,
|
||||
"dg-file-size": str(len(test_content)),
|
||||
"dg-created-at": "2025-01-01T00:00:00Z",
|
||||
"dg-ref-key": "test/reference.bin",
|
||||
"dg-ref-sha256": ref_sha,
|
||||
"dg-delta-size": "100",
|
||||
"dg-delta-cmd": "xdelta3 -e -9 -s reference.bin file.zip file.zip.delta",
|
||||
}
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="test/file.zip.delta",
|
||||
|
||||
25
tests/unit/test_delta_extensions.py
Normal file
25
tests/unit/test_delta_extensions.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Tests for shared delta extension policy."""
|
||||
|
||||
from deltaglider.core.delta_extensions import (
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
DEFAULT_DELTA_EXTENSIONS,
|
||||
is_delta_candidate,
|
||||
)
|
||||
|
||||
|
||||
def test_is_delta_candidate_matches_default_extensions():
|
||||
"""All default extensions should be detected as delta candidates."""
|
||||
for ext in DEFAULT_DELTA_EXTENSIONS:
|
||||
assert is_delta_candidate(f"file{ext}")
|
||||
|
||||
|
||||
def test_is_delta_candidate_matches_compound_extensions():
|
||||
"""Compound extensions should be handled even with multiple suffixes."""
|
||||
for ext in DEFAULT_COMPOUND_DELTA_EXTENSIONS:
|
||||
assert is_delta_candidate(f"file{ext}")
|
||||
|
||||
|
||||
def test_is_delta_candidate_rejects_other_extensions():
|
||||
"""Non delta-friendly extensions should return False."""
|
||||
assert not is_delta_candidate("document.txt")
|
||||
assert not is_delta_candidate("image.jpeg")
|
||||
112
tests/unit/test_object_listing.py
Normal file
112
tests/unit/test_object_listing.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Unit tests for object_listing pagination."""
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
from deltaglider.core.object_listing import list_all_objects, list_objects_page
|
||||
|
||||
|
||||
def test_list_objects_page_passes_continuation_token():
|
||||
"""Test that list_objects_page passes continuation_token to storage."""
|
||||
storage = Mock()
|
||||
storage.list_objects.return_value = {
|
||||
"objects": [],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
"key_count": 0,
|
||||
}
|
||||
|
||||
list_objects_page(
|
||||
storage,
|
||||
bucket="test-bucket",
|
||||
continuation_token="test-token",
|
||||
)
|
||||
|
||||
# Verify continuation_token was passed
|
||||
storage.list_objects.assert_called_once()
|
||||
call_kwargs = storage.list_objects.call_args.kwargs
|
||||
assert call_kwargs["continuation_token"] == "test-token"
|
||||
|
||||
|
||||
def test_list_all_objects_uses_continuation_token_for_pagination():
|
||||
"""Test that list_all_objects uses continuation_token (not start_after) for pagination."""
|
||||
storage = Mock()
|
||||
|
||||
# Mock 3 pages of results
|
||||
responses = [
|
||||
{
|
||||
"objects": [{"key": f"obj{i}"} for i in range(1000)],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token1",
|
||||
"key_count": 1000,
|
||||
},
|
||||
{
|
||||
"objects": [{"key": f"obj{i}"} for i in range(1000, 2000)],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token2",
|
||||
"key_count": 1000,
|
||||
},
|
||||
{
|
||||
"objects": [{"key": f"obj{i}"} for i in range(2000, 2500)],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
"key_count": 500,
|
||||
},
|
||||
]
|
||||
|
||||
storage.list_objects.side_effect = responses
|
||||
|
||||
result = list_all_objects(
|
||||
storage,
|
||||
bucket="test-bucket",
|
||||
prefix="",
|
||||
)
|
||||
|
||||
# Should have made 3 calls
|
||||
assert storage.list_objects.call_count == 3
|
||||
|
||||
# Should have collected all objects
|
||||
assert len(result.objects) == 2500
|
||||
|
||||
# Should not be truncated
|
||||
assert not result.is_truncated
|
||||
|
||||
# Verify the calls used continuation_token correctly
|
||||
calls = storage.list_objects.call_args_list
|
||||
assert len(calls) == 3
|
||||
|
||||
# First call should have no continuation_token
|
||||
assert calls[0].kwargs.get("continuation_token") is None
|
||||
|
||||
# Second call should use token1
|
||||
assert calls[1].kwargs.get("continuation_token") == "token1"
|
||||
|
||||
# Third call should use token2
|
||||
assert calls[2].kwargs.get("continuation_token") == "token2"
|
||||
|
||||
|
||||
def test_list_all_objects_prevents_infinite_loop():
|
||||
"""Test that list_all_objects has max_iterations protection."""
|
||||
storage = Mock()
|
||||
|
||||
# Mock infinite pagination (always returns more)
|
||||
storage.list_objects.return_value = {
|
||||
"objects": [{"key": "obj"}],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token",
|
||||
"key_count": 1,
|
||||
}
|
||||
|
||||
result = list_all_objects(
|
||||
storage,
|
||||
bucket="test-bucket",
|
||||
max_iterations=10, # Low limit for testing
|
||||
)
|
||||
|
||||
# Should stop at max_iterations
|
||||
assert storage.list_objects.call_count == 10
|
||||
assert result.is_truncated
|
||||
70
tests/unit/test_s3_compat.py
Normal file
70
tests/unit/test_s3_compat.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""Tests for S3-compatible storage compatibility.
|
||||
|
||||
Ensures the S3 adapter works with non-AWS S3 endpoints (Hetzner, MinIO, etc.)
|
||||
that don't support newer AWS-specific features like automatic request checksums.
|
||||
"""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from deltaglider.adapters.storage_s3 import S3StorageAdapter
|
||||
|
||||
|
||||
class TestS3CompatibleEndpoints:
|
||||
"""Verify S3 adapter configuration for non-AWS endpoint compatibility."""
|
||||
|
||||
def test_client_disables_automatic_checksums(self):
|
||||
"""boto3 1.36+ sends CRC32/CRC64 checksums by default.
|
||||
|
||||
S3-compatible stores (Hetzner, MinIO) reject these with BadRequest.
|
||||
The adapter must set request_checksum_calculation='when_required'.
|
||||
"""
|
||||
with patch("deltaglider.adapters.storage_s3.boto3.client") as mock_client:
|
||||
S3StorageAdapter(endpoint_url="https://example.com")
|
||||
|
||||
mock_client.assert_called_once()
|
||||
call_kwargs = mock_client.call_args
|
||||
config = call_kwargs.kwargs.get("config") or call_kwargs[1].get("config")
|
||||
|
||||
assert config is not None, "boto3 client must be created with a Config object"
|
||||
assert config.request_checksum_calculation == "when_required"
|
||||
assert config.response_checksum_validation == "when_required"
|
||||
|
||||
def test_put_object_no_checksum_kwargs(self, temp_dir):
|
||||
"""put_object must not pass ChecksumAlgorithm or similar kwargs."""
|
||||
mock_client = MagicMock()
|
||||
mock_client.put_object.return_value = {"ETag": '"abc123"'}
|
||||
|
||||
adapter = S3StorageAdapter(client=mock_client)
|
||||
|
||||
test_file = temp_dir / "test.sha1"
|
||||
test_file.write_text("abc123")
|
||||
|
||||
adapter.put(
|
||||
"my-bucket/test/test.sha1",
|
||||
test_file,
|
||||
{"compression": "none", "tool": "deltaglider"},
|
||||
)
|
||||
|
||||
mock_client.put_object.assert_called_once()
|
||||
call_kwargs = mock_client.put_object.call_args.kwargs
|
||||
|
||||
checksum_keys = {
|
||||
"ChecksumAlgorithm",
|
||||
"ChecksumCRC32",
|
||||
"ChecksumCRC32C",
|
||||
"ChecksumCRC64NVME",
|
||||
"ChecksumSHA1",
|
||||
"ChecksumSHA256",
|
||||
"ContentMD5",
|
||||
}
|
||||
passed_checksum_keys = checksum_keys & set(call_kwargs.keys())
|
||||
assert not passed_checksum_keys, (
|
||||
f"put_object must not pass checksum kwargs for S3-compatible "
|
||||
f"endpoint support, but found: {passed_checksum_keys}"
|
||||
)
|
||||
|
||||
def test_preconfigured_client_is_used_as_is(self):
|
||||
"""When a pre-configured client is passed, it should be used directly."""
|
||||
mock_client = MagicMock()
|
||||
adapter = S3StorageAdapter(client=mock_client)
|
||||
assert adapter.client is mock_client
|
||||
44
tests/unit/test_s3_uri.py
Normal file
44
tests/unit/test_s3_uri.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Tests for S3 URI helpers."""
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.core.s3_uri import build_s3_url, is_s3_url, parse_s3_url
|
||||
|
||||
|
||||
def test_is_s3_url_detects_scheme() -> None:
|
||||
"""is_s3_url should only match the S3 scheme."""
|
||||
assert is_s3_url("s3://bucket/path")
|
||||
assert not is_s3_url("https://example.com/object")
|
||||
|
||||
|
||||
def test_parse_s3_url_returns_bucket_and_key() -> None:
|
||||
"""Parsing should split bucket and key correctly."""
|
||||
parsed = parse_s3_url("s3://my-bucket/path/to/object.txt")
|
||||
assert parsed.bucket == "my-bucket"
|
||||
assert parsed.key == "path/to/object.txt"
|
||||
|
||||
|
||||
def test_parse_strips_trailing_slash_when_requested() -> None:
|
||||
"""strip_trailing_slash should normalise directory-style URLs."""
|
||||
parsed = parse_s3_url("s3://my-bucket/path/to/", strip_trailing_slash=True)
|
||||
assert parsed.bucket == "my-bucket"
|
||||
assert parsed.key == "path/to"
|
||||
|
||||
|
||||
def test_parse_requires_key_when_configured() -> None:
|
||||
"""allow_empty_key=False should reject bucket-only URLs."""
|
||||
with pytest.raises(ValueError):
|
||||
parse_s3_url("s3://bucket-only", allow_empty_key=False)
|
||||
|
||||
|
||||
def test_build_s3_url_round_trip() -> None:
|
||||
"""build_s3_url should round-trip with parse_s3_url."""
|
||||
url = build_s3_url("bucket", "dir/file.tar")
|
||||
parsed = parse_s3_url(url)
|
||||
assert parsed.bucket == "bucket"
|
||||
assert parsed.key == "dir/file.tar"
|
||||
|
||||
|
||||
def test_build_s3_url_for_bucket_root() -> None:
|
||||
"""When key is missing, build_s3_url should omit the trailing slash."""
|
||||
assert build_s3_url("root-bucket") == "s3://root-bucket"
|
||||
479
tests/unit/test_stats_algorithm.py
Normal file
479
tests/unit/test_stats_algorithm.py
Normal file
@@ -0,0 +1,479 @@
|
||||
"""Exhaustive tests for the bucket statistics algorithm."""
|
||||
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.client_operations.stats import get_bucket_stats
|
||||
|
||||
|
||||
class TestBucketStatsAlgorithm:
|
||||
"""Test suite for get_bucket_stats algorithm."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_client(self):
|
||||
"""Create a mock DeltaGliderClient."""
|
||||
client = Mock()
|
||||
client.service = Mock()
|
||||
client.service.storage = Mock()
|
||||
client.service.logger = Mock()
|
||||
return client
|
||||
|
||||
def test_empty_bucket(self, mock_client):
|
||||
"""Test statistics for an empty bucket."""
|
||||
# Setup: Empty bucket
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "empty-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.bucket == "empty-bucket"
|
||||
assert stats.object_count == 0
|
||||
assert stats.total_size == 0
|
||||
assert stats.compressed_size == 0
|
||||
assert stats.space_saved == 0
|
||||
assert stats.average_compression_ratio == 0.0
|
||||
assert stats.delta_objects == 0
|
||||
assert stats.direct_objects == 0
|
||||
|
||||
def test_bucket_with_only_direct_files(self, mock_client):
|
||||
"""Test bucket with only direct files (no compression)."""
|
||||
# Setup: Bucket with 3 direct files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "file1.pdf", "size": 1000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file2.html", "size": 500000, "last_modified": "2024-01-02"},
|
||||
{"key": "file3.txt", "size": 250000, "last_modified": "2024-01-03"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "direct-only-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 3
|
||||
assert stats.total_size == 1750000 # Sum of all files
|
||||
assert stats.compressed_size == 1750000 # Same as total (no compression)
|
||||
assert stats.space_saved == 0
|
||||
assert stats.average_compression_ratio == 0.0
|
||||
assert stats.delta_objects == 0
|
||||
assert stats.direct_objects == 3
|
||||
|
||||
def test_bucket_with_delta_compression(self, mock_client):
|
||||
"""Test bucket with delta-compressed files."""
|
||||
# Setup: Bucket with reference.bin and 2 delta files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "file2.zip.delta", "size": 60000, "last_modified": "2024-01-03"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata for delta files
|
||||
def mock_head(path):
|
||||
if "file1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19500000", "compression_ratio": "0.997"}
|
||||
return head
|
||||
elif "file2.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19600000", "compression_ratio": "0.997"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "compressed-bucket", mode="detailed")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 2 # Only delta files counted (not reference.bin)
|
||||
assert stats.total_size == 39100000 # 19.5M + 19.6M
|
||||
assert stats.compressed_size == 20110000 # reference (20M) + deltas (50K + 60K)
|
||||
assert stats.space_saved == 18990000 # ~19MB saved
|
||||
assert stats.average_compression_ratio > 0.48 # ~48.6% compression
|
||||
assert stats.delta_objects == 2
|
||||
assert stats.direct_objects == 0
|
||||
|
||||
def test_orphaned_reference_bin_detection(self, mock_client):
|
||||
"""Test detection of orphaned reference.bin files."""
|
||||
# Setup: Bucket with reference.bin but no delta files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "regular.pdf", "size": 1000000, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "orphaned-ref-bucket")
|
||||
|
||||
# Verify stats
|
||||
assert stats.object_count == 1 # Only regular.pdf
|
||||
assert stats.total_size == 1000000 # Only regular.pdf size
|
||||
assert stats.compressed_size == 1000000 # reference.bin NOT included
|
||||
assert stats.space_saved == 0
|
||||
assert stats.delta_objects == 0
|
||||
assert stats.direct_objects == 1
|
||||
|
||||
# Verify warning was logged
|
||||
warning_calls = mock_client.service.logger.warning.call_args_list
|
||||
assert any("ORPHANED REFERENCE FILE" in str(call) for call in warning_calls)
|
||||
assert any("20,000,000 bytes" in str(call) for call in warning_calls)
|
||||
assert any(
|
||||
"aws s3 rm s3://orphaned-ref-bucket/reference.bin" in str(call)
|
||||
for call in warning_calls
|
||||
)
|
||||
|
||||
def test_mixed_bucket(self, mock_client):
|
||||
"""Test bucket with both delta and direct files."""
|
||||
# Setup: Mixed bucket
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "pro/reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "pro/v1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "pro/v2.zip.delta", "size": 60000, "last_modified": "2024-01-03"},
|
||||
{"key": "docs/readme.pdf", "size": 500000, "last_modified": "2024-01-04"},
|
||||
{"key": "docs/manual.html", "size": 300000, "last_modified": "2024-01-05"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata for delta files
|
||||
def mock_head(path):
|
||||
if "v1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19500000"}
|
||||
return head
|
||||
elif "v2.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19600000"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "mixed-bucket", mode="detailed")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 4 # 2 delta + 2 direct files
|
||||
assert stats.total_size == 39900000 # 19.5M + 19.6M + 0.5M + 0.3M
|
||||
assert stats.compressed_size == 20910000 # ref (20M) + deltas (110K) + direct (800K)
|
||||
assert stats.space_saved == 18990000
|
||||
assert stats.delta_objects == 2
|
||||
assert stats.direct_objects == 2
|
||||
|
||||
def test_sha1_files_included(self, mock_client):
|
||||
"""Test that .sha1 checksum files are counted properly."""
|
||||
# Setup: Bucket with .sha1 files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "file1.zip", "size": 1000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file1.zip.sha1", "size": 41, "last_modified": "2024-01-01"},
|
||||
{"key": "file2.tar", "size": 2000000, "last_modified": "2024-01-02"},
|
||||
{"key": "file2.tar.sha1", "size": 41, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "sha1-bucket")
|
||||
|
||||
# Verify - .sha1 files ARE counted
|
||||
assert stats.object_count == 4
|
||||
assert stats.total_size == 3000082 # All files including .sha1
|
||||
assert stats.compressed_size == 3000082
|
||||
assert stats.direct_objects == 4
|
||||
|
||||
def test_multiple_deltaspaces(self, mock_client):
|
||||
"""Test bucket with multiple deltaspaces (different prefixes)."""
|
||||
# Setup: Multiple deltaspaces
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "pro/reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "pro/v1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{
|
||||
"key": "enterprise/reference.bin",
|
||||
"size": 25000000,
|
||||
"last_modified": "2024-01-03",
|
||||
},
|
||||
{"key": "enterprise/v1.zip.delta", "size": 70000, "last_modified": "2024-01-04"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata
|
||||
def mock_head(path):
|
||||
if "pro/v1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19500000"}
|
||||
return head
|
||||
elif "enterprise/v1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "24500000"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "multi-deltaspace-bucket", mode="detailed")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 2 # Only delta files
|
||||
assert stats.total_size == 44000000 # 19.5M + 24.5M
|
||||
assert stats.compressed_size == 45120000 # Both references + both deltas
|
||||
assert stats.delta_objects == 2
|
||||
assert stats.direct_objects == 0
|
||||
|
||||
def test_pagination_handling(self, mock_client):
|
||||
"""Test handling of paginated results."""
|
||||
# Setup: Paginated responses
|
||||
mock_client.service.storage.list_objects.side_effect = [
|
||||
{
|
||||
"objects": [
|
||||
{"key": f"file{i}.txt", "size": 1000, "last_modified": "2024-01-01"}
|
||||
for i in range(1000)
|
||||
],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token1",
|
||||
},
|
||||
{
|
||||
"objects": [
|
||||
{"key": f"file{i}.txt", "size": 1000, "last_modified": "2024-01-01"}
|
||||
for i in range(1000, 1500)
|
||||
],
|
||||
"is_truncated": False,
|
||||
},
|
||||
]
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "paginated-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 1500
|
||||
assert stats.total_size == 1500000
|
||||
assert stats.compressed_size == 1500000
|
||||
assert stats.direct_objects == 1500
|
||||
|
||||
# Verify pagination was handled
|
||||
assert mock_client.service.storage.list_objects.call_count == 2
|
||||
|
||||
def test_delta_file_without_metadata(self, mock_client):
|
||||
"""Test handling of delta files with missing metadata in quick mode."""
|
||||
# Setup: Delta file without metadata
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# No metadata available (quick mode doesn't fetch metadata)
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute in quick mode (default)
|
||||
stats = get_bucket_stats(mock_client, "no-metadata-bucket", mode="quick")
|
||||
|
||||
# Verify - without metadata, original size cannot be calculated
|
||||
assert stats.object_count == 1
|
||||
assert stats.total_size == 0 # Cannot calculate without metadata
|
||||
assert stats.compressed_size == 20050000 # reference + delta
|
||||
assert stats.space_saved == 0 # Cannot calculate without metadata
|
||||
assert stats.delta_objects == 1
|
||||
|
||||
# Verify warning was logged about incomplete stats in quick mode
|
||||
warning_calls = mock_client.service.logger.warning.call_args_list
|
||||
assert any("Quick mode cannot calculate" in str(call) for call in warning_calls)
|
||||
|
||||
def test_parallel_metadata_fetching(self, mock_client):
|
||||
"""Test that metadata is fetched in parallel for performance."""
|
||||
# Setup: Many delta files
|
||||
num_deltas = 50
|
||||
objects = [{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"}]
|
||||
objects.extend(
|
||||
[
|
||||
{
|
||||
"key": f"file{i}.zip.delta",
|
||||
"size": 50000 + i,
|
||||
"last_modified": f"2024-01-{i + 2:02d}",
|
||||
}
|
||||
for i in range(num_deltas)
|
||||
]
|
||||
)
|
||||
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": objects,
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata
|
||||
def mock_head(path):
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19500000"}
|
||||
return head
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute with mocked ThreadPoolExecutor
|
||||
with patch("concurrent.futures.ThreadPoolExecutor") as mock_executor:
|
||||
mock_pool = MagicMock()
|
||||
mock_executor.return_value.__enter__.return_value = mock_pool
|
||||
|
||||
# Simulate parallel execution
|
||||
futures = []
|
||||
for i in range(num_deltas):
|
||||
future = Mock()
|
||||
future.result.return_value = (f"file{i}.zip.delta", {"dg-file-size": "19500000"})
|
||||
futures.append(future)
|
||||
|
||||
mock_pool.submit.side_effect = futures
|
||||
patch_as_completed = patch(
|
||||
"concurrent.futures.as_completed",
|
||||
return_value=futures,
|
||||
)
|
||||
|
||||
with patch_as_completed:
|
||||
_ = get_bucket_stats(mock_client, "parallel-bucket", mode="detailed")
|
||||
|
||||
# Verify ThreadPoolExecutor was used with correct max_workers
|
||||
mock_executor.assert_called_once_with(max_workers=10) # min(10, 50) = 10
|
||||
|
||||
def test_stats_modes_control_metadata_fetch(self, mock_client):
|
||||
"""Metadata fetching should depend on the selected stats mode."""
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "alpha/reference.bin", "size": 100, "last_modified": "2024-01-01"},
|
||||
{"key": "alpha/file1.zip.delta", "size": 10, "last_modified": "2024-01-02"},
|
||||
{"key": "alpha/file2.zip.delta", "size": 12, "last_modified": "2024-01-03"},
|
||||
{"key": "beta/reference.bin", "size": 200, "last_modified": "2024-01-04"},
|
||||
{"key": "beta/file1.zip.delta", "size": 20, "last_modified": "2024-01-05"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
metadata_by_key = {
|
||||
"alpha/file1.zip.delta": {"dg-file-size": "100", "compression_ratio": "0.9"},
|
||||
"alpha/file2.zip.delta": {"dg-file-size": "120", "compression_ratio": "0.88"},
|
||||
"beta/file1.zip.delta": {"dg-file-size": "210", "compression_ratio": "0.9"},
|
||||
}
|
||||
|
||||
def mock_head(path: str):
|
||||
for key, metadata in metadata_by_key.items():
|
||||
if key in path:
|
||||
head = Mock()
|
||||
head.metadata = metadata
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Quick mode: no metadata fetch
|
||||
_ = get_bucket_stats(mock_client, "mode-test", mode="quick")
|
||||
assert mock_client.service.storage.head.call_count == 0
|
||||
|
||||
# Sampled mode: one HEAD per delta-space (alpha, beta)
|
||||
mock_client.service.storage.head.reset_mock()
|
||||
stats_sampled = get_bucket_stats(mock_client, "mode-test", mode="sampled")
|
||||
assert mock_client.service.storage.head.call_count == 2
|
||||
|
||||
# Detailed mode: HEAD for every delta (3 total)
|
||||
mock_client.service.storage.head.reset_mock()
|
||||
stats_detailed = get_bucket_stats(mock_client, "mode-test", mode="detailed")
|
||||
assert mock_client.service.storage.head.call_count == 3
|
||||
|
||||
# Sampled totals should be close to detailed but not identical
|
||||
assert stats_detailed.total_size == 100 + 120 + 210
|
||||
assert stats_sampled.total_size == 100 + 100 + 210
|
||||
|
||||
def test_error_handling_in_metadata_fetch(self, mock_client):
|
||||
"""Test graceful handling of errors during metadata fetch."""
|
||||
# Setup
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "file2.zip.delta", "size": 60000, "last_modified": "2024-01-03"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata fetch to fail for one file
|
||||
def mock_head(path):
|
||||
if "file1.zip.delta" in path:
|
||||
raise Exception("S3 error")
|
||||
elif "file2.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19600000"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute - should handle error gracefully
|
||||
stats = get_bucket_stats(mock_client, "error-bucket", mode="detailed")
|
||||
|
||||
# Verify - file1 has no metadata (error), file2 uses metadata
|
||||
assert stats.object_count == 2
|
||||
assert stats.delta_objects == 2
|
||||
# file1 has no metadata so not counted in original size, file2 uses metadata (19600000)
|
||||
assert stats.total_size == 19600000
|
||||
|
||||
# Verify warning was logged for file1
|
||||
warning_calls = mock_client.service.logger.warning.call_args_list
|
||||
assert any(
|
||||
"file1.zip.delta" in str(call) and "no original_size metadata" in str(call)
|
||||
for call in warning_calls
|
||||
)
|
||||
|
||||
def test_multiple_orphaned_references(self, mock_client):
|
||||
"""Test detection of multiple orphaned reference.bin files."""
|
||||
# Setup: Multiple orphaned references
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "pro/reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{
|
||||
"key": "enterprise/reference.bin",
|
||||
"size": 25000000,
|
||||
"last_modified": "2024-01-02",
|
||||
},
|
||||
{"key": "community/reference.bin", "size": 15000000, "last_modified": "2024-01-03"},
|
||||
{"key": "regular.pdf", "size": 1000000, "last_modified": "2024-01-04"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "multi-orphaned-bucket")
|
||||
|
||||
# Verify stats
|
||||
assert stats.object_count == 1 # Only regular.pdf
|
||||
assert stats.total_size == 1000000
|
||||
assert stats.compressed_size == 1000000 # No references included
|
||||
assert stats.space_saved == 0
|
||||
|
||||
# Verify warnings for all orphaned references
|
||||
warning_calls = [str(call) for call in mock_client.service.logger.warning.call_args_list]
|
||||
warning_text = " ".join(warning_calls)
|
||||
|
||||
assert "ORPHANED REFERENCE FILE" in warning_text
|
||||
assert "3 reference.bin file(s)" in warning_text
|
||||
assert "60,000,000 bytes" in warning_text # Total of all references
|
||||
assert "s3://multi-orphaned-bucket/pro/reference.bin" in warning_text
|
||||
assert "s3://multi-orphaned-bucket/enterprise/reference.bin" in warning_text
|
||||
assert "s3://multi-orphaned-bucket/community/reference.bin" in warning_text
|
||||
284
tests/unit/test_stats_caching.py
Normal file
284
tests/unit/test_stats_caching.py
Normal file
@@ -0,0 +1,284 @@
|
||||
"""Unit tests for bucket stats caching functionality."""
|
||||
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from deltaglider.client_models import BucketStats
|
||||
from deltaglider.client_operations.stats import (
|
||||
_get_cache_key,
|
||||
_is_cache_valid,
|
||||
_read_stats_cache,
|
||||
_write_stats_cache,
|
||||
)
|
||||
|
||||
|
||||
def test_get_cache_key():
|
||||
"""Test cache key generation for different modes."""
|
||||
assert _get_cache_key("quick") == ".deltaglider/stats_quick.json"
|
||||
assert _get_cache_key("sampled") == ".deltaglider/stats_sampled.json"
|
||||
assert _get_cache_key("detailed") == ".deltaglider/stats_detailed.json"
|
||||
|
||||
|
||||
def test_is_cache_valid_when_unchanged():
|
||||
"""Test cache validation when bucket hasn't changed."""
|
||||
cached_validation = {
|
||||
"object_count": 100,
|
||||
"compressed_size": 50000,
|
||||
}
|
||||
|
||||
assert _is_cache_valid(cached_validation, 100, 50000) is True
|
||||
|
||||
|
||||
def test_is_cache_valid_when_count_changed():
|
||||
"""Test cache validation when object count changed."""
|
||||
cached_validation = {
|
||||
"object_count": 100,
|
||||
"compressed_size": 50000,
|
||||
}
|
||||
|
||||
# Object count changed
|
||||
assert _is_cache_valid(cached_validation, 101, 50000) is False
|
||||
|
||||
|
||||
def test_is_cache_valid_when_size_changed():
|
||||
"""Test cache validation when compressed size changed."""
|
||||
cached_validation = {
|
||||
"object_count": 100,
|
||||
"compressed_size": 50000,
|
||||
}
|
||||
|
||||
# Compressed size changed
|
||||
assert _is_cache_valid(cached_validation, 100, 60000) is False
|
||||
|
||||
|
||||
def test_write_and_read_cache_roundtrip():
|
||||
"""Test writing and reading cache with valid data."""
|
||||
# Create mock client and storage
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Create test stats
|
||||
test_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=150,
|
||||
total_size=1000000,
|
||||
compressed_size=50000,
|
||||
space_saved=950000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=140,
|
||||
direct_objects=10,
|
||||
)
|
||||
|
||||
# Capture what was written to storage
|
||||
written_data = None
|
||||
|
||||
def capture_put(address, data, metadata):
|
||||
nonlocal written_data
|
||||
written_data = data
|
||||
|
||||
mock_storage.put = capture_put
|
||||
|
||||
# Write cache
|
||||
_write_stats_cache(
|
||||
client=mock_client,
|
||||
bucket="test-bucket",
|
||||
mode="quick",
|
||||
stats=test_stats,
|
||||
object_count=150,
|
||||
compressed_size=50000,
|
||||
)
|
||||
|
||||
# Verify something was written
|
||||
assert written_data is not None
|
||||
|
||||
# Parse written data
|
||||
cache_data = json.loads(written_data.decode("utf-8"))
|
||||
|
||||
# Verify structure
|
||||
assert cache_data["version"] == "1.0"
|
||||
assert cache_data["mode"] == "quick"
|
||||
assert "computed_at" in cache_data
|
||||
assert cache_data["validation"]["object_count"] == 150
|
||||
assert cache_data["validation"]["compressed_size"] == 50000
|
||||
assert cache_data["stats"]["bucket"] == "test-bucket"
|
||||
assert cache_data["stats"]["object_count"] == 150
|
||||
assert cache_data["stats"]["delta_objects"] == 140
|
||||
|
||||
# Now test reading it back
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = written_data
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
# Verify read stats match original
|
||||
assert stats is not None
|
||||
assert validation is not None
|
||||
assert stats.bucket == "test-bucket"
|
||||
assert stats.object_count == 150
|
||||
assert stats.delta_objects == 140
|
||||
assert stats.average_compression_ratio == 0.95
|
||||
assert validation["object_count"] == 150
|
||||
assert validation["compressed_size"] == 50000
|
||||
|
||||
|
||||
def test_read_cache_missing_file():
|
||||
"""Test reading cache when file doesn't exist."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Simulate FileNotFoundError
|
||||
mock_storage.get.side_effect = FileNotFoundError("No such key")
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
|
||||
|
||||
def test_read_cache_invalid_json():
|
||||
"""Test reading cache with corrupted JSON."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Return invalid JSON
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = b"not valid json {]["
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
mock_logger.warning.assert_called_once()
|
||||
|
||||
|
||||
def test_read_cache_version_mismatch():
|
||||
"""Test reading cache with wrong version."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Cache with wrong version
|
||||
cache_data = {
|
||||
"version": "2.0", # Wrong version
|
||||
"mode": "quick",
|
||||
"validation": {"object_count": 100, "compressed_size": 50000},
|
||||
"stats": {
|
||||
"bucket": "test",
|
||||
"object_count": 100,
|
||||
"total_size": 1000,
|
||||
"compressed_size": 500,
|
||||
"space_saved": 500,
|
||||
"average_compression_ratio": 0.5,
|
||||
"delta_objects": 90,
|
||||
"direct_objects": 10,
|
||||
},
|
||||
}
|
||||
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = json.dumps(cache_data).encode("utf-8")
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
mock_logger.warning.assert_called_once()
|
||||
|
||||
|
||||
def test_read_cache_mode_mismatch():
|
||||
"""Test reading cache with wrong mode."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Cache with mismatched mode
|
||||
cache_data = {
|
||||
"version": "1.0",
|
||||
"mode": "detailed", # Wrong mode
|
||||
"validation": {"object_count": 100, "compressed_size": 50000},
|
||||
"stats": {
|
||||
"bucket": "test",
|
||||
"object_count": 100,
|
||||
"total_size": 1000,
|
||||
"compressed_size": 500,
|
||||
"space_saved": 500,
|
||||
"average_compression_ratio": 0.5,
|
||||
"delta_objects": 90,
|
||||
"direct_objects": 10,
|
||||
},
|
||||
}
|
||||
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = json.dumps(cache_data).encode("utf-8")
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
# Request "quick" mode but cache has "detailed"
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
mock_logger.warning.assert_called_once()
|
||||
|
||||
|
||||
def test_write_cache_handles_errors_gracefully():
|
||||
"""Test that cache write failures don't crash the program."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Simulate S3 permission error
|
||||
mock_storage.put.side_effect = PermissionError("Access denied")
|
||||
|
||||
test_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=150,
|
||||
total_size=1000000,
|
||||
compressed_size=50000,
|
||||
space_saved=950000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=140,
|
||||
direct_objects=10,
|
||||
)
|
||||
|
||||
# Should not raise exception
|
||||
_write_stats_cache(
|
||||
client=mock_client,
|
||||
bucket="test-bucket",
|
||||
mode="quick",
|
||||
stats=test_stats,
|
||||
object_count=150,
|
||||
compressed_size=50000,
|
||||
)
|
||||
|
||||
# Should log warning
|
||||
mock_logger.warning.assert_called_once()
|
||||
assert "Failed to write cache" in str(mock_logger.warning.call_args)
|
||||
Reference in New Issue
Block a user