diff --git a/CHANGELOG.md b/CHANGELOG.md index 8630d35..80e084d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [5.1.0] - 2025-10-10 + +### Added +- **New CLI Command**: `deltaglider stats ` for bucket statistics and compression metrics + - Supports `--detailed` flag for comprehensive analysis + - Supports `--json` flag for machine-readable output + - Accepts multiple formats: `s3://bucket/`, `s3://bucket`, `bucket` +- **Session-Level Statistics Caching**: Bucket stats now cached per client instance + - Automatic cache invalidation on mutations (put, delete, bucket operations) + - Intelligent cache reuse (detailed stats serve quick stat requests) + - Enhanced `list_buckets()` includes cached stats when available +- **Programmatic Cache Management**: Added cache management APIs for long-running applications + - `clear_cache()`: Clear all cached references + - `evict_cache()`: Remove specific cached reference + - Session-scoped cache lifecycle management + +### Changed +- Bucket statistics are now cached within client session for performance +- `list_buckets()` response includes `DeltaGliderStats` metadata when cached + +### Documentation +- Added comprehensive DG_MAX_RATIO tuning guide in docs/ +- Updated CLI command reference in CLAUDE.md and README.md +- Added detailed cache management documentation + ## [5.0.3] - 2025-10-10 ### Security diff --git a/CLAUDE.md b/CLAUDE.md index 620f0a6..0b9ad13 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -74,6 +74,17 @@ export AWS_SECRET_ACCESS_KEY=minioadmin # Now you can use deltaglider commands deltaglider cp test.zip s3://test-bucket/ +deltaglider stats test-bucket # Get bucket statistics +``` + +### Available CLI Commands +```bash +cp # Copy files to/from S3 (AWS S3 compatible) +ls # List S3 buckets or objects (AWS S3 compatible) +rm # Remove S3 objects (AWS S3 compatible) +sync # Synchronize directories with S3 (AWS S3 compatible) +stats # Get bucket statistics and compression metrics +verify # Verify integrity of delta file ``` ## Architecture diff --git a/README.md b/README.md index 3226a5c..778b9fa 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,13 @@ deltaglider sync s3://releases/ ./local-backup/ # Sync from S3 deltaglider sync --delete ./src/ s3://backup/ # Mirror exactly deltaglider sync --exclude "*.log" ./src/ s3://backup/ # Exclude patterns +# Get bucket statistics (compression metrics) +deltaglider stats my-bucket # Quick stats overview +deltaglider stats s3://my-bucket # Also accepts s3:// format +deltaglider stats s3://my-bucket/ # With or without trailing slash +deltaglider stats my-bucket --detailed # Detailed compression metrics (slower) +deltaglider stats my-bucket --json # JSON output for automation + # Works with MinIO, R2, and S3-compatible storage deltaglider cp file.zip s3://bucket/ --endpoint-url http://localhost:9000 ``` diff --git a/src/deltaglider/app/cli/main.py b/src/deltaglider/app/cli/main.py index 94d1ac9..f612220 100644 --- a/src/deltaglider/app/cli/main.py +++ b/src/deltaglider/app/cli/main.py @@ -640,6 +640,84 @@ def verify(service: DeltaService, s3_url: str) -> None: sys.exit(1) +@cli.command() +@click.argument("bucket") +@click.option("--detailed", is_flag=True, help="Fetch detailed compression metrics (slower)") +@click.option("--json", "output_json", is_flag=True, help="Output in JSON format") +@click.pass_obj +def stats(service: DeltaService, bucket: str, detailed: bool, output_json: bool) -> None: + """Get bucket statistics and compression metrics. + + BUCKET can be specified as: + - s3://bucket-name/ + - s3://bucket-name + - bucket-name + """ + from ...client import DeltaGliderClient + + try: + # Parse bucket from S3 URL if needed + if bucket.startswith("s3://"): + # Remove s3:// prefix and any trailing slashes + bucket = bucket[5:].rstrip("/") + # Extract just the bucket name (first path component) + bucket = bucket.split("/")[0] if "/" in bucket else bucket + + if not bucket: + click.echo("Error: Invalid bucket name", err=True) + sys.exit(1) + + # Create client from service + client = DeltaGliderClient(service=service) + + # Get bucket stats + bucket_stats = client.get_bucket_stats(bucket, detailed_stats=detailed) + + if output_json: + # JSON output + output = { + "bucket": bucket_stats.bucket, + "object_count": bucket_stats.object_count, + "total_size": bucket_stats.total_size, + "compressed_size": bucket_stats.compressed_size, + "space_saved": bucket_stats.space_saved, + "average_compression_ratio": bucket_stats.average_compression_ratio, + "delta_objects": bucket_stats.delta_objects, + "direct_objects": bucket_stats.direct_objects, + } + click.echo(json.dumps(output, indent=2)) + else: + # Human-readable output + def format_bytes(size: float) -> str: + """Format bytes to human-readable size.""" + for unit in ["B", "KB", "MB", "GB", "TB"]: + if size < 1024.0: + return f"{size:.2f} {unit}" + size /= 1024.0 + return f"{size:.2f} PB" + + click.echo(f"Bucket Statistics: {bucket_stats.bucket}") + click.echo(f"{'=' * 60}") + click.echo(f"Total Objects: {bucket_stats.object_count:,}") + click.echo(f" Delta Objects: {bucket_stats.delta_objects:,}") + click.echo(f" Direct Objects: {bucket_stats.direct_objects:,}") + click.echo("") + click.echo( + f"Original Size: {format_bytes(bucket_stats.total_size)} ({bucket_stats.total_size:,} bytes)" + ) + click.echo( + f"Compressed Size: {format_bytes(bucket_stats.compressed_size)} ({bucket_stats.compressed_size:,} bytes)" + ) + click.echo( + f"Space Saved: {format_bytes(bucket_stats.space_saved)} ({bucket_stats.space_saved:,} bytes)" + ) + click.echo(f"Compression Ratio: {bucket_stats.average_compression_ratio:.1%}") + + except Exception as e: + click.echo(f"Error: {e}", err=True) + sys.exit(1) + + def main() -> None: """Main entry point.""" cli() diff --git a/src/deltaglider/client.py b/src/deltaglider/client.py index 324f49e..368ccf9 100644 --- a/src/deltaglider/client.py +++ b/src/deltaglider/client.py @@ -63,6 +63,52 @@ class DeltaGliderClient: self.service = service self.endpoint_url = endpoint_url self._multipart_uploads: dict[str, Any] = {} # Track multipart uploads + # Session-scoped bucket statistics cache (cleared with the client lifecycle) + self._bucket_stats_cache: dict[str, dict[bool, BucketStats]] = {} + + # ------------------------------------------------------------------------- + # Internal helpers + # ------------------------------------------------------------------------- + + def _invalidate_bucket_stats_cache(self, bucket: str | None = None) -> None: + """Invalidate cached bucket statistics.""" + if bucket is None: + self._bucket_stats_cache.clear() + else: + self._bucket_stats_cache.pop(bucket, None) + + def _store_bucket_stats_cache( + self, + bucket: str, + detailed_stats: bool, + stats: BucketStats, + ) -> None: + """Store bucket statistics in the session cache.""" + bucket_cache = self._bucket_stats_cache.setdefault(bucket, {}) + bucket_cache[detailed_stats] = stats + # Detailed stats are a superset of quick stats; reuse them for quick calls. + if detailed_stats: + bucket_cache[False] = stats + + def _get_cached_bucket_stats(self, bucket: str, detailed_stats: bool) -> BucketStats | None: + """Retrieve cached stats for a bucket, preferring detailed metrics when available.""" + bucket_cache = self._bucket_stats_cache.get(bucket) + if not bucket_cache: + return None + if detailed_stats: + return bucket_cache.get(True) + return bucket_cache.get(False) or bucket_cache.get(True) + + def _get_cached_bucket_stats_for_listing(self, bucket: str) -> tuple[BucketStats | None, bool]: + """Return best cached stats for bucket listings.""" + bucket_cache = self._bucket_stats_cache.get(bucket) + if not bucket_cache: + return (None, False) + if True in bucket_cache: + return (bucket_cache[True], True) + if False in bucket_cache: + return (bucket_cache[False], False) + return (None, False) # ============================================================================ # Boto3-compatible APIs (matches S3 client interface) @@ -171,13 +217,15 @@ class DeltaGliderClient: } # Return as dict[str, Any] for public API (TypedDict is a dict at runtime!) - return cast( + response = cast( dict[str, Any], build_put_response( etag=f'"{sha256_hash}"', deltaglider_info=deltaglider_info, ), ) + self._invalidate_bucket_stats_cache(Bucket) + return response finally: # Clean up temp file if tmp_path.exists(): @@ -418,7 +466,7 @@ class DeltaGliderClient: deltaglider_info["DependentDeltas"] = dependent_deltas # Return as dict[str, Any] for public API (TypedDict is a dict at runtime!) - return cast( + response = cast( dict[str, Any], build_delete_response( delete_marker=False, @@ -426,6 +474,8 @@ class DeltaGliderClient: deltaglider_info=deltaglider_info, ), ) + self._invalidate_bucket_stats_cache(Bucket) + return response def delete_objects( self, @@ -502,6 +552,7 @@ class DeltaGliderClient: } response["ResponseMetadata"] = {"HTTPStatusCode": 200} + self._invalidate_bucket_stats_cache(Bucket) return response def delete_objects_recursive( @@ -627,6 +678,7 @@ class DeltaGliderClient: if single_details: response["DeltaGliderInfo"]["SingleDeletes"] = single_details # type: ignore[index] + self._invalidate_bucket_stats_cache(Bucket) return response def head_object( @@ -703,7 +755,7 @@ class DeltaGliderClient: is_delta = summary.delta_size is not None stored_size = summary.delta_size if is_delta else summary.file_size - return UploadSummary( + upload_summary = UploadSummary( operation=summary.operation, bucket=summary.bucket, key=summary.key, @@ -712,6 +764,8 @@ class DeltaGliderClient: is_delta=is_delta, delta_ratio=summary.delta_ratio or 0.0, ) + self._invalidate_bucket_stats_cache(bucket) + return upload_summary def download(self, s3_url: str, output_path: str | Path) -> None: """Download and reconstruct a file from S3. @@ -938,7 +992,12 @@ class DeltaGliderClient: stats = client.get_bucket_stats('releases', detailed_stats=True) print(f"Compression ratio: {stats.average_compression_ratio:.1%}") """ + cached = self._get_cached_bucket_stats(bucket, detailed_stats) + if cached: + return cached + result: BucketStats = _get_bucket_stats(self, bucket, detailed_stats) + self._store_bucket_stats_cache(bucket, detailed_stats, result) return result def generate_presigned_url( @@ -1010,7 +1069,9 @@ class DeltaGliderClient: ... CreateBucketConfiguration={'LocationConstraint': 'us-west-2'} ... ) """ - return _create_bucket(self, Bucket, CreateBucketConfiguration, **kwargs) + response = _create_bucket(self, Bucket, CreateBucketConfiguration, **kwargs) + self._invalidate_bucket_stats_cache(Bucket) + return response def delete_bucket( self, @@ -1032,7 +1093,9 @@ class DeltaGliderClient: >>> client = create_client() >>> client.delete_bucket(Bucket='my-bucket') """ - return _delete_bucket(self, Bucket, **kwargs) + response = _delete_bucket(self, Bucket, **kwargs) + self._invalidate_bucket_stats_cache(Bucket) + return response def list_buckets(self, **kwargs: Any) -> dict[str, Any]: """List all S3 buckets (boto3-compatible). @@ -1139,6 +1202,7 @@ class DeltaGliderClient: - `evict_cache()`: Remove specific cached reference - docs/CACHE_MANAGEMENT.md: Complete cache management guide """ + self._invalidate_bucket_stats_cache() self.service.cache.clear() diff --git a/src/deltaglider/client_operations/bucket.py b/src/deltaglider/client_operations/bucket.py index 980620f..4c31403 100644 --- a/src/deltaglider/client_operations/bucket.py +++ b/src/deltaglider/client_operations/bucket.py @@ -138,10 +138,32 @@ def list_buckets( # Check if storage adapter has boto3 client if hasattr(storage_adapter, "client"): try: - response = storage_adapter.client.list_buckets() + raw_response = storage_adapter.client.list_buckets() + + buckets: list[dict[str, Any]] = [] + for bucket_entry in raw_response.get("Buckets", []): + bucket_data = dict(bucket_entry) + name = bucket_data.get("Name") + if isinstance(name, str) and name: + cached_stats, detailed = client._get_cached_bucket_stats_for_listing(name) + if cached_stats is not None: + bucket_data["DeltaGliderStats"] = { + "Cached": True, + "Detailed": detailed, + "ObjectCount": cached_stats.object_count, + "TotalSize": cached_stats.total_size, + "CompressedSize": cached_stats.compressed_size, + "SpaceSaved": cached_stats.space_saved, + "AverageCompressionRatio": cached_stats.average_compression_ratio, + "DeltaObjects": cached_stats.delta_objects, + "DirectObjects": cached_stats.direct_objects, + } + + buckets.append(bucket_data) + return { - "Buckets": response.get("Buckets", []), - "Owner": response.get("Owner", {}), + "Buckets": buckets, + "Owner": raw_response.get("Owner", {}), "ResponseMetadata": { "HTTPStatusCode": 200, }, diff --git a/tests/integration/test_bucket_management.py b/tests/integration/test_bucket_management.py index 71c7a82..1b4ecd7 100644 --- a/tests/integration/test_bucket_management.py +++ b/tests/integration/test_bucket_management.py @@ -1,11 +1,13 @@ """Tests for bucket management APIs.""" +from typing import Any from unittest.mock import Mock import pytest from deltaglider.app.cli.main import create_service from deltaglider.client import DeltaGliderClient +from deltaglider.client_models import BucketStats class TestBucketManagement: @@ -123,6 +125,47 @@ class TestBucketManagement: assert response["Buckets"] == [] assert response["ResponseMetadata"]["HTTPStatusCode"] == 200 + def test_list_buckets_includes_cached_stats(self): + """Bucket list should merge cached stats when available.""" + service = create_service() + mock_storage = Mock() + service.storage = mock_storage + + mock_boto3_client = Mock() + mock_boto3_client.list_buckets.return_value = { + "Buckets": [ + {"Name": "bucket1", "CreationDate": "2025-01-01T00:00:00Z"}, + {"Name": "bucket2", "CreationDate": "2025-01-02T00:00:00Z"}, + ], + "Owner": {"DisplayName": "test-user", "ID": "12345"}, + } + mock_storage.client = mock_boto3_client + + client = DeltaGliderClient(service) + + cached_stats = BucketStats( + bucket="bucket1", + object_count=10, + total_size=1000, + compressed_size=600, + space_saved=400, + average_compression_ratio=0.4, + delta_objects=6, + direct_objects=4, + ) + client._store_bucket_stats_cache("bucket1", detailed_stats=True, stats=cached_stats) + + response = client.list_buckets() + + bucket1 = next(bucket for bucket in response["Buckets"] if bucket["Name"] == "bucket1") + assert bucket1["DeltaGliderStats"]["Cached"] is True + assert bucket1["DeltaGliderStats"]["Detailed"] is True + assert bucket1["DeltaGliderStats"]["ObjectCount"] == cached_stats.object_count + assert bucket1["DeltaGliderStats"]["TotalSize"] == cached_stats.total_size + + bucket2 = next(bucket for bucket in response["Buckets"] if bucket["Name"] == "bucket2") + assert "DeltaGliderStats" not in bucket2 + def test_delete_bucket_success(self): """Test deleting a bucket successfully.""" service = create_service() @@ -178,6 +221,69 @@ class TestBucketManagement: with pytest.raises(RuntimeError, match="Failed to delete bucket"): client.delete_bucket(Bucket="full-bucket") + def test_get_bucket_stats_caches_per_session(self, monkeypatch): + """Verify bucket stats are cached within the client session.""" + service = create_service() + mock_storage = Mock() + service.storage = mock_storage + + mock_storage.client = Mock() + + client = DeltaGliderClient(service) + + quick_stats = BucketStats( + bucket="bucket1", + object_count=5, + total_size=500, + compressed_size=300, + space_saved=200, + average_compression_ratio=0.4, + delta_objects=3, + direct_objects=2, + ) + detailed_stats = BucketStats( + bucket="bucket1", + object_count=5, + total_size=520, + compressed_size=300, + space_saved=220, + average_compression_ratio=0.423, + delta_objects=3, + direct_objects=2, + ) + + call_count = {"value": 0} + + def fake_get_bucket_stats(_: Any, bucket: str, detailed_stats_flag: bool) -> BucketStats: + call_count["value"] += 1 + assert bucket == "bucket1" + return detailed_stats if detailed_stats_flag else quick_stats + + monkeypatch.setattr("deltaglider.client._get_bucket_stats", fake_get_bucket_stats) + + # First call should invoke underlying function + result_quick = client.get_bucket_stats("bucket1") + assert result_quick is quick_stats + assert call_count["value"] == 1 + + # Second quick call should hit cache + assert client.get_bucket_stats("bucket1") is quick_stats + assert call_count["value"] == 1 + + # Detailed call triggers new computation + result_detailed = client.get_bucket_stats("bucket1", detailed_stats=True) + assert result_detailed is detailed_stats + assert call_count["value"] == 2 + + # Quick call after detailed uses detailed cached value (more accurate) + assert client.get_bucket_stats("bucket1") is detailed_stats + assert call_count["value"] == 2 + + # Clearing the cache should force recomputation + client.clear_cache() + assert client.get_bucket_stats("bucket1") is quick_stats + assert call_count["value"] == 3 + def test_bucket_methods_without_boto3_client(self): """Test that bucket methods raise NotImplementedError when storage doesn't support it.""" service = create_service() diff --git a/tests/integration/test_stats_command.py b/tests/integration/test_stats_command.py new file mode 100644 index 0000000..9e86fe6 --- /dev/null +++ b/tests/integration/test_stats_command.py @@ -0,0 +1,206 @@ +"""Integration tests for stats CLI command.""" + +import json +from unittest.mock import Mock, patch + +from click.testing import CliRunner + +from deltaglider.app.cli.main import cli +from deltaglider.client_models import BucketStats + + +class TestStatsCommand: + """Test stats CLI command.""" + + def test_stats_json_output(self): + """Test stats command with JSON output.""" + # Create mock bucket stats + mock_stats = BucketStats( + bucket="test-bucket", + object_count=10, + total_size=1000000, + compressed_size=500000, + space_saved=500000, + average_compression_ratio=0.5, + delta_objects=7, + direct_objects=3, + ) + + with patch("deltaglider.client.DeltaGliderClient") as mock_client_class: + # Setup mock client + mock_client = Mock() + mock_client.get_bucket_stats.return_value = mock_stats + mock_client_class.return_value = mock_client + + # Run command + runner = CliRunner() + result = runner.invoke(cli, ["stats", "test-bucket", "--json"]) + + # Verify + assert result.exit_code == 0 + output = json.loads(result.output) + assert output["bucket"] == "test-bucket" + assert output["object_count"] == 10 + assert output["total_size"] == 1000000 + assert output["compressed_size"] == 500000 + assert output["space_saved"] == 500000 + assert output["average_compression_ratio"] == 0.5 + assert output["delta_objects"] == 7 + assert output["direct_objects"] == 3 + + # Verify client was called correctly + mock_client.get_bucket_stats.assert_called_once_with("test-bucket", detailed_stats=False) + + def test_stats_json_output_detailed(self): + """Test stats command with detailed JSON output.""" + mock_stats = BucketStats( + bucket="test-bucket", + object_count=5, + total_size=2000000, + compressed_size=100000, + space_saved=1900000, + average_compression_ratio=0.95, + delta_objects=5, + direct_objects=0, + ) + + with patch("deltaglider.client.DeltaGliderClient") as mock_client_class: + mock_client = Mock() + mock_client.get_bucket_stats.return_value = mock_stats + mock_client_class.return_value = mock_client + + runner = CliRunner() + result = runner.invoke(cli, ["stats", "test-bucket", "--detailed", "--json"]) + + assert result.exit_code == 0 + output = json.loads(result.output) + assert output["average_compression_ratio"] == 0.95 + + # Verify detailed flag was passed + mock_client.get_bucket_stats.assert_called_once_with("test-bucket", detailed_stats=True) + + def test_stats_human_readable_output(self): + """Test stats command with human-readable output.""" + mock_stats = BucketStats( + bucket="test-bucket", + object_count=10, + total_size=1500000, # ~1.43 MB + compressed_size=300000, # ~293 KB + space_saved=1200000, # ~1.14 MB + average_compression_ratio=0.8, + delta_objects=7, + direct_objects=3, + ) + + with patch("deltaglider.client.DeltaGliderClient") as mock_client_class: + mock_client = Mock() + mock_client.get_bucket_stats.return_value = mock_stats + mock_client_class.return_value = mock_client + + runner = CliRunner() + result = runner.invoke(cli, ["stats", "test-bucket"]) + + assert result.exit_code == 0 + output = result.output + + # Verify human-readable format + assert "Bucket Statistics: test-bucket" in output + assert "Total Objects:" in output + assert "10" in output + assert "Delta Objects:" in output + assert "7" in output + assert "Direct Objects:" in output + assert "3" in output + assert "Original Size:" in output + assert "Compressed Size:" in output + assert "Space Saved:" in output + assert "Compression Ratio:" in output + assert "80.0%" in output # 0.8 = 80% + + def test_stats_error_handling(self): + """Test stats command error handling.""" + with patch("deltaglider.client.DeltaGliderClient") as mock_client_class: + mock_client = Mock() + mock_client.get_bucket_stats.side_effect = Exception("Bucket not found") + mock_client_class.return_value = mock_client + + runner = CliRunner() + result = runner.invoke(cli, ["stats", "nonexistent-bucket"]) + + assert result.exit_code == 1 + assert "Error: Bucket not found" in result.output + + def test_stats_with_s3_url(self): + """Test stats command with s3:// URL format.""" + mock_stats = BucketStats( + bucket="test-bucket", + object_count=5, + total_size=1000000, + compressed_size=500000, + space_saved=500000, + average_compression_ratio=0.5, + delta_objects=3, + direct_objects=2, + ) + + with patch("deltaglider.client.DeltaGliderClient") as mock_client_class: + mock_client = Mock() + mock_client.get_bucket_stats.return_value = mock_stats + mock_client_class.return_value = mock_client + + runner = CliRunner() + result = runner.invoke(cli, ["stats", "s3://test-bucket", "--json"]) + + assert result.exit_code == 0 + # Verify bucket name was parsed correctly from S3 URL + mock_client.get_bucket_stats.assert_called_once_with("test-bucket", detailed_stats=False) + + def test_stats_with_s3_url_trailing_slash(self): + """Test stats command with s3:// URL format with trailing slash.""" + mock_stats = BucketStats( + bucket="test-bucket", + object_count=5, + total_size=1000000, + compressed_size=500000, + space_saved=500000, + average_compression_ratio=0.5, + delta_objects=3, + direct_objects=2, + ) + + with patch("deltaglider.client.DeltaGliderClient") as mock_client_class: + mock_client = Mock() + mock_client.get_bucket_stats.return_value = mock_stats + mock_client_class.return_value = mock_client + + runner = CliRunner() + result = runner.invoke(cli, ["stats", "s3://test-bucket/", "--json"]) + + assert result.exit_code == 0 + # Verify bucket name was parsed correctly from S3 URL with trailing slash + mock_client.get_bucket_stats.assert_called_once_with("test-bucket", detailed_stats=False) + + def test_stats_with_s3_url_with_prefix(self): + """Test stats command with s3:// URL format with prefix (should ignore prefix).""" + mock_stats = BucketStats( + bucket="test-bucket", + object_count=5, + total_size=1000000, + compressed_size=500000, + space_saved=500000, + average_compression_ratio=0.5, + delta_objects=3, + direct_objects=2, + ) + + with patch("deltaglider.client.DeltaGliderClient") as mock_client_class: + mock_client = Mock() + mock_client.get_bucket_stats.return_value = mock_stats + mock_client_class.return_value = mock_client + + runner = CliRunner() + result = runner.invoke(cli, ["stats", "s3://test-bucket/some/prefix/", "--json"]) + + assert result.exit_code == 0 + # Verify only bucket name was extracted, prefix ignored + mock_client.get_bucket_stats.assert_called_once_with("test-bucket", detailed_stats=False)