fix: Code quality improvements for v5.2.2 release

- Fix pagination bug using continuation_token instead of start_after
- Add stats caching to prevent blocking web apps
- Improve code formatting and type checking
- Add comprehensive unit tests for new features
- Fix test mock usage in object_listing tests
This commit is contained in:
Simone Scarduzio
2025-10-14 23:54:49 +02:00
parent ff05e77c24
commit e1259b7ea8
26 changed files with 2581 additions and 369 deletions

View File

@@ -153,13 +153,14 @@ class TestBucketManagement:
delta_objects=6,
direct_objects=4,
)
client._store_bucket_stats_cache("bucket1", detailed_stats=True, stats=cached_stats)
client._store_bucket_stats_cache("bucket1", mode="detailed", stats=cached_stats)
response = client.list_buckets()
bucket1 = next(bucket for bucket in response["Buckets"] if bucket["Name"] == "bucket1")
assert bucket1["DeltaGliderStats"]["Cached"] is True
assert bucket1["DeltaGliderStats"]["Detailed"] is True
assert bucket1["DeltaGliderStats"]["Mode"] == "detailed"
assert bucket1["DeltaGliderStats"]["ObjectCount"] == cached_stats.object_count
assert bucket1["DeltaGliderStats"]["TotalSize"] == cached_stats.total_size
@@ -254,10 +255,14 @@ class TestBucketManagement:
call_count = {"value": 0}
def fake_get_bucket_stats(_: Any, bucket: str, detailed_stats_flag: bool) -> BucketStats:
def fake_get_bucket_stats(_: Any, bucket: str, mode: str) -> BucketStats:
call_count["value"] += 1
assert bucket == "bucket1"
return detailed_stats if detailed_stats_flag else quick_stats
if mode == "detailed":
return detailed_stats
if mode == "sampled":
return detailed_stats # sampled treated as detailed for cache propagation
return quick_stats
monkeypatch.setattr("deltaglider.client._get_bucket_stats", fake_get_bucket_stats)
@@ -271,7 +276,7 @@ class TestBucketManagement:
assert call_count["value"] == 1
# Detailed call triggers new computation
result_detailed = client.get_bucket_stats("bucket1", detailed_stats=True)
result_detailed = client.get_bucket_stats("bucket1", mode="detailed")
assert result_detailed is detailed_stats
assert call_count["value"] == 2

View File

@@ -434,7 +434,7 @@ class TestDeltaGliderFeatures:
def test_get_bucket_stats(self, client):
"""Test getting bucket statistics."""
# Test quick stats (default: detailed_stats=False)
# Test quick stats (LIST only)
stats = client.get_bucket_stats("test-bucket")
assert isinstance(stats, BucketStats)
@@ -442,8 +442,8 @@ class TestDeltaGliderFeatures:
assert stats.total_size > 0
assert stats.delta_objects >= 1 # We have archive.zip.delta
# Test with detailed_stats=True
detailed_stats = client.get_bucket_stats("test-bucket", detailed_stats=True)
# Test with detailed mode
detailed_stats = client.get_bucket_stats("test-bucket", mode="detailed")
assert isinstance(detailed_stats, BucketStats)
assert detailed_stats.object_count == stats.object_count

View File

@@ -49,9 +49,7 @@ class TestStatsCommand:
assert output["direct_objects"] == 3
# Verify client was called correctly
mock_client.get_bucket_stats.assert_called_once_with(
"test-bucket", detailed_stats=False
)
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="quick")
def test_stats_json_output_detailed(self):
"""Test stats command with detailed JSON output."""
@@ -79,7 +77,44 @@ class TestStatsCommand:
assert output["average_compression_ratio"] == 0.95
# Verify detailed flag was passed
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", detailed_stats=True)
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="detailed")
def test_stats_json_output_sampled(self):
"""Test stats command with sampled JSON output."""
mock_stats = BucketStats(
bucket="test-bucket",
object_count=5,
total_size=2000000,
compressed_size=100000,
space_saved=1900000,
average_compression_ratio=0.95,
delta_objects=5,
direct_objects=0,
)
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
mock_client = Mock()
mock_client.get_bucket_stats.return_value = mock_stats
mock_client_class.return_value = mock_client
runner = CliRunner()
result = runner.invoke(cli, ["stats", "test-bucket", "--sampled", "--json"])
assert result.exit_code == 0
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="sampled")
def test_stats_sampled_and_detailed_conflict(self):
"""--sampled and --detailed flags must be mutually exclusive."""
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
mock_client = Mock()
mock_client_class.return_value = mock_client
runner = CliRunner()
result = runner.invoke(cli, ["stats", "test-bucket", "--sampled", "--detailed"])
assert result.exit_code == 1
assert "cannot be used together" in result.output
def test_stats_human_readable_output(self):
"""Test stats command with human-readable output."""
@@ -155,9 +190,7 @@ class TestStatsCommand:
assert result.exit_code == 0
# Verify bucket name was parsed correctly from S3 URL
mock_client.get_bucket_stats.assert_called_once_with(
"test-bucket", detailed_stats=False
)
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="quick")
def test_stats_with_s3_url_trailing_slash(self):
"""Test stats command with s3:// URL format with trailing slash."""
@@ -182,9 +215,7 @@ class TestStatsCommand:
assert result.exit_code == 0
# Verify bucket name was parsed correctly from S3 URL with trailing slash
mock_client.get_bucket_stats.assert_called_once_with(
"test-bucket", detailed_stats=False
)
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="quick")
def test_stats_with_s3_url_with_prefix(self):
"""Test stats command with s3:// URL format with prefix (should ignore prefix)."""
@@ -209,6 +240,4 @@ class TestStatsCommand:
assert result.exit_code == 0
# Verify only bucket name was extracted, prefix ignored
mock_client.get_bucket_stats.assert_called_once_with(
"test-bucket", detailed_stats=False
)
mock_client.get_bucket_stats.assert_called_once_with("test-bucket", mode="quick")

View File

@@ -0,0 +1,25 @@
"""Tests for shared delta extension policy."""
from deltaglider.core.delta_extensions import (
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
DEFAULT_DELTA_EXTENSIONS,
is_delta_candidate,
)
def test_is_delta_candidate_matches_default_extensions():
"""All default extensions should be detected as delta candidates."""
for ext in DEFAULT_DELTA_EXTENSIONS:
assert is_delta_candidate(f"file{ext}")
def test_is_delta_candidate_matches_compound_extensions():
"""Compound extensions should be handled even with multiple suffixes."""
for ext in DEFAULT_COMPOUND_DELTA_EXTENSIONS:
assert is_delta_candidate(f"file{ext}")
def test_is_delta_candidate_rejects_other_extensions():
"""Non delta-friendly extensions should return False."""
assert not is_delta_candidate("document.txt")
assert not is_delta_candidate("image.jpeg")

View File

@@ -0,0 +1,112 @@
"""Unit tests for object_listing pagination."""
from unittest.mock import Mock
from deltaglider.core.object_listing import list_all_objects, list_objects_page
def test_list_objects_page_passes_continuation_token():
"""Test that list_objects_page passes continuation_token to storage."""
storage = Mock()
storage.list_objects.return_value = {
"objects": [],
"common_prefixes": [],
"is_truncated": False,
"next_continuation_token": None,
"key_count": 0,
}
list_objects_page(
storage,
bucket="test-bucket",
continuation_token="test-token",
)
# Verify continuation_token was passed
storage.list_objects.assert_called_once()
call_kwargs = storage.list_objects.call_args.kwargs
assert call_kwargs["continuation_token"] == "test-token"
def test_list_all_objects_uses_continuation_token_for_pagination():
"""Test that list_all_objects uses continuation_token (not start_after) for pagination."""
storage = Mock()
# Mock 3 pages of results
responses = [
{
"objects": [{"key": f"obj{i}"} for i in range(1000)],
"common_prefixes": [],
"is_truncated": True,
"next_continuation_token": "token1",
"key_count": 1000,
},
{
"objects": [{"key": f"obj{i}"} for i in range(1000, 2000)],
"common_prefixes": [],
"is_truncated": True,
"next_continuation_token": "token2",
"key_count": 1000,
},
{
"objects": [{"key": f"obj{i}"} for i in range(2000, 2500)],
"common_prefixes": [],
"is_truncated": False,
"next_continuation_token": None,
"key_count": 500,
},
]
storage.list_objects.side_effect = responses
result = list_all_objects(
storage,
bucket="test-bucket",
prefix="",
)
# Should have made 3 calls
assert storage.list_objects.call_count == 3
# Should have collected all objects
assert len(result.objects) == 2500
# Should not be truncated
assert not result.is_truncated
# Verify the calls used continuation_token correctly
calls = storage.list_objects.call_args_list
assert len(calls) == 3
# First call should have no continuation_token
assert calls[0].kwargs.get("continuation_token") is None
# Second call should use token1
assert calls[1].kwargs.get("continuation_token") == "token1"
# Third call should use token2
assert calls[2].kwargs.get("continuation_token") == "token2"
def test_list_all_objects_prevents_infinite_loop():
"""Test that list_all_objects has max_iterations protection."""
storage = Mock()
# Mock infinite pagination (always returns more)
storage.list_objects.return_value = {
"objects": [{"key": "obj"}],
"common_prefixes": [],
"is_truncated": True,
"next_continuation_token": "token",
"key_count": 1,
}
result = list_all_objects(
storage,
bucket="test-bucket",
max_iterations=10, # Low limit for testing
)
# Should stop at max_iterations
assert storage.list_objects.call_count == 10
assert result.is_truncated

44
tests/unit/test_s3_uri.py Normal file
View File

@@ -0,0 +1,44 @@
"""Tests for S3 URI helpers."""
import pytest
from deltaglider.core.s3_uri import build_s3_url, is_s3_url, parse_s3_url
def test_is_s3_url_detects_scheme() -> None:
"""is_s3_url should only match the S3 scheme."""
assert is_s3_url("s3://bucket/path")
assert not is_s3_url("https://example.com/object")
def test_parse_s3_url_returns_bucket_and_key() -> None:
"""Parsing should split bucket and key correctly."""
parsed = parse_s3_url("s3://my-bucket/path/to/object.txt")
assert parsed.bucket == "my-bucket"
assert parsed.key == "path/to/object.txt"
def test_parse_strips_trailing_slash_when_requested() -> None:
"""strip_trailing_slash should normalise directory-style URLs."""
parsed = parse_s3_url("s3://my-bucket/path/to/", strip_trailing_slash=True)
assert parsed.bucket == "my-bucket"
assert parsed.key == "path/to"
def test_parse_requires_key_when_configured() -> None:
"""allow_empty_key=False should reject bucket-only URLs."""
with pytest.raises(ValueError):
parse_s3_url("s3://bucket-only", allow_empty_key=False)
def test_build_s3_url_round_trip() -> None:
"""build_s3_url should round-trip with parse_s3_url."""
url = build_s3_url("bucket", "dir/file.tar")
parsed = parse_s3_url(url)
assert parsed.bucket == "bucket"
assert parsed.key == "dir/file.tar"
def test_build_s3_url_for_bucket_root() -> None:
"""When key is missing, build_s3_url should omit the trailing slash."""
assert build_s3_url("root-bucket") == "s3://root-bucket"

View File

@@ -92,7 +92,7 @@ class TestBucketStatsAlgorithm:
mock_client.service.storage.head.side_effect = mock_head
# Execute
stats = get_bucket_stats(mock_client, "compressed-bucket")
stats = get_bucket_stats(mock_client, "compressed-bucket", mode="detailed")
# Verify
assert stats.object_count == 2 # Only delta files counted (not reference.bin)
@@ -164,7 +164,7 @@ class TestBucketStatsAlgorithm:
mock_client.service.storage.head.side_effect = mock_head
# Execute
stats = get_bucket_stats(mock_client, "mixed-bucket")
stats = get_bucket_stats(mock_client, "mixed-bucket", mode="detailed")
# Verify
assert stats.object_count == 4 # 2 delta + 2 direct files
@@ -229,7 +229,7 @@ class TestBucketStatsAlgorithm:
mock_client.service.storage.head.side_effect = mock_head
# Execute
stats = get_bucket_stats(mock_client, "multi-deltaspace-bucket")
stats = get_bucket_stats(mock_client, "multi-deltaspace-bucket", mode="detailed")
# Verify
assert stats.object_count == 2 # Only delta files
@@ -347,40 +347,57 @@ class TestBucketStatsAlgorithm:
)
with patch_as_completed:
_ = get_bucket_stats(mock_client, "parallel-bucket")
_ = get_bucket_stats(mock_client, "parallel-bucket", mode="detailed")
# Verify ThreadPoolExecutor was used with correct max_workers
mock_executor.assert_called_once_with(max_workers=10) # min(10, 50) = 10
def test_detailed_stats_flag(self, mock_client):
"""Test that detailed_stats flag controls metadata fetching."""
# Setup
def test_stats_modes_control_metadata_fetch(self, mock_client):
"""Metadata fetching should depend on the selected stats mode."""
mock_client.service.storage.list_objects.return_value = {
"objects": [
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
{"key": "file.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
{"key": "alpha/reference.bin", "size": 100, "last_modified": "2024-01-01"},
{"key": "alpha/file1.zip.delta", "size": 10, "last_modified": "2024-01-02"},
{"key": "alpha/file2.zip.delta", "size": 12, "last_modified": "2024-01-03"},
{"key": "beta/reference.bin", "size": 200, "last_modified": "2024-01-04"},
{"key": "beta/file1.zip.delta", "size": 20, "last_modified": "2024-01-05"},
],
"is_truncated": False,
}
# Test with detailed_stats=False (default)
# NOTE: Currently, the implementation always fetches metadata regardless of the flag
# This test documents the current behavior
_ = get_bucket_stats(mock_client, "test-bucket", detailed_stats=False)
metadata_by_key = {
"alpha/file1.zip.delta": {"file_size": "100", "compression_ratio": "0.9"},
"alpha/file2.zip.delta": {"file_size": "120", "compression_ratio": "0.88"},
"beta/file1.zip.delta": {"file_size": "210", "compression_ratio": "0.9"},
}
# Currently metadata is always fetched for delta files
assert mock_client.service.storage.head.called
def mock_head(path: str):
for key, metadata in metadata_by_key.items():
if key in path:
head = Mock()
head.metadata = metadata
return head
return None
# Reset mock
mock_client.service.storage.head.side_effect = mock_head
# Quick mode: no metadata fetch
_ = get_bucket_stats(mock_client, "mode-test", mode="quick")
assert mock_client.service.storage.head.call_count == 0
# Sampled mode: one HEAD per delta-space (alpha, beta)
mock_client.service.storage.head.reset_mock()
stats_sampled = get_bucket_stats(mock_client, "mode-test", mode="sampled")
assert mock_client.service.storage.head.call_count == 2
# Test with detailed_stats=True
mock_client.service.storage.head.return_value = Mock(metadata={"file_size": "19500000"})
# Detailed mode: HEAD for every delta (3 total)
mock_client.service.storage.head.reset_mock()
stats_detailed = get_bucket_stats(mock_client, "mode-test", mode="detailed")
assert mock_client.service.storage.head.call_count == 3
_ = get_bucket_stats(mock_client, "test-bucket", detailed_stats=True)
# Should fetch metadata
assert mock_client.service.storage.head.called
# Sampled totals should be close to detailed but not identical
assert stats_detailed.total_size == 100 + 120 + 210
assert stats_sampled.total_size == 100 + 100 + 210
def test_error_handling_in_metadata_fetch(self, mock_client):
"""Test graceful handling of errors during metadata fetch."""
@@ -407,7 +424,7 @@ class TestBucketStatsAlgorithm:
mock_client.service.storage.head.side_effect = mock_head
# Execute - should handle error gracefully
stats = get_bucket_stats(mock_client, "error-bucket", detailed_stats=True)
stats = get_bucket_stats(mock_client, "error-bucket", mode="detailed")
# Verify - file1 uses fallback, file2 uses metadata
assert stats.object_count == 2

View File

@@ -0,0 +1,284 @@
"""Unit tests for bucket stats caching functionality."""
import json
from unittest.mock import MagicMock
from deltaglider.client_models import BucketStats
from deltaglider.client_operations.stats import (
_get_cache_key,
_is_cache_valid,
_read_stats_cache,
_write_stats_cache,
)
def test_get_cache_key():
"""Test cache key generation for different modes."""
assert _get_cache_key("quick") == ".deltaglider/stats_quick.json"
assert _get_cache_key("sampled") == ".deltaglider/stats_sampled.json"
assert _get_cache_key("detailed") == ".deltaglider/stats_detailed.json"
def test_is_cache_valid_when_unchanged():
"""Test cache validation when bucket hasn't changed."""
cached_validation = {
"object_count": 100,
"compressed_size": 50000,
}
assert _is_cache_valid(cached_validation, 100, 50000) is True
def test_is_cache_valid_when_count_changed():
"""Test cache validation when object count changed."""
cached_validation = {
"object_count": 100,
"compressed_size": 50000,
}
# Object count changed
assert _is_cache_valid(cached_validation, 101, 50000) is False
def test_is_cache_valid_when_size_changed():
"""Test cache validation when compressed size changed."""
cached_validation = {
"object_count": 100,
"compressed_size": 50000,
}
# Compressed size changed
assert _is_cache_valid(cached_validation, 100, 60000) is False
def test_write_and_read_cache_roundtrip():
"""Test writing and reading cache with valid data."""
# Create mock client and storage
mock_storage = MagicMock()
mock_logger = MagicMock()
mock_service = MagicMock()
mock_service.storage = mock_storage
mock_service.logger = mock_logger
mock_client = MagicMock()
mock_client.service = mock_service
# Create test stats
test_stats = BucketStats(
bucket="test-bucket",
object_count=150,
total_size=1000000,
compressed_size=50000,
space_saved=950000,
average_compression_ratio=0.95,
delta_objects=140,
direct_objects=10,
)
# Capture what was written to storage
written_data = None
def capture_put(address, data, metadata):
nonlocal written_data
written_data = data
mock_storage.put = capture_put
# Write cache
_write_stats_cache(
client=mock_client,
bucket="test-bucket",
mode="quick",
stats=test_stats,
object_count=150,
compressed_size=50000,
)
# Verify something was written
assert written_data is not None
# Parse written data
cache_data = json.loads(written_data.decode("utf-8"))
# Verify structure
assert cache_data["version"] == "1.0"
assert cache_data["mode"] == "quick"
assert "computed_at" in cache_data
assert cache_data["validation"]["object_count"] == 150
assert cache_data["validation"]["compressed_size"] == 50000
assert cache_data["stats"]["bucket"] == "test-bucket"
assert cache_data["stats"]["object_count"] == 150
assert cache_data["stats"]["delta_objects"] == 140
# Now test reading it back
mock_obj = MagicMock()
mock_obj.data = written_data
mock_storage.get = MagicMock(return_value=mock_obj)
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
# Verify read stats match original
assert stats is not None
assert validation is not None
assert stats.bucket == "test-bucket"
assert stats.object_count == 150
assert stats.delta_objects == 140
assert stats.average_compression_ratio == 0.95
assert validation["object_count"] == 150
assert validation["compressed_size"] == 50000
def test_read_cache_missing_file():
"""Test reading cache when file doesn't exist."""
mock_storage = MagicMock()
mock_logger = MagicMock()
mock_service = MagicMock()
mock_service.storage = mock_storage
mock_service.logger = mock_logger
mock_client = MagicMock()
mock_client.service = mock_service
# Simulate FileNotFoundError
mock_storage.get.side_effect = FileNotFoundError("No such key")
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
assert stats is None
assert validation is None
def test_read_cache_invalid_json():
"""Test reading cache with corrupted JSON."""
mock_storage = MagicMock()
mock_logger = MagicMock()
mock_service = MagicMock()
mock_service.storage = mock_storage
mock_service.logger = mock_logger
mock_client = MagicMock()
mock_client.service = mock_service
# Return invalid JSON
mock_obj = MagicMock()
mock_obj.data = b"not valid json {]["
mock_storage.get = MagicMock(return_value=mock_obj)
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
assert stats is None
assert validation is None
mock_logger.warning.assert_called_once()
def test_read_cache_version_mismatch():
"""Test reading cache with wrong version."""
mock_storage = MagicMock()
mock_logger = MagicMock()
mock_service = MagicMock()
mock_service.storage = mock_storage
mock_service.logger = mock_logger
mock_client = MagicMock()
mock_client.service = mock_service
# Cache with wrong version
cache_data = {
"version": "2.0", # Wrong version
"mode": "quick",
"validation": {"object_count": 100, "compressed_size": 50000},
"stats": {
"bucket": "test",
"object_count": 100,
"total_size": 1000,
"compressed_size": 500,
"space_saved": 500,
"average_compression_ratio": 0.5,
"delta_objects": 90,
"direct_objects": 10,
},
}
mock_obj = MagicMock()
mock_obj.data = json.dumps(cache_data).encode("utf-8")
mock_storage.get = MagicMock(return_value=mock_obj)
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
assert stats is None
assert validation is None
mock_logger.warning.assert_called_once()
def test_read_cache_mode_mismatch():
"""Test reading cache with wrong mode."""
mock_storage = MagicMock()
mock_logger = MagicMock()
mock_service = MagicMock()
mock_service.storage = mock_storage
mock_service.logger = mock_logger
mock_client = MagicMock()
mock_client.service = mock_service
# Cache with mismatched mode
cache_data = {
"version": "1.0",
"mode": "detailed", # Wrong mode
"validation": {"object_count": 100, "compressed_size": 50000},
"stats": {
"bucket": "test",
"object_count": 100,
"total_size": 1000,
"compressed_size": 500,
"space_saved": 500,
"average_compression_ratio": 0.5,
"delta_objects": 90,
"direct_objects": 10,
},
}
mock_obj = MagicMock()
mock_obj.data = json.dumps(cache_data).encode("utf-8")
mock_storage.get = MagicMock(return_value=mock_obj)
# Request "quick" mode but cache has "detailed"
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
assert stats is None
assert validation is None
mock_logger.warning.assert_called_once()
def test_write_cache_handles_errors_gracefully():
"""Test that cache write failures don't crash the program."""
mock_storage = MagicMock()
mock_logger = MagicMock()
mock_service = MagicMock()
mock_service.storage = mock_storage
mock_service.logger = mock_logger
mock_client = MagicMock()
mock_client.service = mock_service
# Simulate S3 permission error
mock_storage.put.side_effect = PermissionError("Access denied")
test_stats = BucketStats(
bucket="test-bucket",
object_count=150,
total_size=1000000,
compressed_size=50000,
space_saved=950000,
average_compression_ratio=0.95,
delta_objects=140,
direct_objects=10,
)
# Should not raise exception
_write_stats_cache(
client=mock_client,
bucket="test-bucket",
mode="quick",
stats=test_stats,
object_count=150,
compressed_size=50000,
)
# Should log warning
mock_logger.warning.assert_called_once()
assert "Failed to write cache" in str(mock_logger.warning.call_args)