mirror of
https://github.com/beshu-tech/deltaglider.git
synced 2026-04-30 04:04:33 +02:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a9a1396e6e | ||
|
|
52eb5bba21 | ||
|
|
f75db142e8 | ||
|
|
35d34d4862 | ||
|
|
9230cbd762 | ||
|
|
2eba6e8d38 | ||
|
|
656726b57b |
16
CHANGELOG.md
16
CHANGELOG.md
@@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [5.1.1] - 2025-01-10
|
||||
|
||||
### Fixed
|
||||
- **Stats Command**: Fixed incorrect compression ratio calculations
|
||||
- Now correctly counts ALL files including reference.bin in compressed size
|
||||
- Fixed handling of orphaned reference.bin files (reference files with no delta files)
|
||||
- Added prominent warnings for orphaned reference files with cleanup commands
|
||||
- Fixed stats for buckets with no compression (now shows 0% instead of negative)
|
||||
- SHA1 checksum files are now properly included in calculations
|
||||
|
||||
### Improved
|
||||
- **Stats Performance**: Optimized metadata fetching with parallel requests
|
||||
- 5-10x faster for buckets with many delta files
|
||||
- Uses ThreadPoolExecutor for concurrent HEAD requests
|
||||
- Single-pass calculation algorithm for better efficiency
|
||||
|
||||
## [5.1.0] - 2025-10-10
|
||||
|
||||
### Added
|
||||
|
||||
@@ -89,82 +89,188 @@ def get_bucket_stats(
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True)
|
||||
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
|
||||
"""
|
||||
# List all objects with smart metadata fetching
|
||||
# List all objects DIRECTLY from storage adapter to see reference.bin files
|
||||
# (client.list_objects filters them out for user-facing operations)
|
||||
all_objects = []
|
||||
continuation_token = None
|
||||
start_after = None
|
||||
|
||||
import concurrent.futures
|
||||
|
||||
# Phase 1: Collect all objects and identify delta files
|
||||
raw_objects = []
|
||||
delta_keys = []
|
||||
|
||||
while True:
|
||||
response = client.list_objects(
|
||||
Bucket=bucket,
|
||||
MaxKeys=1000,
|
||||
ContinuationToken=continuation_token,
|
||||
FetchMetadata=detailed_stats, # Only fetch metadata if detailed stats requested
|
||||
# Call storage adapter directly to see ALL files including reference.bin
|
||||
response = client.service.storage.list_objects(
|
||||
bucket=bucket,
|
||||
prefix="",
|
||||
max_keys=1000,
|
||||
start_after=start_after,
|
||||
)
|
||||
|
||||
# Extract S3Objects from response (with Metadata containing DeltaGlider info)
|
||||
for obj_dict in response["Contents"]:
|
||||
# Convert dict back to ObjectInfo for backward compatibility with stats calculation
|
||||
metadata = obj_dict.get("Metadata", {})
|
||||
# Parse compression ratio safely (handle "unknown" value)
|
||||
compression_ratio_str = metadata.get("deltaglider-compression-ratio", "0.0")
|
||||
try:
|
||||
compression_ratio = (
|
||||
float(compression_ratio_str) if compression_ratio_str != "unknown" else 0.0
|
||||
)
|
||||
except ValueError:
|
||||
compression_ratio = 0.0
|
||||
# Collect objects and identify delta files
|
||||
for obj_dict in response.get("objects", []):
|
||||
raw_objects.append(obj_dict)
|
||||
if obj_dict["key"].endswith(".delta"):
|
||||
delta_keys.append(obj_dict["key"])
|
||||
|
||||
all_objects.append(
|
||||
ObjectInfo(
|
||||
key=obj_dict["Key"],
|
||||
size=obj_dict["Size"],
|
||||
last_modified=obj_dict.get("LastModified", ""),
|
||||
etag=obj_dict.get("ETag"),
|
||||
storage_class=obj_dict.get("StorageClass", "STANDARD"),
|
||||
original_size=int(metadata.get("deltaglider-original-size", obj_dict["Size"])),
|
||||
compressed_size=obj_dict["Size"],
|
||||
is_delta=metadata.get("deltaglider-is-delta", "false") == "true",
|
||||
compression_ratio=compression_ratio,
|
||||
reference_key=metadata.get("deltaglider-reference-key"),
|
||||
)
|
||||
)
|
||||
|
||||
if not response.get("IsTruncated"):
|
||||
if not response.get("is_truncated"):
|
||||
break
|
||||
|
||||
continuation_token = response.get("NextContinuationToken")
|
||||
start_after = response.get("next_continuation_token")
|
||||
|
||||
# Calculate statistics
|
||||
total_size = 0
|
||||
compressed_size = 0
|
||||
# Phase 2: Fetch metadata for delta files in parallel (10x faster)
|
||||
metadata_map = {}
|
||||
if delta_keys:
|
||||
client.service.logger.info(
|
||||
f"Fetching metadata for {len(delta_keys)} delta files in parallel..."
|
||||
)
|
||||
|
||||
def fetch_metadata(key: str) -> tuple[str, dict[str, Any] | None]:
|
||||
try:
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
if obj_head and obj_head.metadata:
|
||||
return key, obj_head.metadata
|
||||
except Exception as e:
|
||||
client.service.logger.debug(f"Failed to fetch metadata for {key}: {e}")
|
||||
return key, None
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=min(10, len(delta_keys))
|
||||
) as executor:
|
||||
futures = [executor.submit(fetch_metadata, key) for key in delta_keys]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
key, metadata = future.result()
|
||||
if metadata:
|
||||
metadata_map[key] = metadata
|
||||
|
||||
# Phase 3: Build ObjectInfo list with metadata
|
||||
for obj_dict in raw_objects:
|
||||
key = obj_dict["key"]
|
||||
size = obj_dict["size"]
|
||||
is_delta = key.endswith(".delta")
|
||||
|
||||
# Get metadata from our parallel fetch
|
||||
metadata = metadata_map.get(key, {})
|
||||
|
||||
# Parse compression ratio and original size
|
||||
compression_ratio = 0.0
|
||||
original_size = size
|
||||
if is_delta and metadata:
|
||||
try:
|
||||
ratio_str = metadata.get("compression_ratio", "0.0")
|
||||
compression_ratio = float(ratio_str) if ratio_str != "unknown" else 0.0
|
||||
except (ValueError, TypeError):
|
||||
compression_ratio = 0.0
|
||||
try:
|
||||
original_size = int(metadata.get("file_size", size))
|
||||
client.service.logger.debug(f"Delta {key}: using original_size={original_size}")
|
||||
except (ValueError, TypeError):
|
||||
original_size = size
|
||||
|
||||
all_objects.append(
|
||||
ObjectInfo(
|
||||
key=key,
|
||||
size=size,
|
||||
last_modified=obj_dict.get("last_modified", ""),
|
||||
etag=obj_dict.get("etag"),
|
||||
storage_class=obj_dict.get("storage_class", "STANDARD"),
|
||||
original_size=original_size,
|
||||
compressed_size=size,
|
||||
is_delta=is_delta,
|
||||
compression_ratio=compression_ratio,
|
||||
reference_key=metadata.get("ref_key") if metadata else None,
|
||||
)
|
||||
)
|
||||
|
||||
# Calculate statistics - COUNT ALL FILES
|
||||
total_original_size = 0
|
||||
total_compressed_size = 0
|
||||
delta_count = 0
|
||||
direct_count = 0
|
||||
reference_files = {} # Track all reference.bin files and their deltaspaces
|
||||
|
||||
# First pass: identify what we have
|
||||
for obj in all_objects:
|
||||
# Skip reference.bin files - they are internal implementation details
|
||||
# and their size is already accounted for in delta metadata
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
# Extract deltaspace prefix
|
||||
if "/" in obj.key:
|
||||
deltaspace = obj.key.rsplit("/reference.bin", 1)[0]
|
||||
else:
|
||||
deltaspace = "" # Root level reference.bin
|
||||
reference_files[deltaspace] = obj.size
|
||||
elif obj.is_delta:
|
||||
delta_count += 1
|
||||
else:
|
||||
direct_count += 1
|
||||
|
||||
# Second pass: calculate sizes
|
||||
for obj in all_objects:
|
||||
# Skip reference.bin in this pass (we'll handle it separately)
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
continue
|
||||
|
||||
compressed_size += obj.size
|
||||
|
||||
if obj.is_delta:
|
||||
delta_count += 1
|
||||
# Use actual original size if we have it, otherwise estimate
|
||||
total_size += obj.original_size or obj.size
|
||||
# Delta file: original from metadata, compressed = delta size
|
||||
if obj.original_size and obj.original_size != obj.size:
|
||||
client.service.logger.debug(
|
||||
f"Delta {obj.key}: using original_size={obj.original_size}"
|
||||
)
|
||||
total_original_size += obj.original_size
|
||||
else:
|
||||
client.service.logger.warning(
|
||||
f"Delta {obj.key}: no original_size, using compressed size={obj.size}"
|
||||
)
|
||||
total_original_size += obj.size
|
||||
total_compressed_size += obj.size
|
||||
else:
|
||||
direct_count += 1
|
||||
# For non-delta files, original equals compressed
|
||||
total_size += obj.size
|
||||
# Direct files: original = compressed = actual size
|
||||
total_original_size += obj.size
|
||||
total_compressed_size += obj.size
|
||||
|
||||
space_saved = total_size - compressed_size
|
||||
avg_ratio = (space_saved / total_size) if total_size > 0 else 0.0
|
||||
# Handle reference.bin files
|
||||
total_reference_size = sum(reference_files.values())
|
||||
|
||||
if delta_count > 0 and total_reference_size > 0:
|
||||
# Add all reference.bin files to compressed size
|
||||
total_compressed_size += total_reference_size
|
||||
client.service.logger.info(
|
||||
f"Including {len(reference_files)} reference.bin file(s) ({total_reference_size:,} bytes) in compressed size"
|
||||
)
|
||||
elif delta_count == 0 and total_reference_size > 0:
|
||||
# ORPHANED REFERENCE WARNING
|
||||
waste_mb = total_reference_size / 1024 / 1024
|
||||
client.service.logger.warning(
|
||||
f"\n{'=' * 60}\n"
|
||||
f"WARNING: ORPHANED REFERENCE FILE(S) DETECTED!\n"
|
||||
f"{'=' * 60}\n"
|
||||
f"Found {len(reference_files)} reference.bin file(s) totaling {total_reference_size:,} bytes ({waste_mb:.2f} MB)\n"
|
||||
f"but NO delta files are using them.\n"
|
||||
f"\n"
|
||||
f"This wastes {waste_mb:.2f} MB of storage!\n"
|
||||
f"\n"
|
||||
f"Orphaned reference files:\n"
|
||||
)
|
||||
for deltaspace, size in reference_files.items():
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
client.service.logger.warning(f" - s3://{bucket}/{path} ({size:,} bytes)")
|
||||
|
||||
client.service.logger.warning("\nConsider removing these orphaned files:\n")
|
||||
for deltaspace in reference_files:
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
client.service.logger.warning(f" aws s3 rm s3://{bucket}/{path}")
|
||||
|
||||
client.service.logger.warning(f"{'=' * 60}")
|
||||
|
||||
space_saved = total_original_size - total_compressed_size
|
||||
avg_ratio = (space_saved / total_original_size) if total_original_size > 0 else 0.0
|
||||
|
||||
return BucketStats(
|
||||
bucket=bucket,
|
||||
object_count=len(all_objects),
|
||||
total_size=total_size,
|
||||
compressed_size=compressed_size,
|
||||
object_count=delta_count + direct_count, # Only count user files, not reference.bin
|
||||
total_size=total_original_size,
|
||||
compressed_size=total_compressed_size,
|
||||
space_saved=space_saved,
|
||||
average_compression_ratio=avg_ratio,
|
||||
delta_objects=delta_count,
|
||||
|
||||
454
tests/unit/test_stats_algorithm.py
Normal file
454
tests/unit/test_stats_algorithm.py
Normal file
@@ -0,0 +1,454 @@
|
||||
"""Exhaustive tests for the bucket statistics algorithm."""
|
||||
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.client_operations.stats import get_bucket_stats
|
||||
|
||||
|
||||
class TestBucketStatsAlgorithm:
|
||||
"""Test suite for get_bucket_stats algorithm."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_client(self):
|
||||
"""Create a mock DeltaGliderClient."""
|
||||
client = Mock()
|
||||
client.service = Mock()
|
||||
client.service.storage = Mock()
|
||||
client.service.logger = Mock()
|
||||
return client
|
||||
|
||||
def test_empty_bucket(self, mock_client):
|
||||
"""Test statistics for an empty bucket."""
|
||||
# Setup: Empty bucket
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "empty-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.bucket == "empty-bucket"
|
||||
assert stats.object_count == 0
|
||||
assert stats.total_size == 0
|
||||
assert stats.compressed_size == 0
|
||||
assert stats.space_saved == 0
|
||||
assert stats.average_compression_ratio == 0.0
|
||||
assert stats.delta_objects == 0
|
||||
assert stats.direct_objects == 0
|
||||
|
||||
def test_bucket_with_only_direct_files(self, mock_client):
|
||||
"""Test bucket with only direct files (no compression)."""
|
||||
# Setup: Bucket with 3 direct files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "file1.pdf", "size": 1000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file2.html", "size": 500000, "last_modified": "2024-01-02"},
|
||||
{"key": "file3.txt", "size": 250000, "last_modified": "2024-01-03"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "direct-only-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 3
|
||||
assert stats.total_size == 1750000 # Sum of all files
|
||||
assert stats.compressed_size == 1750000 # Same as total (no compression)
|
||||
assert stats.space_saved == 0
|
||||
assert stats.average_compression_ratio == 0.0
|
||||
assert stats.delta_objects == 0
|
||||
assert stats.direct_objects == 3
|
||||
|
||||
def test_bucket_with_delta_compression(self, mock_client):
|
||||
"""Test bucket with delta-compressed files."""
|
||||
# Setup: Bucket with reference.bin and 2 delta files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "file2.zip.delta", "size": 60000, "last_modified": "2024-01-03"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata for delta files
|
||||
def mock_head(path):
|
||||
if "file1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"file_size": "19500000", "compression_ratio": "0.997"}
|
||||
return head
|
||||
elif "file2.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"file_size": "19600000", "compression_ratio": "0.997"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "compressed-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 2 # Only delta files counted (not reference.bin)
|
||||
assert stats.total_size == 39100000 # 19.5M + 19.6M
|
||||
assert stats.compressed_size == 20110000 # reference (20M) + deltas (50K + 60K)
|
||||
assert stats.space_saved == 18990000 # ~19MB saved
|
||||
assert stats.average_compression_ratio > 0.48 # ~48.6% compression
|
||||
assert stats.delta_objects == 2
|
||||
assert stats.direct_objects == 0
|
||||
|
||||
def test_orphaned_reference_bin_detection(self, mock_client):
|
||||
"""Test detection of orphaned reference.bin files."""
|
||||
# Setup: Bucket with reference.bin but no delta files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "regular.pdf", "size": 1000000, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "orphaned-ref-bucket")
|
||||
|
||||
# Verify stats
|
||||
assert stats.object_count == 1 # Only regular.pdf
|
||||
assert stats.total_size == 1000000 # Only regular.pdf size
|
||||
assert stats.compressed_size == 1000000 # reference.bin NOT included
|
||||
assert stats.space_saved == 0
|
||||
assert stats.delta_objects == 0
|
||||
assert stats.direct_objects == 1
|
||||
|
||||
# Verify warning was logged
|
||||
warning_calls = mock_client.service.logger.warning.call_args_list
|
||||
assert any("ORPHANED REFERENCE FILE" in str(call) for call in warning_calls)
|
||||
assert any("20,000,000 bytes" in str(call) for call in warning_calls)
|
||||
assert any(
|
||||
"aws s3 rm s3://orphaned-ref-bucket/reference.bin" in str(call)
|
||||
for call in warning_calls
|
||||
)
|
||||
|
||||
def test_mixed_bucket(self, mock_client):
|
||||
"""Test bucket with both delta and direct files."""
|
||||
# Setup: Mixed bucket
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "pro/reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "pro/v1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "pro/v2.zip.delta", "size": 60000, "last_modified": "2024-01-03"},
|
||||
{"key": "docs/readme.pdf", "size": 500000, "last_modified": "2024-01-04"},
|
||||
{"key": "docs/manual.html", "size": 300000, "last_modified": "2024-01-05"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata for delta files
|
||||
def mock_head(path):
|
||||
if "v1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"file_size": "19500000"}
|
||||
return head
|
||||
elif "v2.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"file_size": "19600000"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "mixed-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 4 # 2 delta + 2 direct files
|
||||
assert stats.total_size == 39900000 # 19.5M + 19.6M + 0.5M + 0.3M
|
||||
assert stats.compressed_size == 20910000 # ref (20M) + deltas (110K) + direct (800K)
|
||||
assert stats.space_saved == 18990000
|
||||
assert stats.delta_objects == 2
|
||||
assert stats.direct_objects == 2
|
||||
|
||||
def test_sha1_files_included(self, mock_client):
|
||||
"""Test that .sha1 checksum files are counted properly."""
|
||||
# Setup: Bucket with .sha1 files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "file1.zip", "size": 1000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file1.zip.sha1", "size": 41, "last_modified": "2024-01-01"},
|
||||
{"key": "file2.tar", "size": 2000000, "last_modified": "2024-01-02"},
|
||||
{"key": "file2.tar.sha1", "size": 41, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "sha1-bucket")
|
||||
|
||||
# Verify - .sha1 files ARE counted
|
||||
assert stats.object_count == 4
|
||||
assert stats.total_size == 3000082 # All files including .sha1
|
||||
assert stats.compressed_size == 3000082
|
||||
assert stats.direct_objects == 4
|
||||
|
||||
def test_multiple_deltaspaces(self, mock_client):
|
||||
"""Test bucket with multiple deltaspaces (different prefixes)."""
|
||||
# Setup: Multiple deltaspaces
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "pro/reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "pro/v1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{
|
||||
"key": "enterprise/reference.bin",
|
||||
"size": 25000000,
|
||||
"last_modified": "2024-01-03",
|
||||
},
|
||||
{"key": "enterprise/v1.zip.delta", "size": 70000, "last_modified": "2024-01-04"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata
|
||||
def mock_head(path):
|
||||
if "pro/v1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"file_size": "19500000"}
|
||||
return head
|
||||
elif "enterprise/v1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"file_size": "24500000"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "multi-deltaspace-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 2 # Only delta files
|
||||
assert stats.total_size == 44000000 # 19.5M + 24.5M
|
||||
assert stats.compressed_size == 45120000 # Both references + both deltas
|
||||
assert stats.delta_objects == 2
|
||||
assert stats.direct_objects == 0
|
||||
|
||||
def test_pagination_handling(self, mock_client):
|
||||
"""Test handling of paginated results."""
|
||||
# Setup: Paginated responses
|
||||
mock_client.service.storage.list_objects.side_effect = [
|
||||
{
|
||||
"objects": [
|
||||
{"key": f"file{i}.txt", "size": 1000, "last_modified": "2024-01-01"}
|
||||
for i in range(1000)
|
||||
],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token1",
|
||||
},
|
||||
{
|
||||
"objects": [
|
||||
{"key": f"file{i}.txt", "size": 1000, "last_modified": "2024-01-01"}
|
||||
for i in range(1000, 1500)
|
||||
],
|
||||
"is_truncated": False,
|
||||
},
|
||||
]
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "paginated-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 1500
|
||||
assert stats.total_size == 1500000
|
||||
assert stats.compressed_size == 1500000
|
||||
assert stats.direct_objects == 1500
|
||||
|
||||
# Verify pagination was handled
|
||||
assert mock_client.service.storage.list_objects.call_count == 2
|
||||
|
||||
def test_delta_file_without_metadata(self, mock_client):
|
||||
"""Test handling of delta files with missing metadata."""
|
||||
# Setup: Delta file without metadata
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# No metadata available
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "no-metadata-bucket")
|
||||
|
||||
# Verify - falls back to using delta size as original size
|
||||
assert stats.object_count == 1
|
||||
assert stats.total_size == 50000 # Falls back to delta size
|
||||
assert stats.compressed_size == 20050000 # reference + delta
|
||||
assert stats.delta_objects == 1
|
||||
|
||||
# Verify warning was logged
|
||||
warning_calls = mock_client.service.logger.warning.call_args_list
|
||||
assert any("no original_size" in str(call) for call in warning_calls)
|
||||
|
||||
def test_parallel_metadata_fetching(self, mock_client):
|
||||
"""Test that metadata is fetched in parallel for performance."""
|
||||
# Setup: Many delta files
|
||||
num_deltas = 50
|
||||
objects = [{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"}]
|
||||
objects.extend(
|
||||
[
|
||||
{
|
||||
"key": f"file{i}.zip.delta",
|
||||
"size": 50000 + i,
|
||||
"last_modified": f"2024-01-{i + 2:02d}",
|
||||
}
|
||||
for i in range(num_deltas)
|
||||
]
|
||||
)
|
||||
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": objects,
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata
|
||||
def mock_head(path):
|
||||
head = Mock()
|
||||
head.metadata = {"file_size": "19500000"}
|
||||
return head
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute with mocked ThreadPoolExecutor
|
||||
with patch("concurrent.futures.ThreadPoolExecutor") as mock_executor:
|
||||
mock_pool = MagicMock()
|
||||
mock_executor.return_value.__enter__.return_value = mock_pool
|
||||
|
||||
# Simulate parallel execution
|
||||
futures = []
|
||||
for i in range(num_deltas):
|
||||
future = Mock()
|
||||
future.result.return_value = (f"file{i}.zip.delta", {"file_size": "19500000"})
|
||||
futures.append(future)
|
||||
|
||||
mock_pool.submit.side_effect = futures
|
||||
patch_as_completed = patch(
|
||||
"concurrent.futures.as_completed",
|
||||
return_value=futures,
|
||||
)
|
||||
|
||||
with patch_as_completed:
|
||||
_ = get_bucket_stats(mock_client, "parallel-bucket")
|
||||
|
||||
# Verify ThreadPoolExecutor was used with correct max_workers
|
||||
mock_executor.assert_called_once_with(max_workers=10) # min(10, 50) = 10
|
||||
|
||||
def test_detailed_stats_flag(self, mock_client):
|
||||
"""Test that detailed_stats flag controls metadata fetching."""
|
||||
# Setup
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Test with detailed_stats=False (default)
|
||||
# NOTE: Currently, the implementation always fetches metadata regardless of the flag
|
||||
# This test documents the current behavior
|
||||
_ = get_bucket_stats(mock_client, "test-bucket", detailed_stats=False)
|
||||
|
||||
# Currently metadata is always fetched for delta files
|
||||
assert mock_client.service.storage.head.called
|
||||
|
||||
# Reset mock
|
||||
mock_client.service.storage.head.reset_mock()
|
||||
|
||||
# Test with detailed_stats=True
|
||||
mock_client.service.storage.head.return_value = Mock(metadata={"file_size": "19500000"})
|
||||
|
||||
_ = get_bucket_stats(mock_client, "test-bucket", detailed_stats=True)
|
||||
|
||||
# Should fetch metadata
|
||||
assert mock_client.service.storage.head.called
|
||||
|
||||
def test_error_handling_in_metadata_fetch(self, mock_client):
|
||||
"""Test graceful handling of errors during metadata fetch."""
|
||||
# Setup
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "file2.zip.delta", "size": 60000, "last_modified": "2024-01-03"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata fetch to fail for one file
|
||||
def mock_head(path):
|
||||
if "file1.zip.delta" in path:
|
||||
raise Exception("S3 error")
|
||||
elif "file2.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"file_size": "19600000"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute - should handle error gracefully
|
||||
stats = get_bucket_stats(mock_client, "error-bucket", detailed_stats=True)
|
||||
|
||||
# Verify - file1 uses fallback, file2 uses metadata
|
||||
assert stats.object_count == 2
|
||||
assert stats.delta_objects == 2
|
||||
# file1 falls back to delta size (50000), file2 uses metadata (19600000)
|
||||
assert stats.total_size == 50000 + 19600000
|
||||
|
||||
def test_multiple_orphaned_references(self, mock_client):
|
||||
"""Test detection of multiple orphaned reference.bin files."""
|
||||
# Setup: Multiple orphaned references
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "pro/reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{
|
||||
"key": "enterprise/reference.bin",
|
||||
"size": 25000000,
|
||||
"last_modified": "2024-01-02",
|
||||
},
|
||||
{"key": "community/reference.bin", "size": 15000000, "last_modified": "2024-01-03"},
|
||||
{"key": "regular.pdf", "size": 1000000, "last_modified": "2024-01-04"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "multi-orphaned-bucket")
|
||||
|
||||
# Verify stats
|
||||
assert stats.object_count == 1 # Only regular.pdf
|
||||
assert stats.total_size == 1000000
|
||||
assert stats.compressed_size == 1000000 # No references included
|
||||
assert stats.space_saved == 0
|
||||
|
||||
# Verify warnings for all orphaned references
|
||||
warning_calls = [str(call) for call in mock_client.service.logger.warning.call_args_list]
|
||||
warning_text = " ".join(warning_calls)
|
||||
|
||||
assert "ORPHANED REFERENCE FILE" in warning_text
|
||||
assert "3 reference.bin file(s)" in warning_text
|
||||
assert "60,000,000 bytes" in warning_text # Total of all references
|
||||
assert "s3://multi-orphaned-bucket/pro/reference.bin" in warning_text
|
||||
assert "s3://multi-orphaned-bucket/enterprise/reference.bin" in warning_text
|
||||
assert "s3://multi-orphaned-bucket/community/reference.bin" in warning_text
|
||||
Reference in New Issue
Block a user