mirror of
https://github.com/beshu-tech/deltaglider.git
synced 2026-04-17 22:19:43 +02:00
Initial commit: DeltaGlider - 99.9% compression for S3 storage
DeltaGlider reduces storage costs by storing only binary deltas between similar files. Achieves 99.9% compression for versioned artifacts. Key features: - Intelligent file type detection (delta for archives, direct for others) - Drop-in S3 replacement with automatic compression - SHA256 integrity verification on every operation - Clean hexagonal architecture - Full test coverage - Production tested with 200K+ files Case study: ReadOnlyREST reduced 4TB to 5GB (99.9% compression)
This commit is contained in:
1
tests/integration/__init__.py
Normal file
1
tests/integration/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Integration tests for DeltaGlider."""
|
||||
191
tests/integration/test_full_workflow.py
Normal file
191
tests/integration/test_full_workflow.py
Normal file
@@ -0,0 +1,191 @@
|
||||
"""Integration test for full put/get workflow."""
|
||||
|
||||
import io
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.core import DeltaService, Leaf, ObjectKey
|
||||
|
||||
|
||||
def test_full_put_get_workflow(service, temp_dir, mock_storage, mock_diff):
|
||||
"""Test complete workflow: put a file, then get it back."""
|
||||
# Create test files
|
||||
file1_content = b"This is the first version of the file."
|
||||
file2_content = b"This is the second version of the file with changes."
|
||||
|
||||
file1 = temp_dir / "version1.txt"
|
||||
file2 = temp_dir / "version2.txt"
|
||||
output_file = temp_dir / "recovered.txt"
|
||||
|
||||
file1.write_bytes(file1_content)
|
||||
file2.write_bytes(file2_content)
|
||||
|
||||
# Set up mock_diff decode to write the target content
|
||||
def decode_side_effect(base, delta, out):
|
||||
out.write_bytes(file2_content)
|
||||
mock_diff.decode.side_effect = decode_side_effect
|
||||
|
||||
leaf = Leaf(bucket="test-bucket", prefix="test/data")
|
||||
|
||||
# Storage state tracking
|
||||
storage_data = {}
|
||||
|
||||
def mock_head(key):
|
||||
"""Mock head_object."""
|
||||
if key in storage_data:
|
||||
return storage_data[key]["head"]
|
||||
return None
|
||||
|
||||
def mock_put(key, body, metadata, content_type="application/octet-stream"):
|
||||
"""Mock put_object."""
|
||||
from deltaglider.ports.storage import PutResult, ObjectHead
|
||||
|
||||
# Read content if it's a Path
|
||||
if isinstance(body, Path):
|
||||
content = body.read_bytes()
|
||||
elif isinstance(body, bytes):
|
||||
content = body
|
||||
else:
|
||||
content = body.read()
|
||||
|
||||
storage_data[key] = {
|
||||
"content": content,
|
||||
"head": ObjectHead(
|
||||
key=key.split("/", 1)[1],
|
||||
size=len(content),
|
||||
etag="mock-etag",
|
||||
last_modified=None,
|
||||
metadata=metadata,
|
||||
)
|
||||
}
|
||||
return PutResult(etag="mock-etag")
|
||||
|
||||
def mock_get(key):
|
||||
"""Mock get_object."""
|
||||
# The key might come without bucket prefix, so check both formats
|
||||
if key in storage_data:
|
||||
return io.BytesIO(storage_data[key]["content"])
|
||||
# Also try with test-bucket prefix if not found
|
||||
full_key = f"test-bucket/{key}" if not key.startswith("test-bucket/") else key
|
||||
if full_key in storage_data:
|
||||
return io.BytesIO(storage_data[full_key]["content"])
|
||||
raise FileNotFoundError(f"Object not found: {key}")
|
||||
|
||||
mock_storage.head.side_effect = mock_head
|
||||
mock_storage.put.side_effect = mock_put
|
||||
mock_storage.get.side_effect = mock_get
|
||||
|
||||
# Step 1: Put the first file (creates reference)
|
||||
summary1 = service.put(file1, leaf)
|
||||
assert summary1.operation == "create_reference"
|
||||
assert summary1.key == "test/data/reference.bin"
|
||||
|
||||
# Verify reference was stored
|
||||
ref_key = f"{leaf.bucket}/{leaf.reference_key()}"
|
||||
assert ref_key in storage_data
|
||||
assert storage_data[ref_key]["content"] == file1_content
|
||||
|
||||
# Step 2: Put the second file (creates delta)
|
||||
summary2 = service.put(file2, leaf)
|
||||
assert summary2.operation == "create_delta"
|
||||
assert summary2.key == "test/data/version2.txt.delta"
|
||||
assert summary2.delta_size is not None
|
||||
assert summary2.ref_key == "test/data/reference.bin"
|
||||
|
||||
# Verify delta was stored
|
||||
delta_key = f"{leaf.bucket}/{summary2.key}"
|
||||
assert delta_key in storage_data
|
||||
|
||||
# Step 3: Get the delta file back
|
||||
obj_key = ObjectKey(bucket=leaf.bucket, key=summary2.key)
|
||||
service.get(obj_key, output_file)
|
||||
|
||||
# Step 4: Verify the recovered file matches the original
|
||||
recovered_content = output_file.read_bytes()
|
||||
assert recovered_content == file2_content
|
||||
|
||||
|
||||
def test_get_with_auto_delta_suffix(service, temp_dir, mock_storage, mock_diff):
|
||||
"""Test get command behavior when .delta suffix is auto-appended."""
|
||||
# Create test file
|
||||
file_content = b"Test file content for auto-suffix test."
|
||||
test_file = temp_dir / "mydata.zip"
|
||||
test_file.write_bytes(file_content)
|
||||
|
||||
# Set up mock_diff decode to write the target content
|
||||
def decode_side_effect(base, delta, out):
|
||||
out.write_bytes(file_content)
|
||||
mock_diff.decode.side_effect = decode_side_effect
|
||||
|
||||
leaf = Leaf(bucket="test-bucket", prefix="archive")
|
||||
|
||||
# Storage state tracking
|
||||
storage_data = {}
|
||||
|
||||
def mock_head(key):
|
||||
"""Mock head_object."""
|
||||
if key in storage_data:
|
||||
return storage_data[key]["head"]
|
||||
return None
|
||||
|
||||
def mock_put(key, body, metadata, content_type="application/octet-stream"):
|
||||
"""Mock put_object."""
|
||||
from deltaglider.ports.storage import PutResult, ObjectHead
|
||||
|
||||
# Read content if it's a Path
|
||||
if isinstance(body, Path):
|
||||
content = body.read_bytes()
|
||||
elif isinstance(body, bytes):
|
||||
content = body
|
||||
else:
|
||||
content = body.read()
|
||||
|
||||
storage_data[key] = {
|
||||
"content": content,
|
||||
"head": ObjectHead(
|
||||
key=key.split("/", 1)[1],
|
||||
size=len(content),
|
||||
etag="mock-etag",
|
||||
last_modified=None,
|
||||
metadata=metadata,
|
||||
)
|
||||
}
|
||||
return PutResult(etag="mock-etag")
|
||||
|
||||
def mock_get(key):
|
||||
"""Mock get_object."""
|
||||
# The key might come without bucket prefix, so check both formats
|
||||
if key in storage_data:
|
||||
return io.BytesIO(storage_data[key]["content"])
|
||||
# Also try with test-bucket prefix if not found
|
||||
full_key = f"test-bucket/{key}" if not key.startswith("test-bucket/") else key
|
||||
if full_key in storage_data:
|
||||
return io.BytesIO(storage_data[full_key]["content"])
|
||||
raise FileNotFoundError(f"Object not found: {key}")
|
||||
|
||||
mock_storage.head.side_effect = mock_head
|
||||
mock_storage.put.side_effect = mock_put
|
||||
mock_storage.get.side_effect = mock_get
|
||||
|
||||
# Put the file
|
||||
summary = service.put(test_file, leaf)
|
||||
|
||||
# Get it back using original name (without .delta)
|
||||
# The service should internally look for "mydata.zip.delta"
|
||||
output_file = temp_dir / "recovered.zip"
|
||||
|
||||
# Use the key without .delta suffix
|
||||
if summary.operation == "create_reference":
|
||||
# If it's a reference, the zero-diff delta was created
|
||||
obj_key = ObjectKey(bucket=leaf.bucket, key="archive/mydata.zip.delta")
|
||||
else:
|
||||
obj_key = ObjectKey(bucket=leaf.bucket, key=summary.key)
|
||||
|
||||
service.get(obj_key, output_file)
|
||||
|
||||
# Verify the recovered file matches the original
|
||||
recovered_content = output_file.read_bytes()
|
||||
assert recovered_content == file_content
|
||||
135
tests/integration/test_get_command.py
Normal file
135
tests/integration/test_get_command.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""Integration test for get command."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
|
||||
from deltaglider.app.cli.main import cli
|
||||
from deltaglider.core import ObjectKey
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_service():
|
||||
"""Create a mock DeltaService."""
|
||||
return Mock()
|
||||
|
||||
|
||||
def test_get_command_with_original_name(mock_service):
|
||||
"""Test get command with original filename (auto-appends .delta)."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Mock the service.get method
|
||||
mock_service.get = Mock()
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
# Run get with original filename (should auto-append .delta)
|
||||
result = runner.invoke(cli, ["get", "s3://test-bucket/data/myfile.zip"])
|
||||
|
||||
# Check it was successful
|
||||
assert result.exit_code == 0
|
||||
assert "Looking for delta file: s3://test-bucket/data/myfile.zip.delta" in result.output
|
||||
assert "Successfully reconstructed: myfile.zip" in result.output
|
||||
|
||||
# Verify the service was called with the correct arguments
|
||||
mock_service.get.assert_called_once()
|
||||
call_args = mock_service.get.call_args
|
||||
obj_key = call_args[0][0]
|
||||
output_path = call_args[0][1]
|
||||
|
||||
assert isinstance(obj_key, ObjectKey)
|
||||
assert obj_key.bucket == "test-bucket"
|
||||
assert obj_key.key == "data/myfile.zip.delta"
|
||||
assert output_path == Path("myfile.zip")
|
||||
|
||||
|
||||
def test_get_command_with_delta_name(mock_service):
|
||||
"""Test get command with explicit .delta filename."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Mock the service.get method
|
||||
mock_service.get = Mock()
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
# Run get with explicit .delta filename
|
||||
result = runner.invoke(cli, ["get", "s3://test-bucket/data/myfile.zip.delta"])
|
||||
|
||||
# Check it was successful
|
||||
assert result.exit_code == 0
|
||||
assert "Looking for delta file" not in result.output # Should not print this message
|
||||
assert "Successfully reconstructed: myfile.zip" in result.output
|
||||
|
||||
# Verify the service was called with the correct arguments
|
||||
mock_service.get.assert_called_once()
|
||||
call_args = mock_service.get.call_args
|
||||
obj_key = call_args[0][0]
|
||||
output_path = call_args[0][1]
|
||||
|
||||
assert isinstance(obj_key, ObjectKey)
|
||||
assert obj_key.bucket == "test-bucket"
|
||||
assert obj_key.key == "data/myfile.zip.delta"
|
||||
assert output_path == Path("myfile.zip")
|
||||
|
||||
|
||||
def test_get_command_with_output_option(mock_service):
|
||||
"""Test get command with custom output path."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Mock the service.get method
|
||||
mock_service.get = Mock()
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_file = Path(tmpdir) / "custom_output.zip"
|
||||
|
||||
# Run get with custom output path
|
||||
result = runner.invoke(cli, [
|
||||
"get",
|
||||
"s3://test-bucket/data/myfile.zip",
|
||||
"-o", str(output_file)
|
||||
])
|
||||
|
||||
# Check it was successful
|
||||
assert result.exit_code == 0
|
||||
assert f"Successfully reconstructed: {output_file}" in result.output
|
||||
|
||||
# Verify the service was called with the correct arguments
|
||||
mock_service.get.assert_called_once()
|
||||
call_args = mock_service.get.call_args
|
||||
obj_key = call_args[0][0]
|
||||
output_path = call_args[0][1]
|
||||
|
||||
assert isinstance(obj_key, ObjectKey)
|
||||
assert obj_key.bucket == "test-bucket"
|
||||
assert obj_key.key == "data/myfile.zip.delta"
|
||||
assert output_path == output_file
|
||||
|
||||
|
||||
def test_get_command_error_handling(mock_service):
|
||||
"""Test get command error handling."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Mock the service.get method to raise an error
|
||||
mock_service.get = Mock(side_effect=FileNotFoundError("Delta not found"))
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
# Run get command
|
||||
result = runner.invoke(cli, ["get", "s3://test-bucket/data/missing.zip"])
|
||||
|
||||
# Check it failed with error message
|
||||
assert result.exit_code == 1
|
||||
assert "Error: Delta not found" in result.output
|
||||
|
||||
|
||||
def test_get_command_invalid_url():
|
||||
"""Test get command with invalid S3 URL."""
|
||||
runner = CliRunner()
|
||||
|
||||
# Run get with invalid URL
|
||||
result = runner.invoke(cli, ["get", "http://invalid-url/file.zip"])
|
||||
|
||||
# Check it failed with error message
|
||||
assert result.exit_code == 1
|
||||
assert "Error: Invalid S3 URL" in result.output
|
||||
106
tests/integration/test_xdelta.py
Normal file
106
tests/integration/test_xdelta.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Integration tests for xdelta3."""
|
||||
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.adapters import XdeltaAdapter
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("skip_if_no_xdelta")
|
||||
class TestXdeltaIntegration:
|
||||
"""Test xdelta3 integration."""
|
||||
|
||||
def test_encode_decode_roundtrip(self, temp_dir):
|
||||
"""Test encoding and decoding roundtrip."""
|
||||
# Setup
|
||||
adapter = XdeltaAdapter()
|
||||
|
||||
# Create base and target files
|
||||
base = temp_dir / "base.txt"
|
||||
base.write_text("This is the base file content.")
|
||||
|
||||
target = temp_dir / "target.txt"
|
||||
target.write_text("This is the modified target file content with changes.")
|
||||
|
||||
delta = temp_dir / "delta.bin"
|
||||
output = temp_dir / "output.txt"
|
||||
|
||||
# Encode
|
||||
adapter.encode(base, target, delta)
|
||||
|
||||
# Verify delta was created
|
||||
assert delta.exists()
|
||||
assert delta.stat().st_size > 0
|
||||
|
||||
# Decode
|
||||
adapter.decode(base, delta, output)
|
||||
|
||||
# Verify output matches target
|
||||
assert output.read_text() == target.read_text()
|
||||
|
||||
def test_encode_identical_files(self, temp_dir):
|
||||
"""Test encoding identical files produces small delta."""
|
||||
# Setup
|
||||
adapter = XdeltaAdapter()
|
||||
|
||||
# Create identical files
|
||||
base = temp_dir / "base.txt"
|
||||
content = "This is identical content in both files." * 100
|
||||
base.write_text(content)
|
||||
|
||||
target = temp_dir / "target.txt"
|
||||
target.write_text(content)
|
||||
|
||||
delta = temp_dir / "delta.bin"
|
||||
|
||||
# Encode
|
||||
adapter.encode(base, target, delta)
|
||||
|
||||
# Verify delta is small (much smaller than original)
|
||||
assert delta.exists()
|
||||
assert delta.stat().st_size < len(content) / 10 # Delta should be <10% of original
|
||||
|
||||
def test_encode_completely_different_files(self, temp_dir):
|
||||
"""Test encoding completely different files."""
|
||||
# Setup
|
||||
adapter = XdeltaAdapter()
|
||||
|
||||
# Create completely different files
|
||||
base = temp_dir / "base.txt"
|
||||
base.write_text("A" * 1000)
|
||||
|
||||
target = temp_dir / "target.txt"
|
||||
target.write_text("B" * 1000)
|
||||
|
||||
delta = temp_dir / "delta.bin"
|
||||
|
||||
# Encode
|
||||
adapter.encode(base, target, delta)
|
||||
|
||||
# Delta will be roughly the size of target since files are completely different
|
||||
assert delta.exists()
|
||||
# Note: xdelta3 compression may still reduce size somewhat
|
||||
|
||||
def test_encode_binary_files(self, temp_dir):
|
||||
"""Test encoding binary files."""
|
||||
# Setup
|
||||
adapter = XdeltaAdapter()
|
||||
|
||||
# Create binary files
|
||||
base = temp_dir / "base.bin"
|
||||
base.write_bytes(b"\x00\x01\x02\x03" * 256)
|
||||
|
||||
target = temp_dir / "target.bin"
|
||||
target.write_bytes(b"\x00\x01\x02\x03" * 200 + b"\xFF\xFE\xFD\xFC" * 56)
|
||||
|
||||
delta = temp_dir / "delta.bin"
|
||||
output = temp_dir / "output.bin"
|
||||
|
||||
# Encode
|
||||
adapter.encode(base, target, delta)
|
||||
|
||||
# Decode
|
||||
adapter.decode(base, delta, output)
|
||||
|
||||
# Verify
|
||||
assert output.read_bytes() == target.read_bytes()
|
||||
Reference in New Issue
Block a user