mirror of
https://github.com/beshu-tech/deltaglider.git
synced 2026-05-19 05:16:54 +02:00
fix(metadata): align direct-upload keys to canonical dg-* namespace (#8)
* fix(metadata): align direct-upload keys to canonical dg-* namespace
`_upload_direct` (the path taken by non-delta-eligible files like
.sha1 / .sha512) wrote user-metadata with bare underscored keys
(`original_name`, `file_sha256`, `compression`) while delta and
reference uploads correctly used the canonical dashed namespace
(`dg-original-name`, `dg-file-sha256`, `dg-compression`).
Downstream consumers — most visibly the DeltaGlider Proxy — only
recognised the dashed form, so every .sha1 / .sha512 listing on
a bucket holding deltaglider-uploaded files produced:
WARN PATHOLOGICAL | Missing/corrupt DG metadata for
bucket/key.sha1 -- falling back to passthrough.
Error: Storage error: Missing dg-original-name
This patch aligns the writer to the canonical scheme and keeps the
read path backward-compatible with already-stored bare-keyed objects
via `resolve_metadata`. No re-upload required.
Changes
-------
* `_upload_direct` emits metadata using `f"{METADATA_PREFIX}{key}"`
(the same pattern delta/reference uploads already use).
* `METADATA_KEY_ALIASES` now lists `compression` and `source_name`
so `resolve_metadata` works for both fields uniformly.
* Replaced bare `metadata.get("compression")` /
`metadata.get("original_name")` / `metadata.get("file_size")` /
`metadata.get("ref_key")` lookups in `DeltaService.get`,
`DeltaService.delete`, `_delete_delta`, the recursive-delete
listing path, `client.list_objects_v2`, and
`client_operations.stats.get_object_info` with `resolve_metadata`
calls so legacy bare-keyed objects keep working forever.
Tests
-----
* `tests/unit/test_metadata_aliases.py` (new, 11 tests) — pins the
alias table contract: new dashed keys, legacy bare underscored
keys, legacy hyphenated keys, priority rule, empty-string
handling.
* `test_direct_upload_emits_dashed_namespace` in
`tests/unit/test_core_service.py` — pins the writer to emit only
dg-* keys.
* Existing tests using the legacy bare `compression: "none"` form
in `test_s3_compat.py` and `test_recursive_delete_reference_*.py`
still pass — proving the dual-scheme read contract holds.
Full unit suite: 87/87 pass, mypy clean, ruff clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* fix(metadata): also resolve legacy file_sha256 in get() dispatch
Adversarial review of the original patch caught a second
asymmetry: DeltaService.get's "is this a regular S3 object or
DeltaGlider-managed?" dispatch was a literal-string check
`"dg-file-sha256" not in obj_head.metadata`. After the writer
fix, NEW direct uploads have `dg-file-sha256` so they route
correctly. But ~4400 pre-fix `.sha1` / `.sha512` files in
production have the bare `file_sha256` key, and they were
silently being routed through the "regular S3 object" branch
instead of the "direct upload" branch.
Both branches call `_get_direct` so file content was still
served correctly — but the wrong log message fired
("Downloading regular S3 object (no DeltaGlider metadata)") and
the recorded file-size for telemetry came from obj_head.size
instead of the metadata's `file_size` (same value for direct
uploads, but still semantically wrong).
Swap the literal-string check for `resolve_metadata(meta,
"file_sha256") is None` so both schemes route to the
DeltaGlider-managed branch.
Added regression test `test_get_legacy_direct_upload_not_
misclassified_as_regular_s3` that builds a HEAD response with
the legacy bare-keyed metadata shape (exactly what's stored on
Hetzner today for the .sha files), captures the log messages,
and fails if the "regular S3 object" canary fires.
Demonstrated locally: revert the dispatch back to literal-string
check → new test fails with the canary log line. Restore →
88/88 pass.
CHANGELOG updated to document both fixes (writer + dispatch).
---------
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -132,6 +132,47 @@ class TestDeltaServicePut:
|
||||
assert issubclass(w[0].category, PolicyViolationWarning)
|
||||
assert "exceeds threshold" in str(w[0].message)
|
||||
|
||||
def test_direct_upload_emits_dashed_namespace(self, service, temp_dir, mock_storage):
|
||||
"""Direct-upload (non-delta-eligible files like .sha1) must
|
||||
write metadata in the canonical dg-* dashed namespace.
|
||||
|
||||
Pre-v6.1.2 this path wrote bare underscored keys
|
||||
(``original_name``, ``file_sha256``, ``compression``) which
|
||||
downstream tools — most notably the Rust S3 proxy — didn't
|
||||
recognise, producing a PATHOLOGICAL warning for every
|
||||
listing. Pin the writer to the canonical scheme so the
|
||||
regression doesn't return.
|
||||
"""
|
||||
# .sha1 is in the non-delta extensions list → use_delta=False
|
||||
non_archive = temp_dir / "build.zip.sha1"
|
||||
non_archive.write_text("deadbeef build.zip\n")
|
||||
|
||||
delta_space = DeltaSpace(bucket="test-bucket", prefix="releases/v1")
|
||||
mock_storage.put.return_value = PutResult(etag="direct123")
|
||||
|
||||
summary = service.put(non_archive, delta_space)
|
||||
|
||||
assert summary.operation == "upload_direct"
|
||||
# Capture the metadata dict that was passed to storage.put
|
||||
# (call_args is the LAST call; direct upload makes exactly one).
|
||||
assert mock_storage.put.called
|
||||
_full_key, _local_file, emitted_meta = mock_storage.put.call_args[0]
|
||||
|
||||
# Every key must be in the dg-* dashed namespace.
|
||||
for key in emitted_meta.keys():
|
||||
assert key.startswith("dg-"), (
|
||||
f"Direct-upload metadata key {key!r} must use the dg-* "
|
||||
f"namespace (got: {list(emitted_meta.keys())})"
|
||||
)
|
||||
|
||||
# Spot-check the canonical keys carry the expected values.
|
||||
assert emitted_meta["dg-original-name"] == "build.zip.sha1"
|
||||
assert emitted_meta["dg-compression"] == "none"
|
||||
assert emitted_meta["dg-file-size"] == str(non_archive.stat().st_size)
|
||||
assert emitted_meta["dg-tool"].startswith("deltaglider/")
|
||||
assert "dg-file-sha256" in emitted_meta
|
||||
assert "dg-created-at" in emitted_meta
|
||||
|
||||
|
||||
class TestDeltaServiceGet:
|
||||
"""Test DeltaService.get method."""
|
||||
@@ -178,6 +219,70 @@ class TestDeltaServiceGet:
|
||||
assert output_path.exists()
|
||||
assert output_path.read_bytes() == test_content
|
||||
|
||||
def test_get_legacy_direct_upload_not_misclassified_as_regular_s3(
|
||||
self, service, mock_storage, temp_dir
|
||||
):
|
||||
"""Pre-v6.1.2 direct uploads have BARE metadata keys
|
||||
(``file_sha256``, ``compression``, ``original_name``) rather
|
||||
than the dashed ``dg-*`` namespace. The "is this a regular S3
|
||||
object or a DeltaGlider-managed one?" dispatch in ``get()``
|
||||
must recognise both schemes — otherwise pre-fix uploads end
|
||||
up in the wrong code path and the "Downloading regular S3
|
||||
object" log line lies about what's actually happening.
|
||||
|
||||
Regression for the dispatch asymmetry caught during PR review.
|
||||
"""
|
||||
import hashlib
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
key = ObjectKey(bucket="test-bucket", key="releases/v1/build.zip.sha1")
|
||||
content = b"deadbeef build.zip\n"
|
||||
real_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Legacy direct-upload shape — exactly what's stored on
|
||||
# Hetzner today for ~4400 .sha1 / .sha512 files.
|
||||
legacy_direct_meta = {
|
||||
"tool": "deltaglider/6.1.1",
|
||||
"original_name": "build.zip.sha1",
|
||||
"file_sha256": real_sha,
|
||||
"file_size": str(len(content)),
|
||||
"created_at": "2026-05-16T03:28:01.000000",
|
||||
"compression": "none",
|
||||
}
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="releases/v1/build.zip.sha1",
|
||||
size=len(content),
|
||||
etag="legacy",
|
||||
last_modified=None,
|
||||
metadata=legacy_direct_meta,
|
||||
)
|
||||
mock_stream = MagicMock()
|
||||
mock_stream.read.side_effect = [content, b""]
|
||||
mock_storage.get.return_value = mock_stream
|
||||
|
||||
# Capture the log messages so we can assert which branch fired.
|
||||
captured = []
|
||||
orig_info = service.logger.info
|
||||
|
||||
def _capture(msg, **kw):
|
||||
captured.append((msg, kw))
|
||||
orig_info(msg, **kw)
|
||||
|
||||
service.logger.info = _capture
|
||||
try:
|
||||
service.get(key, temp_dir / "out.sha1")
|
||||
finally:
|
||||
service.logger.info = orig_info
|
||||
|
||||
msgs = [m for m, _ in captured]
|
||||
# The dispatch must NOT have mistaken this for a "regular S3
|
||||
# object" — that branch's log message is the canary.
|
||||
assert "Downloading regular S3 object (no DeltaGlider metadata)" not in msgs, (
|
||||
"Legacy bare-keyed direct upload was misclassified as a "
|
||||
"regular S3 object — `get()` dispatch isn't using "
|
||||
"resolve_metadata for the file_sha256 presence check."
|
||||
)
|
||||
|
||||
|
||||
class TestDeltaServiceVerify:
|
||||
"""Test DeltaService.verify method."""
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
"""Regression tests for the dual-scheme metadata read/write contract.
|
||||
|
||||
The CLI historically wrote direct-upload metadata with bare,
|
||||
underscored keys (``original_name``, ``file_sha256``, ``compression``)
|
||||
while delta uploads used the canonical dashed namespace
|
||||
(``dg-original-name``, ``dg-file-sha256``, etc.). Downstream
|
||||
consumers — most notably the Rust S3 proxy — only knew the dashed
|
||||
form, so every ``.sha1`` / ``.sha512`` direct upload triggered a
|
||||
PATHOLOGICAL warning when listed.
|
||||
|
||||
v6.1.2 aligned the writer to the dashed form, but the read path
|
||||
must keep recognising the legacy bare keys forever so already-stored
|
||||
objects don't break. These tests pin both halves of the contract.
|
||||
"""
|
||||
|
||||
from deltaglider.core.models import (
|
||||
METADATA_KEY_ALIASES,
|
||||
METADATA_PREFIX,
|
||||
resolve_metadata,
|
||||
)
|
||||
|
||||
|
||||
class TestResolveMetadataAliases:
|
||||
"""Verify resolve_metadata accepts every documented alias."""
|
||||
|
||||
def test_new_dashed_keys_resolve(self):
|
||||
"""The current canonical scheme: dg-*-with-dashes."""
|
||||
meta = {
|
||||
f"{METADATA_PREFIX}tool": "deltaglider/6.1.2",
|
||||
f"{METADATA_PREFIX}original-name": "build.zip",
|
||||
f"{METADATA_PREFIX}file-sha256": "deadbeef",
|
||||
f"{METADATA_PREFIX}file-size": "1024",
|
||||
f"{METADATA_PREFIX}created-at": "2026-05-17T00:00:00Z",
|
||||
f"{METADATA_PREFIX}compression": "none",
|
||||
}
|
||||
assert resolve_metadata(meta, "tool") == "deltaglider/6.1.2"
|
||||
assert resolve_metadata(meta, "original_name") == "build.zip"
|
||||
assert resolve_metadata(meta, "file_sha256") == "deadbeef"
|
||||
assert resolve_metadata(meta, "file_size") == "1024"
|
||||
assert resolve_metadata(meta, "created_at") == "2026-05-17T00:00:00Z"
|
||||
assert resolve_metadata(meta, "compression") == "none"
|
||||
|
||||
def test_legacy_bare_underscored_keys_resolve(self):
|
||||
"""Pre-v6.1.2 direct-upload shape used by historical .sha files."""
|
||||
meta = {
|
||||
"tool": "deltaglider/6.1.1",
|
||||
"original_name": "build.zip.sha1",
|
||||
"file_sha256": "feedface",
|
||||
"file_size": "41",
|
||||
"created_at": "2026-05-16T03:28:01.000000",
|
||||
"compression": "none",
|
||||
}
|
||||
assert resolve_metadata(meta, "tool") == "deltaglider/6.1.1"
|
||||
assert resolve_metadata(meta, "original_name") == "build.zip.sha1"
|
||||
assert resolve_metadata(meta, "file_sha256") == "feedface"
|
||||
assert resolve_metadata(meta, "file_size") == "41"
|
||||
assert resolve_metadata(meta, "created_at") == "2026-05-16T03:28:01.000000"
|
||||
assert resolve_metadata(meta, "compression") == "none"
|
||||
|
||||
def test_legacy_hyphenated_keys_resolve(self):
|
||||
"""Some old paths used hyphens without the dg- prefix."""
|
||||
meta = {
|
||||
"original-name": "old.zip",
|
||||
"file-sha256": "cafe1234",
|
||||
"file-size": "2048",
|
||||
}
|
||||
assert resolve_metadata(meta, "original_name") == "old.zip"
|
||||
assert resolve_metadata(meta, "file_sha256") == "cafe1234"
|
||||
assert resolve_metadata(meta, "file_size") == "2048"
|
||||
|
||||
def test_priority_new_wins_when_both_present(self):
|
||||
"""If both schemes happen to coexist on one object, prefer the
|
||||
canonical dashed key — that's the writer's current intent."""
|
||||
meta = {
|
||||
f"{METADATA_PREFIX}original-name": "new.zip",
|
||||
"original_name": "old.zip",
|
||||
}
|
||||
assert resolve_metadata(meta, "original_name") == "new.zip"
|
||||
|
||||
def test_missing_returns_none(self):
|
||||
assert resolve_metadata({}, "tool") is None
|
||||
assert resolve_metadata({"unrelated": "x"}, "original_name") is None
|
||||
|
||||
def test_empty_string_treated_as_missing(self):
|
||||
"""Empty values must not satisfy the resolver — callers rely on
|
||||
None to trigger the fallback branch."""
|
||||
meta = {f"{METADATA_PREFIX}original-name": ""}
|
||||
assert resolve_metadata(meta, "original_name") is None
|
||||
|
||||
|
||||
class TestAliasTableContract:
|
||||
"""Pin the alias-table shape so a future regression on the
|
||||
ordering (which would break `priority_new_wins_when_both_present`)
|
||||
is caught immediately."""
|
||||
|
||||
def test_every_field_lists_new_dashed_first(self):
|
||||
"""The first alias in each tuple must be the canonical
|
||||
dg-*-with-dashes form. This is what `resolve_metadata` relies
|
||||
on for the "new wins over legacy when both present" rule."""
|
||||
for field, aliases in METADATA_KEY_ALIASES.items():
|
||||
assert aliases[0].startswith(METADATA_PREFIX), (
|
||||
f"{field}: first alias {aliases[0]!r} must be dashed namespace"
|
||||
)
|
||||
|
||||
def test_every_field_includes_legacy_underscored_form(self):
|
||||
"""Backward compat: bare underscored key must always be in the
|
||||
alias list. Pre-v6.1.2 direct uploads use them, and they
|
||||
must keep resolving forever."""
|
||||
for field, aliases in METADATA_KEY_ALIASES.items():
|
||||
assert field in aliases, (
|
||||
f"{field}: alias list must include the bare underscored "
|
||||
f"key {field!r} for legacy-upload compatibility"
|
||||
)
|
||||
|
||||
def test_compression_field_present(self):
|
||||
"""v6.1.2 added `compression` to the alias table so the
|
||||
direct-upload sentinel works on both schemes."""
|
||||
assert "compression" in METADATA_KEY_ALIASES
|
||||
|
||||
def test_source_name_field_present(self):
|
||||
"""Reference files' source_name should resolve uniformly."""
|
||||
assert "source_name" in METADATA_KEY_ALIASES
|
||||
Reference in New Issue
Block a user