diff --git a/CHANGELOG.md b/CHANGELOG.md index 36e4888..93a9b40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,20 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed +- **Direct-upload metadata now uses the canonical `dg-*` dashed namespace.** Pre-fix, files routed through `_upload_direct` (non-delta-eligible extensions: `.sha1`, `.sha512`, etc.) wrote metadata with bare underscored keys (`original_name`, `file_sha256`, `compression`) while delta and reference uploads correctly used the namespaced form (`dg-original-name`, `dg-file-sha256`, `dg-compression`). Downstream consumers — most visibly the [DeltaGlider Proxy](https://github.com/beshu-tech/deltaglider_proxy) — only recognised the dashed form, so every `.sha1`/`.sha512` listing triggered a `PATHOLOGICAL | Missing/corrupt DG metadata` warning. Aligned the writer to the canonical scheme so new uploads stop producing log spam. + +### Changed +- **Read path now resolves both schemes uniformly.** The historical bare keys (`original_name`, `compression`, etc.) stay in `METADATA_KEY_ALIASES` so already-stored objects keep being recognised on read — no migration required. Replaced ad-hoc `metadata.get("compression")` / `metadata.get("original_name")` / `metadata.get("file_size")` / `metadata.get("ref_key")` lookups in `DeltaService.get`, `DeltaService.delete`, `_delete_delta`, the recursive-delete listing path, `client.list_objects_v2`, and `client_operations.stats.get_object_info` with `resolve_metadata(meta, field)` calls so both schemes work transparently for the lifetime of the bucket. New `compression` and `source_name` entries added to the alias table. +- **`DeltaService.get` "regular S3 vs DeltaGlider-managed" dispatch** now uses `resolve_metadata` for the `file_sha256` presence check. Pre-fix, this check looked for the literal string `"dg-file-sha256"` in `obj_head.metadata`, which silently misclassified legacy bare-keyed direct uploads (`file_sha256` without the `dg-` prefix) as "regular S3 objects" — they still served correctly because both branches call `_get_direct`, but the wrong log line fired and the wrong `file_size` value was recorded for telemetry. Caught during adversarial PR review. + +### Added +- **Regression tests for the dual-scheme contract** (`tests/unit/test_metadata_aliases.py`, 11 tests): every alias resolves, new dashed keys win when both are present, empty strings count as missing, the alias-table shape is pinned (first alias dashed, bare underscored alias always present, `compression` + `source_name` present). +- **`test_direct_upload_emits_dashed_namespace`** in `test_core_service.py` pins the writer to emit `dg-*`-only metadata so the original underscored regression cannot return. +- **`test_get_legacy_direct_upload_not_misclassified_as_regular_s3`** in `test_core_service.py` pins the `get()` dispatch to route bare-keyed legacy direct uploads through the DeltaGlider-managed branch (not the "regular S3 object" passthrough). Demonstrated to fail without the corresponding `resolve_metadata` swap, pass with it. + ## [6.1.1] - 2026-03-23 ### Fixed diff --git a/src/deltaglider/client.py b/src/deltaglider/client.py index b64d117..b9b4840 100644 --- a/src/deltaglider/client.py +++ b/src/deltaglider/client.py @@ -42,7 +42,7 @@ from .client_operations.stats import StatsMode from .core import DeltaService, DeltaSpace, ObjectKey from .core.errors import NotFoundError -from .core.models import DeleteResult +from .core.models import DeleteResult, resolve_metadata from .core.object_listing import ObjectListing, list_objects_page from .core.s3_uri import parse_s3_url from .response_builders import ( @@ -398,10 +398,17 @@ class DeltaGliderClient: obj_head = self.service.storage.head(f"{Bucket}/{obj['key']}") if obj_head and obj_head.metadata: metadata = obj_head.metadata - # Update with actual compression stats - original_size = int(metadata.get("file_size", obj["size"])) + # Update with actual compression stats. Use + # `resolve_metadata` so we accept both the new + # dashed `dg-*` keys and the legacy bare ones. + file_size_raw = resolve_metadata(metadata, "file_size") + original_size = int(file_size_raw) if file_size_raw else obj["size"] + # `compression_ratio` isn't in the alias table + # (it's a derived stat, not part of the core + # metadata contract) so fall back to plain + # get() with the legacy bare key. compression_ratio = float(metadata.get("compression_ratio", 0.0)) - reference_key = metadata.get("ref_key") + reference_key = resolve_metadata(metadata, "ref_key") deltaglider_metadata["deltaglider-original-size"] = str(original_size) deltaglider_metadata["deltaglider-compression-ratio"] = str( diff --git a/src/deltaglider/client_operations/stats.py b/src/deltaglider/client_operations/stats.py index 8c366bd..bf4c28c 100644 --- a/src/deltaglider/client_operations/stats.py +++ b/src/deltaglider/client_operations/stats.py @@ -17,6 +17,7 @@ from typing import Any, Literal from ..client_models import BucketStats, CompressionEstimate, ObjectInfo from ..core.delta_extensions import is_delta_candidate +from ..core.models import resolve_metadata from ..core.object_listing import list_all_objects from ..core.s3_uri import parse_s3_url @@ -549,16 +550,22 @@ def get_object_info( metadata = obj_head.metadata is_delta = key.endswith(".delta") + # Use resolve_metadata for the dg-* namespace keys so we read + # both new (dashed-prefixed) and legacy (bare underscored) uploads + # transparently. `last_modified`, `etag`, `compression_ratio` are + # not part of the dg-* contract — they're per-listing or derived + # fields and stay on direct .get() lookups. + file_size_raw = resolve_metadata(metadata, "file_size") return ObjectInfo( key=key, size=obj_head.size, last_modified=metadata.get("last_modified", ""), etag=metadata.get("etag"), - original_size=int(metadata.get("file_size", obj_head.size)), + original_size=int(file_size_raw) if file_size_raw else obj_head.size, compressed_size=obj_head.size, compression_ratio=float(metadata.get("compression_ratio", 0.0)), is_delta=is_delta, - reference_key=metadata.get("ref_key"), + reference_key=resolve_metadata(metadata, "ref_key"), ) diff --git a/src/deltaglider/core/models.py b/src/deltaglider/core/models.py index bff7b94..3d72878 100644 --- a/src/deltaglider/core/models.py +++ b/src/deltaglider/core/models.py @@ -58,6 +58,20 @@ METADATA_KEY_ALIASES: dict[str, tuple[str, ...]] = { "delta-cmd", ), "note": (f"{METADATA_PREFIX}note", "dg_note", "note"), + # `compression` was historically written bare (no prefix) by the + # direct-upload path; v6.1.2 aligned it to the dashed namespace. + # Both forms must continue to resolve so already-stored objects + # keep being recognised on read. + "compression": (f"{METADATA_PREFIX}compression", "dg_compression", "compression"), + # `source-name` is reference-only metadata. Listed here so a + # single call to `resolve_metadata(meta, "source_name")` works + # uniformly with the rest of this table. + "source_name": ( + f"{METADATA_PREFIX}source-name", + "dg_source_name", + "source_name", + "source-name", + ), } diff --git a/src/deltaglider/core/service.py b/src/deltaglider/core/service.py index db985a8..a4557de 100644 --- a/src/deltaglider/core/service.py +++ b/src/deltaglider/core/service.py @@ -30,6 +30,7 @@ from .errors import ( PolicyViolationWarning, ) from .models import ( + METADATA_PREFIX, DeleteResult, DeltaMeta, DeltaSpace, @@ -177,9 +178,15 @@ class DeltaService: if obj_head is None: raise NotFoundError(f"Object not found: {object_key.key}") - # Check if this is a regular S3 object (not uploaded via DeltaGlider) - # Regular S3 objects won't have DeltaGlider metadata (dg-file-sha256 key) - if "dg-file-sha256" not in obj_head.metadata: + # Check if this is a regular S3 object (not uploaded via + # DeltaGlider). A DeltaGlider-managed object always carries a + # `file_sha256` field — could be the canonical `dg-file-sha256` + # (new direct + all delta + all reference uploads) OR the + # legacy bare `file_sha256` (pre-v6.1.2 direct uploads). Use + # `resolve_metadata` so both schemes route to the + # DeltaGlider-managed download branches instead of the + # "regular S3 object" passthrough. + if resolve_metadata(obj_head.metadata, "file_sha256") is None: # This is a regular S3 object, download it directly self.logger.info( "Downloading regular S3 object (no DeltaGlider metadata)", @@ -198,8 +205,11 @@ class DeltaService: self.metrics.timing("deltaglider.get.duration", duration) return - # Check if this is a direct upload (non-delta) uploaded via DeltaGlider - if obj_head.metadata.get("compression") == "none": + # Check if this is a direct upload (non-delta) uploaded via + # DeltaGlider. Use `resolve_metadata` so we recognise both the + # legacy bare `compression` key and the new dashed + # `dg-compression` key. + if resolve_metadata(obj_head.metadata, "compression") == "none": # Direct download without delta processing self._get_direct(object_key, obj_head, out) duration = (self.clock.now() - start_time).total_seconds() @@ -591,14 +601,22 @@ class DeltaService: key = original_name full_key = f"{delta_space.bucket}/{key}" - # Create metadata for the file + # Create metadata for the file using the dashed `dg-*` + # namespace so direct uploads match the same scheme as delta / + # reference uploads. Pre-v6.1.2 versions wrote these keys bare + # (e.g. `original_name` instead of `dg-original-name`); the + # METADATA_KEY_ALIASES table in core/models.py keeps the bare + # forms resolvable on read so already-stored objects keep + # working. New uploads emit the canonical dashed form so + # downstream consumers (the Rust S3 proxy in particular) stop + # logging PATHOLOGICAL warnings on every .sha1 / .sha512 list. metadata = { - "tool": self.tool_version, - "original_name": original_name, - "file_sha256": file_sha256, - "file_size": str(file_size), - "created_at": self.clock.now().isoformat(), - "compression": "none", # Mark as non-compressed + f"{METADATA_PREFIX}tool": self.tool_version, + f"{METADATA_PREFIX}original-name": original_name, + f"{METADATA_PREFIX}file-sha256": file_sha256, + f"{METADATA_PREFIX}file-size": str(file_size), + f"{METADATA_PREFIX}created-at": self.clock.now().isoformat(), + f"{METADATA_PREFIX}compression": "none", # Mark as non-compressed } # Upload the file directly @@ -642,11 +660,13 @@ class DeltaService: self._delete_reference(object_key, full_key, result) elif object_key.key.endswith(".delta"): self._delete_delta(object_key, full_key, obj_head, result) - elif obj_head.metadata.get("compression") == "none": + elif resolve_metadata(obj_head.metadata, "compression") == "none": self.storage.delete(full_key) result.deleted = True result.type = "direct" - result.original_name = obj_head.metadata.get("original_name", object_key.key) + result.original_name = ( + resolve_metadata(obj_head.metadata, "original_name") or object_key.key + ) else: self.storage.delete(full_key) result.deleted = True @@ -712,7 +732,7 @@ class DeltaService: self.storage.delete(full_key) result.deleted = True result.type = "delta" - result.original_name = obj_head.metadata.get("original_name", "unknown") + result.original_name = resolve_metadata(obj_head.metadata, "original_name") or "unknown" if "/" not in object_key.key: return @@ -841,7 +861,7 @@ class DeltaService: affected_deltaspaces.add("/".join(obj.key.split("/")[:-1])) else: obj_head = self.storage.head(f"{bucket}/{obj.key}") - if obj_head and obj_head.metadata.get("compression") == "none": + if obj_head and resolve_metadata(obj_head.metadata, "compression") == "none": direct_uploads.append(obj.key) else: other_objects.append(obj.key) diff --git a/tests/unit/test_core_service.py b/tests/unit/test_core_service.py index 4cbf012..716172e 100644 --- a/tests/unit/test_core_service.py +++ b/tests/unit/test_core_service.py @@ -132,6 +132,47 @@ class TestDeltaServicePut: assert issubclass(w[0].category, PolicyViolationWarning) assert "exceeds threshold" in str(w[0].message) + def test_direct_upload_emits_dashed_namespace(self, service, temp_dir, mock_storage): + """Direct-upload (non-delta-eligible files like .sha1) must + write metadata in the canonical dg-* dashed namespace. + + Pre-v6.1.2 this path wrote bare underscored keys + (``original_name``, ``file_sha256``, ``compression``) which + downstream tools — most notably the Rust S3 proxy — didn't + recognise, producing a PATHOLOGICAL warning for every + listing. Pin the writer to the canonical scheme so the + regression doesn't return. + """ + # .sha1 is in the non-delta extensions list → use_delta=False + non_archive = temp_dir / "build.zip.sha1" + non_archive.write_text("deadbeef build.zip\n") + + delta_space = DeltaSpace(bucket="test-bucket", prefix="releases/v1") + mock_storage.put.return_value = PutResult(etag="direct123") + + summary = service.put(non_archive, delta_space) + + assert summary.operation == "upload_direct" + # Capture the metadata dict that was passed to storage.put + # (call_args is the LAST call; direct upload makes exactly one). + assert mock_storage.put.called + _full_key, _local_file, emitted_meta = mock_storage.put.call_args[0] + + # Every key must be in the dg-* dashed namespace. + for key in emitted_meta.keys(): + assert key.startswith("dg-"), ( + f"Direct-upload metadata key {key!r} must use the dg-* " + f"namespace (got: {list(emitted_meta.keys())})" + ) + + # Spot-check the canonical keys carry the expected values. + assert emitted_meta["dg-original-name"] == "build.zip.sha1" + assert emitted_meta["dg-compression"] == "none" + assert emitted_meta["dg-file-size"] == str(non_archive.stat().st_size) + assert emitted_meta["dg-tool"].startswith("deltaglider/") + assert "dg-file-sha256" in emitted_meta + assert "dg-created-at" in emitted_meta + class TestDeltaServiceGet: """Test DeltaService.get method.""" @@ -178,6 +219,70 @@ class TestDeltaServiceGet: assert output_path.exists() assert output_path.read_bytes() == test_content + def test_get_legacy_direct_upload_not_misclassified_as_regular_s3( + self, service, mock_storage, temp_dir + ): + """Pre-v6.1.2 direct uploads have BARE metadata keys + (``file_sha256``, ``compression``, ``original_name``) rather + than the dashed ``dg-*`` namespace. The "is this a regular S3 + object or a DeltaGlider-managed one?" dispatch in ``get()`` + must recognise both schemes — otherwise pre-fix uploads end + up in the wrong code path and the "Downloading regular S3 + object" log line lies about what's actually happening. + + Regression for the dispatch asymmetry caught during PR review. + """ + import hashlib + from unittest.mock import MagicMock + + key = ObjectKey(bucket="test-bucket", key="releases/v1/build.zip.sha1") + content = b"deadbeef build.zip\n" + real_sha = hashlib.sha256(content).hexdigest() + + # Legacy direct-upload shape — exactly what's stored on + # Hetzner today for ~4400 .sha1 / .sha512 files. + legacy_direct_meta = { + "tool": "deltaglider/6.1.1", + "original_name": "build.zip.sha1", + "file_sha256": real_sha, + "file_size": str(len(content)), + "created_at": "2026-05-16T03:28:01.000000", + "compression": "none", + } + mock_storage.head.return_value = ObjectHead( + key="releases/v1/build.zip.sha1", + size=len(content), + etag="legacy", + last_modified=None, + metadata=legacy_direct_meta, + ) + mock_stream = MagicMock() + mock_stream.read.side_effect = [content, b""] + mock_storage.get.return_value = mock_stream + + # Capture the log messages so we can assert which branch fired. + captured = [] + orig_info = service.logger.info + + def _capture(msg, **kw): + captured.append((msg, kw)) + orig_info(msg, **kw) + + service.logger.info = _capture + try: + service.get(key, temp_dir / "out.sha1") + finally: + service.logger.info = orig_info + + msgs = [m for m, _ in captured] + # The dispatch must NOT have mistaken this for a "regular S3 + # object" — that branch's log message is the canary. + assert "Downloading regular S3 object (no DeltaGlider metadata)" not in msgs, ( + "Legacy bare-keyed direct upload was misclassified as a " + "regular S3 object — `get()` dispatch isn't using " + "resolve_metadata for the file_sha256 presence check." + ) + class TestDeltaServiceVerify: """Test DeltaService.verify method.""" diff --git a/tests/unit/test_metadata_aliases.py b/tests/unit/test_metadata_aliases.py new file mode 100644 index 0000000..aed2e9b --- /dev/null +++ b/tests/unit/test_metadata_aliases.py @@ -0,0 +1,122 @@ +"""Regression tests for the dual-scheme metadata read/write contract. + +The CLI historically wrote direct-upload metadata with bare, +underscored keys (``original_name``, ``file_sha256``, ``compression``) +while delta uploads used the canonical dashed namespace +(``dg-original-name``, ``dg-file-sha256``, etc.). Downstream +consumers — most notably the Rust S3 proxy — only knew the dashed +form, so every ``.sha1`` / ``.sha512`` direct upload triggered a +PATHOLOGICAL warning when listed. + +v6.1.2 aligned the writer to the dashed form, but the read path +must keep recognising the legacy bare keys forever so already-stored +objects don't break. These tests pin both halves of the contract. +""" + +from deltaglider.core.models import ( + METADATA_KEY_ALIASES, + METADATA_PREFIX, + resolve_metadata, +) + + +class TestResolveMetadataAliases: + """Verify resolve_metadata accepts every documented alias.""" + + def test_new_dashed_keys_resolve(self): + """The current canonical scheme: dg-*-with-dashes.""" + meta = { + f"{METADATA_PREFIX}tool": "deltaglider/6.1.2", + f"{METADATA_PREFIX}original-name": "build.zip", + f"{METADATA_PREFIX}file-sha256": "deadbeef", + f"{METADATA_PREFIX}file-size": "1024", + f"{METADATA_PREFIX}created-at": "2026-05-17T00:00:00Z", + f"{METADATA_PREFIX}compression": "none", + } + assert resolve_metadata(meta, "tool") == "deltaglider/6.1.2" + assert resolve_metadata(meta, "original_name") == "build.zip" + assert resolve_metadata(meta, "file_sha256") == "deadbeef" + assert resolve_metadata(meta, "file_size") == "1024" + assert resolve_metadata(meta, "created_at") == "2026-05-17T00:00:00Z" + assert resolve_metadata(meta, "compression") == "none" + + def test_legacy_bare_underscored_keys_resolve(self): + """Pre-v6.1.2 direct-upload shape used by historical .sha files.""" + meta = { + "tool": "deltaglider/6.1.1", + "original_name": "build.zip.sha1", + "file_sha256": "feedface", + "file_size": "41", + "created_at": "2026-05-16T03:28:01.000000", + "compression": "none", + } + assert resolve_metadata(meta, "tool") == "deltaglider/6.1.1" + assert resolve_metadata(meta, "original_name") == "build.zip.sha1" + assert resolve_metadata(meta, "file_sha256") == "feedface" + assert resolve_metadata(meta, "file_size") == "41" + assert resolve_metadata(meta, "created_at") == "2026-05-16T03:28:01.000000" + assert resolve_metadata(meta, "compression") == "none" + + def test_legacy_hyphenated_keys_resolve(self): + """Some old paths used hyphens without the dg- prefix.""" + meta = { + "original-name": "old.zip", + "file-sha256": "cafe1234", + "file-size": "2048", + } + assert resolve_metadata(meta, "original_name") == "old.zip" + assert resolve_metadata(meta, "file_sha256") == "cafe1234" + assert resolve_metadata(meta, "file_size") == "2048" + + def test_priority_new_wins_when_both_present(self): + """If both schemes happen to coexist on one object, prefer the + canonical dashed key — that's the writer's current intent.""" + meta = { + f"{METADATA_PREFIX}original-name": "new.zip", + "original_name": "old.zip", + } + assert resolve_metadata(meta, "original_name") == "new.zip" + + def test_missing_returns_none(self): + assert resolve_metadata({}, "tool") is None + assert resolve_metadata({"unrelated": "x"}, "original_name") is None + + def test_empty_string_treated_as_missing(self): + """Empty values must not satisfy the resolver — callers rely on + None to trigger the fallback branch.""" + meta = {f"{METADATA_PREFIX}original-name": ""} + assert resolve_metadata(meta, "original_name") is None + + +class TestAliasTableContract: + """Pin the alias-table shape so a future regression on the + ordering (which would break `priority_new_wins_when_both_present`) + is caught immediately.""" + + def test_every_field_lists_new_dashed_first(self): + """The first alias in each tuple must be the canonical + dg-*-with-dashes form. This is what `resolve_metadata` relies + on for the "new wins over legacy when both present" rule.""" + for field, aliases in METADATA_KEY_ALIASES.items(): + assert aliases[0].startswith(METADATA_PREFIX), ( + f"{field}: first alias {aliases[0]!r} must be dashed namespace" + ) + + def test_every_field_includes_legacy_underscored_form(self): + """Backward compat: bare underscored key must always be in the + alias list. Pre-v6.1.2 direct uploads use them, and they + must keep resolving forever.""" + for field, aliases in METADATA_KEY_ALIASES.items(): + assert field in aliases, ( + f"{field}: alias list must include the bare underscored " + f"key {field!r} for legacy-upload compatibility" + ) + + def test_compression_field_present(self): + """v6.1.2 added `compression` to the alias table so the + direct-upload sentinel works on both schemes.""" + assert "compression" in METADATA_KEY_ALIASES + + def test_source_name_field_present(self): + """Reference files' source_name should resolve uniformly.""" + assert "source_name" in METADATA_KEY_ALIASES