From d023600420865f58e4ead3eba089e9aa2c175dae Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Mon, 18 May 2026 20:58:22 +0500 Subject: [PATCH 1/7] fix(catalogs): validate extension and preset catalog payload shape MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ExtensionCatalog._fetch_single_catalog` and `PresetCatalog._fetch_single_catalog` only check that the `extensions` / `presets` key is *present* in the parsed catalog JSON. They don't check that the value is a JSON object, and they don't check that the root is a JSON object at all. A malformed (or compromised) upstream catalog returning: {"schema_version": "1.0", "extensions": []} passes both `"extensions" not in catalog_data` and the subsequent `response.read()` JSON parse, gets cached on disk, and then crashes deep inside `_get_merged_extensions` (resp. `_get_merged_packs`) with: AttributeError: 'list' object has no attribute 'items' instead of the existing user-facing `ExtensionError("Invalid catalog format from ")` / `PresetError("Invalid preset catalog format")` that the surrounding code is clearly trying to produce. The sibling integration-catalog reader already validates this — see `src/specify_cli/integrations/catalog.py` where the fetch path explicitly checks both `isinstance(catalog_data, dict)` and `isinstance(catalog_data.get("integrations"), dict)` before returning. This change mirrors that pattern in the extension and preset readers so the three catalog fetchers stay consistent and a malformed upstream surfaces as the user-facing error instead of a raw Python traceback. Adds parametrized regression tests covering: - root payload is not a JSON object (list, str, int, null) - root is a dict but `extensions` / `presets` value is the wrong type (list, str, null, int) All eight bad-payload shapes now raise the expected catalog error. --- src/specify_cli/extensions.py | 19 +++++++++++++++ src/specify_cli/presets.py | 20 ++++++++++++++++ tests/test_extensions.py | 45 +++++++++++++++++++++++++++++++++++ tests/test_presets.py | 45 +++++++++++++++++++++++++++++++++++ 4 files changed, 129 insertions(+) diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 871503f0ae..5ad6303838 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -1839,8 +1839,27 @@ def _fetch_single_catalog(self, entry: CatalogEntry, force_refresh: bool = False with self._open_url(entry.url, timeout=10) as response: catalog_data = json.loads(response.read()) + # Validate payload shape before iteration. Checking only key + # presence would let a payload like ``{"extensions": []}`` or + # ``{"extensions": null}`` slip through here and then crash with + # ``AttributeError: 'list' object has no attribute 'items'`` deep + # inside ``_get_merged_extensions``. The sibling integration + # catalog reader already guards both the root object and the + # nested mapping (see ``integrations/catalog.py``); the extension + # catalog must stay consistent so a malformed upstream surfaces as + # the user-facing ``Invalid catalog format`` error instead of a + # raw Python traceback. + if not isinstance(catalog_data, dict): + raise ExtensionError( + f"Invalid catalog format from {entry.url}: expected a JSON object" + ) if "schema_version" not in catalog_data or "extensions" not in catalog_data: raise ExtensionError(f"Invalid catalog format from {entry.url}") + if not isinstance(catalog_data.get("extensions"), dict): + raise ExtensionError( + f"Invalid catalog format from {entry.url}: " + "'extensions' must be a JSON object" + ) # Save to cache self.cache_dir.mkdir(parents=True, exist_ok=True) diff --git a/src/specify_cli/presets.py b/src/specify_cli/presets.py index fd9d4745f1..523b50726a 100644 --- a/src/specify_cli/presets.py +++ b/src/specify_cli/presets.py @@ -2045,11 +2045,31 @@ def _fetch_single_catalog(self, entry: PresetCatalogEntry, force_refresh: bool = with self._open_url(entry.url, timeout=10) as response: catalog_data = json.loads(response.read()) + # Validate payload shape before iteration. Checking only key + # presence would let a payload like ``{"presets": []}`` or + # ``{"presets": null}`` slip through here and then crash with + # ``AttributeError: 'list' object has no attribute 'items'`` deep + # inside ``_get_merged_packs``. The sibling integration catalog + # reader already guards both the root object and the nested + # mapping (see ``integrations/catalog.py``); the preset catalog + # must stay consistent so a malformed upstream surfaces as the + # user-facing ``Invalid preset catalog format`` error instead of + # a raw Python traceback. + if not isinstance(catalog_data, dict): + raise PresetError( + f"Invalid preset catalog format from {entry.url}: " + "expected a JSON object" + ) if ( "schema_version" not in catalog_data or "presets" not in catalog_data ): raise PresetError("Invalid preset catalog format") + if not isinstance(catalog_data.get("presets"), dict): + raise PresetError( + f"Invalid preset catalog format from {entry.url}: " + "'presets' must be a JSON object" + ) self.cache_dir.mkdir(parents=True, exist_ok=True) cache_file.write_text(json.dumps(catalog_data, indent=2)) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 153388a541..f1ce74c33a 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -2577,6 +2577,51 @@ def fake_open(req, timeout=None): assert captured["req"].get_header("Authorization") == "Bearer ghp_testtoken" + @pytest.mark.parametrize( + "payload", + [ + # Root is not a JSON object. + [], + "oops", + 42, + None, + # Root is fine but ``extensions`` is the wrong type. + {"schema_version": "1.0", "extensions": []}, + {"schema_version": "1.0", "extensions": "oops"}, + {"schema_version": "1.0", "extensions": None}, + {"schema_version": "1.0", "extensions": 42}, + ], + ) + def test_fetch_single_catalog_rejects_malformed_payload(self, temp_dir, payload): + """Malformed catalog payloads raise ExtensionError, not AttributeError. + + Without this guard, a payload like ``{"extensions": []}`` would pass the + key-presence check and then crash with ``AttributeError: 'list' object + has no attribute 'items'`` deep inside ``_get_merged_extensions``. The + sibling integration catalog reader already validates both the root + object and the nested mapping (see ``integrations/catalog.py``); the + extension catalog must stay consistent. + """ + from unittest.mock import patch, MagicMock + + catalog = self._make_catalog(temp_dir) + + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(payload).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + entry = CatalogEntry( + url="https://example.com/catalog.json", + name="default", + priority=1, + install_allowed=True, + ) + + with patch.object(catalog, "_open_url", return_value=mock_response): + with pytest.raises(ExtensionError, match="Invalid catalog format"): + catalog._fetch_single_catalog(entry, force_refresh=True) + def test_download_extension_sends_auth_header(self, temp_dir, monkeypatch): """download_extension passes Authorization header when a provider is configured.""" from unittest.mock import patch, MagicMock diff --git a/tests/test_presets.py b/tests/test_presets.py index 8fa700fa77..1928840d96 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -1514,6 +1514,51 @@ def fake_open(req, timeout=None): assert captured["req"].get_header("Authorization") == "Bearer ghp_testtoken" + @pytest.mark.parametrize( + "payload", + [ + # Root is not a JSON object. + [], + "oops", + 42, + None, + # Root is fine but ``presets`` is the wrong type. + {"schema_version": "1.0", "presets": []}, + {"schema_version": "1.0", "presets": "oops"}, + {"schema_version": "1.0", "presets": None}, + {"schema_version": "1.0", "presets": 42}, + ], + ) + def test_fetch_single_catalog_rejects_malformed_payload(self, project_dir, payload): + """Malformed catalog payloads raise PresetError, not AttributeError. + + Without this guard, a payload like ``{"presets": []}`` would pass the + key-presence check and then crash with ``AttributeError: 'list' object + has no attribute 'items'`` deep inside ``_get_merged_packs``. The + sibling integration catalog reader already validates both the root + object and the nested mapping (see ``integrations/catalog.py``); the + preset catalog must stay consistent. + """ + from unittest.mock import patch, MagicMock + + catalog = PresetCatalog(project_dir) + + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(payload).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + entry = PresetCatalogEntry( + url="https://example.com/catalog.json", + name="default", + priority=1, + install_allowed=True, + ) + + with patch.object(catalog, "_open_url", return_value=mock_response): + with pytest.raises(PresetError, match="Invalid preset catalog format"): + catalog._fetch_single_catalog(entry, force_refresh=True) + def test_download_pack_sends_auth_header(self, project_dir, monkeypatch): """download_pack passes Authorization header when configured.""" from unittest.mock import patch, MagicMock From df3823f711eec103aca816adfbb98b0dce5499c5 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Thu, 21 May 2026 07:35:40 +0500 Subject: [PATCH 2/7] fix(catalogs): skip non-mapping entries during extension and preset merge Addresses Copilot review feedback on this PR. `_fetch_single_catalog` now validates that the ``extensions`` / ``presets`` value is a mapping, but it doesn't (and shouldn't) validate every entry inside that mapping. A payload like: {"schema_version": "1.0", "extensions": {"good": {...}, "bad": []}} passes the fetch-level guard, then later crashes inside ``_get_merged_extensions`` (resp. ``_get_merged_packs``) at ``{**ext_data, ...}`` with ``TypeError: 'list' object is not a mapping``. The sibling integration-catalog reader at ``src/specify_cli/integrations/catalog.py:245`` handles this with a per-entry ``isinstance(integ_data, dict)`` skip during merge, so one malformed entry doesn't poison an otherwise valid catalog. This change mirrors that pattern in the extension and preset mergers and adds regression tests asserting that valid entries continue to merge while malformed siblings are silently dropped. --- src/specify_cli/extensions.py | 10 +++++++++ src/specify_cli/presets.py | 11 +++++++++ tests/test_extensions.py | 42 +++++++++++++++++++++++++++++++++++ tests/test_presets.py | 41 ++++++++++++++++++++++++++++++++++ 4 files changed, 104 insertions(+) diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 5ad6303838..19782aacb5 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -1914,6 +1914,16 @@ def _get_merged_extensions(self, force_refresh: bool = False) -> List[Dict[str, continue for ext_id, ext_data in catalog_data.get("extensions", {}).items(): + # Per-entry guard: ``_fetch_single_catalog`` already validates + # that ``catalog_data["extensions"]`` is a mapping, but it + # does not (and should not) validate every entry shape there + # — one malformed entry shouldn't poison an otherwise valid + # catalog. Skip non-mapping entries here so a payload like + # ``{"extensions": {"foo": [], "bar": {...}}}`` still merges + # the valid entries without crashing on ``**ext_data``. + # Mirrors ``integrations/catalog.py:245``. + if not isinstance(ext_data, dict): + continue if ext_id not in merged: # Higher-priority catalog wins merged[ext_id] = { **ext_data, diff --git a/src/specify_cli/presets.py b/src/specify_cli/presets.py index 523b50726a..39611cd8b7 100644 --- a/src/specify_cli/presets.py +++ b/src/specify_cli/presets.py @@ -2103,6 +2103,17 @@ def _get_merged_packs(self, force_refresh: bool = False) -> Dict[str, Dict[str, try: data = self._fetch_single_catalog(entry, force_refresh) for pack_id, pack_data in data.get("presets", {}).items(): + # Per-entry guard: ``_fetch_single_catalog`` already + # validates that ``data["presets"]`` is a mapping, but it + # does not (and should not) validate every entry shape + # there — one malformed entry shouldn't poison an + # otherwise valid catalog. Skip non-mapping entries here + # so a payload like ``{"presets": {"foo": [], "bar": + # {...}}}`` still merges the valid entries without + # crashing on ``**pack_data``. Mirrors + # ``integrations/catalog.py:245``. + if not isinstance(pack_data, dict): + continue pack_data_with_catalog = {**pack_data, "_catalog_name": entry.name, "_install_allowed": entry.install_allowed} merged[pack_id] = pack_data_with_catalog except PresetError: diff --git a/tests/test_extensions.py b/tests/test_extensions.py index f1ce74c33a..db8c080d7e 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -2622,6 +2622,48 @@ def test_fetch_single_catalog_rejects_malformed_payload(self, temp_dir, payload) with pytest.raises(ExtensionError, match="Invalid catalog format"): catalog._fetch_single_catalog(entry, force_refresh=True) + def test_get_merged_extensions_skips_non_mapping_entries(self, temp_dir): + """Per-entry guard: one malformed entry shouldn't poison the merge. + + ``_fetch_single_catalog`` validates that ``extensions`` is a mapping, + but it doesn't (and shouldn't) validate every entry inside it — a + single bad entry in an otherwise-valid catalog should be skipped, not + crash the whole resolve path. Mirrors the per-entry skip in + ``integrations/catalog.py``: a malformed entry returns no error, + valid entries continue to merge normally. + """ + from unittest.mock import patch, MagicMock + + catalog = self._make_catalog(temp_dir) + # Mix of valid entry, list-shaped entry, and string-shaped entry. + payload = { + "schema_version": "1.0", + "extensions": { + "good": {"name": "Good", "version": "1.0.0"}, + "bad-list": [], + "bad-str": "oops", + }, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(payload).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + entry = CatalogEntry( + url="https://example.com/catalog.json", + name="default", + priority=1, + install_allowed=True, + ) + + with patch.object(catalog, "_open_url", return_value=mock_response), \ + patch.object(catalog, "get_active_catalogs", return_value=[entry]): + merged = catalog._get_merged_extensions(force_refresh=True) + + # Only the well-formed entry survives; the two malformed entries are + # silently dropped rather than raising or crashing. + assert [ext["id"] for ext in merged] == ["good"] + def test_download_extension_sends_auth_header(self, temp_dir, monkeypatch): """download_extension passes Authorization header when a provider is configured.""" from unittest.mock import patch, MagicMock diff --git a/tests/test_presets.py b/tests/test_presets.py index 1928840d96..514365d1a5 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -1559,6 +1559,47 @@ def test_fetch_single_catalog_rejects_malformed_payload(self, project_dir, paylo with pytest.raises(PresetError, match="Invalid preset catalog format"): catalog._fetch_single_catalog(entry, force_refresh=True) + def test_get_merged_packs_skips_non_mapping_entries(self, project_dir): + """Per-entry guard: one malformed entry shouldn't poison the merge. + + ``_fetch_single_catalog`` validates that ``presets`` is a mapping, + but it doesn't (and shouldn't) validate every entry inside it — a + single bad entry in an otherwise-valid catalog should be skipped, + not crash the whole resolve path. Mirrors the per-entry skip in + ``integrations/catalog.py``: a malformed entry returns no error, + valid entries continue to merge normally. + """ + from unittest.mock import patch, MagicMock + + catalog = PresetCatalog(project_dir) + payload = { + "schema_version": "1.0", + "presets": { + "good": {"name": "Good", "version": "1.0.0"}, + "bad-list": [], + "bad-str": "oops", + }, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(payload).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + entry = PresetCatalogEntry( + url="https://example.com/catalog.json", + name="default", + priority=1, + install_allowed=True, + ) + + with patch.object(catalog, "_open_url", return_value=mock_response), \ + patch.object(catalog, "get_active_catalogs", return_value=[entry]): + merged = catalog._get_merged_packs(force_refresh=True) + + # Only the well-formed entry survives; the two malformed entries are + # silently dropped rather than raising or crashing. + assert list(merged.keys()) == ["good"] + def test_download_pack_sends_auth_header(self, project_dir, monkeypatch): """download_pack passes Authorization header when configured.""" from unittest.mock import patch, MagicMock From 964324a8efa96bc0b54c7c4e1eaeb5f4c41402a1 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Sat, 23 May 2026 16:07:23 +0500 Subject: [PATCH 3/7] fix(catalogs): validate cached extension and preset payload shape MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Copilot review feedback on this PR (round 2). The earlier commits in this branch added payload-shape validation on the network fetch path. The cache-hit path still returned ``json.loads(cache_file.read_text())`` directly without re-checking the shape, so a cache poisoned by an older spec-kit version (or a manual edit, or an upstream that briefly served a bad payload before the network guards landed) would re-crash every invocation of ``_get_merged_extensions`` / ``_get_merged_packs`` with ``AttributeError: 'list' object has no attribute 'items'`` despite the cache being "valid" by age. Extracts the shape validation into ``_validate_catalog_payload`` on both ``ExtensionCatalog`` and ``PresetCatalog``, and calls it from both the cache-load and network-fetch branches of ``_fetch_single_catalog``. If the cached payload fails validation, the cache read is treated like a ``json.JSONDecodeError`` — the cached value is discarded and the function falls through to the network fetch, which refreshes the cache with a clean payload on success. Never propagates ``AttributeError`` to the caller. Regression tests parametrize the four root-bad-type variants plus three ``extensions``/``presets``-bad-type variants per file, asserting that a poisoned cache silently recovers via network refetch and returns the freshly-fetched payload. --- src/specify_cli/extensions.py | 74 +++++++++++++++++++++----------- src/specify_cli/presets.py | 80 +++++++++++++++++++++++------------ tests/test_extensions.py | 70 ++++++++++++++++++++++++++++++ tests/test_presets.py | 71 +++++++++++++++++++++++++++++++ 4 files changed, 244 insertions(+), 51 deletions(-) diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 19782aacb5..de67dfcdd8 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -1702,6 +1702,44 @@ def _open_url(self, url: str, timeout: int = 10): from specify_cli.authentication.http import open_url return open_url(url, timeout) + def _validate_catalog_payload(self, catalog_data: Any, url: str) -> None: + """Validate a parsed catalog payload's shape. + + Applied to both network-fetched and cache-loaded payloads so a + once-poisoned cache (older spec-kit version, manual edit, upstream + served a bad payload before the network-side guards were added) + cannot re-crash ``_get_merged_extensions`` on subsequent calls. + + Checking only key presence would let a payload like + ``{"extensions": []}`` or ``{"extensions": null}`` slip through + here and then crash with ``AttributeError: 'list' object has no + attribute 'items'`` deep inside ``_get_merged_extensions``. The + sibling integration catalog reader already guards both the root + object and the nested mapping (see ``integrations/catalog.py``); + the extension catalog must stay consistent so a malformed payload + surfaces as the user-facing ``Invalid catalog format`` error + instead of a raw Python traceback. + + Args: + catalog_data: Parsed JSON payload from the catalog source. + url: Source URL — used in the error message so the user can + tell which catalog in a multi-catalog stack is malformed. + + Raises: + ExtensionError: If the payload's shape is invalid. + """ + if not isinstance(catalog_data, dict): + raise ExtensionError( + f"Invalid catalog format from {url}: expected a JSON object" + ) + if "schema_version" not in catalog_data or "extensions" not in catalog_data: + raise ExtensionError(f"Invalid catalog format from {url}") + if not isinstance(catalog_data.get("extensions"), dict): + raise ExtensionError( + f"Invalid catalog format from {url}: " + "'extensions' must be a JSON object" + ) + def get_active_catalogs(self) -> List[CatalogEntry]: """Get the ordered list of active catalogs. @@ -1827,11 +1865,19 @@ def _fetch_single_catalog(self, entry: CatalogEntry, force_refresh: bool = False # If metadata is invalid or missing expected fields, treat cache as invalid pass - # Use cache if valid + # Use cache if valid. A previously-cached payload must clear the + # same shape checks as a freshly-fetched one — otherwise a once- + # poisoned cache (older spec-kit version, manual edit, upstream + # served a bad payload before the network-side guards were added) + # would re-crash on every invocation despite the cache being + # "valid" by age. If validation fails on the cached read, fall + # through to the network fetch path so the cache gets refreshed. if is_valid: try: - return json.loads(cache_file.read_text()) - except json.JSONDecodeError: + cached_data = json.loads(cache_file.read_text()) + self._validate_catalog_payload(cached_data, entry.url) + return cached_data + except (json.JSONDecodeError, ExtensionError): pass # Fetch from network @@ -1839,27 +1885,7 @@ def _fetch_single_catalog(self, entry: CatalogEntry, force_refresh: bool = False with self._open_url(entry.url, timeout=10) as response: catalog_data = json.loads(response.read()) - # Validate payload shape before iteration. Checking only key - # presence would let a payload like ``{"extensions": []}`` or - # ``{"extensions": null}`` slip through here and then crash with - # ``AttributeError: 'list' object has no attribute 'items'`` deep - # inside ``_get_merged_extensions``. The sibling integration - # catalog reader already guards both the root object and the - # nested mapping (see ``integrations/catalog.py``); the extension - # catalog must stay consistent so a malformed upstream surfaces as - # the user-facing ``Invalid catalog format`` error instead of a - # raw Python traceback. - if not isinstance(catalog_data, dict): - raise ExtensionError( - f"Invalid catalog format from {entry.url}: expected a JSON object" - ) - if "schema_version" not in catalog_data or "extensions" not in catalog_data: - raise ExtensionError(f"Invalid catalog format from {entry.url}") - if not isinstance(catalog_data.get("extensions"), dict): - raise ExtensionError( - f"Invalid catalog format from {entry.url}: " - "'extensions' must be a JSON object" - ) + self._validate_catalog_payload(catalog_data, entry.url) # Save to cache self.cache_dir.mkdir(parents=True, exist_ok=True) diff --git a/src/specify_cli/presets.py b/src/specify_cli/presets.py index 39611cd8b7..89fd033187 100644 --- a/src/specify_cli/presets.py +++ b/src/specify_cli/presets.py @@ -1860,6 +1860,48 @@ def _open_url(self, url: str, timeout: int = 10): from specify_cli.authentication.http import open_url return open_url(url, timeout) + def _validate_catalog_payload(self, catalog_data: Any, url: str) -> None: + """Validate a parsed preset-catalog payload's shape. + + Applied to both network-fetched and cache-loaded payloads so a + once-poisoned cache (older spec-kit version, manual edit, upstream + served a bad payload before the network-side guards were added) + cannot re-crash ``_get_merged_packs`` on subsequent calls. + + Checking only key presence would let a payload like + ``{"presets": []}`` or ``{"presets": null}`` slip through here and + then crash with ``AttributeError: 'list' object has no attribute + 'items'`` deep inside ``_get_merged_packs``. The sibling + integration catalog reader already guards both the root object and + the nested mapping (see ``integrations/catalog.py``); the preset + catalog must stay consistent so a malformed payload surfaces as + the user-facing ``Invalid preset catalog format`` error instead of + a raw Python traceback. + + Args: + catalog_data: Parsed JSON payload from the catalog source. + url: Source URL — used in the error message so the user can + tell which catalog in a multi-catalog stack is malformed. + + Raises: + PresetError: If the payload's shape is invalid. + """ + if not isinstance(catalog_data, dict): + raise PresetError( + f"Invalid preset catalog format from {url}: " + "expected a JSON object" + ) + if ( + "schema_version" not in catalog_data + or "presets" not in catalog_data + ): + raise PresetError("Invalid preset catalog format") + if not isinstance(catalog_data.get("presets"), dict): + raise PresetError( + f"Invalid preset catalog format from {url}: " + "'presets' must be a JSON object" + ) + def _load_catalog_config(self, config_path: Path) -> Optional[List[PresetCatalogEntry]]: """Load catalog stack configuration from a YAML file. @@ -2035,41 +2077,25 @@ def _fetch_single_catalog(self, entry: PresetCatalogEntry, force_refresh: bool = """ cache_file, metadata_file = self._get_cache_paths(entry.url) + # Use cache if valid. A previously-cached payload must clear the + # same shape checks as a freshly-fetched one — otherwise a once- + # poisoned cache would re-crash on every invocation despite the + # cache being "valid" by age. If validation fails on the cached + # read, fall through to the network fetch path so the cache gets + # refreshed. if not force_refresh and self._is_url_cache_valid(entry.url): try: - return json.loads(cache_file.read_text()) - except json.JSONDecodeError: + cached_data = json.loads(cache_file.read_text()) + self._validate_catalog_payload(cached_data, entry.url) + return cached_data + except (json.JSONDecodeError, PresetError): pass try: with self._open_url(entry.url, timeout=10) as response: catalog_data = json.loads(response.read()) - # Validate payload shape before iteration. Checking only key - # presence would let a payload like ``{"presets": []}`` or - # ``{"presets": null}`` slip through here and then crash with - # ``AttributeError: 'list' object has no attribute 'items'`` deep - # inside ``_get_merged_packs``. The sibling integration catalog - # reader already guards both the root object and the nested - # mapping (see ``integrations/catalog.py``); the preset catalog - # must stay consistent so a malformed upstream surfaces as the - # user-facing ``Invalid preset catalog format`` error instead of - # a raw Python traceback. - if not isinstance(catalog_data, dict): - raise PresetError( - f"Invalid preset catalog format from {entry.url}: " - "expected a JSON object" - ) - if ( - "schema_version" not in catalog_data - or "presets" not in catalog_data - ): - raise PresetError("Invalid preset catalog format") - if not isinstance(catalog_data.get("presets"), dict): - raise PresetError( - f"Invalid preset catalog format from {entry.url}: " - "'presets' must be a JSON object" - ) + self._validate_catalog_payload(catalog_data, entry.url) self.cache_dir.mkdir(parents=True, exist_ok=True) cache_file.write_text(json.dumps(catalog_data, indent=2)) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index db8c080d7e..4e89663a3d 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -2622,6 +2622,76 @@ def test_fetch_single_catalog_rejects_malformed_payload(self, temp_dir, payload) with pytest.raises(ExtensionError, match="Invalid catalog format"): catalog._fetch_single_catalog(entry, force_refresh=True) + @pytest.mark.parametrize( + "cached_payload", + [ + [], + "oops", + 42, + None, + {"schema_version": "1.0", "extensions": []}, + {"schema_version": "1.0", "extensions": "oops"}, + {"schema_version": "1.0", "extensions": None}, + ], + ) + def test_fetch_single_catalog_rejects_malformed_cached_payload( + self, temp_dir, cached_payload + ): + """A poisoned cache silently falls back to the network instead of + crashing — cached payloads pass through the same shape validation + as freshly-fetched ones. + + Without this, a cache poisoned by an older spec-kit version (or a + manual edit, or an upstream that briefly served a bad payload + before the network guards landed) would re-crash every invocation + of ``_get_merged_extensions`` despite the cache being "valid" by + age. The recovery contract is: if the cached payload fails + validation, drop it and refetch — never propagate + ``AttributeError`` to the caller. + """ + from unittest.mock import patch, MagicMock + + catalog = self._make_catalog(temp_dir) + + # Poison the default-URL cache. ``DEFAULT_CATALOG_URL`` is the + # branch that goes through ``is_cache_valid()`` (the non-default + # branch uses per-URL hashed cache files but the same code path + # below). + catalog.cache_dir.mkdir(parents=True, exist_ok=True) + catalog.cache_file.write_text(json.dumps(cached_payload)) + catalog.cache_metadata_file.write_text( + json.dumps( + { + "cached_at": datetime.now(timezone.utc).isoformat(), + "catalog_url": ExtensionCatalog.DEFAULT_CATALOG_URL, + } + ) + ) + + # Network refetch returns a valid payload so the recovery path + # can complete. + valid = { + "schema_version": "1.0", + "extensions": {"foo": {"name": "Foo", "version": "1.0.0"}}, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(valid).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + entry = CatalogEntry( + url=ExtensionCatalog.DEFAULT_CATALOG_URL, + name="default", + priority=1, + install_allowed=True, + ) + + with patch.object(catalog, "_open_url", return_value=mock_response): + result = catalog._fetch_single_catalog(entry, force_refresh=False) + + # The poisoned cache was discarded and the network payload returned. + assert result == valid + def test_get_merged_extensions_skips_non_mapping_entries(self, temp_dir): """Per-entry guard: one malformed entry shouldn't poison the merge. diff --git a/tests/test_presets.py b/tests/test_presets.py index 514365d1a5..5322d1454d 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -1559,6 +1559,77 @@ def test_fetch_single_catalog_rejects_malformed_payload(self, project_dir, paylo with pytest.raises(PresetError, match="Invalid preset catalog format"): catalog._fetch_single_catalog(entry, force_refresh=True) + @pytest.mark.parametrize( + "cached_payload", + [ + [], + "oops", + 42, + None, + {"schema_version": "1.0", "presets": []}, + {"schema_version": "1.0", "presets": "oops"}, + {"schema_version": "1.0", "presets": None}, + ], + ) + def test_fetch_single_catalog_rejects_malformed_cached_payload( + self, project_dir, cached_payload + ): + """A poisoned cache silently falls back to the network instead of + crashing — cached payloads pass through the same shape validation + as freshly-fetched ones. + + Without this, a cache poisoned by an older spec-kit version (or a + manual edit, or an upstream that briefly served a bad payload + before the network guards landed) would re-crash every invocation + of ``_get_merged_packs`` despite the cache being "valid" by age. + The recovery contract is: if the cached payload fails validation, + drop it and refetch — never propagate ``AttributeError`` to the + caller. + """ + from unittest.mock import patch, MagicMock + + catalog = PresetCatalog(project_dir) + + # Poison the default-URL cache. ``DEFAULT_CATALOG_URL`` and + # non-default URLs both flow through the same cache-load branch. + cache_file, metadata_file = catalog._get_cache_paths( + catalog.DEFAULT_CATALOG_URL + ) + cache_file.parent.mkdir(parents=True, exist_ok=True) + cache_file.write_text(json.dumps(cached_payload)) + metadata_file.write_text( + json.dumps( + { + "cached_at": datetime.now(timezone.utc).isoformat(), + "catalog_url": catalog.DEFAULT_CATALOG_URL, + } + ) + ) + + # Network refetch returns a valid payload so the recovery path + # can complete. + valid = { + "schema_version": "1.0", + "presets": {"foo": {"name": "Foo", "version": "1.0.0"}}, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(valid).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + entry = PresetCatalogEntry( + url=catalog.DEFAULT_CATALOG_URL, + name="default", + priority=1, + install_allowed=True, + ) + + with patch.object(catalog, "_open_url", return_value=mock_response): + result = catalog._fetch_single_catalog(entry, force_refresh=False) + + # The poisoned cache was discarded and the network payload returned. + assert result == valid + def test_get_merged_packs_skips_non_mapping_entries(self, project_dir): """Per-entry guard: one malformed entry shouldn't poison the merge. From 1708e5c32b3b188ec3eb3411adfccefbc9ee0db4 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Sun, 31 May 2026 01:40:49 +0500 Subject: [PATCH 4/7] fix(catalogs): include URL in missing-keys error to match sibling branches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Copilot review feedback on this PR (round 3). ``_validate_catalog_payload`` advertises in its docstring that the catalog URL is included in error messages "so the user can tell which catalog in a multi-catalog stack is malformed" — but the missing-keys branch raised ``PresetError("Invalid preset catalog format")`` without the URL, breaking that contract and making multi-catalog debugging harder. The root-bad-type and nested-bad-type branches in the same helper already include the URL; this commit brings the middle branch in line. For consistency, the same fix is applied to the legacy single-catalog fetch paths in ``ExtensionCatalog.fetch_catalog`` and ``PresetCatalog.fetch_catalog`` (where the URL was likewise dropped from the missing-keys error). The existing regex matchers in the regression tests target the ``"Invalid (preset )?catalog format"`` prefix, which is preserved verbatim before the ``from `` suffix — no test changes needed. --- src/specify_cli/extensions.py | 2 +- src/specify_cli/presets.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index de67dfcdd8..9ee09297d5 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -2012,7 +2012,7 @@ def fetch_catalog(self, force_refresh: bool = False) -> Dict[str, Any]: # Validate catalog structure if "schema_version" not in catalog_data or "extensions" not in catalog_data: - raise ExtensionError("Invalid catalog format") + raise ExtensionError(f"Invalid catalog format from {catalog_url}") # Save to cache self.cache_dir.mkdir(parents=True, exist_ok=True) diff --git a/src/specify_cli/presets.py b/src/specify_cli/presets.py index 89fd033187..08aaa045fc 100644 --- a/src/specify_cli/presets.py +++ b/src/specify_cli/presets.py @@ -1895,7 +1895,7 @@ def _validate_catalog_payload(self, catalog_data: Any, url: str) -> None: "schema_version" not in catalog_data or "presets" not in catalog_data ): - raise PresetError("Invalid preset catalog format") + raise PresetError(f"Invalid preset catalog format from {url}") if not isinstance(catalog_data.get("presets"), dict): raise PresetError( f"Invalid preset catalog format from {url}: " @@ -2199,7 +2199,7 @@ def fetch_catalog(self, force_refresh: bool = False) -> Dict[str, Any]: "schema_version" not in catalog_data or "presets" not in catalog_data ): - raise PresetError("Invalid preset catalog format") + raise PresetError(f"Invalid preset catalog format from {catalog_url}") self.cache_dir.mkdir(parents=True, exist_ok=True) self.cache_file.write_text(json.dumps(catalog_data, indent=2)) From 7f44b256b8394734746e88814f2e378431675381 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Mon, 1 Jun 2026 21:43:49 +0500 Subject: [PATCH 5/7] fix(catalogs): broaden cache except tuples and reuse validator in fetch_catalog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Copilot review feedback on this PR (round 4): 1. ``ExtensionCatalog.fetch_catalog`` and ``PresetCatalog.fetch_catalog`` — the legacy single-catalog methods — still only checked key presence. A payload like ``42`` (root non-object) crashed with ``TypeError: argument of type 'int' is not iterable`` during the ``"schema_version" in catalog_data`` check, and an entry mapping of the wrong type crashed downstream. Both now reuse ``_validate_catalog_payload`` so the network-side behaviour of the legacy methods stays consistent with the multi-catalog ``_fetch_single_catalog`` path. (Copilot #3335623482, #3335623556.) 2. The cache-read ``except`` tuples in ``_fetch_single_catalog`` and ``fetch_catalog`` were too narrow. ``read_text`` can raise ``OSError`` (permissions / disk / handle limit) or ``UnicodeError`` (cache file written by an older client in a different encoding) in addition to ``json.JSONDecodeError``. Without those in the tuple, an unreadable cache crashed the caller instead of falling through to the network refetch the cache contract documents. Both sites now catch ``(json.JSONDecodeError, OSError, UnicodeError, )``. (Copilot #3335623588, #3335623608.) 3. While here, pinned ``encoding="utf-8"`` on every cache ``read_text`` call so cache files written by an older Windows client (with a non-UTF-8 default locale) decode the same way on a newer client. Regression tests: - ``test_fetch_catalog_rejects_malformed_payload`` — 7 parametrized payloads per file covering root-non-object + nested-bad-type variants asserting ``fetch_catalog`` raises the named domain error. - ``test_fetch_catalog_recovers_from_unreadable_cache`` — writes ``b"\xff\xfe\x00not-utf-8"`` to the cache file and asserts ``fetch_catalog`` silently falls through to the mocked network and returns the freshly-fetched payload. --- src/specify_cli/extensions.py | 36 +++++++++++----- src/specify_cli/presets.py | 39 ++++++++++++----- tests/test_extensions.py | 80 ++++++++++++++++++++++++++++++++++ tests/test_presets.py | 81 +++++++++++++++++++++++++++++++++++ 4 files changed, 214 insertions(+), 22 deletions(-) diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 9ee09297d5..1d92b30348 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -1874,10 +1874,15 @@ def _fetch_single_catalog(self, entry: CatalogEntry, force_refresh: bool = False # through to the network fetch path so the cache gets refreshed. if is_valid: try: - cached_data = json.loads(cache_file.read_text()) + cached_data = json.loads(cache_file.read_text(encoding="utf-8")) self._validate_catalog_payload(cached_data, entry.url) return cached_data - except (json.JSONDecodeError, ExtensionError): + except (json.JSONDecodeError, OSError, UnicodeError, ExtensionError): + # Cache is best-effort: a JSON-decode failure, an OS-level + # read failure (permissions / disk / handle limit), or a + # text-encoding failure on a cache file written by an older + # client all fall through to the network fetch path. Only + # the network failure is surfaced to the caller. pass # Fetch from network @@ -1994,25 +1999,34 @@ def fetch_catalog(self, force_refresh: bool = False) -> Dict[str, Any]: Raises: ExtensionError: If catalog cannot be fetched """ - # Check cache first unless force refresh + # Fetch from network + catalog_url = self.get_catalog_url() + + # Check cache first unless force refresh. Match the + # ``_fetch_single_catalog`` cache contract: a poisoned or + # unreadable cache silently falls through to a network refetch + # rather than crashing the caller. ``_validate_catalog_payload`` + # is reused here so a cache written by an older client + # (pre-validation) is rejected and refreshed instead of returning + # the stale malformed payload. if not force_refresh and self.is_cache_valid(): try: - return json.loads(self.cache_file.read_text()) - except json.JSONDecodeError: + cached_data = json.loads(self.cache_file.read_text(encoding="utf-8")) + self._validate_catalog_payload(cached_data, catalog_url) + return cached_data + except (json.JSONDecodeError, OSError, UnicodeError, ExtensionError): pass # Fall through to network fetch - # Fetch from network - catalog_url = self.get_catalog_url() - try: import urllib.error with self._open_url(catalog_url, timeout=10) as response: catalog_data = json.loads(response.read()) - # Validate catalog structure - if "schema_version" not in catalog_data or "extensions" not in catalog_data: - raise ExtensionError(f"Invalid catalog format from {catalog_url}") + # Validate catalog structure. Reuses the same helper as + # ``_fetch_single_catalog`` so all three branches (root type, + # missing keys, nested-mapping type) stay consistent. + self._validate_catalog_payload(catalog_data, catalog_url) # Save to cache self.cache_dir.mkdir(parents=True, exist_ok=True) diff --git a/src/specify_cli/presets.py b/src/specify_cli/presets.py index 08aaa045fc..ec3f1cf5be 100644 --- a/src/specify_cli/presets.py +++ b/src/specify_cli/presets.py @@ -2085,10 +2085,15 @@ def _fetch_single_catalog(self, entry: PresetCatalogEntry, force_refresh: bool = # refreshed. if not force_refresh and self._is_url_cache_valid(entry.url): try: - cached_data = json.loads(cache_file.read_text()) + cached_data = json.loads(cache_file.read_text(encoding="utf-8")) self._validate_catalog_payload(cached_data, entry.url) return cached_data - except (json.JSONDecodeError, PresetError): + except (json.JSONDecodeError, OSError, UnicodeError, PresetError): + # Cache is best-effort: a JSON-decode failure, an OS-level + # read failure (permissions / disk / handle limit), or a + # text-encoding failure on a cache file written by an + # older client all fall through to the network fetch path. + # Only the network failure is surfaced to the caller. pass try: @@ -2182,24 +2187,36 @@ def fetch_catalog(self, force_refresh: bool = False) -> Dict[str, Any]: """ catalog_url = self.get_catalog_url() + # Match the ``_fetch_single_catalog`` cache contract: a poisoned + # or unreadable cache silently falls through to a network refetch + # rather than crashing the caller. ``_validate_catalog_payload`` + # is reused here so a cache written by an older client + # (pre-validation) is rejected and refreshed instead of returning + # the stale malformed payload. if not force_refresh and self.is_cache_valid(): try: - metadata = json.loads(self.cache_metadata_file.read_text()) + metadata = json.loads( + self.cache_metadata_file.read_text(encoding="utf-8") + ) if metadata.get("catalog_url") == catalog_url: - return json.loads(self.cache_file.read_text()) - except (json.JSONDecodeError, OSError): - # Cache is corrupt or unreadable; fall through to network fetch + cached_data = json.loads( + self.cache_file.read_text(encoding="utf-8") + ) + self._validate_catalog_payload(cached_data, catalog_url) + return cached_data + except (json.JSONDecodeError, OSError, UnicodeError, PresetError): + # Cache is corrupt, unreadable, or fails the shape check; + # fall through to network fetch. pass try: with self._open_url(catalog_url, timeout=10) as response: catalog_data = json.loads(response.read()) - if ( - "schema_version" not in catalog_data - or "presets" not in catalog_data - ): - raise PresetError(f"Invalid preset catalog format from {catalog_url}") + # Validate catalog structure. Reuses the same helper as + # ``_fetch_single_catalog`` so all three branches (root type, + # missing keys, nested-mapping type) stay consistent. + self._validate_catalog_payload(catalog_data, catalog_url) self.cache_dir.mkdir(parents=True, exist_ok=True) self.cache_file.write_text(json.dumps(catalog_data, indent=2)) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 4e89663a3d..be72f3263c 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -2692,6 +2692,86 @@ def test_fetch_single_catalog_rejects_malformed_cached_payload( # The poisoned cache was discarded and the network payload returned. assert result == valid + @pytest.mark.parametrize( + "payload", + [ + # Root is not a JSON object. + [], + "oops", + 42, + None, + # Root is fine but ``extensions`` is the wrong type. + {"schema_version": "1.0", "extensions": []}, + {"schema_version": "1.0", "extensions": "oops"}, + {"schema_version": "1.0", "extensions": None}, + ], + ) + def test_fetch_catalog_rejects_malformed_payload(self, temp_dir, payload): + """Legacy ``fetch_catalog`` reuses the same shape-validation helper. + + Before this change ``fetch_catalog`` only checked key presence — so + a payload like ``42`` would crash with + ``TypeError: argument of type 'int' is not iterable`` during the + ``"schema_version" in catalog_data`` check, and an entry mapping + of the wrong type would crash downstream. Reusing + ``_validate_catalog_payload`` keeps the network-side behaviour of + the legacy single-catalog method consistent with the multi-catalog + ``_fetch_single_catalog`` path. + """ + from unittest.mock import patch, MagicMock + + catalog = self._make_catalog(temp_dir) + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(payload).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + with patch.object(catalog, "_open_url", return_value=mock_response): + with pytest.raises(ExtensionError, match="Invalid catalog format"): + catalog.fetch_catalog(force_refresh=True) + + def test_fetch_catalog_recovers_from_unreadable_cache(self, temp_dir): + """An unreadable / wrong-encoded cache file silently refetches. + + The cache contract is best-effort: a JSON-decode failure, an OS + read failure (permissions / disk / handle limit), or an invalid + text encoding on a cache file written by an older client must + all fall through to the network fetch rather than crash the + caller. Covers Copilot's review point that the previous + ``except (json.JSONDecodeError,)`` was too narrow. + """ + from unittest.mock import patch, MagicMock + + catalog = self._make_catalog(temp_dir) + # Write invalid UTF-8 bytes to the cache file so ``read_text`` + # raises ``UnicodeDecodeError`` (a subclass of ``UnicodeError``). + catalog.cache_dir.mkdir(parents=True, exist_ok=True) + catalog.cache_file.write_bytes(b"\xff\xfe\x00not-utf-8") + catalog.cache_metadata_file.write_text( + json.dumps( + { + "cached_at": datetime.now(timezone.utc).isoformat(), + "catalog_url": ExtensionCatalog.DEFAULT_CATALOG_URL, + } + ), + encoding="utf-8", + ) + + valid = { + "schema_version": "1.0", + "extensions": {"foo": {"name": "Foo", "version": "1.0.0"}}, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(valid).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + with patch.object(catalog, "_open_url", return_value=mock_response): + result = catalog.fetch_catalog(force_refresh=False) + + # Recovered via network rather than crashing on the unreadable cache. + assert result == valid + def test_get_merged_extensions_skips_non_mapping_entries(self, temp_dir): """Per-entry guard: one malformed entry shouldn't poison the merge. diff --git a/tests/test_presets.py b/tests/test_presets.py index 5322d1454d..85275e6bdc 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -1630,6 +1630,87 @@ def test_fetch_single_catalog_rejects_malformed_cached_payload( # The poisoned cache was discarded and the network payload returned. assert result == valid + @pytest.mark.parametrize( + "payload", + [ + # Root is not a JSON object. + [], + "oops", + 42, + None, + # Root is fine but ``presets`` is the wrong type. + {"schema_version": "1.0", "presets": []}, + {"schema_version": "1.0", "presets": "oops"}, + {"schema_version": "1.0", "presets": None}, + ], + ) + def test_fetch_catalog_rejects_malformed_payload(self, project_dir, payload): + """Legacy ``fetch_catalog`` reuses the same shape-validation helper. + + Before this change ``fetch_catalog`` only checked key presence — + so a payload like ``42`` would crash with + ``TypeError: argument of type 'int' is not iterable`` during the + ``"schema_version" in catalog_data`` check, and an entry mapping + of the wrong type would crash downstream. Reusing + ``_validate_catalog_payload`` keeps the network-side behaviour of + the legacy single-catalog method consistent with the multi-catalog + ``_fetch_single_catalog`` path. + """ + from unittest.mock import patch, MagicMock + + catalog = PresetCatalog(project_dir) + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(payload).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + with patch.object(catalog, "_open_url", return_value=mock_response): + with pytest.raises(PresetError, match="Invalid preset catalog format"): + catalog.fetch_catalog(force_refresh=True) + + def test_fetch_catalog_recovers_from_unreadable_cache(self, project_dir): + """An unreadable / wrong-encoded cache file silently refetches. + + The cache contract is best-effort: a JSON-decode failure, an OS + read failure (permissions / disk / handle limit), or an invalid + text encoding on a cache file written by an older client must + all fall through to the network fetch rather than crash the + caller. Covers Copilot's review point that the previous + ``except (json.JSONDecodeError, OSError)`` was missing + ``UnicodeError``. + """ + from unittest.mock import patch, MagicMock + + catalog = PresetCatalog(project_dir) + catalog.cache_dir.mkdir(parents=True, exist_ok=True) + # Invalid UTF-8 bytes so ``read_text`` raises ``UnicodeDecodeError`` + # (a subclass of ``UnicodeError``). + catalog.cache_file.write_bytes(b"\xff\xfe\x00not-utf-8") + catalog.cache_metadata_file.write_text( + json.dumps( + { + "cached_at": datetime.now(timezone.utc).isoformat(), + "catalog_url": catalog.get_catalog_url(), + } + ), + encoding="utf-8", + ) + + valid = { + "schema_version": "1.0", + "presets": {"foo": {"name": "Foo", "version": "1.0.0"}}, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(valid).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + with patch.object(catalog, "_open_url", return_value=mock_response): + result = catalog.fetch_catalog(force_refresh=False) + + # Recovered via network rather than crashing on the unreadable cache. + assert result == valid + def test_get_merged_packs_skips_non_mapping_entries(self, project_dir): """Per-entry guard: one malformed entry shouldn't poison the merge. From 00020615567f3f08ee5c04df5db38bf63e74e02f Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Wed, 3 Jun 2026 00:03:20 +0500 Subject: [PATCH 6/7] fix(catalogs): harden cache-validity checks and pin UTF-8 on writes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cache-best-effort contract added in 7f44b25 was incomplete on two points raised by Copilot: 1. The cache-validity helpers (is_cache_valid / _is_url_cache_valid, plus the inline metadata-age check inside _fetch_single_catalog for per-URL caches) read the metadata file without specifying an encoding and only caught json.JSONDecodeError / ValueError / KeyError / TypeError. A metadata file written by a tool using the system locale codec, or one whose handle is briefly unavailable, would raise UnicodeDecodeError / OSError and propagate past the read-side try/except in fetch_catalog — the very crash the read-side guard was meant to prevent. The validity checks now read with encoding="utf-8" and treat OSError / UnicodeError as cache-invalid, matching the documented contract. 2. The network-fetch path wrote the cache and metadata files with bare write_text(...), picking up the platform default encoding. The read path was already pinned to UTF-8 (and the integrations/catalog.py:193-203 sibling writes UTF-8 too), so on hosts whose default codec isn't UTF-8 the write/read pair could disagree and force an unnecessary refetch on every invocation. All four write_text calls now pass encoding="utf-8" so the cache survives a round trip on any platform. Also rewords the misleading # Fetch from network comment in extensions.fetch_catalog — it sat above the cache-check block, which read as if the cache step had been skipped. Tests ----- Adds two parametrized regression tests per catalog: * test_fetch_catalog_recovers_from_unreadable_metadata plants non-UTF-8 bytes in the metadata file, asserts is_cache_valid() returns False (rather than raising), and confirms fetch_catalog falls through to the network instead of crashing. * test_fetch_catalog_writes_cache_as_utf8 round-trips a payload containing a non-ASCII identifier (café) through the public fetch path and reads the cache back with read_text(encoding="utf-8"), catching encoding drift at the byte level rather than relying on the system codec to happen to be UTF-8. Both pairs follow the established sibling-file symmetry — the extension and preset suites stay in lock-step. --- src/specify_cli/extensions.py | 85 ++++++++++++++++++++++++++++------- src/specify_cli/presets.py | 61 +++++++++++++++++++++---- tests/test_extensions.py | 75 +++++++++++++++++++++++++++++++ tests/test_presets.py | 72 +++++++++++++++++++++++++++++ 4 files changed, 268 insertions(+), 25 deletions(-) diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 1d92b30348..7e8392c6eb 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -1855,14 +1855,28 @@ def _fetch_single_catalog(self, entry: CatalogEntry, force_refresh: bool = False is_valid = False if not force_refresh and cache_file.exists() and cache_meta_file.exists(): try: - metadata = json.loads(cache_meta_file.read_text()) + metadata = json.loads( + cache_meta_file.read_text(encoding="utf-8") + ) cached_at = datetime.fromisoformat(metadata.get("cached_at", "")) if cached_at.tzinfo is None: cached_at = cached_at.replace(tzinfo=timezone.utc) age = (datetime.now(timezone.utc) - cached_at).total_seconds() is_valid = age < self.CACHE_DURATION - except (json.JSONDecodeError, ValueError, KeyError, TypeError): - # If metadata is invalid or missing expected fields, treat cache as invalid + except ( + json.JSONDecodeError, + OSError, + UnicodeError, + ValueError, + KeyError, + TypeError, + ): + # Cache validity is best-effort: invalid/missing metadata + # fields, an unreadable metadata file (permissions / disk), + # or a wrongly-encoded metadata file (written by a tool + # using the system locale codec) all degrade to "cache + # invalid" so the caller falls through to a network + # refetch instead of crashing. pass # Use cache if valid. A previously-cached payload must clear the @@ -1892,13 +1906,23 @@ def _fetch_single_catalog(self, entry: CatalogEntry, force_refresh: bool = False self._validate_catalog_payload(catalog_data, entry.url) - # Save to cache + # Save to cache. Both files are explicitly UTF-8 to match the + # ``read_text(encoding="utf-8")`` on the read side and the + # ``integrations/catalog.py:193-203`` precedent. Without this, + # platforms whose default encoding isn't UTF-8 would write + # locale-encoded bytes that the read path can't decode, forcing + # an unnecessary network refetch on every invocation. self.cache_dir.mkdir(parents=True, exist_ok=True) - cache_file.write_text(json.dumps(catalog_data, indent=2)) - cache_meta_file.write_text(json.dumps({ - "cached_at": datetime.now(timezone.utc).isoformat(), - "catalog_url": entry.url, - }, indent=2)) + cache_file.write_text( + json.dumps(catalog_data, indent=2), encoding="utf-8" + ) + cache_meta_file.write_text( + json.dumps({ + "cached_at": datetime.now(timezone.utc).isoformat(), + "catalog_url": entry.url, + }, indent=2), + encoding="utf-8", + ) return catalog_data @@ -1971,6 +1995,12 @@ def _get_merged_extensions(self, force_refresh: bool = False) -> List[Dict[str, def is_cache_valid(self) -> bool: """Check if cached catalog is still valid. + Returns ``False`` for any read/decoding failure on the metadata + file (missing fields, malformed JSON, permissions / disk errors, + wrong text encoding) so callers fall through to a network refetch + instead of crashing. Treating cache validity as best-effort + matches the contract used by the per-URL cache check below. + Returns: True if cache exists and is within cache duration """ @@ -1978,13 +2008,22 @@ def is_cache_valid(self) -> bool: return False try: - metadata = json.loads(self.cache_metadata_file.read_text()) + metadata = json.loads( + self.cache_metadata_file.read_text(encoding="utf-8") + ) cached_at = datetime.fromisoformat(metadata.get("cached_at", "")) if cached_at.tzinfo is None: cached_at = cached_at.replace(tzinfo=timezone.utc) age_seconds = (datetime.now(timezone.utc) - cached_at).total_seconds() return age_seconds < self.CACHE_DURATION - except (json.JSONDecodeError, ValueError, KeyError, TypeError): + except ( + json.JSONDecodeError, + OSError, + UnicodeError, + ValueError, + KeyError, + TypeError, + ): return False def fetch_catalog(self, force_refresh: bool = False) -> Dict[str, Any]: @@ -1999,16 +2038,19 @@ def fetch_catalog(self, force_refresh: bool = False) -> Dict[str, Any]: Raises: ExtensionError: If catalog cannot be fetched """ - # Fetch from network catalog_url = self.get_catalog_url() - # Check cache first unless force refresh. Match the + # Check the cache first unless ``force_refresh`` was requested, + # then fall through to a network fetch. Match the # ``_fetch_single_catalog`` cache contract: a poisoned or # unreadable cache silently falls through to a network refetch # rather than crashing the caller. ``_validate_catalog_payload`` # is reused here so a cache written by an older client # (pre-validation) is rejected and refreshed instead of returning - # the stale malformed payload. + # the stale malformed payload. ``is_cache_valid`` itself swallows + # OSError/UnicodeError on the metadata read, so a cache-validity + # check can't crash this method before the read-side fallback + # runs. if not force_refresh and self.is_cache_valid(): try: cached_data = json.loads(self.cache_file.read_text(encoding="utf-8")) @@ -2028,16 +2070,25 @@ def fetch_catalog(self, force_refresh: bool = False) -> Dict[str, Any]: # missing keys, nested-mapping type) stay consistent. self._validate_catalog_payload(catalog_data, catalog_url) - # Save to cache + # Save to cache. Explicit UTF-8 on both writes mirrors the + # ``read_text(encoding="utf-8")`` on the read side and the + # ``integrations/catalog.py:193-203`` precedent — otherwise + # platforms whose default encoding isn't UTF-8 would write + # locale-encoded bytes the read path can't decode, forcing an + # unnecessary refetch on every invocation. self.cache_dir.mkdir(parents=True, exist_ok=True) - self.cache_file.write_text(json.dumps(catalog_data, indent=2)) + self.cache_file.write_text( + json.dumps(catalog_data, indent=2), encoding="utf-8" + ) # Save cache metadata metadata = { "cached_at": datetime.now(timezone.utc).isoformat(), "catalog_url": catalog_url, } - self.cache_metadata_file.write_text(json.dumps(metadata, indent=2)) + self.cache_metadata_file.write_text( + json.dumps(metadata, indent=2), encoding="utf-8" + ) return catalog_data diff --git a/src/specify_cli/presets.py b/src/specify_cli/presets.py index ec3f1cf5be..d6bd2488d3 100644 --- a/src/specify_cli/presets.py +++ b/src/specify_cli/presets.py @@ -2051,7 +2051,7 @@ def _is_url_cache_valid(self, url: str) -> bool: if not cache_file.exists() or not metadata_file.exists(): return False try: - metadata = json.loads(metadata_file.read_text()) + metadata = json.loads(metadata_file.read_text(encoding="utf-8")) cached_at = datetime.fromisoformat(metadata.get("cached_at", "")) if cached_at.tzinfo is None: cached_at = cached_at.replace(tzinfo=timezone.utc) @@ -2059,7 +2059,19 @@ def _is_url_cache_valid(self, url: str) -> bool: datetime.now(timezone.utc) - cached_at ).total_seconds() return age_seconds < self.CACHE_DURATION - except (json.JSONDecodeError, ValueError, KeyError, TypeError): + except ( + json.JSONDecodeError, + OSError, + UnicodeError, + ValueError, + KeyError, + TypeError, + ): + # Cache validity is best-effort: invalid/missing fields, an + # unreadable metadata file (permissions / disk), or a wrongly + # encoded one (written by a tool using the system locale + # codec) all degrade to "cache invalid" so the caller falls + # through to a network refetch instead of crashing. return False def _fetch_single_catalog(self, entry: PresetCatalogEntry, force_refresh: bool = False) -> Dict[str, Any]: @@ -2102,13 +2114,23 @@ def _fetch_single_catalog(self, entry: PresetCatalogEntry, force_refresh: bool = self._validate_catalog_payload(catalog_data, entry.url) + # Both files are written explicitly as UTF-8 to match the + # ``read_text(encoding="utf-8")`` on the read side and the + # ``integrations/catalog.py:193-203`` precedent. Without this, + # platforms whose default encoding isn't UTF-8 would write + # locale-encoded bytes the read path can't decode, forcing an + # unnecessary refetch on every invocation. self.cache_dir.mkdir(parents=True, exist_ok=True) - cache_file.write_text(json.dumps(catalog_data, indent=2)) + cache_file.write_text( + json.dumps(catalog_data, indent=2), encoding="utf-8" + ) metadata = { "cached_at": datetime.now(timezone.utc).isoformat(), "catalog_url": entry.url, } - metadata_file.write_text(json.dumps(metadata, indent=2)) + metadata_file.write_text( + json.dumps(metadata, indent=2), encoding="utf-8" + ) return catalog_data @@ -2155,6 +2177,12 @@ def _get_merged_packs(self, force_refresh: bool = False) -> Dict[str, Dict[str, def is_cache_valid(self) -> bool: """Check if cached catalog is still valid. + Returns ``False`` for any read/decoding failure on the metadata + file (missing fields, malformed JSON, permissions / disk errors, + wrong text encoding) so callers fall through to a network refetch + instead of crashing. Treating cache validity as best-effort + matches the contract used by ``_is_url_cache_valid`` above. + Returns: True if cache exists and is within cache duration """ @@ -2162,7 +2190,9 @@ def is_cache_valid(self) -> bool: return False try: - metadata = json.loads(self.cache_metadata_file.read_text()) + metadata = json.loads( + self.cache_metadata_file.read_text(encoding="utf-8") + ) cached_at = datetime.fromisoformat(metadata.get("cached_at", "")) if cached_at.tzinfo is None: cached_at = cached_at.replace(tzinfo=timezone.utc) @@ -2170,7 +2200,14 @@ def is_cache_valid(self) -> bool: datetime.now(timezone.utc) - cached_at ).total_seconds() return age_seconds < self.CACHE_DURATION - except (json.JSONDecodeError, ValueError, KeyError, TypeError): + except ( + json.JSONDecodeError, + OSError, + UnicodeError, + ValueError, + KeyError, + TypeError, + ): return False def fetch_catalog(self, force_refresh: bool = False) -> Dict[str, Any]: @@ -2218,15 +2255,23 @@ def fetch_catalog(self, force_refresh: bool = False) -> Dict[str, Any]: # missing keys, nested-mapping type) stay consistent. self._validate_catalog_payload(catalog_data, catalog_url) + # Save to cache. Explicit UTF-8 on both writes mirrors the + # ``read_text(encoding="utf-8")`` on the read side and the + # ``integrations/catalog.py:193-203`` precedent — otherwise + # platforms whose default encoding isn't UTF-8 would write + # locale-encoded bytes the read path can't decode, forcing an + # unnecessary refetch on every invocation. self.cache_dir.mkdir(parents=True, exist_ok=True) - self.cache_file.write_text(json.dumps(catalog_data, indent=2)) + self.cache_file.write_text( + json.dumps(catalog_data, indent=2), encoding="utf-8" + ) metadata = { "cached_at": datetime.now(timezone.utc).isoformat(), "catalog_url": catalog_url, } self.cache_metadata_file.write_text( - json.dumps(metadata, indent=2) + json.dumps(metadata, indent=2), encoding="utf-8" ) return catalog_data diff --git a/tests/test_extensions.py b/tests/test_extensions.py index be72f3263c..66ff4f2ac7 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -2772,6 +2772,81 @@ def test_fetch_catalog_recovers_from_unreadable_cache(self, temp_dir): # Recovered via network rather than crashing on the unreadable cache. assert result == valid + def test_fetch_catalog_recovers_from_unreadable_metadata(self, temp_dir): + """A wrongly-encoded metadata file degrades to a cache miss. + + ``is_cache_valid`` is consulted *before* the cache payload is + read; if the metadata file itself can't be decoded (e.g. it was + written on a Windows host whose default codec isn't UTF-8) the + validity check must return ``False`` rather than propagate + ``UnicodeDecodeError``. Without that guard, a corrupted metadata + file would crash every invocation instead of falling through to + a network refetch. + """ + from unittest.mock import patch, MagicMock + + catalog = self._make_catalog(temp_dir) + catalog.cache_dir.mkdir(parents=True, exist_ok=True) + catalog.cache_file.write_text("{}", encoding="utf-8") + # Bytes that are not valid UTF-8 — ``read_text(encoding="utf-8")`` + # will raise ``UnicodeDecodeError`` (subclass of ``UnicodeError``). + catalog.cache_metadata_file.write_bytes(b"\xff\xfe\x00bad") + + # is_cache_valid must absorb the decode failure, not crash. + assert catalog.is_cache_valid() is False + + valid = { + "schema_version": "1.0", + "extensions": {"foo": {"name": "Foo", "version": "1.0.0"}}, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(valid).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + with patch.object(catalog, "_open_url", return_value=mock_response): + result = catalog.fetch_catalog(force_refresh=False) + + assert result == valid + + def test_fetch_catalog_writes_cache_as_utf8(self, temp_dir): + """Cache + metadata are written with explicit UTF-8 encoding. + + On platforms whose default text encoding is not UTF-8 (Windows + with a non-UTF-8 ANSI codepage, some CI images), a bare + ``write_text`` would emit locale-encoded bytes that the + UTF-8-only read path can't decode, forcing a refetch on every + invocation. Asserting on raw bytes catches the drift directly + instead of relying on the round-trip working by accident. + """ + from unittest.mock import patch, MagicMock + + catalog = self._make_catalog(temp_dir) + # A non-ASCII string in the payload makes the encoding choice + # observable at the byte level — UTF-8 encodes "é" as 0xC3 0xA9. + payload = { + "schema_version": "1.0", + "extensions": { + "café": {"name": "Café", "version": "1.0.0"}, + }, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(payload).encode("utf-8") + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + with patch.object(catalog, "_open_url", return_value=mock_response): + catalog.fetch_catalog(force_refresh=True) + + # Cache round-trips via the UTF-8 read path used in production. + cached = json.loads(catalog.cache_file.read_text(encoding="utf-8")) + assert "café" in cached["extensions"] + # Metadata also UTF-8 — confirms both writes were updated. + meta = json.loads( + catalog.cache_metadata_file.read_text(encoding="utf-8") + ) + assert "cached_at" in meta + def test_get_merged_extensions_skips_non_mapping_entries(self, temp_dir): """Per-entry guard: one malformed entry shouldn't poison the merge. diff --git a/tests/test_presets.py b/tests/test_presets.py index 85275e6bdc..844a444d57 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -1711,6 +1711,78 @@ def test_fetch_catalog_recovers_from_unreadable_cache(self, project_dir): # Recovered via network rather than crashing on the unreadable cache. assert result == valid + def test_fetch_catalog_recovers_from_unreadable_metadata(self, project_dir): + """A wrongly-encoded metadata file degrades to a cache miss. + + ``is_cache_valid`` is consulted *before* the cache payload is + read; if the metadata file itself can't be decoded (e.g. it was + written on a host whose default codec isn't UTF-8) the validity + check must return ``False`` rather than propagate + ``UnicodeDecodeError``. Without that guard, a corrupted metadata + file would crash every invocation instead of falling through to + a network refetch. + """ + from unittest.mock import patch, MagicMock + + catalog = PresetCatalog(project_dir) + catalog.cache_dir.mkdir(parents=True, exist_ok=True) + catalog.cache_file.write_text("{}", encoding="utf-8") + # Bytes that are not valid UTF-8 — ``read_text(encoding="utf-8")`` + # will raise ``UnicodeDecodeError`` (subclass of ``UnicodeError``). + catalog.cache_metadata_file.write_bytes(b"\xff\xfe\x00bad") + + # is_cache_valid must absorb the decode failure, not crash. + assert catalog.is_cache_valid() is False + + valid = { + "schema_version": "1.0", + "presets": {"foo": {"name": "Foo", "version": "1.0.0"}}, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(valid).encode() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + with patch.object(catalog, "_open_url", return_value=mock_response): + result = catalog.fetch_catalog(force_refresh=False) + + assert result == valid + + def test_fetch_catalog_writes_cache_as_utf8(self, project_dir): + """Cache + metadata are written with explicit UTF-8 encoding. + + On platforms whose default text encoding is not UTF-8, a bare + ``write_text`` would emit locale-encoded bytes that the + UTF-8-only read path can't decode, forcing a refetch on every + invocation. Asserting on a round-trip with non-ASCII content + catches the drift directly. + """ + from unittest.mock import patch, MagicMock + + catalog = PresetCatalog(project_dir) + # A non-ASCII string in the payload makes the encoding choice + # observable at the byte level — UTF-8 encodes "é" as 0xC3 0xA9. + payload = { + "schema_version": "1.0", + "presets": { + "café": {"name": "Café", "version": "1.0.0"}, + }, + } + mock_response = MagicMock() + mock_response.read.return_value = json.dumps(payload).encode("utf-8") + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + + with patch.object(catalog, "_open_url", return_value=mock_response): + catalog.fetch_catalog(force_refresh=True) + + cached = json.loads(catalog.cache_file.read_text(encoding="utf-8")) + assert "café" in cached["presets"] + meta = json.loads( + catalog.cache_metadata_file.read_text(encoding="utf-8") + ) + assert "cached_at" in meta + def test_get_merged_packs_skips_non_mapping_entries(self, project_dir): """Per-entry guard: one malformed entry shouldn't poison the merge. From d9bd79c58174c8dc77b342c6430039756a3cfbd0 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Wed, 3 Jun 2026 19:24:52 +0500 Subject: [PATCH 7/7] test(catalogs): assert UTF-8 write encoding by recording write_text kwargs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot's review on this PR caught that test_fetch_catalog_writes_cache_as_utf8 claimed to validate UTF-8 at the byte level but actually only round-tripped a non-ASCII string through json.dumps/read_text. Because json.dumps defaults to ensure_ascii=True, 'café' was serialized as the all-ASCII escape 'caf\u00e9' before reaching write_text — the bytes on disk were identical regardless of the encoding kwarg, so a locale-encoded write would have round-tripped just fine. The drift guard the test name advertised was not actually being enforced. Rewriting these tests to observe the production code's argument directly: each test now monkey-patches pathlib.Path.write_text with a recorder that captures the encoding kwarg for every call, runs the production fetch, and asserts every write into the cache directory passed encoding='utf-8'. That is the substantive thing the regression guard cares about — non-ASCII payload tricks were the wrong lever to pull, because json.dumps was masking the encoding choice before write_text ever ran. Both tests verified locally against the current production code (492 passed in the extensions+presets suites) and confirmed to fail against a synthetic no-encoding write (the recorder records None instead of 'utf-8', the assertion catches it). Same change applied symmetrically to test_extensions.py and test_presets.py to keep the sibling files in lockstep with the production code paths in extensions.py and presets.py. --- tests/test_extensions.py | 68 +++++++++++++++++++++++++++------------- tests/test_presets.py | 59 +++++++++++++++++++++++----------- 2 files changed, 86 insertions(+), 41 deletions(-) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 66ff4f2ac7..357c278b21 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -2809,43 +2809,67 @@ def test_fetch_catalog_recovers_from_unreadable_metadata(self, temp_dir): assert result == valid - def test_fetch_catalog_writes_cache_as_utf8(self, temp_dir): - """Cache + metadata are written with explicit UTF-8 encoding. - - On platforms whose default text encoding is not UTF-8 (Windows - with a non-UTF-8 ANSI codepage, some CI images), a bare - ``write_text`` would emit locale-encoded bytes that the - UTF-8-only read path can't decode, forcing a refetch on every - invocation. Asserting on raw bytes catches the drift directly - instead of relying on the round-trip working by accident. + def test_fetch_catalog_writes_cache_as_utf8(self, temp_dir, monkeypatch): + """Cache + metadata writes pass ``encoding="utf-8"``, observably. + + The earlier version of this test claimed to assert UTF-8 at the + byte level but actually only round-tripped a non-ASCII string + through ``json.dumps`` and ``read_text(encoding="utf-8")``. + Because ``json.dumps`` defaults to ``ensure_ascii=True``, "café" + was serialized as the all-ASCII escape ``caf\\u00e9`` before it + ever reached ``write_text`` — the bytes on disk were identical + regardless of the encoding kwarg, so a locale-encoded write + would have round-tripped just fine. The drift Copilot's review + flagged wasn't actually being caught. + + Fix: directly observe the ``encoding`` argument passed to every + ``write_text`` call made against the cache directory. This is + the production code's encoding choice, which is exactly what + the regression guard cares about; non-ASCII payload tricks are + unnecessary because the assertion is about the kwarg, not the + bytes. """ from unittest.mock import patch, MagicMock + from pathlib import Path as _PathCls catalog = self._make_catalog(temp_dir) - # A non-ASCII string in the payload makes the encoding choice - # observable at the byte level — UTF-8 encodes "é" as 0xC3 0xA9. payload = { "schema_version": "1.0", - "extensions": { - "café": {"name": "Café", "version": "1.0.0"}, - }, + "extensions": {"foo": {"name": "Foo", "version": "1.0.0"}}, } mock_response = MagicMock() mock_response.read.return_value = json.dumps(payload).encode("utf-8") mock_response.__enter__ = lambda s: s mock_response.__exit__ = MagicMock(return_value=False) + # Record every ``write_text`` call's encoding kwarg so the + # assertion observes the production writer's argument directly. + recorded: list[dict] = [] + real_write_text = _PathCls.write_text + + def recording_write_text(self, data, *args, **kwargs): + recorded.append( + {"path": str(self), "encoding": kwargs.get("encoding")} + ) + return real_write_text(self, data, *args, **kwargs) + + monkeypatch.setattr(_PathCls, "write_text", recording_write_text) + with patch.object(catalog, "_open_url", return_value=mock_response): catalog.fetch_catalog(force_refresh=True) - # Cache round-trips via the UTF-8 read path used in production. - cached = json.loads(catalog.cache_file.read_text(encoding="utf-8")) - assert "café" in cached["extensions"] - # Metadata also UTF-8 — confirms both writes were updated. - meta = json.loads( - catalog.cache_metadata_file.read_text(encoding="utf-8") - ) - assert "cached_at" in meta + # Filter to writes inside the catalog's cache directory so + # unrelated writes from other machinery don't pollute the + # assertion. + cache_writes = [ + r for r in recorded if str(catalog.cache_dir) in r["path"] + ] + assert cache_writes, "fetch_catalog made no writes to the cache dir" + for record in cache_writes: + assert record["encoding"] == "utf-8", ( + f"write_text on {record['path']} used encoding " + f"{record['encoding']!r}; expected 'utf-8'" + ) def test_get_merged_extensions_skips_non_mapping_entries(self, temp_dir): """Per-entry guard: one malformed entry shouldn't poison the merge. diff --git a/tests/test_presets.py b/tests/test_presets.py index 844a444d57..9075406ca1 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -1748,40 +1748,61 @@ def test_fetch_catalog_recovers_from_unreadable_metadata(self, project_dir): assert result == valid - def test_fetch_catalog_writes_cache_as_utf8(self, project_dir): - """Cache + metadata are written with explicit UTF-8 encoding. - - On platforms whose default text encoding is not UTF-8, a bare - ``write_text`` would emit locale-encoded bytes that the - UTF-8-only read path can't decode, forcing a refetch on every - invocation. Asserting on a round-trip with non-ASCII content - catches the drift directly. + def test_fetch_catalog_writes_cache_as_utf8(self, project_dir, monkeypatch): + """Cache + metadata writes pass ``encoding="utf-8"``, observably. + + The earlier version of this test claimed to assert UTF-8 at the + byte level but actually only round-tripped a non-ASCII string + through ``json.dumps`` and ``read_text(encoding="utf-8")``. + Because ``json.dumps`` defaults to ``ensure_ascii=True``, "café" + was serialized as the all-ASCII escape ``caf\\u00e9`` before it + ever reached ``write_text`` — the bytes on disk were identical + regardless of the encoding kwarg. The drift Copilot's review + flagged wasn't actually being caught. + + Fix: directly observe the ``encoding`` argument passed to every + ``write_text`` call made against the cache directory. This is + the production code's encoding choice, which is exactly what + the regression guard cares about. """ from unittest.mock import patch, MagicMock + from pathlib import Path as _PathCls catalog = PresetCatalog(project_dir) - # A non-ASCII string in the payload makes the encoding choice - # observable at the byte level — UTF-8 encodes "é" as 0xC3 0xA9. payload = { "schema_version": "1.0", - "presets": { - "café": {"name": "Café", "version": "1.0.0"}, - }, + "presets": {"foo": {"name": "Foo", "version": "1.0.0"}}, } mock_response = MagicMock() mock_response.read.return_value = json.dumps(payload).encode("utf-8") mock_response.__enter__ = lambda s: s mock_response.__exit__ = MagicMock(return_value=False) + # Record every ``write_text`` call's encoding kwarg so the + # assertion observes the production writer's argument directly. + recorded: list[dict] = [] + real_write_text = _PathCls.write_text + + def recording_write_text(self, data, *args, **kwargs): + recorded.append( + {"path": str(self), "encoding": kwargs.get("encoding")} + ) + return real_write_text(self, data, *args, **kwargs) + + monkeypatch.setattr(_PathCls, "write_text", recording_write_text) + with patch.object(catalog, "_open_url", return_value=mock_response): catalog.fetch_catalog(force_refresh=True) - cached = json.loads(catalog.cache_file.read_text(encoding="utf-8")) - assert "café" in cached["presets"] - meta = json.loads( - catalog.cache_metadata_file.read_text(encoding="utf-8") - ) - assert "cached_at" in meta + cache_writes = [ + r for r in recorded if str(catalog.cache_dir) in r["path"] + ] + assert cache_writes, "fetch_catalog made no writes to the cache dir" + for record in cache_writes: + assert record["encoding"] == "utf-8", ( + f"write_text on {record['path']} used encoding " + f"{record['encoding']!r}; expected 'utf-8'" + ) def test_get_merged_packs_skips_non_mapping_entries(self, project_dir): """Per-entry guard: one malformed entry shouldn't poison the merge.