From 4c6083e7b8ff711ba9186f67907353212ba41c93 Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Tue, 12 May 2026 13:18:25 -0500
Subject: [PATCH 1/9] add eval capes to sdk

---
 docs/index.rst                                |  34 +++
 nucleus/__init__.py                           |  67 ++++++
 nucleus/data_transfer_object/evaluation_v2.py | 145 ++++++++++++
 nucleus/evaluation_v2.py                      | 210 ++++++++++++++++++
 tests/test_evaluation_v2.py                   | 170 ++++++++++++++
 5 files changed, 626 insertions(+)
 create mode 100644 nucleus/data_transfer_object/evaluation_v2.py
 create mode 100644 nucleus/evaluation_v2.py
 create mode 100644 tests/test_evaluation_v2.py

diff --git a/docs/index.rst b/docs/index.rst
index 698ef59a..a33f704f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -12,6 +12,40 @@ Scale Nucleus helps you:
 
 Nucleus is a new way—the right way—to develop ML models, helping us move away from the concept of one dataset and towards a paradigm of collections of scenarios.
 
+.. _evaluations-v2:
+
+Evaluations V2
+--------------
+
+Evaluation V2 runs COCO-style metrics against stored matches (``evaluation_match_v2``) for a **model run**.
+Create an evaluation with :meth:`NucleusClient.create_evaluation_v2`; poll with
+:meth:`nucleus.evaluation_v2.EvaluationV2.wait_for_completion`; then fetch aggregates via
+:meth:`nucleus.evaluation_v2.EvaluationV2.charts` or per-row examples via
+:meth:`nucleus.evaluation_v2.EvaluationV2.examples`.
+
+.. code-block:: python
+
+   import nucleus
+
+   client = nucleus.NucleusClient(api_key="YOUR_API_KEY")
+   evaluation = client.create_evaluation_v2(
+       model_run_id="run_xxx",
+       name="my-eval",
+       allowed_label_matches=[
+           nucleus.AllowedLabelMatch(
+               ground_truth_label="car",
+               model_prediction_label="vehicle",
+           ),
+       ],
+   )
+   evaluation.wait_for_completion()
+   charts = evaluation.charts(iou_threshold=0.5)
+   fps = evaluation.examples(match_type="FP", limit=20)
+
+The API uses REST endpoints ``/nucleus/modelRun/:id/evaluationsV2``,
+``/nucleus/evaluationsV2/:id``, ``/nucleus/evaluationsV2/:id/charts``, and
+``POST /nucleus/evaluationsV2/:id/examples``.
+
 .. _installation:
 
 Installation
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
index d7ee51db..6d675433 100644
--- a/nucleus/__init__.py
+++ b/nucleus/__init__.py
@@ -2,6 +2,7 @@
 
 __all__ = [
     "AsyncJob",
+    "AllowedLabelMatch",
     "EmbeddingsExportJob",
     "BoxAnnotation",
     "DeduplicationJob",
@@ -17,6 +18,12 @@
     "DatasetInfo",
     "DatasetItem",
     "DatasetItemRetrievalError",
+    "EvaluationV2",
+    "EvaluationV2Charts",
+    "EvaluationV2ExamplesPage",
+    "EvaluationV2FilterArgs",
+    "EvaluationV2MatchExample",
+    "EvaluationV2Status",
     "Frame",
     "Keypoint",
     "KeypointsAnnotation",
@@ -129,6 +136,12 @@
 )
 from .data_transfer_object.dataset_details import DatasetDetails
 from .data_transfer_object.dataset_info import DatasetInfo
+from .data_transfer_object.evaluation_v2 import (
+    EvaluationV2Charts,
+    EvaluationV2ExamplesPage,
+    EvaluationV2FilterArgs,
+    EvaluationV2MatchExample,
+)
 from .data_transfer_object.job_status import JobInfoRequestPayload
 from .dataset import Dataset
 from .dataset_item import DatasetItem
@@ -138,6 +151,7 @@
     DeduplicationStats,
 )
 from .deprecation_warning import deprecated
+from .evaluation_v2 import AllowedLabelMatch, EvaluationV2, EvaluationV2Status
 from .errors import (
     DatasetItemRetrievalError,
     ModelCreationError,
@@ -875,6 +889,59 @@ def commit_model_run(
             payload = {}
         return self.make_request(payload, f"modelRun/{model_run_id}/commit")
 
+    def create_evaluation_v2(
+        self,
+        model_run_id: str,
+        *,
+        name: Optional[str] = None,
+        allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
+        allowed_label_matches_id: Optional[str] = None,
+    ) -> EvaluationV2:
+        """Create an Evaluation V2 job for a model run.
+
+        Starts a Temporal workflow that fills ``evaluation_match_v2``. Use
+        :meth:`EvaluationV2.wait_for_completion` then :meth:`EvaluationV2.charts`
+        or :meth:`EvaluationV2.examples` for results.
+
+        Parameters:
+            model_run_id: Nucleus model run id (``run_*``).
+            name: Optional human-readable name.
+            allowed_label_matches: Optional explicit allowed label pairs; omit to use
+                the model run's default configuration.
+            allowed_label_matches_id: Optional existing allowed-label-matches config id.
+
+        Returns:
+            :class:`EvaluationV2` loaded via ``GET /nucleus/evaluationsV2/:id``.
+        """
+        payload: Dict[str, Any] = {}
+        if name is not None:
+            payload["name"] = name
+        if allowed_label_matches:
+            payload[
+                "allowed_label_matches"
+            ] = [m.to_api_dict() for m in allowed_label_matches]
+        if allowed_label_matches_id is not None:
+            payload["allowed_label_matches_id"] = allowed_label_matches_id
+        result = self.make_request(
+            payload, f"modelRun/{model_run_id}/evaluationsV2"
+        )
+        eval_id = result.get("evaluation_id")
+        if not eval_id:
+            raise RuntimeError(f"Unexpected create evaluation V2 response: {result}")
+        return self.get_evaluation_v2(str(eval_id))
+
+    def get_evaluation_v2(self, evaluation_id: str) -> EvaluationV2:
+        """Fetch a single Evaluation V2 row."""
+        data = self.get(f"evaluationsV2/{evaluation_id}")
+        return EvaluationV2.from_json(data, self)
+
+    def list_evaluations_v2(self, model_run_id: str) -> List[EvaluationV2]:
+        """List Evaluation V2 rows for a model run (newest first)."""
+        rows = self.get(f"modelRun/{model_run_id}/evaluationsV2")
+        if not isinstance(rows, list):
+            return []
+        return [EvaluationV2.from_json(r, self) for r in rows]
+
     @deprecated(msg="Prefer calling Dataset.info() directly.")
     def dataset_info(self, dataset_id: str):
         dataset = self.get_dataset(dataset_id)
diff --git a/nucleus/data_transfer_object/evaluation_v2.py b/nucleus/data_transfer_object/evaluation_v2.py
new file mode 100644
index 00000000..7524e32e
--- /dev/null
+++ b/nucleus/data_transfer_object/evaluation_v2.py
@@ -0,0 +1,145 @@
+"""Pydantic models for Nucleus Evaluations V2 REST payloads."""
+
+from typing import Any, Dict, List, Literal, Optional
+
+from nucleus.pydantic_base import DictCompatibleModel
+
+
+class RangeNum(DictCompatibleModel):
+    min: Optional[float] = None
+    max: Optional[float] = None
+
+
+class MetadataPredicate(DictCompatibleModel):
+    key: str
+    op: Literal["EQ", "IN", "GT", "LT"]
+    value: Optional[Any] = None
+
+
+class EvaluationV2FilterArgs(DictCompatibleModel):
+    """Filter object for charts/examples calls (mirrors server evaluation_v2 SQL filters)."""
+
+    confidence_range: Optional[RangeNum] = None
+    iou_range: Optional[RangeNum] = None
+    pred_labels: Optional[List[str]] = None
+    gt_labels: Optional[List[str]] = None
+    item_metadata: Optional[List[MetadataPredicate]] = None
+    prediction_metadata: Optional[List[MetadataPredicate]] = None
+    label_equality: Optional[Literal["EQ", "NEQ"]] = None
+    has_ground_truth: Optional[bool] = None
+    tide_background: Optional[bool] = None
+
+    def to_api_filters(self) -> Dict[str, Any]:
+        """Serialize to camelCase keys expected by the GraphQL / REST layer."""
+        d = self.dict(exclude_none=True)
+        # pydantic v1 uses snake_case fields; server expects camelCase in JSON filters
+        out: Dict[str, Any] = {}
+        if "confidence_range" in d:
+            out["confidenceRange"] = d["confidence_range"]
+        if "iou_range" in d:
+            out["iouRange"] = d["iou_range"]
+        if "pred_labels" in d:
+            out["predLabels"] = d["pred_labels"]
+        if "gt_labels" in d:
+            out["gtLabels"] = d["gt_labels"]
+        if "item_metadata" in d:
+            out["itemMetadata"] = d["item_metadata"]
+        if "prediction_metadata" in d:
+            out["predictionMetadata"] = d["prediction_metadata"]
+        if "label_equality" in d:
+            out["labelEquality"] = d["label_equality"]
+        if "has_ground_truth" in d:
+            out["hasGroundTruth"] = d["has_ground_truth"]
+        if "tide_background" in d:
+            out["tideBackground"] = d["tide_background"]
+        return out
+
+
+class MapSummary(DictCompatibleModel):
+    mapAt50: Optional[float] = None
+    mapAt75: Optional[float] = None
+    mapAt5095: Optional[float] = None
+
+
+class PerClassAp(DictCompatibleModel):
+    classLabel: str
+    ap: float
+
+
+class ConfusionEntry(DictCompatibleModel):
+    gtLabel: str
+    predLabel: str
+    count: int
+
+
+class ScoreHistogramBucket(DictCompatibleModel):
+    bucketMin: float
+    bucketMax: float
+    count: int
+
+
+class TotalCounts(DictCompatibleModel):
+    tp: int
+    fp: int
+    fn: int
+    predsWithConfidence: int
+
+
+class ApBySize(DictCompatibleModel):
+    small: Optional[float] = None
+    medium: Optional[float] = None
+    large: Optional[float] = None
+
+
+class PrCurvePoint(DictCompatibleModel):
+    classLabel: str
+    recall: float
+    precision: float
+
+
+class TideAttribution(DictCompatibleModel):
+    truePositive: int
+    localization: int
+    classification: int
+    both: int
+    duplicate: int
+    background: int
+    missed: int
+
+
+class EvaluationV2Charts(DictCompatibleModel):
+    mapSummary: MapSummary
+    perClassAp: List[PerClassAp]
+    confusionMatrix: List[ConfusionEntry]
+    scoreHistogram: List[ScoreHistogramBucket]
+    computedIouRanges: List[float]
+    totalCounts: TotalCounts
+    apBySize: ApBySize
+    prCurve: List[PrCurvePoint]
+    tideAttribution: TideAttribution
+
+
+class EvaluationV2MatchExample(DictCompatibleModel):
+    id: str
+    evaluation_id: str
+    dataset_item_id: str
+    model_prediction_id: Optional[str] = None
+    ground_truth_annotation_id: Optional[str] = None
+    pred_canonical_label: Optional[str] = None
+    gt_canonical_label: Optional[str] = None
+    pred_raw_label: Optional[str] = None
+    gt_raw_label: Optional[str] = None
+    iou: float
+    confidence: Optional[float] = None
+    true_positive: bool
+    match_type: str
+    gt_area: Optional[float] = None
+    item_metadata: Dict[str, Any]
+    prediction_metadata: Dict[str, Any]
+    prediction_row: Optional[Dict[str, Any]] = None
+    annotation_row: Optional[Dict[str, Any]] = None
+
+
+class EvaluationV2ExamplesPage(DictCompatibleModel):
+    rows: List[EvaluationV2MatchExample]
+    total: int
diff --git a/nucleus/evaluation_v2.py b/nucleus/evaluation_v2.py
new file mode 100644
index 00000000..eaee103a
--- /dev/null
+++ b/nucleus/evaluation_v2.py
@@ -0,0 +1,210 @@
+"""Nucleus Evaluation V2 — COCO-style metrics computed off ``evaluation_match_v2``."""
+
+from __future__ import annotations
+
+import json
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from urllib.parse import urlencode
+
+import requests
+
+from nucleus.data_transfer_object.evaluation_v2 import (
+    EvaluationV2Charts,
+    EvaluationV2ExamplesPage,
+    EvaluationV2FilterArgs,
+)
+from nucleus.errors import NucleusAPIError
+
+if TYPE_CHECKING:
+    from nucleus import NucleusClient
+
+
+class EvaluationV2Status(str, Enum):
+    """Lifecycle states for ``nucleus.evaluation_v2.status``."""
+
+    PENDING = "pending"
+    COMPUTING = "computing"
+    SUCCEEDED = "succeeded"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+
+@dataclass
+class AllowedLabelMatch:
+    """Pair of labels that may match for IoU evaluation (snake_case JSON for the API)."""
+
+    ground_truth_label: str
+    model_prediction_label: str
+
+    def to_api_dict(self) -> Dict[str, str]:
+        return {
+            "ground_truth_label": self.ground_truth_label,
+            "model_prediction_label": self.model_prediction_label,
+        }
+
+
+@dataclass
+class EvaluationV2:
+    """A single Evaluation V2 run for a model run (``evalv2_*``)."""
+
+    id: str
+    model_run_id: str
+    dataset_id: str
+    status: str
+    name: Optional[str] = None
+    temporal_workflow_id: Optional[str] = None
+    error_message: Optional[str] = None
+    created_at: Optional[str] = None
+    allowed_label_matches_id: Optional[str] = None
+    allowed_label_matches: Optional[List[AllowedLabelMatch]] = None
+    allowed_label_matches_name: Optional[str] = None
+    _client: Any = field(repr=False, default=None)
+
+    @classmethod
+    def from_json(
+        cls,
+        payload: Dict[str, Any],
+        client: Optional["NucleusClient"] = None,
+    ) -> "EvaluationV2":
+        raw_matches = payload.get("allowed_label_matches")
+        matches: Optional[List[AllowedLabelMatch]] = None
+        if isinstance(raw_matches, list):
+            matches = []
+            for m in raw_matches:
+                if not isinstance(m, dict):
+                    continue
+                gt = m.get("groundTruthLabel") or m.get("ground_truth_label")
+                mp = m.get("modelPredictionLabel") or m.get("model_prediction_label")
+                if gt is not None and mp is not None:
+                    matches.append(
+                        AllowedLabelMatch(
+                            ground_truth_label=str(gt),
+                            model_prediction_label=str(mp),
+                        )
+                    )
+
+        return cls(
+            id=str(payload["id"]),
+            model_run_id=str(payload["model_run_id"]),
+            dataset_id=str(payload["dataset_id"]),
+            status=str(payload["status"]),
+            name=payload.get("name"),
+            temporal_workflow_id=payload.get("temporal_workflow_id"),
+            error_message=payload.get("error_message"),
+            created_at=payload.get("created_at"),
+            allowed_label_matches_id=payload.get("allowed_label_matches_id"),
+            allowed_label_matches=matches,
+            allowed_label_matches_name=payload.get("allowed_label_matches_name"),
+            _client=client,
+        )
+
+    def refresh(self) -> "EvaluationV2":
+        """Reload this evaluation from ``GET /nucleus/evaluationsV2/:id``."""
+        if self._client is None:
+            raise RuntimeError("EvaluationV2 has no client; use NucleusClient.get_evaluation_v2.")
+        data = self._client.get(f"evaluationsV2/{self.id}")
+        updated = EvaluationV2.from_json(data, self._client)
+        self.__dict__.update(updated.__dict__)
+        return self
+
+    def wait_for_completion(
+        self,
+        timeout_sec: float = 600,
+        poll_interval: float = 5,
+    ) -> "EvaluationV2":
+        """Poll until status is terminal or ``timeout_sec`` elapses.
+
+        Raises:
+            RuntimeError: on ``failed`` status or timeout.
+        """
+        deadline = time.monotonic() + timeout_sec
+        terminal_ok = {"succeeded", "cancelled"}
+        while time.monotonic() < deadline:
+            self.refresh()
+            if self.status == "failed":
+                raise RuntimeError(
+                    f"Evaluation {self.id} failed: {self.error_message or 'unknown'}"
+                )
+            if self.status in terminal_ok:
+                return self
+            time.sleep(poll_interval)
+        raise RuntimeError(
+            f"Timed out after {timeout_sec}s waiting for evaluation {self.id} "
+            f"(last status: {self.status})"
+        )
+
+    def delete(self) -> None:
+        """Cancel workflow (best effort) and soft-delete (``204 No Content``)."""
+        if self._client is None:
+            raise RuntimeError("EvaluationV2 has no client.")
+        resp = self._client.make_request(
+            {},
+            f"evaluationsV2/{self.id}",
+            requests_command=requests.delete,
+            return_raw_response=True,
+        )
+        if resp.status_code != 204:
+            raise NucleusAPIError(
+                f"{self._client.endpoint}/evaluationsV2/{self.id}",
+                requests.delete,
+                resp,
+            )
+
+    def charts(
+        self,
+        iou_threshold: float = 0.5,
+        filters: Optional[Union[EvaluationV2FilterArgs, Dict[str, Any]]] = None,
+        query: Optional[str] = None,
+    ) -> EvaluationV2Charts:
+        """Aggregate metrics (mAP, confusion matrix, PR curve, TIDE, …)."""
+        if self._client is None:
+            raise RuntimeError("EvaluationV2 has no client.")
+        params: Dict[str, str] = {}
+        params["iouThreshold"] = str(iou_threshold)
+        if filters is not None:
+            if isinstance(filters, EvaluationV2FilterArgs):
+                filt_dict = filters.to_api_filters()
+            else:
+                filt_dict = filters
+            params["filters"] = json.dumps(filt_dict)
+        if query:
+            params["query"] = query
+        qs = urlencode(params)
+        route = f"evaluationsV2/{self.id}/charts?{qs}"
+        data = self._client.get(route)
+        return EvaluationV2Charts.parse_obj(data)
+
+    def examples(
+        self,
+        match_type: str,
+        limit: int = 50,
+        offset: int = 0,
+        sort_by: Optional[str] = None,
+        sort_order: Optional[str] = None,
+        filters: Optional[Union[EvaluationV2FilterArgs, Dict[str, Any]]] = None,
+        query: Optional[str] = None,
+    ) -> EvaluationV2ExamplesPage:
+        """Paginated TP / FP / FN match rows with prediction and annotation blobs."""
+        if self._client is None:
+            raise RuntimeError("EvaluationV2 has no client.")
+        payload: Dict[str, Any] = {
+            "match_type": match_type,
+            "limit": limit,
+            "offset": offset,
+        }
+        if sort_by is not None:
+            payload["sort_by"] = sort_by
+        if sort_order is not None:
+            payload["sort_order"] = sort_order
+        if filters is not None:
+            if isinstance(filters, EvaluationV2FilterArgs):
+                payload["filters"] = filters.to_api_filters()
+            else:
+                payload["filters"] = filters
+        if query:
+            payload["query"] = query
+        data = self._client.post(payload, f"evaluationsV2/{self.id}/examples")
+        return EvaluationV2ExamplesPage.parse_obj(data)
diff --git a/tests/test_evaluation_v2.py b/tests/test_evaluation_v2.py
new file mode 100644
index 00000000..c8429cf7
--- /dev/null
+++ b/tests/test_evaluation_v2.py
@@ -0,0 +1,170 @@
+"""Unit tests for Evaluations V2 client (no live API)."""
+
+from unittest.mock import MagicMock
+
+import requests
+
+from nucleus import AllowedLabelMatch, EvaluationV2, NucleusClient
+from nucleus.data_transfer_object.evaluation_v2 import EvaluationV2Charts
+
+
+def test_allowed_label_match_to_api_dict():
+    m = AllowedLabelMatch(ground_truth_label="a", model_prediction_label="b")
+    assert m.to_api_dict() == {
+        "ground_truth_label": "a",
+        "model_prediction_label": "b",
+    }
+
+
+def test_evaluation_v2_from_json_with_matches():
+    client = NucleusClient(api_key="k")
+    payload = {
+        "id": "evalv2_1",
+        "model_run_id": "run_1",
+        "dataset_id": "ds_1",
+        "status": "pending",
+        "allowed_label_matches": [
+            {"groundTruthLabel": "x", "modelPredictionLabel": "y"},
+        ],
+    }
+    ev = EvaluationV2.from_json(payload, client)
+    assert ev.id == "evalv2_1"
+    assert ev.allowed_label_matches is not None
+    assert len(ev.allowed_label_matches) == 1
+    assert ev.allowed_label_matches[0].ground_truth_label == "x"
+
+
+def test_create_evaluation_v2_then_get():
+    client = NucleusClient(api_key="test")
+    client.connection.make_request = MagicMock(
+        return_value={
+            "evaluation_id": "evalv2_new",
+            "status": "pending",
+            "workflow_id": "w",
+        }
+    )
+    client.connection.get = MagicMock(
+        return_value={
+            "id": "evalv2_new",
+            "model_run_id": "run_1",
+            "dataset_id": "ds_1",
+            "status": "pending",
+        }
+    )
+
+    ev = client.create_evaluation_v2(
+        "run_1",
+        name="n1",
+        allowed_label_matches=[
+            AllowedLabelMatch("gt", "pred"),
+        ],
+    )
+    assert ev.id == "evalv2_new"
+    client.connection.make_request.assert_called_once()
+    client.connection.get.assert_called_once_with("evaluationsV2/evalv2_new")
+
+
+def test_charts_get_query_string():
+    client = MagicMock(spec=NucleusClient)
+    client.get.return_value = {
+        "mapSummary": {"mapAt50": 0.1, "mapAt75": 0.2, "mapAt5095": 0.15},
+        "perClassAp": [],
+        "confusionMatrix": [],
+        "scoreHistogram": [],
+        "computedIouRanges": [],
+        "totalCounts": {"tp": 0, "fp": 0, "fn": 0, "predsWithConfidence": 0},
+        "apBySize": {"small": None, "medium": None, "large": None},
+        "prCurve": [],
+        "tideAttribution": {
+            "truePositive": 0,
+            "localization": 0,
+            "classification": 0,
+            "both": 0,
+            "duplicate": 0,
+            "background": 0,
+            "missed": 0,
+        },
+    }
+    ev = EvaluationV2(
+        id="evalv2_1",
+        model_run_id="run_1",
+        dataset_id="ds_1",
+        status="succeeded",
+        _client=client,
+    )
+    charts = ev.charts(iou_threshold=0.5)
+    assert isinstance(charts, EvaluationV2Charts)
+    call_route = client.get.call_args[0][0]
+    assert "evaluationsV2/evalv2_1/charts" in call_route
+    assert "iouThreshold=0.5" in call_route
+
+
+def test_examples_post_body():
+    client = MagicMock(spec=NucleusClient)
+    client.post.return_value = {"rows": [], "total": 0}
+    ev = EvaluationV2(
+        id="evalv2_1",
+        model_run_id="run_1",
+        dataset_id="ds_1",
+        status="succeeded",
+        _client=client,
+    )
+    page = ev.examples("TP", limit=20, offset=5)
+    assert page.total == 0
+    client.post.assert_called_once()
+    args, kwargs = client.post.call_args
+    payload, route = args
+    assert route == "evaluationsV2/evalv2_1/examples"
+    assert payload["match_type"] == "TP"
+    assert payload["limit"] == 20
+    assert payload["offset"] == 5
+
+
+def test_wait_for_completion():
+    client = NucleusClient(api_key="test")
+    client.connection.get = MagicMock(
+        side_effect=[
+            {
+                "id": "evalv2_1",
+                "model_run_id": "run_1",
+                "dataset_id": "ds_1",
+                "status": "pending",
+            },
+            {
+                "id": "evalv2_1",
+                "model_run_id": "run_1",
+                "dataset_id": "ds_1",
+                "status": "succeeded",
+            },
+        ]
+    )
+    ev = EvaluationV2(
+        id="evalv2_1",
+        model_run_id="run_1",
+        dataset_id="ds_1",
+        status="pending",
+        _client=client,
+    )
+    ev.wait_for_completion(timeout_sec=5, poll_interval=0.01)
+    assert ev.status == "succeeded"
+
+
+def test_delete_204():
+    client = NucleusClient(api_key="test")
+    resp = MagicMock()
+    resp.status_code = 204
+    client.connection.make_request = MagicMock(return_value=resp)
+    ev = EvaluationV2(
+        id="evalv2_1",
+        model_run_id="run_1",
+        dataset_id="ds_1",
+        status="succeeded",
+        _client=client,
+    )
+    ev.delete()
+    assert client.connection.make_request.call_count == 1
+    cargs = client.connection.make_request.call_args
+    assert cargs[0][0] == {}
+    assert cargs[0][1] == "evaluationsV2/evalv2_1"
+    assert cargs[0][2] is requests.delete
+    assert cargs[0][3] is True

From 36f6b4aef3e244e5194aa52898232d6027619edd Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Tue, 12 May 2026 13:49:22 -0500
Subject: [PATCH 2/9] Apply suggestion from @greptile-apps[bot]

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 nucleus/data_transfer_object/evaluation_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nucleus/data_transfer_object/evaluation_v2.py b/nucleus/data_transfer_object/evaluation_v2.py
index 7524e32e..6150aee3 100644
--- a/nucleus/data_transfer_object/evaluation_v2.py
+++ b/nucleus/data_transfer_object/evaluation_v2.py
@@ -129,7 +129,7 @@ class EvaluationV2MatchExample(DictCompatibleModel):
     gt_canonical_label: Optional[str] = None
     pred_raw_label: Optional[str] = None
     gt_raw_label: Optional[str] = None
-    iou: float
+    iou: Optional[float] = None
     confidence: Optional[float] = None
     true_positive: bool
     match_type: str

From 3caaf8d336ac1a7bec27e85fe3095d2419e01018 Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Tue, 12 May 2026 13:49:34 -0500
Subject: [PATCH 3/9] Apply suggestion from @greptile-apps[bot]

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 nucleus/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nucleus/__init__.py b/nucleus/__init__.py
index 6d675433..b3ad1297 100644
--- a/nucleus/__init__.py
+++ b/nucleus/__init__.py
@@ -916,7 +916,7 @@ def create_evaluation_v2(
         payload: Dict[str, Any] = {}
         if name is not None:
             payload["name"] = name
-        if allowed_label_matches:
+        if allowed_label_matches is not None:
             payload[
                 "allowed_label_matches"
             ] = [m.to_api_dict() for m in allowed_label_matches]

From 13a91b2c057761b40ce9ce40a8829efff6837adf Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Tue, 12 May 2026 14:03:04 -0500
Subject: [PATCH 4/9] run hooks

---
 nucleus/__init__.py      | 12 +++++++-----
 nucleus/evaluation_v2.py | 20 +++++++++++++++-----
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/nucleus/__init__.py b/nucleus/__init__.py
index 6d675433..d3995ba8 100644
--- a/nucleus/__init__.py
+++ b/nucleus/__init__.py
@@ -151,7 +151,6 @@
     DeduplicationStats,
 )
 from .deprecation_warning import deprecated
-from .evaluation_v2 import AllowedLabelMatch, EvaluationV2, EvaluationV2Status
 from .errors import (
     DatasetItemRetrievalError,
     ModelCreationError,
@@ -160,6 +159,7 @@
     NotFoundError,
     NucleusAPIError,
 )
+from .evaluation_v2 import AllowedLabelMatch, EvaluationV2, EvaluationV2Status
 from .job import CustomerJobTypes
 from .model import Model
 from .model_run import ModelRun
@@ -917,9 +917,9 @@ def create_evaluation_v2(
         if name is not None:
             payload["name"] = name
         if allowed_label_matches:
-            payload[
-                "allowed_label_matches"
-            ] = [m.to_api_dict() for m in allowed_label_matches]
+            payload["allowed_label_matches"] = [
+                m.to_api_dict() for m in allowed_label_matches
+            ]
         if allowed_label_matches_id is not None:
             payload["allowed_label_matches_id"] = allowed_label_matches_id
         result = self.make_request(
@@ -927,7 +927,9 @@ def create_evaluation_v2(
         )
         eval_id = result.get("evaluation_id")
         if not eval_id:
-            raise RuntimeError(f"Unexpected create evaluation V2 response: {result}")
+            raise RuntimeError(
+                f"Unexpected create evaluation V2 response: {result}"
+            )
         return self.get_evaluation_v2(str(eval_id))
 
     def get_evaluation_v2(self, evaluation_id: str) -> EvaluationV2:
diff --git a/nucleus/evaluation_v2.py b/nucleus/evaluation_v2.py
index eaee103a..94191f31 100644
--- a/nucleus/evaluation_v2.py
+++ b/nucleus/evaluation_v2.py
@@ -77,7 +77,9 @@ def from_json(
                 if not isinstance(m, dict):
                     continue
                 gt = m.get("groundTruthLabel") or m.get("ground_truth_label")
-                mp = m.get("modelPredictionLabel") or m.get("model_prediction_label")
+                mp = m.get("modelPredictionLabel") or m.get(
+                    "model_prediction_label"
+                )
                 if gt is not None and mp is not None:
                     matches.append(
                         AllowedLabelMatch(
@@ -97,14 +99,18 @@ def from_json(
             created_at=payload.get("created_at"),
             allowed_label_matches_id=payload.get("allowed_label_matches_id"),
             allowed_label_matches=matches,
-            allowed_label_matches_name=payload.get("allowed_label_matches_name"),
+            allowed_label_matches_name=payload.get(
+                "allowed_label_matches_name"
+            ),
             _client=client,
         )
 
     def refresh(self) -> "EvaluationV2":
         """Reload this evaluation from ``GET /nucleus/evaluationsV2/:id``."""
         if self._client is None:
-            raise RuntimeError("EvaluationV2 has no client; use NucleusClient.get_evaluation_v2.")
+            raise RuntimeError(
+                "EvaluationV2 has no client; use NucleusClient.get_evaluation_v2."
+            )
         data = self._client.get(f"evaluationsV2/{self.id}")
         updated = EvaluationV2.from_json(data, self._client)
         self.__dict__.update(updated.__dict__)
@@ -156,7 +162,9 @@ def delete(self) -> None:
     def charts(
         self,
         iou_threshold: float = 0.5,
-        filters: Optional[Union[EvaluationV2FilterArgs, Dict[str, Any]]] = None,
+        filters: Optional[
+            Union[EvaluationV2FilterArgs, Dict[str, Any]]
+        ] = None,
         query: Optional[str] = None,
     ) -> EvaluationV2Charts:
         """Aggregate metrics (mAP, confusion matrix, PR curve, TIDE, …)."""
@@ -184,7 +192,9 @@ def examples(
         offset: int = 0,
         sort_by: Optional[str] = None,
         sort_order: Optional[str] = None,
-        filters: Optional[Union[EvaluationV2FilterArgs, Dict[str, Any]]] = None,
+        filters: Optional[
+            Union[EvaluationV2FilterArgs, Dict[str, Any]]
+        ] = None,
         query: Optional[str] = None,
     ) -> EvaluationV2ExamplesPage:
         """Paginated TP / FP / FN match rows with prediction and annotation blobs."""

From aced4aab70192915fa0c67a9d3d2e35ba3d985a2 Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Tue, 12 May 2026 14:05:32 -0500
Subject: [PATCH 5/9] Update nucleus/data_transfer_object/evaluation_v2.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 nucleus/data_transfer_object/evaluation_v2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nucleus/data_transfer_object/evaluation_v2.py b/nucleus/data_transfer_object/evaluation_v2.py
index 6150aee3..18607ddc 100644
--- a/nucleus/data_transfer_object/evaluation_v2.py
+++ b/nucleus/data_transfer_object/evaluation_v2.py
@@ -134,8 +134,8 @@ class EvaluationV2MatchExample(DictCompatibleModel):
     true_positive: bool
     match_type: str
     gt_area: Optional[float] = None
-    item_metadata: Dict[str, Any]
-    prediction_metadata: Dict[str, Any]
+    item_metadata: Optional[Dict[str, Any]] = None
+    prediction_metadata: Optional[Dict[str, Any]] = None
     prediction_row: Optional[Dict[str, Any]] = None
     annotation_row: Optional[Dict[str, Any]] = None
 

From 866ac71918b65a229310592fa8b1a34df702e8ae Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Tue, 12 May 2026 14:54:55 -0500
Subject: [PATCH 6/9] fix p1

---
 nucleus/evaluation_v2.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/nucleus/evaluation_v2.py b/nucleus/evaluation_v2.py
index 94191f31..3fdf5dca 100644
--- a/nucleus/evaluation_v2.py
+++ b/nucleus/evaluation_v2.py
@@ -76,10 +76,12 @@ def from_json(
             for m in raw_matches:
                 if not isinstance(m, dict):
                     continue
-                gt = m.get("groundTruthLabel") or m.get("ground_truth_label")
-                mp = m.get("modelPredictionLabel") or m.get(
-                    "model_prediction_label"
-                )
+                gt = m.get("groundTruthLabel")
+                if gt is None:
+                    gt = m.get("ground_truth_label")
+                mp = m.get("modelPredictionLabel")
+                if mp is None:
+                    mp = m.get("model_prediction_label")
                 if gt is not None and mp is not None:
                     matches.append(
                         AllowedLabelMatch(

From 658216319ac2413994a3d7ce03fc3559db647268 Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Thu, 28 May 2026 17:31:01 -0500
Subject: [PATCH 7/9] address comments

---
 CHANGELOG.md                                  |   5 +
 docs/index.rst                                |  12 +--
 nucleus/__init__.py                           |  41 ++++---
 nucleus/data_transfer_object/evaluation_v2.py |  65 ++++++-----
 nucleus/evaluation_v2.py                      |  78 +++++++++-----
 pyproject.toml                                |   2 +-
 tests/test_evaluation_v2.py                   | 101 +++++++++++++++++-
 7 files changed, 231 insertions(+), 73 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 019af44e..486ff13a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.18.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.3) - 2026-05-28
+
+### Added
+- **Evaluations V2** client support for COCO-style metrics on model runs via stored `evaluation_match_v2` rows. `NucleusClient` exposes `create_evaluation_v2()`, `get_evaluation_v2()`, and `list_evaluations_v2()`. The `EvaluationV2` resource supports `wait_for_completion()`, `charts()` (mAP, confusion matrix, PR curve, TIDE, and related aggregates), `examples()` (paginated TP/FP/FN rows), `delete()`, and `refresh()`. `AllowedLabelMatch` configures allowed ground-truth / prediction label pairs; filter and response types include `EvaluationV2FilterArgs`, `EvaluationV2Charts`, `EvaluationV2ExamplesPage`, and `EvaluationV2MatchExample`. Sphinx docs cover the workflow under Evaluations V2.
+
 ## [0.18.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.2) - 2026-05-08
 
 ### Added
diff --git a/docs/index.rst b/docs/index.rst
index a33f704f..88ec8c3d 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -17,10 +17,10 @@ Nucleus is a new way—the right way—to develop ML models, helping us move awa
 Evaluations V2
 --------------
 
-Evaluation V2 runs COCO-style metrics against stored matches (``evaluation_match_v2``) for a **model run**.
-Create an evaluation with :meth:`NucleusClient.create_evaluation_v2`; poll with
-:meth:`nucleus.evaluation_v2.EvaluationV2.wait_for_completion`; then fetch aggregates via
-:meth:`nucleus.evaluation_v2.EvaluationV2.charts` or per-row examples via
+Evaluation V2 measures how well a **model run** matches ground-truth annotations.
+Create a run with :meth:`NucleusClient.create_evaluation_v2`, wait with
+:meth:`nucleus.evaluation_v2.EvaluationV2.wait_for_completion`, then read summary metrics with
+:meth:`nucleus.evaluation_v2.EvaluationV2.charts` or individual matches with
 :meth:`nucleus.evaluation_v2.EvaluationV2.examples`.
 
 .. code-block:: python
@@ -42,10 +42,6 @@ Create an evaluation with :meth:`NucleusClient.create_evaluation_v2`; poll with
    charts = evaluation.charts(iou_threshold=0.5)
    fps = evaluation.examples(match_type="FP", limit=20)
 
-The API uses REST endpoints ``/nucleus/modelRun/:id/evaluationsV2``,
-``/nucleus/evaluationsV2/:id``, ``/nucleus/evaluationsV2/:id/charts``, and
-``POST /nucleus/evaluationsV2/:id/examples``.
-
 .. _installation:
 
 Installation
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
index 4e1ea270..8e551987 100644
--- a/nucleus/__init__.py
+++ b/nucleus/__init__.py
@@ -897,21 +897,20 @@ def create_evaluation_v2(
         allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
         allowed_label_matches_id: Optional[str] = None,
     ) -> EvaluationV2:
-        """Create an Evaluation V2 job for a model run.
+        """Create an evaluation for a model run.
 
-        Starts a Temporal workflow that fills ``evaluation_match_v2``. Use
-        :meth:`EvaluationV2.wait_for_completion` then :meth:`EvaluationV2.charts`
-        or :meth:`EvaluationV2.examples` for results.
+        The evaluation runs in the background. Call
+        :meth:`EvaluationV2.wait_for_completion`, then
+        :meth:`EvaluationV2.charts` or :meth:`EvaluationV2.examples` for results.
 
         Parameters:
-            model_run_id: Nucleus model run id (``run_*``).
-            name: Optional human-readable name.
-            allowed_label_matches: Optional explicit allowed label pairs; omit to use
-                the model run's default configuration.
-            allowed_label_matches_id: Optional existing allowed-label-matches config id.
+            model_run_id: Model run id (``run_*``).
+            name: Optional display name.
+            allowed_label_matches: Optional label pairs to treat as matches.
+            allowed_label_matches_id: Optional id of a saved label-match configuration.
 
         Returns:
-            :class:`EvaluationV2` loaded via ``GET /nucleus/evaluationsV2/:id``.
+            :class:`EvaluationV2`: The created evaluation.
         """
         payload: Dict[str, Any] = {}
         if name is not None:
@@ -933,15 +932,31 @@ def create_evaluation_v2(
         return self.get_evaluation_v2(str(eval_id))
 
     def get_evaluation_v2(self, evaluation_id: str) -> EvaluationV2:
-        """Fetch a single Evaluation V2 row."""
+        """Get an evaluation by id.
+
+        Parameters:
+            evaluation_id: Evaluation id (``evalv2_*``).
+
+        Returns:
+            :class:`EvaluationV2`.
+        """
         data = self.get(f"evaluationsV2/{evaluation_id}")
         return EvaluationV2.from_json(data, self)
 
     def list_evaluations_v2(self, model_run_id: str) -> List[EvaluationV2]:
-        """List Evaluation V2 rows for a model run (newest first)."""
+        """List evaluations for a model run (newest first).
+
+        Parameters:
+            model_run_id: Model run id (``run_*``).
+
+        Returns:
+            List of :class:`EvaluationV2`.
+        """
         rows = self.get(f"modelRun/{model_run_id}/evaluationsV2")
         if not isinstance(rows, list):
-            return []
+            raise RuntimeError(
+                f"Unexpected list evaluations V2 response: {rows!r}"
+            )
         return [EvaluationV2.from_json(r, self) for r in rows]
 
     @deprecated(msg="Prefer calling Dataset.info() directly.")
diff --git a/nucleus/data_transfer_object/evaluation_v2.py b/nucleus/data_transfer_object/evaluation_v2.py
index 18607ddc..a1abb443 100644
--- a/nucleus/data_transfer_object/evaluation_v2.py
+++ b/nucleus/data_transfer_object/evaluation_v2.py
@@ -1,10 +1,30 @@
-"""Pydantic models for Nucleus Evaluations V2 REST payloads."""
+"""Response and filter models for Evaluation V2."""
 
 from typing import Any, Dict, List, Literal, Optional
 
 from nucleus.pydantic_base import DictCompatibleModel
 
 
+def _snake_to_camel(name: str) -> str:
+    parts = name.split("_")
+    if len(parts) == 1:
+        return name
+    return parts[0] + "".join(part.capitalize() for part in parts[1:])
+
+
+def _camelize_filter_value(value: Any) -> Any:
+    if isinstance(value, dict):
+        return {
+            _snake_to_camel(key): (
+                val if key == "value" else _camelize_filter_value(val)
+            )
+            for key, val in value.items()
+        }
+    if isinstance(value, list):
+        return [_camelize_filter_value(item) for item in value]
+    return value
+
+
 class RangeNum(DictCompatibleModel):
     min: Optional[float] = None
     max: Optional[float] = None
@@ -16,8 +36,21 @@ class MetadataPredicate(DictCompatibleModel):
     value: Optional[Any] = None
 
 
+_FILTER_API_KEYS = {
+    "confidence_range": "confidenceRange",
+    "iou_range": "iouRange",
+    "pred_labels": "predLabels",
+    "gt_labels": "gtLabels",
+    "item_metadata": "itemMetadata",
+    "prediction_metadata": "predictionMetadata",
+    "label_equality": "labelEquality",
+    "has_ground_truth": "hasGroundTruth",
+    "tide_background": "tideBackground",
+}
+
+
 class EvaluationV2FilterArgs(DictCompatibleModel):
-    """Filter object for charts/examples calls (mirrors server evaluation_v2 SQL filters)."""
+    """Optional filters for :meth:`nucleus.evaluation_v2.EvaluationV2.charts` and :meth:`nucleus.evaluation_v2.EvaluationV2.examples`."""
 
     confidence_range: Optional[RangeNum] = None
     iou_range: Optional[RangeNum] = None
@@ -30,29 +63,13 @@ class EvaluationV2FilterArgs(DictCompatibleModel):
     tide_background: Optional[bool] = None
 
     def to_api_filters(self) -> Dict[str, Any]:
-        """Serialize to camelCase keys expected by the GraphQL / REST layer."""
+        """Return filters as a dict ready for API requests."""
         d = self.dict(exclude_none=True)
-        # pydantic v1 uses snake_case fields; server expects camelCase in JSON filters
-        out: Dict[str, Any] = {}
-        if "confidence_range" in d:
-            out["confidenceRange"] = d["confidence_range"]
-        if "iou_range" in d:
-            out["iouRange"] = d["iou_range"]
-        if "pred_labels" in d:
-            out["predLabels"] = d["pred_labels"]
-        if "gt_labels" in d:
-            out["gtLabels"] = d["gt_labels"]
-        if "item_metadata" in d:
-            out["itemMetadata"] = d["item_metadata"]
-        if "prediction_metadata" in d:
-            out["predictionMetadata"] = d["prediction_metadata"]
-        if "label_equality" in d:
-            out["labelEquality"] = d["label_equality"]
-        if "has_ground_truth" in d:
-            out["hasGroundTruth"] = d["has_ground_truth"]
-        if "tide_background" in d:
-            out["tideBackground"] = d["tide_background"]
-        return out
+        return {
+            api_key: _camelize_filter_value(d[snake_key])
+            for snake_key, api_key in _FILTER_API_KEYS.items()
+            if snake_key in d
+        }
 
 
 class MapSummary(DictCompatibleModel):
diff --git a/nucleus/evaluation_v2.py b/nucleus/evaluation_v2.py
index 3fdf5dca..4dd35385 100644
--- a/nucleus/evaluation_v2.py
+++ b/nucleus/evaluation_v2.py
@@ -1,4 +1,4 @@
-"""Nucleus Evaluation V2 — COCO-style metrics computed off ``evaluation_match_v2``."""
+"""Evaluation V2 — metrics and examples for a model run."""
 
 from __future__ import annotations
 
@@ -6,7 +6,7 @@
 import time
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
 from urllib.parse import urlencode
 
 import requests
@@ -16,14 +16,12 @@
     EvaluationV2ExamplesPage,
     EvaluationV2FilterArgs,
 )
-from nucleus.errors import NucleusAPIError
-
 if TYPE_CHECKING:
     from nucleus import NucleusClient
 
 
 class EvaluationV2Status(str, Enum):
-    """Lifecycle states for ``nucleus.evaluation_v2.status``."""
+    """Status of an Evaluation V2 run."""
 
     PENDING = "pending"
     COMPUTING = "computing"
@@ -32,9 +30,15 @@ class EvaluationV2Status(str, Enum):
     CANCELLED = "cancelled"
 
 
+_TERMINAL_OK: Set[EvaluationV2Status] = {
+    EvaluationV2Status.SUCCEEDED,
+    EvaluationV2Status.CANCELLED,
+}
+
+
 @dataclass
 class AllowedLabelMatch:
-    """Pair of labels that may match for IoU evaluation (snake_case JSON for the API)."""
+    """Ground-truth and prediction label pair that counts as a match."""
 
     ground_truth_label: str
     model_prediction_label: str
@@ -48,7 +52,7 @@ def to_api_dict(self) -> Dict[str, str]:
 
 @dataclass
 class EvaluationV2:
-    """A single Evaluation V2 run for a model run (``evalv2_*``)."""
+    """An Evaluation V2 run for a model run."""
 
     id: str
     model_run_id: str
@@ -61,7 +65,7 @@ class EvaluationV2:
     allowed_label_matches_id: Optional[str] = None
     allowed_label_matches: Optional[List[AllowedLabelMatch]] = None
     allowed_label_matches_name: Optional[str] = None
-    _client: Any = field(repr=False, default=None)
+    _client: Optional["NucleusClient"] = field(repr=False, default=None)
 
     @classmethod
     def from_json(
@@ -108,7 +112,11 @@ def from_json(
         )
 
     def refresh(self) -> "EvaluationV2":
-        """Reload this evaluation from ``GET /nucleus/evaluationsV2/:id``."""
+        """Reload this evaluation from Nucleus.
+
+        Returns:
+            self, with updated fields.
+        """
         if self._client is None:
             raise RuntimeError(
                 "EvaluationV2 has no client; use NucleusClient.get_evaluation_v2."
@@ -123,20 +131,26 @@ def wait_for_completion(
         timeout_sec: float = 600,
         poll_interval: float = 5,
     ) -> "EvaluationV2":
-        """Poll until status is terminal or ``timeout_sec`` elapses.
+        """Wait until the evaluation finishes or is cancelled.
+
+        Parameters:
+            timeout_sec: Maximum seconds to wait.
+            poll_interval: Seconds between status checks.
+
+        Returns:
+            self, after a terminal status is reached.
 
         Raises:
-            RuntimeError: on ``failed`` status or timeout.
+            RuntimeError: If the evaluation fails or times out.
         """
         deadline = time.monotonic() + timeout_sec
-        terminal_ok = {"succeeded", "cancelled"}
         while time.monotonic() < deadline:
             self.refresh()
-            if self.status == "failed":
+            if self.status == EvaluationV2Status.FAILED:
                 raise RuntimeError(
                     f"Evaluation {self.id} failed: {self.error_message or 'unknown'}"
                 )
-            if self.status in terminal_ok:
+            if self.status in _TERMINAL_OK:
                 return self
             time.sleep(poll_interval)
         raise RuntimeError(
@@ -145,21 +159,15 @@ def wait_for_completion(
         )
 
     def delete(self) -> None:
-        """Cancel workflow (best effort) and soft-delete (``204 No Content``)."""
+        """Delete this evaluation."""
         if self._client is None:
             raise RuntimeError("EvaluationV2 has no client.")
-        resp = self._client.make_request(
+        self._client.make_request(
             {},
             f"evaluationsV2/{self.id}",
             requests_command=requests.delete,
             return_raw_response=True,
         )
-        if resp.status_code != 204:
-            raise NucleusAPIError(
-                f"{self._client.endpoint}/evaluationsV2/{self.id}",
-                requests.delete,
-                resp,
-            )
 
     def charts(
         self,
@@ -169,7 +177,16 @@ def charts(
         ] = None,
         query: Optional[str] = None,
     ) -> EvaluationV2Charts:
-        """Aggregate metrics (mAP, confusion matrix, PR curve, TIDE, …)."""
+        """Return aggregate metrics for this evaluation.
+
+        Parameters:
+            iou_threshold: IoU threshold for matching (default 0.5).
+            filters: Optional filters (:class:`EvaluationV2FilterArgs` or dict).
+            query: Optional query string to narrow results.
+
+        Returns:
+            :class:`EvaluationV2Charts`: Summary metrics (mAP, confusion matrix, PR curve, etc.).
+        """
         if self._client is None:
             raise RuntimeError("EvaluationV2 has no client.")
         params: Dict[str, str] = {}
@@ -199,7 +216,20 @@ def examples(
         ] = None,
         query: Optional[str] = None,
     ) -> EvaluationV2ExamplesPage:
-        """Paginated TP / FP / FN match rows with prediction and annotation blobs."""
+        """Return paginated true-positive, false-positive, or false-negative examples.
+
+        Parameters:
+            match_type: ``"TP"``, ``"FP"``, or ``"FN"``.
+            limit: Page size (default 50).
+            offset: Row offset for pagination.
+            sort_by: Optional field to sort by.
+            sort_order: Optional sort direction (e.g. ``"asc"`` or ``"desc"``).
+            filters: Optional filters (:class:`EvaluationV2FilterArgs` or dict).
+            query: Optional query string to narrow results.
+
+        Returns:
+            :class:`EvaluationV2ExamplesPage`: Matching rows and total count.
+        """
         if self._client is None:
             raise RuntimeError("EvaluationV2 has no client.")
         payload: Dict[str, Any] = {
diff --git a/pyproject.toml b/pyproject.toml
index 772decb2..dd07937e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"]  # Easy ignore for getting it running
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.18.2"
+version = "0.18.3"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/test_evaluation_v2.py b/tests/test_evaluation_v2.py
index c8429cf7..829e34a2 100644
--- a/tests/test_evaluation_v2.py
+++ b/tests/test_evaluation_v2.py
@@ -2,10 +2,45 @@
 
 from unittest.mock import MagicMock
 
+import pytest
 import requests
 
 from nucleus import AllowedLabelMatch, EvaluationV2, NucleusClient
-from nucleus.data_transfer_object.evaluation_v2 import EvaluationV2Charts
+from nucleus.data_transfer_object.evaluation_v2 import (
+    EvaluationV2Charts,
+    EvaluationV2FilterArgs,
+    MetadataPredicate,
+    RangeNum,
+    _camelize_filter_value,
+)
+
+
+def test_evaluation_v2_filter_args_to_api_filters():
+    filters = EvaluationV2FilterArgs(
+        confidence_range=RangeNum(min=0.1, max=0.9),
+        pred_labels=["cat"],
+        item_metadata=[MetadataPredicate(key="tier", op="EQ", value="gold")],
+        has_ground_truth=True,
+    )
+    assert filters.to_api_filters() == {
+        "confidenceRange": {"min": 0.1, "max": 0.9},
+        "predLabels": ["cat"],
+        "itemMetadata": [{"key": "tier", "op": "EQ", "value": "gold"}],
+        "hasGroundTruth": True,
+    }
+
+
+def test_camelize_filter_value_nested_keys():
+    assert _camelize_filter_value({"bucket_min": 1.0, "bucket_max": 2.0}) == {
+        "bucketMin": 1.0,
+        "bucketMax": 2.0,
+    }
+
+
+def test_camelize_filter_value_preserves_predicate_value():
+    assert _camelize_filter_value(
+        {"key": "k", "op": "EQ", "value": {"keep_snake": 1}}
+    ) == {"key": "k", "op": "EQ", "value": {"keep_snake": 1}}
 
 
 def test_allowed_label_match_to_api_dict():
@@ -34,6 +69,41 @@ def test_evaluation_v2_from_json_with_matches():
     assert ev.allowed_label_matches[0].ground_truth_label == "x"
 
 
+def test_list_evaluations_v2_empty():
+    client = NucleusClient(api_key="test")
+    client.connection.get = MagicMock(return_value=[])
+    result = client.list_evaluations_v2("run_1")
+    assert result == []
+    client.connection.get.assert_called_once_with(
+        "modelRun/run_1/evaluationsV2"
+    )
+
+
+def test_list_evaluations_v2_returns_rows():
+    client = NucleusClient(api_key="test")
+    client.connection.get = MagicMock(
+        return_value=[
+            {
+                "id": "evalv2_1",
+                "model_run_id": "run_1",
+                "dataset_id": "ds_1",
+                "status": "succeeded",
+            },
+        ]
+    )
+    result = client.list_evaluations_v2("run_1")
+    assert len(result) == 1
+    assert result[0].id == "evalv2_1"
+    assert result[0]._client is client
+
+
+def test_list_evaluations_v2_invalid_response():
+    client = NucleusClient(api_key="test")
+    client.connection.get = MagicMock(return_value={"evaluations": []})
+    with pytest.raises(RuntimeError, match="Unexpected list evaluations V2"):
+        client.list_evaluations_v2("run_1")
+
+
 def test_create_evaluation_v2_then_get():
     client = NucleusClient(api_key="test")
     client.connection.make_request = MagicMock(
@@ -120,6 +190,30 @@ def test_examples_post_body():
     assert payload["offset"] == 5
 
 
+def test_examples_with_filter_args():
+    client = MagicMock(spec=NucleusClient)
+    client.post.return_value = {"rows": [], "total": 0}
+    ev = EvaluationV2(
+        id="evalv2_1",
+        model_run_id="run_1",
+        dataset_id="ds_1",
+        status="succeeded",
+        _client=client,
+    )
+    filters = EvaluationV2FilterArgs(
+        confidence_range=RangeNum(min=0.1, max=0.9),
+        pred_labels=["cat"],
+        has_ground_truth=True,
+    )
+    ev.examples("FP", limit=10, filters=filters)
+    payload = client.post.call_args[0][0]
+    assert payload["filters"] == {
+        "confidenceRange": {"min": 0.1, "max": 0.9},
+        "predLabels": ["cat"],
+        "hasGroundTruth": True,
+    }
+
+
 def test_wait_for_completion():
     client = NucleusClient(api_key="test")
     client.connection.get = MagicMock(
@@ -149,10 +243,11 @@ def test_wait_for_completion():
     assert ev.status == "succeeded"
 
 
-def test_delete_204():
+@pytest.mark.parametrize("status_code", [200, 204])
+def test_delete_success(status_code):
     client = NucleusClient(api_key="test")
     resp = MagicMock()
-    resp.status_code = 204
+    resp.status_code = status_code
     client.connection.make_request = MagicMock(return_value=resp)
     ev = EvaluationV2(
         id="evalv2_1",

From f88b665808d40e52eacd0c75a083ecd74b82336c Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Thu, 28 May 2026 18:21:23 -0500
Subject: [PATCH 8/9] fix lint

---
 nucleus/evaluation_v2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nucleus/evaluation_v2.py b/nucleus/evaluation_v2.py
index 4dd35385..43f8a03c 100644
--- a/nucleus/evaluation_v2.py
+++ b/nucleus/evaluation_v2.py
@@ -16,6 +16,7 @@
     EvaluationV2ExamplesPage,
     EvaluationV2FilterArgs,
 )
+
 if TYPE_CHECKING:
     from nucleus import NucleusClient
 

From 55a753f73cfec4df66b64c48c6507b2dec362faf Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Mon, 1 Jun 2026 15:05:24 -0500
Subject: [PATCH 9/9] update version

---
 CHANGELOG.md   | 178 +++++++++++++++++++++++++++++++++++--------------
 pyproject.toml |   2 +-
 2 files changed, 130 insertions(+), 50 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7c49a64b..fa40d075 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,40 +5,48 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [0.18.4](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.3) - 2026-05-28
+## [0.18.4](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.4) - 2026-05-28
 
 ### Added
+
 - **Evaluations V2** client support for COCO-style metrics on model runs via stored `evaluation_match_v2` rows. `NucleusClient` exposes `create_evaluation_v2()`, `get_evaluation_v2()`, and `list_evaluations_v2()`. The `EvaluationV2` resource supports `wait_for_completion()`, `charts()` (mAP, confusion matrix, PR curve, TIDE, and related aggregates), `examples()` (paginated TP/FP/FN rows), `delete()`, and `refresh()`. `AllowedLabelMatch` configures allowed ground-truth / prediction label pairs; filter and response types include `EvaluationV2FilterArgs`, `EvaluationV2Charts`, `EvaluationV2ExamplesPage`, and `EvaluationV2MatchExample`. Sphinx docs cover the workflow under Evaluations V2.
 
 ## [0.18.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.3) - 2026-05-18
 
 ### Added
+
 - `DatasetItem.phash` field exposing the 64-character "0/1" perceptual-hash string when populated by the Nucleus backend. Available on every SDK method that yields a `DatasetItem` (e.g. `items_and_annotation_generator`, `items_generator`, `query_items`, `dataset.items`, `iloc`/`refloc`/`loc`).
 
 ## [0.18.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.2) - 2026-05-08
 
 ### Added
+
 - Dataset tags are now exposed through the SDK so customers can identify datasets labeled by Scale vs other vendors. `Dataset.info()` now returns a `tags` field, and `Dataset` exposes `get_tags()`, `add_tags()`, and `remove_tags()` methods.
 
 ## [0.18.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.1) - 2026-05-05
 
 ### Changed
+
 - `Dataset.deduplicate()` and `Dataset.deduplicate_by_ids()` now run asynchronously and return a `DeduplicationJob` instead of returning a `DeduplicationResult` directly. Call `job.result()` to wait for completion and retrieve the result.
 
 ### Removed
+
 - Sync deduplication support for `Dataset.deduplicate()` and `Dataset.deduplicate_by_ids()`.
 
 ## [0.18.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.0) - 2026-04-29
 
 ### Removed
+
 - Dropped support for Python 3.7, 3.8, and 3.9. The minimum supported Python version is now **3.10**, and the SDK now supports Python **3.10, 3.11, 3.12, 3.13, and 3.14**.
 
 ### Changed
+
 - `DatasetItem.reference_id` is now typed `Optional[str]` (defaulting to `None`) instead of `str` with a `"DUMMY_VALUE"` sentinel. The field is still required at runtime: `__post_init__` now asserts `reference_id is not None`. This matches the existing docstring (already documented as `Optional[str]`) and removes the magic sentinel.
 - `nucleus/async_utils.py` now passes `aiohttp.ClientTimeout(total=DEFAULT_NETWORK_TIMEOUT_SEC)` to `session.post`/`session.get` instead of a bare integer (no behavioral change; aligns with the typed `aiohttp` API).
 - `NucleusClient.list_autotags` now always returns a `list` (`List[dict]`) regardless of the response shape, matching its declared return type.
 
 ### Fixed
+
 - All `mypy --ignore-missing-imports nucleus` errors and notes resolved (zero issues across all source files):
   - `nucleus/evaluation_match.py`: widen `infer_confusion_category` parameters to `Optional[str]`.
   - `nucleus/annotation.py`: default `TYPE_KEY` lookup to `""`; make `Segment.index` `Optional[int]`; type `Segment.to_payload`'s `payload` as `Dict[str, Any]`.
@@ -50,6 +58,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `nucleus/scene.py`: annotate `Frame.__init__` and `VideoScene.info` so their bodies are type-checked.
 
 ### Tooling / CI
+
 - Expanded CircleCI installation matrix from `[3.10, 3.11]` to `[3.10, 3.11, 3.12, 3.13, 3.14]`, so every supported Python version is exercised on every PR (build sdist, install with each extras combination, smoke-test `import nucleus`).
 - Fixed pytest 9 fixture-mark errors across the test suite (`tests/cli/conftest.py`, `tests/validate/conftest.py`, `tests/test_scene.py`, `tests/test_video_scene.py`); pytest 9 turns `@pytest.mark.*` on a fixture into a hard error.
 - Cleaned up several pylint findings across the codebase (`E0606`, `W3101`, `R1737`, `R1728`, `C3001`, `C3002`, `W0719`).
@@ -60,21 +69,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.17.14](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.14) - 2026-04-14
 
 ### Changed
+
 - `api_key` and `limited_access_key` are now mutually exclusive in `NucleusClient`. Passing both (or setting `NUCLEUS_API_KEY` while also passing `limited_access_key`) raises a `ValueError`.
 
 ### Fixed
+
 - Docstring improvements across `NucleusClient`: fixed copy-paste errors (`get_job`, `get_slice`, `delete_slice`), removed phantom `stats_only` parameter from `list_jobs`, corrected `make_request` parameter name, and restructured `create_launch_model`/`create_launch_model_from_dir` docs for proper rendering.
 - Suppressed Sphinx warnings from inherited pydantic `BaseModel` methods by removing `inherited-members` from autoapi options.
 
 ## [0.17.13](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.13) - 2026-03-06
 
 ### Fixed
+
 - Removed the deprecated `pkg_resources` package and replaced it with `importlib-metadata`
 - Resolved ~79 errors/warnings in sphinx auto doc build errors
 
 ## [0.17.12](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.12) - 2026-02-23
 
 ### Added
+
 - `Dataset.deduplicate()` method to deduplicate images using perceptual hashing. Accepts optional `reference_ids` to deduplicate specific items, or deduplicates the entire dataset when only `threshold` is provided. Required `threshold` parameter (0-64) controls similarity matching (lower = stricter, 0 = exact matches only).
 - `Dataset.deduplicate_by_ids()` method for deduplication using internal `dataset_item_ids` directly, avoiding the reference ID to item ID mapping for improved efficiency.
 - `DeduplicationResult` and `DeduplicationStats` dataclasses for structured deduplication results.
@@ -102,6 +115,7 @@ print(result.unique_reference_ids)
 ## [0.17.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.11) - 2025-11-03
 
 ### Added
+
 - Support passing a limited access key via `NucleusClient(limited_access_key=...)`. When provided, the client sends the `x-limited-access-key` header on all requests (sync and async).
 - Allow using the SDK without a standard API key when a `limited_access_key` is supplied. In this mode, Basic Auth is omitted and only the limited access header is used.
 
@@ -113,6 +127,7 @@ client = nucleus.NucleusClient(limited_access_key="<LIMITED_ACCESS_KEY>")
 ```
 
 ### Changed
+
 - `Connection` accepts `extra_headers` and only includes Basic Auth when `api_key` is provided. This enables header-only auth with limited access keys.
 - Header propagation applies across all request paths, including Validate endpoints and concurrent async helpers.
 - Tests updated to be tolerant of limited-access-only runs.
@@ -121,22 +136,26 @@ client = nucleus.NucleusClient(limited_access_key="<LIMITED_ACCESS_KEY>")
 ## [0.17.10](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.10) - 2025-03-19
 
 ### Added
+
 - Adding page size variable to `items_and_annotation_generator()` to reduce timeout errors for customers with large datasets
 
 ## [0.17.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.9) - 2025-03-11
 
 ### Added
+
 - Adding `export_class_labels` methods to datasets and slices to extract unique class labels of the annotations in the dataset/slice.
 
 ## [0.17.8](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.8) - 2025-01-02
 
 ### Added
+
 - Adding `only_most_recent_tasks` parameter for `dataset.scene_and_annotation_generator()` and `dataset.items_and_annotation_generator()` to accommodate for multiple sets of ground truth caused by relabeled tasks. Also returns the task_id in the annotation results.
 
 ## [0.17.7](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.7) - 2024-11-05
 
 ### Added
-- Adding `slice_id` parameter for `dataset.scene_and_annotation_generator()`. 
+
+- Adding `slice_id` parameter for `dataset.scene_and_annotation_generator()`.
 
 Example usage:
 
@@ -149,7 +168,8 @@ for scene in dataset.scene_and_annotation_generator(slice_id="slc_..."):
 ## [0.17.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.6) - 2024-07-03
 
 ### Added
-- Method for downloading all annotations grouped by `scene` and `track_reference_id`. 
+
+- Method for downloading all annotations grouped by `scene` and `track_reference_id`.
 
 Example usage:
 
@@ -162,6 +182,7 @@ for scene in dataset.scene_and_annotation_generator():
 ## [0.17.5](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.5) - 2024-04-15
 
 ### Added
+
 - Method for uploading lidar semantic segmentation predictions, via `dataset.upload_lidar_semseg_predictions`
 
 Example usage:
@@ -177,76 +198,89 @@ dataset.upload_lidar_semseg_predictions(model, pointcloud_ref_id, predictions_s3
 
 For the expected format of the s3 predictions, refer to the [documentation here](https://docs.nucleus.scale.com/en/latest/api/nucleus/index.html#nucleus.Dataset.upload_lidar_semseg_predictions)
 
-
 ## [0.17.4](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.4) - 2024-03-25
 
 ### Modified
-- In `Model.run`, added the `model_run_name` parameter. This allows the creation of multiple model runs for datasets.
 
+- In `Model.run`, added the `model_run_name` parameter. This allows the creation of multiple model runs for datasets.
 
 ## [0.17.3] - 2024-02-29
 
 ### Added
+
 - Added the environment variable `S3_ENDPOINT` to accomodate for nonstandard S3 Endpoint URLs when asking for presigned URLs
 
 ## [0.17.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.2) - 2024-02-28
 
 ### Modified
+
 - In `Dataset.create_slice`, the `reference_ids` parameter is now optional. If left unspecified, it will create an empty slice
 
 ## [0.17.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.1) - 2024-02-22
 
 ### Added
+
 - Environment variable `NUCLEUS_SKIP_SSL_VERIFY` to skip SSL verification on requests
 
 ## [0.17.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.0) - 2024-02-06
 
 ### Added
+
 - Added `dataset.add_items_from_dir`
 - Added pytest-xdist for test parallelization
 
 ### Fixes
+
 - Fix test `test_models.test_remove_invalid_tag_from_model`
 
 ## [0.16.18](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.18) - 2024-02-06
 
 ### Added
+
 - Add the ability to add and remove `trained_slice_id` to a model
 
 ## [0.16.17](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.17) - 2024-01-29
 
 ### Fixes
+
 - Update documentation
 
 ## [0.16.16](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.16) - 2024-01-25
 
 ### Fixes
+
 - Minor fixes to docstring
 
 ## [0.16.15](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.15) - 2024-01-11
 
 ### Fixes
-- Fix lidar concurrent lidar pointcloud to also return intensity in case it exists in the response. 
+
+- Fix lidar concurrent lidar pointcloud to also return intensity in case it exists in the response.
 
 ## [0.16.14](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.14) - 2024-01-03
 
 ### Fixes
+
 - Open up Pydantic version requirements as was fixed in 0.16.11
 
 ## [0.16.13](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.13) - 2023-12-13
 
 ### Added
+
 - Added `trained_slice_id` parameter to `dataset.upload_predictions()` to specify the slice ID used to train the model.
 
 ### Fixes
+
 - Fix offset generation for image chips in `dataset.items_and_annotation_chip_generator()`
 
 ## [0.16.12](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.12) - 2023-11-29
 
 ### Added
-- Added tag support for slices. 
+
+- Added tag support for slices.
 
 Example:
+
 ```python
 >>> slc = client.get_slice('slc_id')
 >>> tags = slc.tags
@@ -256,10 +290,12 @@ Example:
 ## [0.16.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.11) - 2023-11-22
 
 ### Added
+
 - Added `num_processes` parameter to `dataset.items_and_annotation_chip_generator()` to specify parallel processing.
 - Method to allow for concurrent task fetches for pointcloud data
 
 Example:
+
 ```python
 >>> task_ids = ['task_1', 'task_2']
 >>> resp = client.download_pointcloud_tasks(task_ids=task_ids, frame_num=1)
@@ -271,6 +307,7 @@ Example:
 ```
 
 ### Fixes
+
 - Support environments using pydantic>=2
 
 ## [0.16.10](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.10) - 2023-11-22
@@ -278,6 +315,7 @@ Example:
 Allow creating a dataset by crawling all images in a directory, recursively. Also supports privacy mode datasets.
 
 #### Example structure:
+
 ```
 ~/Documents/
     data/
@@ -300,7 +338,7 @@ client.create_dataset_from_dir(data_dir)
 
 #### Example Privacy Mode:
 
-This requires that a proxy (or file server) is setup  and can serve files _relative_ to the data_dir
+This requires that a proxy (or file server) is setup and can serve files _relative_ to the data_dir
 
 ```python
 data_dir = "~/Documents/data"
@@ -315,7 +353,6 @@ client.create_dataset_from_dir(
 This would create a dataset `my-dataset`, and when opened in Nucleus, the images would be requested to the path:
 `<privacy_mode_proxy>/<img ref id>`, for example: `http://localhost:5000/assets/2022/img01.png`
 
-
 ## [0.16.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.9) - 2023-11-17
 
 ### Fixes
@@ -327,108 +364,121 @@ This would create a dataset `my-dataset`, and when opened in Nucleus, the images
 ### Added
 
 #### Dataset Item width and height
+
 - Allow passing width and height to `DatasetItem`
 - This is _required_ when using privacy mode
 
 #### Dataset Item Fetch
+
 - Added `dataset.items_and_annotation_chip_generator()` functionality to generate chips of images in s3 or locally.
 - Added `query` parameter for `dataset.items_and_annotation_generator()` to filter dataset items.
 
 ### Removed
-- `upload_to_scale` is no longer a property in `DatasetItem`, users should instead specify `use_privacy_mode` on the dataset during creation
 
+- `upload_to_scale` is no longer a property in `DatasetItem`, users should instead specify `use_privacy_mode` on the dataset during creation
 
 ## [0.16.7](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.7) - 2023-11-03
 
 ### Added
-- Allow direct embedding vector upload together with dataset items. `DatasetItem` now has an additional parameter called `embedding_info` which can be used to directly upload embeddings when a dataset is uploaded.
-- Added `dataset.embedding_indexes` property, which exposes information about every embedding index which belongs to the dataset.   
 
+- Allow direct embedding vector upload together with dataset items. `DatasetItem` now has an additional parameter called `embedding_info` which can be used to directly upload embeddings when a dataset is uploaded.
+- Added `dataset.embedding_indexes` property, which exposes information about every embedding index which belongs to the dataset.
 
 ## [0.16.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.6) - 2023-11-01
 
 ### Added
+
 - Allow datasets to be created in "privacy mode". For example, `client.create_dataset('name', use_privacy_mode=True)`.
 - Privacy Mode lets customers use Nucleus without sensitive raw data ever leaving their servers.
 - When set to `True`, you can submit URLs to Nucleus that link to raw data assets like images or point clouds, instead of transferring that data to Scale. Access control is then completely in the hands of users: URLs may optionally be protected behind your corporate VPN or an IP whitelist. When you load a Nucleus web page, your browser will directly fetch the raw data from your servers without it ever being accessible to Scale.
 
-
 ## [0.16.5](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.5) - 2023-10-30
 
 ### Added
-- Added a `description` to the slice info. 
+
+- Added a `description` to the slice info.
 
 ### Changed
-- Made `skeleton` key optional on `KeypointsAnnotation`. 
 
+- Made `skeleton` key optional on `KeypointsAnnotation`.
 
 ## [0.16.4](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.4) - 2023-10-23
 
 ### Added
+
 - Added a `query_objects` method on the Dataset class.
 - Example
+
 ```shell
 >>> ds = client.get_dataset('ds_id')
 >>> objects = ds.query_objects('annotations.metadata.distance_to_device > 150', ObjectQueryType.GROUND_TRUTH_ONLY)
 [CuboidAnnotation(label="", dimensions={}, ...), ...]
 ```
-- Added `EvaluationMatch` class to represent IOU Matches, False Positives and False Negatives retrieved through the `query_objects` method 
 
+- Added `EvaluationMatch` class to represent IOU Matches, False Positives and False Negatives retrieved through the `query_objects` method
 
 ## [0.16.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.3) - 2023-10-10
 
 ### Added
+
 - Added a `query_scenes` method on the Dataset class.
 - Example
+
 ```shell
 >>> ds = client.get_dataset('ds_id')
 >>> scenes = ds.query_scenes('scene.metadata.foo = "baz"')
 [Scene(reference_id="", metadata={}, ...), ...]
 ```
 
-
 ## [0.16.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.2) - 2023-10-03
 
 ### Fixed
-- Raise error on all error states for AsyncJob.sleep_until_complete(). Before it only handled the deprecated "Errored"
 
+- Raise error on all error states for AsyncJob.sleep_until_complete(). Before it only handled the deprecated "Errored"
 
 ## [0.16.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.1) - 2023-09-18
 
 ### Added
+
 - Added `asynchronous` parameter for `slice.export_embeddings()` and `dataset.export_embeddings()` to allow embeddings to be exported asynchronously.
 
 ### Changed
+
 - Changed `slice.export_embeddings()` and `dataset.export_embeddings()` to be asynchronous by deafult.
 
 ## [0.16.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.16.0) - 2023-09-18
 
 ### Removed
+
 - Support for Python 3.6 - it is end of life for more than a year
 
 ### Fixed
+
 - Development environment for Python 3.11
-- 
+-
 
 ## [0.15.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.11) - 2023-09-15
 
 ### Added
-- Added `slice.export_raw_json()` functionality to support raw export of object slices (annotations, predictions, item and scene level data). Currently does not support image slices. 
 
+- Added `slice.export_raw_json()` functionality to support raw export of object slices (annotations, predictions, item and scene level data). Currently does not support image slices.
 
 ## [0.15.10](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.10) - 2023-07-20
 
 ### Added
+
 - Fix `slice.export_predictions(args)` and `slice.export_predictions_generator(args)` methods to return `Predictions` instead of `Annotations`
 
 ## [0.15.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.9) - 2023-06-26
 
 ### Added
+
 - Support for Scale Launch client v1.0.0 and higher for the Nucleus + Launch integration
 
 ## [0.15.7](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.7) - 2023-06-09
 
 ### Added
+
 - Allow for downloading pointcloud data for a give task and frame number, example:
 
 ```python
@@ -442,39 +492,45 @@ np_pts = np.array([pt.to_list() for pt in pts])
 ## [0.15.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.6) - 2023-06-03
 
 ### Changed
+
 - Document new restrictions to slice create/append.
 - `Dataset.create_slice` and `Slice.append` methods cannot exceed 10,000 items per request.
 
 ## [0.15.5](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.5) - 2023-05-8
 
 ### Fixed
-- Give default annotation_id to `KeypointAnnotations` when not specified
 
+- Give default annotation_id to `KeypointAnnotations` when not specified
 
 ## [0.15.4](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.4) - 2023-03-21
 
 ### Changed
-- Added `create_slice_by_ids` to create slices from dataset item, scene, and object IDs
 
+- Added `create_slice_by_ids` to create slices from dataset item, scene, and object IDs
 
 ## [0.15.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.3) - 2023-03-02
 
 ### Changed
+
 - Allow denormalized scores in `EvaluationResult`s
 
 ## [0.15.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.2) - 2023-02-10
 
 ### Changed
+
 - Fix `client.create_launch_model_from_dir(args)` method
 
 ## [0.15.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.1) - 2023-01-16
 
 ### Changed
+
 - Better filter tuning of `client.list_jobs(args)` method
 
 ### Added
+
 - Dataset method to filter jobs, and statistics on running jobs
-Example:
+  Example:
+
 ```python
 >>> client = nucleus.NucleusClient(API_KEY)
 >>> ds = client.get_dataset(ds_id)
@@ -485,6 +541,7 @@ Example:
 ```
 
 Detailed Example
+
 ```python
 >>> from nucleus.job import CustomerJobTypes
 >>> client = nucleus.NucleusClient(API_KEY)
@@ -501,15 +558,18 @@ Detailed Example
 # ... returns list of AsyncJob objects
 ```
 
-
 ## [0.15.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.15.0) - 2022-12-19
 
 ### Changed
+
 - `dataset.slices` now returns a list of `Slice` objects instead of a list of IDs
 
 ### Added
+
 Retrieve a slice from a dataset by its name, or all slices of a particular type from a dataset. Where type is one of `["dataset_item", "object", "scene"]`.
+
 - `dataset.get_slices(name, slice_type): List[Slice]`
+
 ```python
 from nucleus.slice import SliceType
 dataset.get_slices(name="My Slice")
@@ -519,55 +579,58 @@ dataset.get_slices(slice_type=SliceType.DATASET_ITEM)
 ## [0.14.30](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.30) - 2022-11-29
 
 ### Added
+
 - Support for uploading track-level metrics to external evaluation functions using track_ref_ids
 
 ## [0.14.29](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.29) - 2022-11-22
 
 ### Added
+
 - Support for `Track`s, enabling ground truth annotations and model predictions to be grouped across dataset items and scenes
 - Helpers to update track metadata, as well as to create and delete tracks at the dataset level
 
-
 ## [0.14.28](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.28) - 2022-11-17
 
 ### Added
+
 - Support for appending to slice with scene reference IDs
 - Better error handling when appending to a slice with non-existent reference IDs
 
-
 ## [0.14.27](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.27) - 2022-11-04
 
 ### Added
+
 - Support for scene-level external evaluation functions
 - Support for uploading custom scene-level metrics
 
-
 ## [0.14.26](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.26) - 2022-11-01
 
 ### Added
+
 - Support for fetching scene from a `DatasetItem.reference_id`
-Example:
+  Example:
+
 ```python
 dataset = client.get_dataset("<dataset_id>")
 assert dataset.is_scene  # only works on scene datasets
 some_item = dataset.iloc(0)
-dataset.get_scene_from_item_ref_id(some_item['item'].reference_id) 
+dataset.get_scene_from_item_ref_id(some_item['item'].reference_id)
 ```
 
-
 ## [0.14.25](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.25) - 2022-10-20
 
 ### Updated
+
 - Items of a slice can be retrieved by Slice property `.item`
 - The type of items returned from `.items` is based on the slice `type`:
   - `slice.type == 'dataset_item'` => list of `DatasetItem` objects
   - `slice.type == 'object'` => list of `Annotation`/`Prediction` objects
   - `slice.type == 'scene'` => list of `Scene` objects
 
-
 ## [0.14.24](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.24) - 2022-10-19
 
 ### Fixed
+
 - Late imports for seldomly used heavy libraries. Sped up CLI invocation and autocomplation.
   If you had shell completions installed before we recommend removeing them from your .(bash|zsh)rc
   file and reinstalling with nu install-completions
@@ -575,39 +638,43 @@ dataset.get_scene_from_item_ref_id(some_item['item'].reference_id)
 ## [0.14.23](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.23) - 2022-10-17
 
 ### Added
-- Support for building slices via Nucleus' Smart Sample
 
+- Support for building slices via Nucleus' Smart Sample
 
 ## [0.14.22](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.22) - 2022-10-14
 
 ### Added
-- Trigger for calculating Validate metrics for a model. This allows underperforming slice discovery and more model analysis
 
+- Trigger for calculating Validate metrics for a model. This allows underperforming slice discovery and more model analysis
 
 ## [0.14.21](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.21) - 2022-09-28
 
 ### Added
-- Support for `context_attachment` metadata values. See [upload metadata](https://nucleus.scale.com/docs/upload-metadata) for more information.
 
+- Support for `context_attachment` metadata values. See [upload metadata](https://nucleus.scale.com/docs/upload-metadata) for more information.
 
 ## [0.14.20](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.20) - 2022-09-23
 
 ### Fixed
+
 - Local uploads are correctly batched and prevents flooding the network with requests
 
 ## [0.14.19](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.19) - 2022-08-26
 
 ### Added
+
 - Support for Coordinate metadata values. See [upload metadata](https://nucleus.scale.com/docs/upload-metadata) for more information.
 
 ## [0.14.18](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.18) - 2022-08-16
 
 ### Added
+
 - Metadata and confidence support for scene categories
 
 ## [0.14.17](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.17) - 2022-08-15
 
 ### Fixed
+
 - Fix `AsyncJob` status payload keys causing test failures
 - Fix `AsyncJob` export test
 - Fix `page_size` for `{Dataset,Slice}.items_and_annotatation_generator()`
@@ -616,105 +683,120 @@ dataset.get_scene_from_item_ref_id(some_item['item'].reference_id)
 ## [0.14.16](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.16) - 2022-08-12
 
 ### Added
+
 - Scene categorization support
 
 ## [0.14.15](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.15) - 2022-08-11
 
 ### Removed
+
 - Removed s3fs, fsspec dependencies for simpler installation in various environments
 
 ## [0.14.14](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.14) - 2022-08-11
 
 ### Added
+
 - client.slices to list all of users slices independent of dataset
-- Added optional parameter `asynchronous: bool` to `Dataset.update_item_metadata` and  `Dataset.update_scene_metadata`,
-allowing the update to run as a background job when set to `True`
+- Added optional parameter `asynchronous: bool` to `Dataset.update_item_metadata` and `Dataset.update_scene_metadata`,
+  allowing the update to run as a background job when set to `True`
 
 ### Fixed
-- Validate unit test listing and evaluation history listing. Now uses new bulk fetch endpoints for faster listing.
 
+- Validate unit test listing and evaluation history listing. Now uses new bulk fetch endpoints for faster listing.
 
 ## [0.14.13](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.13) - 2022-08-10
 
 ### Fixed
-- Fix payload parsing for scene export
 
+- Fix payload parsing for scene export
 
 ## [0.14.12](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.12) - 2022-08-05
 
 ### Added
+
 - Added auto-paginated `Slice.export_predictions_generator`
 
 ### Fixed
-- Change `{Dataset,Slice}.items_and_annotation_generator` to work with improved paginate endpoint
 
+- Change `{Dataset,Slice}.items_and_annotation_generator` to work with improved paginate endpoint
 
 ## [0.14.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.11) - 2022-07-20
 
 ### Fixed
+
 - Various docstring and typing updates
 
 ## [0.14.10](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.10) - 2022-07-20
 
 ### Added
+
 - `Dataset.items_and_annotation_generator()`
 
 ### Fixed
+
 - `Slice.items_and_annotation_generator()` bug
 
 ## [0.14.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.9) - 2022-07-14
 
 ### Fixed
+
 - NoneType errors in Validate
 
 ## [0.14.8](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.8) - 2022-07-14
 
 ### Fixed
+
 - Segmentation metrics filtering. Prior version artificially boosted performance when filtering was applied.
 
 ## [0.14.7](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.7) - 2022-07-07
 
 ### Added
+
 - Support running structured queries and retrieving item results via API
 
 ## [0.14.6](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.6) - 2022-07-07
 
 ### Fixed
+
 - `Dataset.delete_annotations` now defaults `reference_ids` to an empty list and `keep_history` to true
 
 ## [0.14.5](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.5) - 2022-07-05
 
 ### Fixed
+
 - Averaging of rich semantic segmentation taxonomies not taking into account missing classes
 
 ## [0.14.4](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.4) - 2022-06-21
 
 ### Fixed
+
 - Regression that caused Validate filter statements to not work
 
 ## [0.14.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.3) - 2022-06-21
 
 ### Fixed
-- CLI installation without GEOS errored out. Now handled by importer.
 
+- CLI installation without GEOS errored out. Now handled by importer.
 
 ## [0.14.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.2) - 2022-06-21
 
 ### Fixed
+
 - Better error reporting when everything is filtered out by a filter statement in a Validate evaluation function
 
 ## [0.14.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.1) - 2022-06-20
 
 ### Fixed
-- Adapt Segmentation metrics to better support instance segmentation
-- Change Segmentation/Polygon metrics to use new segmentation metrics 
 
+- Adapt Segmentation metrics to better support instance segmentation
+- Change Segmentation/Polygon metrics to use new segmentation metrics
 
 ## [0.14.0](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.0) - 2022-06-16
 
 ### Added
 
 - Allow creation/deletion of model tags on new and existing models, eg:
+
 ```python
 # on model creation
 model = client.create_model(name="foo_model", reference_id="foo-model-ref", tags=["some tag"])
@@ -730,17 +812,19 @@ existing_model.remove_tags(['tag a'])
 ## [0.13.5](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.13.4) - 2022-06-15
 
 ### Fixed
-- Guard against invalid skeleton indexes in KeypointsAnnotation
 
+- Guard against invalid skeleton indexes in KeypointsAnnotation
 
 ## [0.13.4](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.13.4) - 2022-06-09
 
 ### Fixed
-- Guard against extras imports 
+
+- Guard against extras imports
 
 ## [0.13.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.13.3) - 2022-06-09
 
 ### Fixed
+
 - Make installation of scale-launch optional (again!).
 
 ## [0.13.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.13.2) - 2022-06-08
@@ -748,7 +832,7 @@ existing_model.remove_tags(['tag a'])
 ### Fixed
 
 - Open up requirements for easier installation in more environments. Add more optional installs under `metrics`
- 
+
 ## [0.13.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.13.1) - 2022-06-08
 
 ### Fixed
@@ -767,7 +851,6 @@ existing_model.remove_tags(['tag a'])
 
 - Poetry dependency list
 
-
 ## [0.12.3](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.12.3) - 2022-06-02
 
 ### Added
@@ -776,14 +859,12 @@ existing_model.remove_tags(['tag a'])
 - `Dataset.export_scale_task_info`
 - `Slice.export_scale_task_info`
 
-
 ## [0.12.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.12.2) - 2022-06-02
 
 ### Added
 
 - Allow users to upload external evaluation results calculated on the client side.
 
-
 ## [0.12.1](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.12.1) - 2022-06-02
 
 ### Added
@@ -796,7 +877,6 @@ existing_model.remove_tags(['tag a'])
 
 - Allow users to create external evaluation functions for Scenario Tests in Validate.
 
-
 ## [0.11.2](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.11.2) - 2022-05-20
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index dd07937e..ac2ad14b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"]  # Easy ignore for getting it running
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.18.3"
+version = "0.18.4"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]