githubnext · github-actions · May 18, 2026 · May 18, 2026 · May 26, 2026 · Jun 4, 2026
diff --git a/benchmarks/pandas/bench_cut_bins_to_frame.py b/benchmarks/pandas/bench_cut_bins_to_frame.py
@@ -0,0 +1,56 @@
+"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows."""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+NUM_BINS = 20
+WARMUP = 5
+ITERATIONS = 50
+
+data = np.array([(i % 1000) * 0.1 for i in range(SIZE)])
+
+for _ in range(WARMUP):
+    # pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result
+    cut_result = pd.cut(data, NUM_BINS)
+    # Summary DataFrame equivalent to cutBinsToFrame
+    counts = cut_result.value_counts(sort=False)
+    summary = pd.DataFrame({
+        "bin": counts.index.astype(str),
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+        "count": counts.values,
+        "frequency": counts.values / len(data),
+    })
+    # cutBinCounts equivalent: counts dict
+    count_dict = dict(zip(counts.index.astype(str), counts.values))
+    # binEdges equivalent: DataFrame of interval edges
+    edges = pd.DataFrame({
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+    })
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    cut_result = pd.cut(data, NUM_BINS)
+    counts = cut_result.value_counts(sort=False)
+    summary = pd.DataFrame({
+        "bin": counts.index.astype(str),
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+        "count": counts.values,
+        "frequency": counts.values / len(data),
+    })
+    count_dict = dict(zip(counts.index.astype(str), counts.values))
+    edges = pd.DataFrame({
+        "left": [iv.left for iv in counts.index],
+        "right": [iv.right for iv in counts.index],
+    })
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "cut_bins_to_frame",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_math_ops.py b/benchmarks/pandas/bench_math_ops.py
@@ -0,0 +1,35 @@
+"""Benchmark: math_ops — abs / round on Series and DataFrame of 100k rows."""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+s = pd.Series(np.where(np.arange(SIZE) % 2 == 0, -(np.arange(SIZE) + 0.567), np.arange(SIZE) + 0.567))
+df = pd.DataFrame({
+    "a": -(np.arange(SIZE) + 0.123),
+    "b": np.arange(SIZE) + 0.456,
+})
+
+for _ in range(WARMUP):
+    s.abs()
+    df.abs()
+    s.round(1)
+    df.round(1)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    s.abs()
+    df.abs()
+    s.round(1)
+    df.round(1)
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "math_ops",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_na_ops.py b/benchmarks/pandas/bench_na_ops.py
@@ -0,0 +1,42 @@
+"""Benchmark: na_ops — isna / notna / ffill / bfill on 100k rows."""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+data = pd.array([i if i % 5 != 0 else pd.NA for i in range(SIZE)], dtype="Int64")
+s = pd.Series(data, dtype="float64")
+s[np.arange(SIZE) % 5 == 0] = np.nan
+
+df = pd.DataFrame({
+    "a": s,
+    "b": pd.Series([float(i * 2) if i % 7 != 0 else np.nan for i in range(SIZE)]),
+})
+
+for _ in range(WARMUP):
+    pd.isna(s)
+    pd.notna(s)
+    s.ffill()
+    s.bfill()
+    df.ffill()
+    df.bfill()
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    pd.isna(s)
+    pd.notna(s)
+    s.ffill()
+    s.bfill()
+    df.ffill()
+    df.bfill()
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "na_ops",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_notna_boolean.py b/benchmarks/pandas/bench_notna_boolean.py
@@ -0,0 +1,36 @@
+"""Benchmark: notna_boolean — boolean-mask indexing on 100k rows."""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+s = pd.Series(np.arange(SIZE))
+mask = pd.Series(np.arange(SIZE) % 2 == 0)
+bool_arr = np.arange(SIZE) % 3 != 0
+
+df = pd.DataFrame({
+    "a": np.arange(SIZE),
+    "b": np.arange(SIZE) * 2,
+})
+
+for _ in range(WARMUP):
+    s[mask]
+    s[~mask]
+    df[bool_arr]
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    s[mask]
+    s[~mask]
+    df[bool_arr]
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "notna_boolean",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_read_html.py b/benchmarks/pandas/bench_read_html.py
@@ -0,0 +1,52 @@
+"""
+Benchmark: pd.read_html — parse HTML tables into DataFrames.
+Outputs JSON: {"function": "read_html", "mean_ms": ..., "iterations": ..., "total_ms": ...}
+"""
+import json
+import time
+import math
+
+try:
+    import pandas as pd
+except ImportError:
+    import subprocess, sys
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas", "--quiet"])
+    import pandas as pd
+
+try:
+    import lxml  # noqa: F401
+except ImportError:
+    import subprocess, sys
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "lxml", "--quiet"])
+
+ROWS = 1_000
+WARMUP = 3
+ITERATIONS = 20
+
+
+def build_html(rows: int) -> str:
+    header = "<tr><th>id</th><th>name</th><th>value</th><th>score</th></tr>"
+    body_rows = [
+        f"<tr><td>{i}</td><td>item_{i % 100}</td><td>{i * 1.5:.2f}</td><td>{math.sin(i * 0.01):.6f}</td></tr>"
+        for i in range(rows)
+    ]
+    return f"<table><thead>{header}</thead><tbody>{''.join(body_rows)}</tbody></table>"
+
+
+html = build_html(ROWS)
+
+# Warm-up
+for _ in range(WARMUP):
+    pd.read_html(html)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    pd.read_html(html)
+total_ms = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "read_html",
+    "mean_ms": total_ms / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total_ms,
+}))
diff --git a/benchmarks/pandas/bench_reduce_ops.py b/benchmarks/pandas/bench_reduce_ops.py
@@ -0,0 +1,37 @@
+"""Benchmark: reduce_ops — nunique / any / all on Series and DataFrame of 100k rows."""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+s = pd.Series(np.arange(SIZE) % 1000)
+bool_s = pd.Series(np.arange(SIZE) > 0)
+df = pd.DataFrame({
+    "a": np.arange(SIZE) % 500,
+    "b": np.arange(SIZE) % 200,
+    "c": np.arange(SIZE) % 100,
+})
+
+for _ in range(WARMUP):
+    s.nunique()
+    bool_s.any()
+    bool_s.all()
+    df.nunique()
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    s.nunique()
+    bool_s.any()
+    bool_s.all()
+    df.nunique()
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "reduce_ops",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_rename_ops.py b/benchmarks/pandas/bench_rename_ops.py
@@ -0,0 +1,36 @@
+"""Benchmark: rename_ops — rename / add_prefix / add_suffix on Series/DataFrame of 100k rows."""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+s = pd.Series(np.arange(SIZE), index=[f"row_{i}" for i in range(SIZE)])
+df = pd.DataFrame({
+    "col_a": np.arange(SIZE),
+    "col_b": np.arange(SIZE) * 2,
+    "col_c": np.arange(SIZE) * 3,
+})
+
+for _ in range(WARMUP):
+    s.rename(lambda lbl: f"new_{lbl}")
+    df.rename(columns={"col_a": "a", "col_b": "b"})
+    df.add_prefix("pre_")
+    df.add_suffix("_suf")
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    s.rename(lambda lbl: f"new_{lbl}")
+    df.rename(columns={"col_a": "a", "col_b": "b"})
+    df.add_prefix("pre_")
+    df.add_suffix("_suf")
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "rename_ops",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_to_json_denormalize.py b/benchmarks/pandas/bench_to_json_denormalize.py
@@ -0,0 +1,41 @@
+"""Benchmark: to_json_denormalize — json orient variants on 10k-row DataFrame."""
+import json, time
+import numpy as np
+import pandas as pd
+
+ROWS = 10_000
+WARMUP = 5
+ITERATIONS = 30
+
+# DataFrame matching the tsb benchmark (nested-structure-like columns)
+df = pd.DataFrame({
+    "name": [f"user_{i}" for i in range(ROWS)],
+    "address.city": [f"city_{i % 100}" for i in range(ROWS)],
+    "address.zip": [str(10000 + (i % 9000)) for i in range(ROWS)],
+    "score": np.arange(ROWS) * 0.01,
+})
+
+for _ in range(WARMUP):
+    # pandas equivalent of toJsonDenormalize: to_dict("records") then reconstruct nesting
+    recs = df.to_dict("records")
+    # pandas equivalent of toJsonRecords: orient="records"
+    df.to_json(orient="records")
+    # pandas equivalent of toJsonSplit: orient="split"
+    df.to_json(orient="split")
+    # pandas equivalent of toJsonIndex: orient="index"
+    df.to_json(orient="index")
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    recs = df.to_dict("records")
+    df.to_json(orient="records")
+    df.to_json(orient="split")
+    df.to_json(orient="index")
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "to_json_denormalize",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))
diff --git a/benchmarks/pandas/bench_value_counts_full.py b/benchmarks/pandas/bench_value_counts_full.py
@@ -0,0 +1,28 @@
+"""Benchmark: value_counts_full — value_counts(bins=N) on Series of 100k rows."""
+import json, time
+import numpy as np
+import pandas as pd
+
+SIZE = 100_000
+WARMUP = 5
+ITERATIONS = 50
+
+rng = np.random.default_rng(42)
+s = pd.Series(rng.random(SIZE) * 100)
+
+for _ in range(WARMUP):
+    s.value_counts(bins=10)
+    s.value_counts(bins=20)
+
+start = time.perf_counter()
+for _ in range(ITERATIONS):
+    s.value_counts(bins=10)
+    s.value_counts(bins=20)
+total = (time.perf_counter() - start) * 1000
+
+print(json.dumps({
+    "function": "value_counts_full",
+    "mean_ms": total / ITERATIONS,
+    "iterations": ITERATIONS,
+    "total_ms": total,
+}))