Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions benchmarks/pandas/bench_cut_bins_to_frame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
NUM_BINS = 20
WARMUP = 5
ITERATIONS = 50

data = np.array([(i % 1000) * 0.1 for i in range(SIZE)])

for _ in range(WARMUP):
# pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result
cut_result = pd.cut(data, NUM_BINS)
# Summary DataFrame equivalent to cutBinsToFrame
counts = cut_result.value_counts(sort=False)
summary = pd.DataFrame({
"bin": counts.index.astype(str),
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
"count": counts.values,
"frequency": counts.values / len(data),
})
# cutBinCounts equivalent: counts dict
count_dict = dict(zip(counts.index.astype(str), counts.values))
# binEdges equivalent: DataFrame of interval edges
edges = pd.DataFrame({
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
})

start = time.perf_counter()
for _ in range(ITERATIONS):
cut_result = pd.cut(data, NUM_BINS)
counts = cut_result.value_counts(sort=False)
summary = pd.DataFrame({
"bin": counts.index.astype(str),
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
"count": counts.values,
"frequency": counts.values / len(data),
})
count_dict = dict(zip(counts.index.astype(str), counts.values))
edges = pd.DataFrame({
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
})
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "cut_bins_to_frame",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
35 changes: 35 additions & 0 deletions benchmarks/pandas/bench_math_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Benchmark: math_ops — abs / round on Series and DataFrame of 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

s = pd.Series(np.where(np.arange(SIZE) % 2 == 0, -(np.arange(SIZE) + 0.567), np.arange(SIZE) + 0.567))
df = pd.DataFrame({
"a": -(np.arange(SIZE) + 0.123),
"b": np.arange(SIZE) + 0.456,
})

for _ in range(WARMUP):
s.abs()
df.abs()
s.round(1)
df.round(1)

start = time.perf_counter()
for _ in range(ITERATIONS):
s.abs()
df.abs()
s.round(1)
df.round(1)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "math_ops",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
42 changes: 42 additions & 0 deletions benchmarks/pandas/bench_na_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Benchmark: na_ops — isna / notna / ffill / bfill on 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

data = pd.array([i if i % 5 != 0 else pd.NA for i in range(SIZE)], dtype="Int64")
s = pd.Series(data, dtype="float64")
s[np.arange(SIZE) % 5 == 0] = np.nan

df = pd.DataFrame({
"a": s,
"b": pd.Series([float(i * 2) if i % 7 != 0 else np.nan for i in range(SIZE)]),
})

for _ in range(WARMUP):
pd.isna(s)
pd.notna(s)
s.ffill()
s.bfill()
df.ffill()
df.bfill()

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.isna(s)
pd.notna(s)
s.ffill()
s.bfill()
df.ffill()
df.bfill()
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "na_ops",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
36 changes: 36 additions & 0 deletions benchmarks/pandas/bench_notna_boolean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Benchmark: notna_boolean — boolean-mask indexing on 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

s = pd.Series(np.arange(SIZE))
mask = pd.Series(np.arange(SIZE) % 2 == 0)
bool_arr = np.arange(SIZE) % 3 != 0

df = pd.DataFrame({
"a": np.arange(SIZE),
"b": np.arange(SIZE) * 2,
})

for _ in range(WARMUP):
s[mask]
s[~mask]
df[bool_arr]

start = time.perf_counter()
for _ in range(ITERATIONS):
s[mask]
s[~mask]
df[bool_arr]
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "notna_boolean",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
52 changes: 52 additions & 0 deletions benchmarks/pandas/bench_read_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""
Benchmark: pd.read_html — parse HTML tables into DataFrames.
Outputs JSON: {"function": "read_html", "mean_ms": ..., "iterations": ..., "total_ms": ...}
"""
import json
import time
import math

try:
import pandas as pd
except ImportError:
import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas", "--quiet"])
import pandas as pd

try:
import lxml # noqa: F401
except ImportError:
import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "lxml", "--quiet"])

ROWS = 1_000
WARMUP = 3
ITERATIONS = 20


def build_html(rows: int) -> str:
header = "<tr><th>id</th><th>name</th><th>value</th><th>score</th></tr>"
body_rows = [
f"<tr><td>{i}</td><td>item_{i % 100}</td><td>{i * 1.5:.2f}</td><td>{math.sin(i * 0.01):.6f}</td></tr>"
for i in range(rows)
]
return f"<table><thead>{header}</thead><tbody>{''.join(body_rows)}</tbody></table>"


html = build_html(ROWS)

# Warm-up
for _ in range(WARMUP):
pd.read_html(html)

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.read_html(html)
total_ms = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "read_html",
"mean_ms": total_ms / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total_ms,
}))
37 changes: 37 additions & 0 deletions benchmarks/pandas/bench_reduce_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Benchmark: reduce_ops — nunique / any / all on Series and DataFrame of 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

s = pd.Series(np.arange(SIZE) % 1000)
bool_s = pd.Series(np.arange(SIZE) > 0)
df = pd.DataFrame({
"a": np.arange(SIZE) % 500,
"b": np.arange(SIZE) % 200,
"c": np.arange(SIZE) % 100,
})

for _ in range(WARMUP):
s.nunique()
bool_s.any()
bool_s.all()
df.nunique()

start = time.perf_counter()
for _ in range(ITERATIONS):
s.nunique()
bool_s.any()
bool_s.all()
df.nunique()
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "reduce_ops",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
36 changes: 36 additions & 0 deletions benchmarks/pandas/bench_rename_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Benchmark: rename_ops — rename / add_prefix / add_suffix on Series/DataFrame of 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

s = pd.Series(np.arange(SIZE), index=[f"row_{i}" for i in range(SIZE)])
df = pd.DataFrame({
"col_a": np.arange(SIZE),
"col_b": np.arange(SIZE) * 2,
"col_c": np.arange(SIZE) * 3,
})

for _ in range(WARMUP):
s.rename(lambda lbl: f"new_{lbl}")
df.rename(columns={"col_a": "a", "col_b": "b"})
df.add_prefix("pre_")
df.add_suffix("_suf")

start = time.perf_counter()
for _ in range(ITERATIONS):
s.rename(lambda lbl: f"new_{lbl}")
df.rename(columns={"col_a": "a", "col_b": "b"})
df.add_prefix("pre_")
df.add_suffix("_suf")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "rename_ops",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
41 changes: 41 additions & 0 deletions benchmarks/pandas/bench_to_json_denormalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Benchmark: to_json_denormalize — json orient variants on 10k-row DataFrame."""
import json, time
import numpy as np
import pandas as pd

ROWS = 10_000
WARMUP = 5
ITERATIONS = 30

# DataFrame matching the tsb benchmark (nested-structure-like columns)
df = pd.DataFrame({
"name": [f"user_{i}" for i in range(ROWS)],
"address.city": [f"city_{i % 100}" for i in range(ROWS)],
"address.zip": [str(10000 + (i % 9000)) for i in range(ROWS)],
"score": np.arange(ROWS) * 0.01,
})

for _ in range(WARMUP):
# pandas equivalent of toJsonDenormalize: to_dict("records") then reconstruct nesting
recs = df.to_dict("records")
# pandas equivalent of toJsonRecords: orient="records"
df.to_json(orient="records")
# pandas equivalent of toJsonSplit: orient="split"
df.to_json(orient="split")
# pandas equivalent of toJsonIndex: orient="index"
df.to_json(orient="index")

start = time.perf_counter()
for _ in range(ITERATIONS):
recs = df.to_dict("records")
df.to_json(orient="records")
df.to_json(orient="split")
df.to_json(orient="index")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "to_json_denormalize",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
28 changes: 28 additions & 0 deletions benchmarks/pandas/bench_value_counts_full.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Benchmark: value_counts_full — value_counts(bins=N) on Series of 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

rng = np.random.default_rng(42)
s = pd.Series(rng.random(SIZE) * 100)

for _ in range(WARMUP):
s.value_counts(bins=10)
s.value_counts(bins=20)

start = time.perf_counter()
for _ in range(ITERATIONS):
s.value_counts(bins=10)
s.value_counts(bins=20)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "value_counts_full",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
Loading