Skip to content

Commit c6a8fb0

Browse files
Merge pull request #508 from andfoy/free_threaded_example
Add a benchmark/example for numexpr usage under free-threading conditions
2 parents d296d27 + 462dd17 commit c6a8fb0

File tree

2 files changed

+174
-7
lines changed

2 files changed

+174
-7
lines changed

bench/free_threading.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
#################################################################################
2+
# To compare the performance of numexpr when free-threading CPython is used.
3+
#
4+
# This example makes use of Python threads, as opposed to C native ones
5+
# in order to highlight the improvement introduced by free-threading CPython,
6+
# which now disables the GIL altogether.
7+
#################################################################################
8+
"""
9+
Results with GIL-enabled CPython:
10+
11+
Benchmarking Expression 1:
12+
NumPy time (threaded over 32 chunks with 16 threads): 1.173090 seconds
13+
numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 0.951071 seconds
14+
numexpr speedup: 1.23x
15+
----------------------------------------
16+
Benchmarking Expression 2:
17+
NumPy time (threaded over 32 chunks with 16 threads): 10.410874 seconds
18+
numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 8.248753 seconds
19+
numexpr speedup: 1.26x
20+
----------------------------------------
21+
Benchmarking Expression 3:
22+
NumPy time (threaded over 32 chunks with 16 threads): 9.605909 seconds
23+
numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 11.087108 seconds
24+
numexpr speedup: 0.87x
25+
----------------------------------------
26+
Benchmarking Expression 4:
27+
NumPy time (threaded over 32 chunks with 16 threads): 3.836962 seconds
28+
numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 18.054531 seconds
29+
numexpr speedup: 0.21x
30+
----------------------------------------
31+
32+
Results with free-threading CPython:
33+
34+
Benchmarking Expression 1:
35+
NumPy time (threaded over 32 chunks with 16 threads): 3.415349 seconds
36+
numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 2.618876 seconds
37+
numexpr speedup: 1.30x
38+
----------------------------------------
39+
Benchmarking Expression 2:
40+
NumPy time (threaded over 32 chunks with 16 threads): 19.005238 seconds
41+
numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 12.611407 seconds
42+
numexpr speedup: 1.51x
43+
----------------------------------------
44+
Benchmarking Expression 3:
45+
NumPy time (threaded over 32 chunks with 16 threads): 20.555149 seconds
46+
numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 17.690749 seconds
47+
numexpr speedup: 1.16x
48+
----------------------------------------
49+
Benchmarking Expression 4:
50+
NumPy time (threaded over 32 chunks with 16 threads): 38.338372 seconds
51+
numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 35.074684 seconds
52+
numexpr speedup: 1.09x
53+
----------------------------------------
54+
"""
55+
56+
import os
57+
58+
os.environ["NUMEXPR_NUM_THREADS"] = "2"
59+
import threading
60+
import timeit
61+
62+
import numpy as np
63+
64+
import numexpr as ne
65+
66+
array_size = 10**8
67+
num_runs = 10
68+
num_chunks = 32 # Number of chunks
69+
num_threads = 16 # Number of threads constrained by how many chunks memory can hold
70+
71+
a = np.random.rand(array_size).reshape(10**4, -1)
72+
b = np.random.rand(array_size).reshape(10**4, -1)
73+
c = np.random.rand(array_size).reshape(10**4, -1)
74+
75+
chunk_size = array_size // num_chunks
76+
77+
expressions_numpy = [
78+
lambda a, b, c: a + b * c,
79+
lambda a, b, c: a**2 + b**2 - 2 * a * b * np.cos(c),
80+
lambda a, b, c: np.sin(a) + np.log(b) * np.sqrt(c),
81+
lambda a, b, c: np.exp(a) + np.tan(b) - np.sinh(c),
82+
]
83+
84+
expressions_numexpr = [
85+
"a + b * c",
86+
"a**2 + b**2 - 2 * a * b * cos(c)",
87+
"sin(a) + log(b) * sqrt(c)",
88+
"exp(a) + tan(b) - sinh(c)",
89+
]
90+
91+
92+
def benchmark_numpy_chunk(func, a, b, c, results, indices):
93+
for index in indices:
94+
start = index * chunk_size
95+
end = (index + 1) * chunk_size
96+
time_taken = timeit.timeit(
97+
lambda: func(a[start:end], b[start:end], c[start:end]), number=num_runs
98+
)
99+
results.append(time_taken)
100+
101+
102+
def benchmark_numexpr_re_evaluate(expr, a, b, c, results, indices):
103+
for index in indices:
104+
start = index * chunk_size
105+
end = (index + 1) * chunk_size
106+
# if index == 0:
107+
# Evaluate the first chunk with evaluate
108+
time_taken = timeit.timeit(
109+
lambda: ne.evaluate(
110+
expr,
111+
local_dict={
112+
"a": a[start:end],
113+
"b": b[start:end],
114+
"c": c[start:end],
115+
},
116+
),
117+
number=num_runs,
118+
)
119+
results.append(time_taken)
120+
121+
122+
def run_benchmark_threaded():
123+
chunk_indices = list(range(num_chunks))
124+
125+
for i in range(len(expressions_numpy)):
126+
print(f"Benchmarking Expression {i+1}:")
127+
128+
results_numpy = []
129+
results_numexpr = []
130+
131+
threads_numpy = []
132+
for j in range(num_threads):
133+
indices = chunk_indices[j::num_threads] # Distribute chunks across threads
134+
thread = threading.Thread(
135+
target=benchmark_numpy_chunk,
136+
args=(expressions_numpy[i], a, b, c, results_numpy, indices),
137+
)
138+
threads_numpy.append(thread)
139+
thread.start()
140+
141+
for thread in threads_numpy:
142+
thread.join()
143+
144+
numpy_time = sum(results_numpy)
145+
print(
146+
f"NumPy time (threaded over {num_chunks} chunks with {num_threads} threads): {numpy_time:.6f} seconds"
147+
)
148+
149+
threads_numexpr = []
150+
for j in range(num_threads):
151+
indices = chunk_indices[j::num_threads] # Distribute chunks across threads
152+
thread = threading.Thread(
153+
target=benchmark_numexpr_re_evaluate,
154+
args=(expressions_numexpr[i], a, b, c, results_numexpr, indices),
155+
)
156+
threads_numexpr.append(thread)
157+
thread.start()
158+
159+
for thread in threads_numexpr:
160+
thread.join()
161+
162+
numexpr_time = sum(results_numexpr)
163+
print(
164+
f"numexpr time (threaded with re_evaluate over {num_chunks} chunks with {num_threads} threads): {numexpr_time:.6f} seconds"
165+
)
166+
print(f"numexpr speedup: {numpy_time / numexpr_time:.2f}x")
167+
print("-" * 40)
168+
169+
170+
if __name__ == "__main__":
171+
run_benchmark_threaded()

numexpr/necompiler.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -775,14 +775,12 @@ def getArguments(names, local_dict=None, global_dict=None, _frame_depth: int=2):
775775

776776

777777
# Dictionaries for caching variable names and compiled expressions
778-
# _names_cache = CacheDict(256)
779778
_names_cache = threading.local()
780-
# _numexpr_cache = CacheDict(256)
781779
_numexpr_cache = threading.local()
782-
# _numexpr_last = ContextDict()
783780
_numexpr_last = threading.local()
784781
evaluate_lock = threading.Lock()
785782

783+
786784
def validate(ex: str,
787785
local_dict: Optional[Dict] = None,
788786
global_dict: Optional[Dict] = None,
@@ -856,7 +854,6 @@ def validate(ex: str,
856854
----
857855
858856
"""
859-
global _numexpr_last
860857
if not hasattr(_numexpr_last, 'l'):
861858
_numexpr_last.l = ContextDict()
862859

@@ -998,7 +995,6 @@ def re_evaluate(local_dict: Optional[Dict] = None,
998995
The calling frame depth. Unless you are a NumExpr developer you should
999996
not set this value.
1000997
"""
1001-
global _numexpr_last
1002998
if not hasattr(_numexpr_last, 'l'):
1003999
_numexpr_last.l = ContextDict()
10041000

@@ -1009,5 +1005,5 @@ def re_evaluate(local_dict: Optional[Dict] = None,
10091005
argnames = _numexpr_last.l['argnames']
10101006
args = getArguments(argnames, local_dict, global_dict, _frame_depth=_frame_depth)
10111007
kwargs = _numexpr_last.l['kwargs']
1012-
with evaluate_lock:
1013-
return compiled_ex(*args, **kwargs)
1008+
# with evaluate_lock:
1009+
return compiled_ex(*args, **kwargs)

0 commit comments

Comments
 (0)