Skip to content

Commit f193631

Browse files
authored
bpo-46841: Use inline caching for calls (pythonGH-31709)
1 parent 105b9ac commit f193631

16 files changed

+491
-732
lines changed

Include/cpython/code.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ struct PyCodeObject {
105105
/* Quickened instructions and cache, or NULL
106106
This should be treated as opaque by all code except the specializer and
107107
interpreter. */
108-
union _cache_or_instruction *co_quickened;
108+
_Py_CODEUNIT *co_quickened;
109109

110110
};
111111

Include/internal/pycore_code.h

Lines changed: 26 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -8,50 +8,10 @@ extern "C" {
88
* Specialization and quickening structs and helper functions
99
*/
1010

11-
typedef struct {
12-
int32_t cache_count;
13-
int32_t _; /* Force 8 byte size */
14-
} _PyEntryZero;
15-
16-
typedef struct {
17-
uint8_t original_oparg;
18-
uint8_t counter;
19-
uint16_t index;
20-
uint32_t version;
21-
} _PyAdaptiveEntry;
2211

23-
typedef struct {
24-
/* Borrowed ref */
25-
PyObject *obj;
26-
} _PyObjectCache;
27-
28-
typedef struct {
29-
uint32_t func_version;
30-
uint16_t min_args;
31-
uint16_t defaults_len;
32-
} _PyCallCache;
33-
34-
35-
/* Add specialized versions of entries to this union.
36-
*
37-
* Do not break the invariant: sizeof(SpecializedCacheEntry) == 8
38-
* Preserving this invariant is necessary because:
39-
- If any one form uses more space, then all must and on 64 bit machines
40-
this is likely to double the memory consumption of caches
41-
- The function for calculating the offset of caches assumes a 4:1
42-
cache:instruction size ratio. Changing that would need careful
43-
analysis to choose a new function.
44-
*/
45-
typedef union {
46-
_PyEntryZero zero;
47-
_PyAdaptiveEntry adaptive;
48-
_PyObjectCache obj;
49-
_PyCallCache call;
50-
} SpecializedCacheEntry;
51-
52-
#define INSTRUCTIONS_PER_ENTRY (sizeof(SpecializedCacheEntry)/sizeof(_Py_CODEUNIT))
53-
54-
/* Inline caches */
12+
// Inline caches. If you change the number of cache entries for an instruction,
13+
// you must *also* update the number of cache entries in Lib/opcode.py and bump
14+
// the magic number in Lib/importlib/_bootstrap_external.py!
5515

5616
#define CACHE_ENTRIES(cache) (sizeof(cache)/sizeof(_Py_CODEUNIT))
5717

@@ -112,73 +72,22 @@ typedef struct {
11272

11373
#define INLINE_CACHE_ENTRIES_LOAD_METHOD CACHE_ENTRIES(_PyLoadMethodCache)
11474

115-
/* Maximum size of code to quicken, in code units. */
116-
#define MAX_SIZE_TO_QUICKEN 5000
117-
118-
typedef union _cache_or_instruction {
119-
_Py_CODEUNIT code[1];
120-
SpecializedCacheEntry entry;
121-
} SpecializedCacheOrInstruction;
75+
typedef struct {
76+
_Py_CODEUNIT counter;
77+
_Py_CODEUNIT func_version[2];
78+
_Py_CODEUNIT min_args;
79+
} _PyCallCache;
12280

123-
/* Get pointer to the nth cache entry, from the first instruction and n.
124-
* Cache entries are indexed backwards, with [count-1] first in memory, and [0] last.
125-
* The zeroth entry immediately precedes the instructions.
126-
*/
127-
static inline SpecializedCacheEntry *
128-
_GetSpecializedCacheEntry(const _Py_CODEUNIT *first_instr, Py_ssize_t n)
129-
{
130-
SpecializedCacheOrInstruction *last_cache_plus_one = (SpecializedCacheOrInstruction *)first_instr;
131-
assert(&last_cache_plus_one->code[0] == first_instr);
132-
return &last_cache_plus_one[-1-n].entry;
133-
}
81+
#define INLINE_CACHE_ENTRIES_CALL CACHE_ENTRIES(_PyCallCache)
13482

135-
/* Following two functions form a pair.
136-
*
137-
* oparg_from_offset_and_index() is used to compute the oparg
138-
* when quickening, so that offset_from_oparg_and_nexti()
139-
* can be used at runtime to compute the offset.
140-
*
141-
* The relationship between the three values is currently
142-
* offset == (index>>1) + oparg
143-
* This relation is chosen based on the following observations:
144-
* 1. typically 1 in 4 instructions need a cache
145-
* 2. instructions that need a cache typically use 2 entries
146-
* These observations imply: offset ≈ index/2
147-
* We use the oparg to fine tune the relation to avoid wasting space
148-
* and allow consecutive instructions to use caches.
149-
*
150-
* If the number of cache entries < number of instructions/2 we will waste
151-
* some small amoount of space.
152-
* If the number of cache entries > (number of instructions/2) + 255, then
153-
* some instructions will not be able to use a cache.
154-
* In practice, we expect some small amount of wasted space in a shorter functions
155-
* and only functions exceeding a 1000 lines or more not to have enugh cache space.
156-
*
157-
*/
158-
static inline int
159-
oparg_from_offset_and_nexti(int offset, int nexti)
160-
{
161-
return offset-(nexti>>1);
162-
}
83+
typedef struct {
84+
_Py_CODEUNIT counter;
85+
} _PyPrecallCache;
16386

164-
static inline int
165-
offset_from_oparg_and_nexti(int oparg, int nexti)
166-
{
167-
return (nexti>>1)+oparg;
168-
}
87+
#define INLINE_CACHE_ENTRIES_PRECALL CACHE_ENTRIES(_PyPrecallCache)
16988

170-
/* Get pointer to the cache entry associated with an instruction.
171-
* nexti is the index of the instruction plus one.
172-
* nexti is used as it corresponds to the instruction pointer in the interpreter.
173-
* This doesn't check that an entry has been allocated for that instruction. */
174-
static inline SpecializedCacheEntry *
175-
_GetSpecializedCacheEntryForInstruction(const _Py_CODEUNIT *first_instr, int nexti, int oparg)
176-
{
177-
return _GetSpecializedCacheEntry(
178-
first_instr,
179-
offset_from_oparg_and_nexti(oparg, nexti)
180-
);
181-
}
89+
/* Maximum size of code to quicken, in code units. */
90+
#define MAX_SIZE_TO_QUICKEN 10000
18291

18392
#define QUICKENING_WARMUP_DELAY 8
18493

@@ -205,6 +114,13 @@ _Py_IncrementCountAndMaybeQuicken(PyCodeObject *code)
205114

206115
extern Py_ssize_t _Py_QuickenedCount;
207116

117+
// Borrowed references to common callables:
118+
struct callable_cache {
119+
PyObject *isinstance;
120+
PyObject *len;
121+
PyObject *list_append;
122+
};
123+
208124
/* "Locals plus" for a code object is the set of locals + cell vars +
209125
* free vars. This relates to variable names as well as offsets into
210126
* the "fast locals" storage array of execution frames. The compiler
@@ -332,11 +248,6 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range);
332248

333249
#define ADAPTIVE_CACHE_BACKOFF 64
334250

335-
static inline void
336-
cache_backoff(_PyAdaptiveEntry *entry) {
337-
entry->counter = ADAPTIVE_CACHE_BACKOFF;
338-
}
339-
340251
/* Specialization functions */
341252

342253
extern int _Py_Specialize_LoadAttr(PyObject *owner, _Py_CODEUNIT *instr,
@@ -348,10 +259,10 @@ extern int _Py_Specialize_LoadMethod(PyObject *owner, _Py_CODEUNIT *instr,
348259
PyObject *name);
349260
extern int _Py_Specialize_BinarySubscr(PyObject *sub, PyObject *container, _Py_CODEUNIT *instr);
350261
extern int _Py_Specialize_StoreSubscr(PyObject *container, PyObject *sub, _Py_CODEUNIT *instr);
351-
extern int _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr, int nargs,
352-
PyObject *kwnames, SpecializedCacheEntry *cache);
353-
extern int _Py_Specialize_Precall(PyObject *callable, _Py_CODEUNIT *instr, int nargs,
354-
PyObject *kwnames, SpecializedCacheEntry *cache, PyObject *builtins);
262+
extern int _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr,
263+
int nargs, PyObject *kwnames);
264+
extern int _Py_Specialize_Precall(PyObject *callable, _Py_CODEUNIT *instr,
265+
int nargs, PyObject *kwnames, int oparg);
355266
extern void _Py_Specialize_BinaryOp(PyObject *lhs, PyObject *rhs, _Py_CODEUNIT *instr,
356267
int oparg);
357268
extern void _Py_Specialize_CompareOp(PyObject *lhs, PyObject *rhs,

Include/internal/pycore_global_strings.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ struct _Py_global_strings {
269269
STRUCT_FOR_ID(inf)
270270
STRUCT_FOR_ID(intersection)
271271
STRUCT_FOR_ID(isatty)
272+
STRUCT_FOR_ID(isinstance)
272273
STRUCT_FOR_ID(items)
273274
STRUCT_FOR_ID(iter)
274275
STRUCT_FOR_ID(join)
@@ -278,6 +279,7 @@ struct _Py_global_strings {
278279
STRUCT_FOR_ID(last_type)
279280
STRUCT_FOR_ID(last_value)
280281
STRUCT_FOR_ID(latin1)
282+
STRUCT_FOR_ID(len)
281283
STRUCT_FOR_ID(line)
282284
STRUCT_FOR_ID(lineno)
283285
STRUCT_FOR_ID(listcomp)

Include/internal/pycore_interp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ extern "C" {
1212

1313
#include "pycore_atomic.h" // _Py_atomic_address
1414
#include "pycore_ast_state.h" // struct ast_state
15+
#include "pycore_code.h" // struct callable_cache
1516
#include "pycore_context.h" // struct _Py_context_state
1617
#include "pycore_dict.h" // struct _Py_dict_state
1718
#include "pycore_exceptions.h" // struct _Py_exc_state
@@ -176,6 +177,7 @@ struct _is {
176177

177178
struct ast_state ast;
178179
struct type_cache type_cache;
180+
struct callable_cache callable_cache;
179181

180182
/* The following fields are here to avoid allocation during init.
181183
The data is exposed through PyInterpreterState pointer fields.

Include/internal/pycore_runtime_init.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,7 @@ extern "C" {
884884
INIT_ID(inf), \
885885
INIT_ID(intersection), \
886886
INIT_ID(isatty), \
887+
INIT_ID(isinstance), \
887888
INIT_ID(items), \
888889
INIT_ID(iter), \
889890
INIT_ID(join), \
@@ -893,6 +894,7 @@ extern "C" {
893894
INIT_ID(last_type), \
894895
INIT_ID(last_value), \
895896
INIT_ID(latin1), \
897+
INIT_ID(len), \
896898
INIT_ID(line), \
897899
INIT_ID(lineno), \
898900
INIT_ID(listcomp), \

0 commit comments

Comments
 (0)