diff --git a/Include/cpython/code.h b/Include/cpython/code.h
index 2561b2b88ba..370f1d259ab 100644
--- a/Include/cpython/code.h
+++ b/Include/cpython/code.h
@@ -72,6 +72,24 @@ typedef struct {
uint8_t *per_instruction_tools;
} _PyCoMonitoringData;
+#ifdef Py_GIL_DISABLED
+
+/* Each thread specializes a thread-local copy of the bytecode in free-threaded
+ * builds. These copies are stored on the code object in a `_PyCodeArray`. The
+ * first entry in the array always points to the "main" copy of the bytecode
+ * that is stored at the end of the code object.
+ */
+typedef struct {
+ Py_ssize_t size;
+ char *entries[1];
+} _PyCodeArray;
+
+#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() \
+ _PyCodeArray *co_tlbc;
+#else
+#define _PyCode_DEF_THREAD_LOCAL_BYTECODE()
+#endif
+
// To avoid repeating ourselves in deepfreeze.py, all PyCodeObject members are
// defined in this macro:
#define _PyCode_DEF(SIZE) { \
@@ -138,6 +156,7 @@ typedef struct {
Type is a void* to keep the format private in codeobject.c to force \
people to go through the proper APIs. */ \
void *co_extra; \
+ _PyCode_DEF_THREAD_LOCAL_BYTECODE() \
char co_code_adaptive[(SIZE)]; \
}
diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h
index c2cb4e3cdd9..f69c586a4f9 100644
--- a/Include/cpython/initconfig.h
+++ b/Include/cpython/initconfig.h
@@ -183,6 +183,7 @@ typedef struct PyConfig {
int cpu_count;
#ifdef Py_GIL_DISABLED
int enable_gil;
+ int tlbc_enabled;
#endif
/* --- Path configuration inputs ------------ */
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index 411bbff106d..80bd19a8878 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -174,6 +174,18 @@ _PyEval_IsGILEnabled(PyThreadState *tstate)
extern int _PyEval_EnableGILTransient(PyThreadState *tstate);
extern int _PyEval_EnableGILPermanent(PyThreadState *tstate);
extern int _PyEval_DisableGIL(PyThreadState *state);
+
+
+static inline _Py_CODEUNIT *
+_PyEval_GetExecutableCode(PyThreadState *tstate, PyCodeObject *co)
+{
+ _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(tstate, co);
+ if (bc != NULL) {
+ return bc;
+ }
+ return _PyCode_GetTLBC(co);
+}
+
#endif
extern void _PyEval_DeactivateOpCache(void);
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
index 57e0a14bb9b..a0acf76db6f 100644
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@@ -11,6 +11,7 @@ extern "C" {
#include "pycore_stackref.h" // _PyStackRef
#include "pycore_lock.h" // PyMutex
#include "pycore_backoff.h" // _Py_BackoffCounter
+#include "pycore_tstate.h" // _PyThreadStateImpl
/* Each instruction in a code object is a fixed-width value,
@@ -313,11 +314,17 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range);
/** API for executors */
extern void _PyCode_Clear_Executors(PyCodeObject *code);
+
#ifdef Py_GIL_DISABLED
// gh-115999 tracks progress on addressing this.
#define ENABLE_SPECIALIZATION 0
+// Use this to enable specialization families once they are thread-safe. All
+// uses will be replaced with ENABLE_SPECIALIZATION once all families are
+// thread-safe.
+#define ENABLE_SPECIALIZATION_FT 1
#else
#define ENABLE_SPECIALIZATION 1
+#define ENABLE_SPECIALIZATION_FT ENABLE_SPECIALIZATION
#endif
/* Specialization functions */
@@ -600,6 +607,40 @@ struct _PyCode8 _PyCode_DEF(8);
PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup;
+#ifdef Py_GIL_DISABLED
+
+// Return a pointer to the thread-local bytecode for the current thread, if it
+// exists.
+static inline _Py_CODEUNIT *
+_PyCode_GetTLBCFast(PyThreadState *tstate, PyCodeObject *co)
+{
+ _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc);
+ int32_t idx = ((_PyThreadStateImpl*) tstate)->tlbc_index;
+ if (idx < code->size && code->entries[idx] != NULL) {
+ return (_Py_CODEUNIT *) code->entries[idx];
+ }
+ return NULL;
+}
+
+// Return a pointer to the thread-local bytecode for the current thread,
+// creating it if necessary.
+extern _Py_CODEUNIT *_PyCode_GetTLBC(PyCodeObject *co);
+
+// Reserve an index for the current thread into thread-local bytecode
+// arrays
+//
+// Returns the reserved index or -1 on error.
+extern int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp);
+
+// Release the current thread's index into thread-local bytecode arrays
+extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate);
+
+// Free all TLBC copies not associated with live threads.
+//
+// Returns 0 on success or -1 on error.
+extern int _Py_ClearUnusedTLBC(PyInterpreterState *interp);
+#endif
+
#ifdef __cplusplus
}
#endif
diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h
index c9ac3819d03..8c0100390d0 100644
--- a/Include/internal/pycore_frame.h
+++ b/Include/internal/pycore_frame.h
@@ -68,6 +68,10 @@ typedef struct _PyInterpreterFrame {
PyObject *f_locals; /* Strong reference, may be NULL. Only valid if not on C stack */
PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */
_Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */
+#ifdef Py_GIL_DISABLED
+ /* Index of thread-local bytecode containing instr_ptr. */
+ int32_t tlbc_index;
+#endif
_PyStackRef *stackpointer;
uint16_t return_offset; /* Only relevant during a function call */
char owner;
@@ -76,7 +80,7 @@ typedef struct _PyInterpreterFrame {
} _PyInterpreterFrame;
#define _PyInterpreterFrame_LASTI(IF) \
- ((int)((IF)->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(IF))))
+ ((int)((IF)->instr_ptr - _PyFrame_GetBytecode((IF))))
static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) {
PyObject *executable = PyStackRef_AsPyObjectBorrow(f->f_executable);
@@ -84,6 +88,19 @@ static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) {
return (PyCodeObject *)executable;
}
+static inline _Py_CODEUNIT *
+_PyFrame_GetBytecode(_PyInterpreterFrame *f)
+{
+#ifdef Py_GIL_DISABLED
+ PyCodeObject *co = _PyFrame_GetCode(f);
+ _PyCodeArray *tlbc = _Py_atomic_load_ptr_acquire(&co->co_tlbc);
+ assert(f->tlbc_index >= 0 && f->tlbc_index < tlbc->size);
+ return (_Py_CODEUNIT *)tlbc->entries[f->tlbc_index];
+#else
+ return _PyCode_CODE(_PyFrame_GetCode(f));
+#endif
+}
+
static inline PyFunctionObject *_PyFrame_GetFunction(_PyInterpreterFrame *f) {
PyObject *func = PyStackRef_AsPyObjectBorrow(f->f_funcobj);
assert(PyFunction_Check(func));
@@ -144,13 +161,33 @@ static inline void _PyFrame_Copy(_PyInterpreterFrame *src, _PyInterpreterFrame *
#endif
}
+#ifdef Py_GIL_DISABLED
+static inline void
+_PyFrame_InitializeTLBC(PyThreadState *tstate, _PyInterpreterFrame *frame,
+ PyCodeObject *code)
+{
+ _Py_CODEUNIT *tlbc = _PyCode_GetTLBCFast(tstate, code);
+ if (tlbc == NULL) {
+ // No thread-local bytecode exists for this thread yet; use the main
+ // thread's copy, deferring thread-local bytecode creation to the
+ // execution of RESUME.
+ frame->instr_ptr = _PyCode_CODE(code);
+ frame->tlbc_index = 0;
+ }
+ else {
+ frame->instr_ptr = tlbc;
+ frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
+ }
+}
+#endif
+
/* Consumes reference to func and locals.
Does not initialize frame->previous, which happens
when frame is linked into the frame stack.
*/
static inline void
_PyFrame_Initialize(
- _PyInterpreterFrame *frame, _PyStackRef func,
+ PyThreadState *tstate, _PyInterpreterFrame *frame, _PyStackRef func,
PyObject *locals, PyCodeObject *code, int null_locals_from, _PyInterpreterFrame *previous)
{
frame->previous = previous;
@@ -162,7 +199,12 @@ _PyFrame_Initialize(
frame->f_locals = locals;
frame->stackpointer = frame->localsplus + code->co_nlocalsplus;
frame->frame_obj = NULL;
+#ifdef Py_GIL_DISABLED
+ _PyFrame_InitializeTLBC(tstate, frame, code);
+#else
+ (void)tstate;
frame->instr_ptr = _PyCode_CODE(code);
+#endif
frame->return_offset = 0;
frame->owner = FRAME_OWNED_BY_THREAD;
@@ -224,7 +266,8 @@ _PyFrame_IsIncomplete(_PyInterpreterFrame *frame)
return true;
}
return frame->owner != FRAME_OWNED_BY_GENERATOR &&
- frame->instr_ptr < _PyCode_CODE(_PyFrame_GetCode(frame)) + _PyFrame_GetCode(frame)->_co_firsttraceable;
+ frame->instr_ptr < _PyFrame_GetBytecode(frame) +
+ _PyFrame_GetCode(frame)->_co_firsttraceable;
}
static inline _PyInterpreterFrame *
@@ -315,7 +358,8 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_
_PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top;
tstate->datastack_top += code->co_framesize;
assert(tstate->datastack_top < tstate->datastack_limit);
- _PyFrame_Initialize(new_frame, func, NULL, code, null_locals_from, previous);
+ _PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from,
+ previous);
return new_frame;
}
@@ -339,7 +383,11 @@ _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int
assert(stackdepth <= code->co_stacksize);
frame->stackpointer = frame->localsplus + code->co_nlocalsplus + stackdepth;
frame->frame_obj = NULL;
+#ifdef Py_GIL_DISABLED
+ _PyFrame_InitializeTLBC(tstate, frame, code);
+#else
frame->instr_ptr = _PyCode_CODE(code);
+#endif
frame->owner = FRAME_OWNED_BY_THREAD;
frame->return_offset = 0;
diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h
index b85957df5a6..38a1c56c09d 100644
--- a/Include/internal/pycore_gc.h
+++ b/Include/internal/pycore_gc.h
@@ -389,6 +389,10 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar
} \
} while (0)
+#ifdef Py_GIL_DISABLED
+extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp,
+ gcvisitobjects_t callback, void *arg);
+#endif
#ifdef __cplusplus
}
diff --git a/Include/internal/pycore_index_pool.h b/Include/internal/pycore_index_pool.h
new file mode 100644
index 00000000000..e81bfd4d6ed
--- /dev/null
+++ b/Include/internal/pycore_index_pool.h
@@ -0,0 +1,56 @@
+#ifndef Py_INTERNAL_INDEX_POOL_H
+#define Py_INTERNAL_INDEX_POOL_H
+
+#include "Python.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+# error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef Py_GIL_DISABLED
+
+// This contains code for allocating unique indices in an array. It is used by
+// the free-threaded build to assign each thread a globally unique index into
+// each code object's thread-local bytecode array.
+
+// A min-heap of indices
+typedef struct _PyIndexHeap {
+ int32_t *values;
+
+ // Number of items stored in values
+ Py_ssize_t size;
+
+ // Maximum number of items that can be stored in values
+ Py_ssize_t capacity;
+} _PyIndexHeap;
+
+// An unbounded pool of indices. Indices are allocated starting from 0. They
+// may be released back to the pool once they are no longer in use.
+typedef struct _PyIndexPool {
+ PyMutex mutex;
+
+ // Min heap of indices available for allocation
+ _PyIndexHeap free_indices;
+
+ // Next index to allocate if no free indices are available
+ int32_t next_index;
+} _PyIndexPool;
+
+// Allocate the smallest available index. Returns -1 on error.
+extern int32_t _PyIndexPool_AllocIndex(_PyIndexPool *indices);
+
+// Release `index` back to the pool
+extern void _PyIndexPool_FreeIndex(_PyIndexPool *indices, int32_t index);
+
+extern void _PyIndexPool_Fini(_PyIndexPool *indices);
+
+#endif // Py_GIL_DISABLED
+
+#ifdef __cplusplus
+}
+#endif
+#endif // !Py_INTERNAL_INDEX_POOL_H
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
index 36cd71e5a00..9e3b4299693 100644
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -26,6 +26,7 @@ extern "C" {
#include "pycore_genobject.h" // _PyGen_FetchStopIterationValue
#include "pycore_global_objects.h"// struct _Py_interp_cached_objects
#include "pycore_import.h" // struct _import_state
+#include "pycore_index_pool.h" // _PyIndexPool
#include "pycore_instruments.h" // _PY_MONITORING_EVENTS
#include "pycore_list.h" // struct _Py_list_state
#include "pycore_mimalloc.h" // struct _mimalloc_interp_state
@@ -222,6 +223,7 @@ struct _is {
struct _brc_state brc; // biased reference counting state
struct _Py_unique_id_pool unique_ids; // object ids for per-thread refcounts
PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS];
+ _PyIndexPool tlbc_indices;
#endif
// Per-interpreter state for the obmalloc allocator. For the main
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
index e0e7d5ebf09..b8bea72baea 100644
--- a/Include/internal/pycore_tstate.h
+++ b/Include/internal/pycore_tstate.h
@@ -42,6 +42,9 @@ typedef struct _PyThreadStateImpl {
int is_finalized;
} refcounts;
+ // Index to use to retrieve thread-local bytecode for this thread
+ int32_t tlbc_index;
+
// When >1, code objects do not immortalize their non-string constants.
int suppress_co_const_immortalization;
#endif
@@ -52,7 +55,6 @@ typedef struct _PyThreadStateImpl {
} _PyThreadStateImpl;
-
#ifdef __cplusplus
}
#endif
diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h
index de628d240d1..55416d2aae1 100644
--- a/Include/internal/pycore_uop_ids.h
+++ b/Include/internal/pycore_uop_ids.h
@@ -193,106 +193,107 @@ extern "C" {
#define _LOAD_ATTR_SLOT_1 423
#define _LOAD_ATTR_WITH_HINT 424
#define _LOAD_BUILD_CLASS LOAD_BUILD_CLASS
+#define _LOAD_BYTECODE 425
#define _LOAD_COMMON_CONSTANT LOAD_COMMON_CONSTANT
#define _LOAD_CONST LOAD_CONST
#define _LOAD_CONST_IMMORTAL LOAD_CONST_IMMORTAL
-#define _LOAD_CONST_INLINE 425
-#define _LOAD_CONST_INLINE_BORROW 426
-#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 427
-#define _LOAD_CONST_INLINE_WITH_NULL 428
+#define _LOAD_CONST_INLINE 426
+#define _LOAD_CONST_INLINE_BORROW 427
+#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 428
+#define _LOAD_CONST_INLINE_WITH_NULL 429
#define _LOAD_DEREF LOAD_DEREF
-#define _LOAD_FAST 429
-#define _LOAD_FAST_0 430
-#define _LOAD_FAST_1 431
-#define _LOAD_FAST_2 432
-#define _LOAD_FAST_3 433
-#define _LOAD_FAST_4 434
-#define _LOAD_FAST_5 435
-#define _LOAD_FAST_6 436
-#define _LOAD_FAST_7 437
+#define _LOAD_FAST 430
+#define _LOAD_FAST_0 431
+#define _LOAD_FAST_1 432
+#define _LOAD_FAST_2 433
+#define _LOAD_FAST_3 434
+#define _LOAD_FAST_4 435
+#define _LOAD_FAST_5 436
+#define _LOAD_FAST_6 437
+#define _LOAD_FAST_7 438
#define _LOAD_FAST_AND_CLEAR LOAD_FAST_AND_CLEAR
#define _LOAD_FAST_CHECK LOAD_FAST_CHECK
#define _LOAD_FAST_LOAD_FAST LOAD_FAST_LOAD_FAST
#define _LOAD_FROM_DICT_OR_DEREF LOAD_FROM_DICT_OR_DEREF
#define _LOAD_FROM_DICT_OR_GLOBALS LOAD_FROM_DICT_OR_GLOBALS
-#define _LOAD_GLOBAL 438
-#define _LOAD_GLOBAL_BUILTINS 439
-#define _LOAD_GLOBAL_BUILTINS_FROM_KEYS 440
-#define _LOAD_GLOBAL_MODULE 441
-#define _LOAD_GLOBAL_MODULE_FROM_KEYS 442
+#define _LOAD_GLOBAL 439
+#define _LOAD_GLOBAL_BUILTINS 440
+#define _LOAD_GLOBAL_BUILTINS_FROM_KEYS 441
+#define _LOAD_GLOBAL_MODULE 442
+#define _LOAD_GLOBAL_MODULE_FROM_KEYS 443
#define _LOAD_LOCALS LOAD_LOCALS
#define _LOAD_NAME LOAD_NAME
-#define _LOAD_SMALL_INT 443
-#define _LOAD_SMALL_INT_0 444
-#define _LOAD_SMALL_INT_1 445
-#define _LOAD_SMALL_INT_2 446
-#define _LOAD_SMALL_INT_3 447
+#define _LOAD_SMALL_INT 444
+#define _LOAD_SMALL_INT_0 445
+#define _LOAD_SMALL_INT_1 446
+#define _LOAD_SMALL_INT_2 447
+#define _LOAD_SMALL_INT_3 448
#define _LOAD_SPECIAL LOAD_SPECIAL
#define _LOAD_SUPER_ATTR_ATTR LOAD_SUPER_ATTR_ATTR
#define _LOAD_SUPER_ATTR_METHOD LOAD_SUPER_ATTR_METHOD
-#define _MAKE_CALLARGS_A_TUPLE 448
+#define _MAKE_CALLARGS_A_TUPLE 449
#define _MAKE_CELL MAKE_CELL
#define _MAKE_FUNCTION MAKE_FUNCTION
-#define _MAKE_WARM 449
+#define _MAKE_WARM 450
#define _MAP_ADD MAP_ADD
#define _MATCH_CLASS MATCH_CLASS
#define _MATCH_KEYS MATCH_KEYS
#define _MATCH_MAPPING MATCH_MAPPING
#define _MATCH_SEQUENCE MATCH_SEQUENCE
-#define _MAYBE_EXPAND_METHOD 450
-#define _MAYBE_EXPAND_METHOD_KW 451
-#define _MONITOR_CALL 452
-#define _MONITOR_JUMP_BACKWARD 453
-#define _MONITOR_RESUME 454
+#define _MAYBE_EXPAND_METHOD 451
+#define _MAYBE_EXPAND_METHOD_KW 452
+#define _MONITOR_CALL 453
+#define _MONITOR_JUMP_BACKWARD 454
+#define _MONITOR_RESUME 455
#define _NOP NOP
#define _POP_EXCEPT POP_EXCEPT
-#define _POP_JUMP_IF_FALSE 455
-#define _POP_JUMP_IF_TRUE 456
+#define _POP_JUMP_IF_FALSE 456
+#define _POP_JUMP_IF_TRUE 457
#define _POP_TOP POP_TOP
-#define _POP_TOP_LOAD_CONST_INLINE_BORROW 457
+#define _POP_TOP_LOAD_CONST_INLINE_BORROW 458
#define _PUSH_EXC_INFO PUSH_EXC_INFO
-#define _PUSH_FRAME 458
+#define _PUSH_FRAME 459
#define _PUSH_NULL PUSH_NULL
-#define _PY_FRAME_GENERAL 459
-#define _PY_FRAME_KW 460
-#define _QUICKEN_RESUME 461
-#define _REPLACE_WITH_TRUE 462
+#define _PY_FRAME_GENERAL 460
+#define _PY_FRAME_KW 461
+#define _QUICKEN_RESUME 462
+#define _REPLACE_WITH_TRUE 463
#define _RESUME_CHECK RESUME_CHECK
#define _RETURN_GENERATOR RETURN_GENERATOR
#define _RETURN_VALUE RETURN_VALUE
-#define _SAVE_RETURN_OFFSET 463
-#define _SEND 464
-#define _SEND_GEN_FRAME 465
+#define _SAVE_RETURN_OFFSET 464
+#define _SEND 465
+#define _SEND_GEN_FRAME 466
#define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS
#define _SET_ADD SET_ADD
#define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE
#define _SET_UPDATE SET_UPDATE
-#define _START_EXECUTOR 466
-#define _STORE_ATTR 467
-#define _STORE_ATTR_INSTANCE_VALUE 468
-#define _STORE_ATTR_SLOT 469
-#define _STORE_ATTR_WITH_HINT 470
+#define _START_EXECUTOR 467
+#define _STORE_ATTR 468
+#define _STORE_ATTR_INSTANCE_VALUE 469
+#define _STORE_ATTR_SLOT 470
+#define _STORE_ATTR_WITH_HINT 471
#define _STORE_DEREF STORE_DEREF
-#define _STORE_FAST 471
-#define _STORE_FAST_0 472
-#define _STORE_FAST_1 473
-#define _STORE_FAST_2 474
-#define _STORE_FAST_3 475
-#define _STORE_FAST_4 476
-#define _STORE_FAST_5 477
-#define _STORE_FAST_6 478
-#define _STORE_FAST_7 479
+#define _STORE_FAST 472
+#define _STORE_FAST_0 473
+#define _STORE_FAST_1 474
+#define _STORE_FAST_2 475
+#define _STORE_FAST_3 476
+#define _STORE_FAST_4 477
+#define _STORE_FAST_5 478
+#define _STORE_FAST_6 479
+#define _STORE_FAST_7 480
#define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST
#define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST
#define _STORE_GLOBAL STORE_GLOBAL
#define _STORE_NAME STORE_NAME
-#define _STORE_SLICE 480
-#define _STORE_SUBSCR 481
+#define _STORE_SLICE 481
+#define _STORE_SUBSCR 482
#define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT
#define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT
#define _SWAP SWAP
-#define _TIER2_RESUME_CHECK 482
-#define _TO_BOOL 483
+#define _TIER2_RESUME_CHECK 483
+#define _TO_BOOL 484
#define _TO_BOOL_BOOL TO_BOOL_BOOL
#define _TO_BOOL_INT TO_BOOL_INT
#define _TO_BOOL_LIST TO_BOOL_LIST
@@ -302,13 +303,13 @@ extern "C" {
#define _UNARY_NEGATIVE UNARY_NEGATIVE
#define _UNARY_NOT UNARY_NOT
#define _UNPACK_EX UNPACK_EX
-#define _UNPACK_SEQUENCE 484
+#define _UNPACK_SEQUENCE 485
#define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST
#define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE
#define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE
#define _WITH_EXCEPT_START WITH_EXCEPT_START
#define _YIELD_VALUE YIELD_VALUE
-#define MAX_UOP_ID 484
+#define MAX_UOP_ID 485
#ifdef __cplusplus
}
diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h
index 4cfdecec78b..ade297201f0 100644
--- a/Include/internal/pycore_uop_metadata.h
+++ b/Include/internal/pycore_uop_metadata.h
@@ -289,7 +289,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = {
[_FATAL_ERROR] = 0,
[_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG,
[_DEOPT] = 0,
- [_ERROR_POP_N] = HAS_ARG_FLAG,
+ [_ERROR_POP_N] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG,
[_TIER2_RESUME_CHECK] = HAS_DEOPT_FLAG,
};
diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py
index 7c1ef42a497..2ad267e3e08 100644
--- a/Lib/test/support/__init__.py
+++ b/Lib/test/support/__init__.py
@@ -1274,6 +1274,11 @@ def requires_specialization(test):
_opcode.ENABLE_SPECIALIZATION, "requires specialization")(test)
+def requires_specialization_ft(test):
+ return unittest.skipUnless(
+ _opcode.ENABLE_SPECIALIZATION_FT, "requires specialization")(test)
+
+
#=======================================================================
# Check for the presence of docstrings.
diff --git a/Lib/test/test_capi/test_config.py b/Lib/test/test_capi/test_config.py
index 71fb9ae45c7..77730ad2f32 100644
--- a/Lib/test/test_capi/test_config.py
+++ b/Lib/test/test_capi/test_config.py
@@ -100,6 +100,7 @@ class CAPITests(unittest.TestCase):
options.append(("run_presite", str | None, None))
if sysconfig.get_config_var('Py_GIL_DISABLED'):
options.append(("enable_gil", int, None))
+ options.append(("tlbc_enabled", int, None))
if support.MS_WINDOWS:
options.extend((
("legacy_windows_stdio", bool, None),
diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py
index f1ab72180d7..c352325ff3d 100644
--- a/Lib/test/test_capi/test_opt.py
+++ b/Lib/test/test_capi/test_opt.py
@@ -7,7 +7,8 @@ import os
import _opcode
-from test.support import script_helper, requires_specialization, import_helper
+from test.support import (script_helper, requires_specialization,
+ import_helper, Py_GIL_DISABLED)
_testinternalcapi = import_helper.import_module("_testinternalcapi")
@@ -34,6 +35,7 @@ def clear_executors(func):
@requires_specialization
+@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds")
@unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"),
"Requires optimizer infrastructure")
class TestOptimizerAPI(unittest.TestCase):
@@ -138,6 +140,7 @@ def get_opnames(ex):
@requires_specialization
+@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds")
@unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"),
"Requires optimizer infrastructure")
class TestExecutorInvalidation(unittest.TestCase):
@@ -219,6 +222,7 @@ class TestExecutorInvalidation(unittest.TestCase):
@requires_specialization
+@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds")
@unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"),
"Requires optimizer infrastructure")
@unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.")
@@ -586,6 +590,7 @@ class TestUops(unittest.TestCase):
@requires_specialization
+@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds")
@unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"),
"Requires optimizer infrastructure")
@unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.")
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index eca9adf9a7d..634efda3544 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -12,6 +12,7 @@ import unittest
from test import support
from test.support import os_helper
from test.support import force_not_colorized
+from test.support import threading_helper
from test.support.script_helper import (
spawn_python, kill_python, assert_python_ok, assert_python_failure,
interpreter_requires_environment
@@ -1068,6 +1069,57 @@ class CmdLineTest(unittest.TestCase):
out = res.out.strip().decode("utf-8")
return tuple(int(i) for i in out.split())
+ @unittest.skipUnless(support.Py_GIL_DISABLED,
+ "PYTHON_TLBC and -X tlbc"
+ " only supported in Py_GIL_DISABLED builds")
+ @threading_helper.requires_working_threading()
+ def test_disable_thread_local_bytecode(self):
+ code = """if 1:
+ import threading
+ def test(x, y):
+ return x + y
+ t = threading.Thread(target=test, args=(1,2))
+ t.start()
+ t.join()"""
+ assert_python_ok("-W", "always", "-X", "tlbc=0", "-c", code)
+ assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC="0")
+
+ @unittest.skipUnless(support.Py_GIL_DISABLED,
+ "PYTHON_TLBC and -X tlbc"
+ " only supported in Py_GIL_DISABLED builds")
+ @threading_helper.requires_working_threading()
+ def test_enable_thread_local_bytecode(self):
+ code = """if 1:
+ import threading
+ def test(x, y):
+ return x + y
+ t = threading.Thread(target=test, args=(1,2))
+ t.start()
+ t.join()"""
+ # The functionality of thread-local bytecode is tested more extensively
+ # in test_thread_local_bytecode
+ assert_python_ok("-W", "always", "-X", "tlbc=1", "-c", code)
+ assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC="1")
+
+ @unittest.skipUnless(support.Py_GIL_DISABLED,
+ "PYTHON_TLBC and -X tlbc"
+ " only supported in Py_GIL_DISABLED builds")
+ def test_invalid_thread_local_bytecode(self):
+ rc, out, err = assert_python_failure("-X", "tlbc")
+ self.assertIn(b"tlbc=n: n is missing or invalid", err)
+ rc, out, err = assert_python_failure("-X", "tlbc=foo")
+ self.assertIn(b"tlbc=n: n is missing or invalid", err)
+ rc, out, err = assert_python_failure("-X", "tlbc=-1")
+ self.assertIn(b"tlbc=n: n is missing or invalid", err)
+ rc, out, err = assert_python_failure("-X", "tlbc=2")
+ self.assertIn(b"tlbc=n: n is missing or invalid", err)
+ rc, out, err = assert_python_failure(PYTHON_TLBC="foo")
+ self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err)
+ rc, out, err = assert_python_failure(PYTHON_TLBC="-1")
+ self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err)
+ rc, out, err = assert_python_failure(PYTHON_TLBC="2")
+ self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err)
+
@unittest.skipIf(interpreter_requires_environment(),
'Cannot run -I tests when PYTHON env vars are required.')
diff --git a/Lib/test/test_dis.py b/Lib/test/test_dis.py
index 3c6570afa50..a991c67fca4 100644
--- a/Lib/test/test_dis.py
+++ b/Lib/test/test_dis.py
@@ -10,7 +10,8 @@ import sys
import types
import unittest
from test.support import (captured_stdout, requires_debug_ranges,
- requires_specialization, cpython_only)
+ requires_specialization, requires_specialization_ft,
+ cpython_only)
from test.support.bytecode_helper import BytecodeTestCase
import opcode
@@ -1261,7 +1262,7 @@ class DisTests(DisTestBase):
self.do_disassembly_compare(got, dis_load_test_quickened_code)
@cpython_only
- @requires_specialization
+ @requires_specialization_ft
def test_binary_specialize(self):
binary_op_quicken = """\
0 RESUME_CHECK 0
@@ -1281,6 +1282,9 @@ class DisTests(DisTestBase):
got = self.get_disassembly(co_unicode, adaptive=True)
self.do_disassembly_compare(got, binary_op_quicken % "BINARY_OP_ADD_UNICODE 0 (+)")
+ @cpython_only
+ @requires_specialization
+ def test_binary_subscr_specialize(self):
binary_subscr_quicken = """\
0 RESUME_CHECK 0
diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py
index 5e886b6c8c3..bf861ef06ee 100644
--- a/Lib/test/test_embed.py
+++ b/Lib/test/test_embed.py
@@ -644,6 +644,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
CONFIG_COMPAT['run_presite'] = None
if support.Py_GIL_DISABLED:
CONFIG_COMPAT['enable_gil'] = -1
+ CONFIG_COMPAT['tlbc_enabled'] = GET_DEFAULT_CONFIG
if MS_WINDOWS:
CONFIG_COMPAT.update({
'legacy_windows_stdio': False,
diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
index c0862d7d15f..d839893d2c6 100644
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -1094,7 +1094,14 @@ class SysModuleTest(unittest.TestCase):
# While we could imagine a Python session where the number of
# multiple buffer objects would exceed the sharing of references,
# it is unlikely to happen in a normal test run.
- self.assertLess(a, sys.gettotalrefcount())
+ #
+ # In free-threaded builds each code object owns an array of
+ # pointers to copies of the bytecode. When the number of
+ # code objects is a large fraction of the total number of
+ # references, this can cause the total number of allocated
+ # blocks to exceed the total number of references.
+ if not support.Py_GIL_DISABLED:
+ self.assertLess(a, sys.gettotalrefcount())
except AttributeError:
# gettotalrefcount() not available
pass
@@ -1613,7 +1620,10 @@ class SizeofTest(unittest.TestCase):
def func():
return sys._getframe()
x = func()
- INTERPRETER_FRAME = '9PhcP'
+ if support.Py_GIL_DISABLED:
+ INTERPRETER_FRAME = '10PhcP'
+ else:
+ INTERPRETER_FRAME = '9PhcP'
check(x, size('3PiccPP' + INTERPRETER_FRAME + 'P'))
# function
def func(): pass
diff --git a/Lib/test/test_thread_local_bytecode.py b/Lib/test/test_thread_local_bytecode.py
new file mode 100644
index 00000000000..7a8809c5ae7
--- /dev/null
+++ b/Lib/test/test_thread_local_bytecode.py
@@ -0,0 +1,198 @@
+"""Tests for thread-local bytecode."""
+import dis
+import textwrap
+import unittest
+
+from test import support
+from test.support import cpython_only, import_helper, requires_specialization_ft
+from test.support.script_helper import assert_python_ok
+from test.support.threading_helper import requires_working_threading
+
+# Skip this test if the _testinternalcapi module isn't available
+_testinternalcapi = import_helper.import_module("_testinternalcapi")
+
+
+@cpython_only
+@requires_working_threading()
+@unittest.skipUnless(support.Py_GIL_DISABLED, "only in free-threaded builds")
+class TLBCTests(unittest.TestCase):
+ @requires_specialization_ft
+ def test_new_threads_start_with_unspecialized_code(self):
+ code = textwrap.dedent("""
+ import dis
+ import queue
+ import threading
+
+ from _testinternalcapi import get_tlbc
+
+ def all_opnames(bc):
+ return {i.opname for i in dis._get_instructions_bytes(bc)}
+
+ def f(a, b, q=None):
+ if q is not None:
+ q.put(get_tlbc(f))
+ return a + b
+
+ for _ in range(100):
+ # specialize
+ f(1, 2)
+
+ q = queue.Queue()
+ t = threading.Thread(target=f, args=('a', 'b', q))
+ t.start()
+ t.join()
+
+ assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f))
+ assert "BINARY_OP_ADD_INT" not in all_opnames(q.get())
+ """)
+ assert_python_ok("-X", "tlbc=1", "-c", code)
+
+ @requires_specialization_ft
+ def test_threads_specialize_independently(self):
+ code = textwrap.dedent("""
+ import dis
+ import queue
+ import threading
+
+ from _testinternalcapi import get_tlbc
+
+ def all_opnames(bc):
+ return {i.opname for i in dis._get_instructions_bytes(bc)}
+
+ def f(a, b):
+ return a + b
+
+ def g(a, b, q=None):
+ for _ in range(100):
+ f(a, b)
+ if q is not None:
+ q.put(get_tlbc(f))
+
+ # specialize in main thread
+ g(1, 2)
+
+ # specialize in other thread
+ q = queue.Queue()
+ t = threading.Thread(target=g, args=('a', 'b', q))
+ t.start()
+ t.join()
+
+ assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f))
+ t_opnames = all_opnames(q.get())
+ assert "BINARY_OP_ADD_INT" not in t_opnames
+ assert "BINARY_OP_ADD_UNICODE" in t_opnames
+ """)
+ assert_python_ok("-X", "tlbc=1", "-c", code)
+
+ def test_reuse_tlbc_across_threads_different_lifetimes(self):
+ code = textwrap.dedent("""
+ import queue
+ import threading
+
+ from _testinternalcapi import get_tlbc_id
+
+ def f(a, b, q=None):
+ if q is not None:
+ q.put(get_tlbc_id(f))
+ return a + b
+
+ q = queue.Queue()
+ tlbc_ids = []
+ for _ in range(3):
+ t = threading.Thread(target=f, args=('a', 'b', q))
+ t.start()
+ t.join()
+ tlbc_ids.append(q.get())
+
+ assert tlbc_ids[0] == tlbc_ids[1]
+ assert tlbc_ids[1] == tlbc_ids[2]
+ """)
+ assert_python_ok("-X", "tlbc=1", "-c", code)
+
+ def test_no_copies_if_tlbc_disabled(self):
+ code = textwrap.dedent("""
+ import queue
+ import threading
+
+ from _testinternalcapi import get_tlbc_id
+
+ def f(a, b, q=None):
+ if q is not None:
+ q.put(get_tlbc_id(f))
+ return a + b
+
+ q = queue.Queue()
+ threads = []
+ for _ in range(3):
+ t = threading.Thread(target=f, args=('a', 'b', q))
+ t.start()
+ threads.append(t)
+
+ tlbc_ids = []
+ for t in threads:
+ t.join()
+ tlbc_ids.append(q.get())
+
+ main_tlbc_id = get_tlbc_id(f)
+ assert main_tlbc_id is not None
+ assert tlbc_ids[0] == main_tlbc_id
+ assert tlbc_ids[1] == main_tlbc_id
+ assert tlbc_ids[2] == main_tlbc_id
+ """)
+ assert_python_ok("-X", "tlbc=0", "-c", code)
+
+ def test_no_specialization_if_tlbc_disabled(self):
+ code = textwrap.dedent("""
+ import dis
+ import queue
+ import threading
+
+ from _testinternalcapi import get_tlbc
+
+ def all_opnames(f):
+ bc = get_tlbc(f)
+ return {i.opname for i in dis._get_instructions_bytes(bc)}
+
+ def f(a, b):
+ return a + b
+
+ for _ in range(100):
+ f(1, 2)
+
+ assert "BINARY_OP_ADD_INT" not in all_opnames(f)
+ """)
+ assert_python_ok("-X", "tlbc=0", "-c", code)
+
+ def test_generator_throw(self):
+ code = textwrap.dedent("""
+ import queue
+ import threading
+
+ from _testinternalcapi import get_tlbc_id
+
+ def g():
+ try:
+ yield
+ except:
+ yield get_tlbc_id(g)
+
+ def f(q):
+ gen = g()
+ next(gen)
+ q.put(gen.throw(ValueError))
+
+ q = queue.Queue()
+ t = threading.Thread(target=f, args=(q,))
+ t.start()
+ t.join()
+
+ gen = g()
+ next(gen)
+ main_id = gen.throw(ValueError)
+ assert main_id != q.get()
+ """)
+ assert_python_ok("-X", "tlbc=1", "-c", code)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 1a9191ec0ce..c650ecaf7be 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -460,6 +460,7 @@ PYTHON_OBJS= \
Python/hashtable.o \
Python/import.o \
Python/importdl.o \
+ Python/index_pool.o \
Python/initconfig.o \
Python/interpconfig.o \
Python/instrumentation.o \
@@ -1228,6 +1229,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_hashtable.h \
$(srcdir)/Include/internal/pycore_import.h \
$(srcdir)/Include/internal/pycore_importdl.h \
+ $(srcdir)/Include/internal/pycore_index_pool.h \
$(srcdir)/Include/internal/pycore_initconfig.h \
$(srcdir)/Include/internal/pycore_instruments.h \
$(srcdir)/Include/internal/pycore_instruction_sequence.h \
diff --git a/Modules/_opcode.c b/Modules/_opcode.c
index dc93063aee7..7ccf7af6bf9 100644
--- a/Modules/_opcode.c
+++ b/Modules/_opcode.c
@@ -422,6 +422,9 @@ _opcode_exec(PyObject *m) {
if (PyModule_AddIntMacro(m, ENABLE_SPECIALIZATION) < 0) {
return -1;
}
+ if (PyModule_AddIntMacro(m, ENABLE_SPECIALIZATION_FT) < 0) {
+ return -1;
+ }
return 0;
}
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index eb98b433c6c..883f32599fb 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -14,6 +14,7 @@
#include "pycore_bitutils.h" // _Py_bswap32()
#include "pycore_bytesobject.h" // _PyBytes_Find()
#include "pycore_ceval.h" // _PyEval_AddPendingCall()
+#include "pycore_code.h" // _PyCode_GetTLBCFast()
#include "pycore_compile.h" // _PyCompile_CodeGen()
#include "pycore_context.h" // _PyContext_NewHamtForTests()
#include "pycore_dict.h" // _PyManagedDictPointer_GetValues()
@@ -1963,6 +1964,48 @@ get_py_thread_id(PyObject *self, PyObject *Py_UNUSED(ignored))
Py_BUILD_ASSERT(sizeof(unsigned long long) >= sizeof(tid));
return PyLong_FromUnsignedLongLong(tid);
}
+
+static PyCodeObject *
+get_code(PyObject *obj)
+{
+ if (PyCode_Check(obj)) {
+ return (PyCodeObject *)obj;
+ }
+ else if (PyFunction_Check(obj)) {
+ return (PyCodeObject *)PyFunction_GetCode(obj);
+ }
+ return (PyCodeObject *)PyErr_Format(
+ PyExc_TypeError, "expected function or code object, got %s",
+ Py_TYPE(obj)->tp_name);
+}
+
+static PyObject *
+get_tlbc(PyObject *Py_UNUSED(module), PyObject *obj)
+{
+ PyCodeObject *code = get_code(obj);
+ if (code == NULL) {
+ return NULL;
+ }
+ _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(PyThreadState_GET(), code);
+ if (bc == NULL) {
+ Py_RETURN_NONE;
+ }
+ return PyBytes_FromStringAndSize((const char *)bc, _PyCode_NBYTES(code));
+}
+
+static PyObject *
+get_tlbc_id(PyObject *Py_UNUSED(module), PyObject *obj)
+{
+ PyCodeObject *code = get_code(obj);
+ if (code == NULL) {
+ return NULL;
+ }
+ _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(PyThreadState_GET(), code);
+ if (bc == NULL) {
+ Py_RETURN_NONE;
+ }
+ return PyLong_FromVoidPtr(bc);
+}
#endif
static PyObject *
@@ -2022,7 +2065,6 @@ identify_type_slot_wrappers(PyObject *self, PyObject *Py_UNUSED(ignored))
return _PyType_GetSlotWrapperNames();
}
-
static PyMethodDef module_functions[] = {
{"get_configs", get_configs, METH_NOARGS},
{"get_recursion_depth", get_recursion_depth, METH_NOARGS},
@@ -2110,6 +2152,8 @@ static PyMethodDef module_functions[] = {
#ifdef Py_GIL_DISABLED
{"py_thread_id", get_py_thread_id, METH_NOARGS},
+ {"get_tlbc", get_tlbc, METH_O, NULL},
+ {"get_tlbc_id", get_tlbc_id, METH_O, NULL},
#endif
#ifdef _Py_TIER2
{"uop_symbols_test", _Py_uop_symbols_test, METH_NOARGS},
diff --git a/Objects/codeobject.c b/Objects/codeobject.c
index 775ea7aca82..1cf9740af9a 100644
--- a/Objects/codeobject.c
+++ b/Objects/codeobject.c
@@ -6,17 +6,22 @@
#include "pycore_code.h" // _PyCodeConstructor
#include "pycore_frame.h" // FRAME_SPECIALS_SIZE
#include "pycore_hashtable.h" // _Py_hashtable_t
+#include "pycore_index_pool.h" // _PyIndexPool
#include "pycore_initconfig.h" // _PyStatus_OK()
#include "pycore_interp.h" // PyInterpreterState.co_extra_freefuncs
#include "pycore_object.h" // _PyObject_SetDeferredRefcount
+#include "pycore_object_stack.h"
#include "pycore_opcode_metadata.h" // _PyOpcode_Deopt, _PyOpcode_Caches
#include "pycore_opcode_utils.h" // RESUME_AT_FUNC_START
+#include "pycore_pymem.h" // _PyMem_FreeDelayed
#include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "pycore_setobject.h" // _PySet_NextEntry()
#include "pycore_tuple.h" // _PyTuple_ITEMS()
#include "pycore_uniqueid.h" // _PyObject_AssignUniqueId()
#include "clinic/codeobject.c.h"
+#define INITIAL_SPECIALIZED_CODE_SIZE 16
+
static const char *
code_event_name(PyCodeEvent event) {
switch (event) {
@@ -440,9 +445,15 @@ _PyCode_Validate(struct _PyCodeConstructor *con)
return 0;
}
-extern void _PyCode_Quicken(PyCodeObject *code);
+extern void
+_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts,
+ int enable_counters);
-static void
+#ifdef Py_GIL_DISABLED
+static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size);
+#endif
+
+static int
init_code(PyCodeObject *co, struct _PyCodeConstructor *con)
{
int nlocalsplus = (int)PyTuple_GET_SIZE(con->localsplusnames);
@@ -505,14 +516,27 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con)
memcpy(_PyCode_CODE(co), PyBytes_AS_STRING(con->code),
PyBytes_GET_SIZE(con->code));
+#ifdef Py_GIL_DISABLED
+ co->co_tlbc = _PyCodeArray_New(INITIAL_SPECIALIZED_CODE_SIZE);
+ if (co->co_tlbc == NULL) {
+ return -1;
+ }
+ co->co_tlbc->entries[0] = co->co_code_adaptive;
+#endif
int entry_point = 0;
while (entry_point < Py_SIZE(co) &&
_PyCode_CODE(co)[entry_point].op.code != RESUME) {
entry_point++;
}
co->_co_firsttraceable = entry_point;
- _PyCode_Quicken(co);
+#ifdef Py_GIL_DISABLED
+ _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts,
+ interp->config.tlbc_enabled);
+#else
+ _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts, 1);
+#endif
notify_code_watchers(PY_CODE_EVENT_CREATE, co);
+ return 0;
}
static int
@@ -667,7 +691,12 @@ _PyCode_New(struct _PyCodeConstructor *con)
PyErr_NoMemory();
return NULL;
}
- init_code(co, con);
+
+ if (init_code(co, con) < 0) {
+ Py_DECREF(co);
+ return NULL;
+ }
+
#ifdef Py_GIL_DISABLED
co->_co_unique_id = _PyObject_AssignUniqueId((PyObject *)co);
_PyObject_GC_TRACK(co);
@@ -1871,6 +1900,17 @@ code_dealloc(PyCodeObject *co)
PyObject_ClearWeakRefs((PyObject*)co);
}
free_monitoring_data(co->_co_monitoring);
+#ifdef Py_GIL_DISABLED
+ // The first element always points to the mutable bytecode at the end of
+ // the code object, which will be freed when the code object is freed.
+ for (Py_ssize_t i = 1; i < co->co_tlbc->size; i++) {
+ char *entry = co->co_tlbc->entries[i];
+ if (entry != NULL) {
+ PyMem_Free(entry);
+ }
+ }
+ PyMem_Free(co->co_tlbc);
+#endif
PyObject_Free(co);
}
@@ -2646,5 +2686,270 @@ _PyCode_Fini(PyInterpreterState *interp)
_Py_hashtable_destroy(state->constants);
state->constants = NULL;
}
+ _PyIndexPool_Fini(&interp->tlbc_indices);
#endif
}
+
+#ifdef Py_GIL_DISABLED
+
+// Thread-local bytecode (TLBC)
+//
+// Each thread specializes a thread-local copy of the bytecode, created on the
+// first RESUME, in free-threaded builds. All copies of the bytecode for a code
+// object are stored in the `co_tlbc` array. Threads reserve a globally unique
+// index identifying its copy of the bytecode in all `co_tlbc` arrays at thread
+// creation and release the index at thread destruction. The first entry in
+// every `co_tlbc` array always points to the "main" copy of the bytecode that
+// is stored at the end of the code object. This ensures that no bytecode is
+// copied for programs that do not use threads.
+//
+// Thread-local bytecode can be disabled at runtime by providing either `-X
+// tlbc=0` or `PYTHON_TLBC=0`. Disabling thread-local bytecode also disables
+// specialization. All threads share the main copy of the bytecode when
+// thread-local bytecode is disabled.
+//
+// Concurrent modifications to the bytecode made by the specializing
+// interpreter and instrumentation use atomics, with specialization taking care
+// not to overwrite an instruction that was instrumented concurrently.
+
+int32_t
+_Py_ReserveTLBCIndex(PyInterpreterState *interp)
+{
+ if (interp->config.tlbc_enabled) {
+ return _PyIndexPool_AllocIndex(&interp->tlbc_indices);
+ }
+ // All threads share the main copy of the bytecode when TLBC is disabled
+ return 0;
+}
+
+void
+_Py_ClearTLBCIndex(_PyThreadStateImpl *tstate)
+{
+ PyInterpreterState *interp = ((PyThreadState *)tstate)->interp;
+ if (interp->config.tlbc_enabled) {
+ _PyIndexPool_FreeIndex(&interp->tlbc_indices, tstate->tlbc_index);
+ }
+}
+
+static _PyCodeArray *
+_PyCodeArray_New(Py_ssize_t size)
+{
+ _PyCodeArray *arr = PyMem_Calloc(
+ 1, offsetof(_PyCodeArray, entries) + sizeof(void *) * size);
+ if (arr == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ arr->size = size;
+ return arr;
+}
+
+static void
+copy_code(_Py_CODEUNIT *dst, PyCodeObject *co)
+{
+ int code_len = (int) Py_SIZE(co);
+ for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) {
+ dst[i] = _Py_GetBaseCodeUnit(co, i);
+ }
+ _PyCode_Quicken(dst, code_len, co->co_consts, 1);
+}
+
+static Py_ssize_t
+get_pow2_greater(Py_ssize_t initial, Py_ssize_t limit)
+{
+ // initial must be a power of two
+ assert(!(initial & (initial - 1)));
+ Py_ssize_t res = initial;
+ while (res && res < limit) {
+ res <<= 1;
+ }
+ return res;
+}
+
+static _Py_CODEUNIT *
+create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx)
+{
+ _PyCodeArray *tlbc = co->co_tlbc;
+ if (idx >= tlbc->size) {
+ Py_ssize_t new_size = get_pow2_greater(tlbc->size, idx + 1);
+ if (!new_size) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ _PyCodeArray *new_tlbc = _PyCodeArray_New(new_size);
+ if (new_tlbc == NULL) {
+ return NULL;
+ }
+ memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *));
+ _Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc);
+ _PyMem_FreeDelayed(tlbc);
+ tlbc = new_tlbc;
+ }
+ char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co));
+ if (bc == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ copy_code((_Py_CODEUNIT *) bc, co);
+ assert(tlbc->entries[idx] == NULL);
+ tlbc->entries[idx] = bc;
+ return (_Py_CODEUNIT *) bc;
+}
+
+static _Py_CODEUNIT *
+get_tlbc_lock_held(PyCodeObject *co)
+{
+ _PyCodeArray *tlbc = co->co_tlbc;
+ _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)PyThreadState_GET();
+ int32_t idx = tstate->tlbc_index;
+ if (idx < tlbc->size && tlbc->entries[idx] != NULL) {
+ return (_Py_CODEUNIT *)tlbc->entries[idx];
+ }
+ return create_tlbc_lock_held(co, idx);
+}
+
+_Py_CODEUNIT *
+_PyCode_GetTLBC(PyCodeObject *co)
+{
+ _Py_CODEUNIT *result;
+ Py_BEGIN_CRITICAL_SECTION(co);
+ result = get_tlbc_lock_held(co);
+ Py_END_CRITICAL_SECTION();
+ return result;
+}
+
+// My kingdom for a bitset
+struct flag_set {
+ uint8_t *flags;
+ Py_ssize_t size;
+};
+
+static inline int
+flag_is_set(struct flag_set *flags, Py_ssize_t idx)
+{
+ assert(idx >= 0);
+ return (idx < flags->size) && flags->flags[idx];
+}
+
+// Set the flag for each tlbc index in use
+static int
+get_indices_in_use(PyInterpreterState *interp, struct flag_set *in_use)
+{
+ assert(interp->stoptheworld.world_stopped);
+ assert(in_use->flags == NULL);
+ int32_t max_index = 0;
+ for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) {
+ int32_t idx = ((_PyThreadStateImpl *) p)->tlbc_index;
+ if (idx > max_index) {
+ max_index = idx;
+ }
+ }
+ in_use->size = (size_t) max_index + 1;
+ in_use->flags = PyMem_Calloc(in_use->size, sizeof(*in_use->flags));
+ if (in_use->flags == NULL) {
+ return -1;
+ }
+ for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) {
+ in_use->flags[((_PyThreadStateImpl *) p)->tlbc_index] = 1;
+ }
+ return 0;
+}
+
+struct get_code_args {
+ _PyObjectStack code_objs;
+ struct flag_set indices_in_use;
+ int err;
+};
+
+static void
+clear_get_code_args(struct get_code_args *args)
+{
+ if (args->indices_in_use.flags != NULL) {
+ PyMem_Free(args->indices_in_use.flags);
+ args->indices_in_use.flags = NULL;
+ }
+ _PyObjectStack_Clear(&args->code_objs);
+}
+
+static inline int
+is_bytecode_unused(_PyCodeArray *tlbc, Py_ssize_t idx,
+ struct flag_set *indices_in_use)
+{
+ assert(idx > 0 && idx < tlbc->size);
+ return tlbc->entries[idx] != NULL && !flag_is_set(indices_in_use, idx);
+}
+
+static int
+get_code_with_unused_tlbc(PyObject *obj, struct get_code_args *args)
+{
+ if (!PyCode_Check(obj)) {
+ return 1;
+ }
+ PyCodeObject *co = (PyCodeObject *) obj;
+ _PyCodeArray *tlbc = co->co_tlbc;
+ // The first index always points at the main copy of the bytecode embedded
+ // in the code object.
+ for (Py_ssize_t i = 1; i < tlbc->size; i++) {
+ if (is_bytecode_unused(tlbc, i, &args->indices_in_use)) {
+ if (_PyObjectStack_Push(&args->code_objs, obj) < 0) {
+ args->err = -1;
+ return 0;
+ }
+ return 1;
+ }
+ }
+ return 1;
+}
+
+static void
+free_unused_bytecode(PyCodeObject *co, struct flag_set *indices_in_use)
+{
+ _PyCodeArray *tlbc = co->co_tlbc;
+ // The first index always points at the main copy of the bytecode embedded
+ // in the code object.
+ for (Py_ssize_t i = 1; i < tlbc->size; i++) {
+ if (is_bytecode_unused(tlbc, i, indices_in_use)) {
+ PyMem_Free(tlbc->entries[i]);
+ tlbc->entries[i] = NULL;
+ }
+ }
+}
+
+int
+_Py_ClearUnusedTLBC(PyInterpreterState *interp)
+{
+ struct get_code_args args = {
+ .code_objs = {NULL},
+ .indices_in_use = {NULL, 0},
+ .err = 0,
+ };
+ _PyEval_StopTheWorld(interp);
+ // Collect in-use tlbc indices
+ if (get_indices_in_use(interp, &args.indices_in_use) < 0) {
+ goto err;
+ }
+ // Collect code objects that have bytecode not in use by any thread
+ _PyGC_VisitObjectsWorldStopped(
+ interp, (gcvisitobjects_t)get_code_with_unused_tlbc, &args);
+ if (args.err < 0) {
+ goto err;
+ }
+ // Free unused bytecode. This must happen outside of gc_visit_heaps; it is
+ // unsafe to allocate or free any mimalloc managed memory when it's
+ // running.
+ PyObject *obj;
+ while ((obj = _PyObjectStack_Pop(&args.code_objs)) != NULL) {
+ free_unused_bytecode((PyCodeObject*) obj, &args.indices_in_use);
+ }
+ _PyEval_StartTheWorld(interp);
+ clear_get_code_args(&args);
+ return 0;
+
+err:
+ _PyEval_StartTheWorld(interp);
+ clear_get_code_args(&args);
+ PyErr_NoMemory();
+ return -1;
+}
+
+#endif
diff --git a/Objects/frameobject.c b/Objects/frameobject.c
index 55394afa523..c743c254848 100644
--- a/Objects/frameobject.c
+++ b/Objects/frameobject.c
@@ -1651,7 +1651,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno, void *Py_UNUSED(ignore
}
/* Finally set the new lasti and return OK. */
f->f_lineno = 0;
- f->f_frame->instr_ptr = _PyCode_CODE(code) + best_addr;
+ f->f_frame->instr_ptr = _PyFrame_GetBytecode(f->f_frame) + best_addr;
return 0;
}
@@ -1867,10 +1867,11 @@ PyTypeObject PyFrame_Type = {
};
static void
-init_frame(_PyInterpreterFrame *frame, PyFunctionObject *func, PyObject *locals)
+init_frame(PyThreadState *tstate, _PyInterpreterFrame *frame,
+ PyFunctionObject *func, PyObject *locals)
{
PyCodeObject *code = (PyCodeObject *)func->func_code;
- _PyFrame_Initialize(frame, PyStackRef_FromPyObjectNew(func),
+ _PyFrame_Initialize(tstate, frame, PyStackRef_FromPyObjectNew(func),
Py_XNewRef(locals), code, 0, NULL);
}
@@ -1922,7 +1923,7 @@ PyFrame_New(PyThreadState *tstate, PyCodeObject *code,
Py_DECREF(func);
return NULL;
}
- init_frame((_PyInterpreterFrame *)f->_f_frame_data, func, locals);
+ init_frame(tstate, (_PyInterpreterFrame *)f->_f_frame_data, func, locals);
f->f_frame = (_PyInterpreterFrame *)f->_f_frame_data;
f->f_frame->owner = FRAME_OWNED_BY_FRAME_OBJECT;
// This frame needs to be "complete", so pretend that the first RESUME ran:
@@ -1941,7 +1942,8 @@ frame_init_get_vars(_PyInterpreterFrame *frame)
// here:
PyCodeObject *co = _PyFrame_GetCode(frame);
int lasti = _PyInterpreterFrame_LASTI(frame);
- if (!(lasti < 0 && _PyCode_CODE(co)->op.code == COPY_FREE_VARS
+ if (!(lasti < 0
+ && _PyFrame_GetBytecode(frame)->op.code == COPY_FREE_VARS
&& PyStackRef_FunctionCheck(frame->f_funcobj)))
{
/* Free vars are initialized */
@@ -1957,7 +1959,7 @@ frame_init_get_vars(_PyInterpreterFrame *frame)
frame->localsplus[offset + i] = PyStackRef_FromPyObjectNew(o);
}
// COPY_FREE_VARS doesn't have inline CACHEs, either:
- frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame));
+ frame->instr_ptr = _PyFrame_GetBytecode(frame);
}
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index b4a11195613..40225313a8a 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -11638,9 +11638,10 @@ super_descr_get(PyObject *self, PyObject *obj, PyObject *type)
}
static int
-super_init_without_args(_PyInterpreterFrame *cframe, PyCodeObject *co,
- PyTypeObject **type_p, PyObject **obj_p)
+super_init_without_args(_PyInterpreterFrame *cframe, PyTypeObject **type_p,
+ PyObject **obj_p)
{
+ PyCodeObject *co = _PyFrame_GetCode(cframe);
if (co->co_argcount == 0) {
PyErr_SetString(PyExc_RuntimeError,
"super(): no arguments");
@@ -11740,7 +11741,7 @@ super_init_impl(PyObject *self, PyTypeObject *type, PyObject *obj) {
"super(): no current frame");
return -1;
}
- int res = super_init_without_args(frame, _PyFrame_GetCode(frame), &type, &obj);
+ int res = super_init_without_args(frame, &type, &obj);
if (res < 0) {
return -1;
diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj
index a3c2d32c454..51b493f8a84 100644
--- a/PCbuild/_freeze_module.vcxproj
+++ b/PCbuild/_freeze_module.vcxproj
@@ -222,6 +222,7 @@
+
diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters
index 91b1d75fb8d..09a5f4d30ef 100644
--- a/PCbuild/_freeze_module.vcxproj.filters
+++ b/PCbuild/_freeze_module.vcxproj.filters
@@ -232,6 +232,9 @@
Source Files
+
+ Source Files
+
Source Files
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index a4881e9256e..f840e7fd61f 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -255,6 +255,7 @@
+
@@ -614,6 +615,7 @@
+
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 740790cc5e1..a930cd0b0b1 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -687,6 +687,9 @@
Include\internal
+
+ Include\internal
+
Include\internal
@@ -1373,6 +1376,9 @@
Python
+
+ Python
+
Python
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index fa98af12c69..2c78cb99317 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -168,11 +168,11 @@ dummy_func(
}
op(_QUICKEN_RESUME, (--)) {
- #if ENABLE_SPECIALIZATION
+ #if ENABLE_SPECIALIZATION_FT
if (tstate->tracing == 0 && this_instr->op.code == RESUME) {
FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK);
}
- #endif /* ENABLE_SPECIALIZATION */
+ #endif /* ENABLE_SPECIALIZATION_FT */
}
tier1 op(_MAYBE_INSTRUMENT, (--)) {
@@ -190,7 +190,26 @@ dummy_func(
}
}
+ op(_LOAD_BYTECODE, (--)) {
+ #ifdef Py_GIL_DISABLED
+ if (frame->tlbc_index !=
+ ((_PyThreadStateImpl *)tstate)->tlbc_index) {
+ _Py_CODEUNIT *bytecode =
+ _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame));
+ ERROR_IF(bytecode == NULL, error);
+ int off = this_instr - _PyFrame_GetBytecode(frame);
+ frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
+ frame->instr_ptr = bytecode + off;
+ // Make sure this_instr gets reset correctley for any uops that
+ // follow
+ next_instr = frame->instr_ptr;
+ DISPATCH();
+ }
+ #endif
+ }
+
macro(RESUME) =
+ _LOAD_BYTECODE +
_MAYBE_INSTRUMENT +
_QUICKEN_RESUME +
_CHECK_PERIODIC_IF_NOT_YIELD_FROM;
@@ -204,6 +223,10 @@ dummy_func(
uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version);
assert((version & _PY_EVAL_EVENTS_MASK) == 0);
DEOPT_IF(eval_breaker != version);
+ #ifdef Py_GIL_DISABLED
+ DEOPT_IF(frame->tlbc_index !=
+ ((_PyThreadStateImpl *)tstate)->tlbc_index);
+ #endif
}
op(_MONITOR_RESUME, (--)) {
@@ -217,6 +240,7 @@ dummy_func(
}
macro(INSTRUMENTED_RESUME) =
+ _LOAD_BYTECODE +
_MAYBE_INSTRUMENT +
_CHECK_PERIODIC_IF_NOT_YIELD_FROM +
_MONITOR_RESUME;
@@ -682,8 +706,8 @@ dummy_func(
};
specializing op(_SPECIALIZE_BINARY_SUBSCR, (counter/1, container, sub -- container, sub)) {
- assert(frame->stackpointer == NULL);
#if ENABLE_SPECIALIZATION
+ assert(frame->stackpointer == NULL);
if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
next_instr = this_instr;
_Py_Specialize_BinarySubscr(container, sub, next_instr);
@@ -1236,7 +1260,7 @@ dummy_func(
if (oparg) {
PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]);
if (PyLong_Check(lasti)) {
- frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti);
+ frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti);
assert(!_PyErr_Occurred(tstate));
}
else {
@@ -2671,9 +2695,7 @@ dummy_func(
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
DEAD(cond);
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}
@@ -2681,9 +2703,7 @@ dummy_func(
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
DEAD(cond);
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}
@@ -3697,7 +3717,7 @@ dummy_func(
op(_CREATE_INIT_FRAME, (init[1], self[1], args[oparg] -- init_frame: _PyInterpreterFrame *)) {
_PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked(
tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame);
- assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK);
+ assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK);
/* Push self onto stack of shim */
shim->localsplus[0] = PyStackRef_DUP(self[0]);
DEAD(init);
@@ -4593,7 +4613,7 @@ dummy_func(
}
specializing op(_SPECIALIZE_BINARY_OP, (counter/1, lhs, rhs -- lhs, rhs)) {
- #if ENABLE_SPECIALIZATION
+ #if ENABLE_SPECIALIZATION_FT
if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
next_instr = this_instr;
_Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY);
@@ -4601,7 +4621,7 @@ dummy_func(
}
OPCODE_DEFERRED_INC(BINARY_OP);
ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter);
- #endif /* ENABLE_SPECIALIZATION */
+ #endif /* ENABLE_SPECIALIZATION_FT */
assert(NB_ADD <= oparg);
assert(oparg <= NB_INPLACE_XOR);
}
@@ -4632,7 +4652,7 @@ dummy_func(
int original_opcode = 0;
if (tstate->tracing) {
PyCodeObject *code = _PyFrame_GetCode(frame);
- original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode;
+ original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode;
next_instr = this_instr;
} else {
original_opcode = _Py_call_instrumentation_line(
@@ -4687,9 +4707,7 @@ dummy_func(
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
int offset = flag * oparg;
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
}
@@ -4698,9 +4716,7 @@ dummy_func(
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
int offset = flag * oparg;
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
}
@@ -4715,9 +4731,7 @@ dummy_func(
PyStackRef_CLOSE(value_stackref);
offset = 0;
}
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
}
@@ -4815,7 +4829,7 @@ dummy_func(
tier2 op(_EXIT_TRACE, (exit_p/4 --)) {
_PyExitData *exit = (_PyExitData *)exit_p;
PyCodeObject *code = _PyFrame_GetCode(frame);
- _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target;
+ _Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
#if defined(Py_DEBUG) && !defined(_Py_JIT)
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
if (lltrace >= 2) {
@@ -4823,7 +4837,7 @@ dummy_func(
_PyUOpPrint(&next_uop[-1]);
printf(", exit %u, temp %d, target %d -> %s]\n",
exit - current_executor->exits, exit->temperature.value_and_backoff,
- (int)(target - _PyCode_CODE(code)),
+ (int)(target - _PyFrame_GetBytecode(frame)),
_PyOpcode_OpName[target->op.code]);
}
#endif
@@ -4933,7 +4947,7 @@ dummy_func(
_PyUOpPrint(&next_uop[-1]);
printf(", exit %u, temp %d, target %d -> %s]\n",
exit - current_executor->exits, exit->temperature.value_and_backoff,
- (int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))),
+ (int)(target - _PyFrame_GetBytecode(frame)),
_PyOpcode_OpName[target->op.code]);
}
#endif
@@ -4995,7 +5009,7 @@ dummy_func(
}
tier2 op(_ERROR_POP_N, (target/2, unused[oparg] --)) {
- frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target;
+ frame->instr_ptr = _PyFrame_GetBytecode(frame) + target;
SYNC_SP();
GOTO_UNWIND();
}
diff --git a/Python/ceval.c b/Python/ceval.c
index beee5325cd6..9a608f06966 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -189,7 +189,7 @@ lltrace_instruction(_PyInterpreterFrame *frame,
dump_stack(frame, stack_pointer);
const char *opname = _PyOpcode_OpName[opcode];
assert(opname != NULL);
- int offset = (int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame)));
+ int offset = (int)(next_instr - _PyFrame_GetBytecode(frame));
if (OPCODE_HAS_ARG((int)_PyOpcode_Deopt[opcode])) {
printf("%d: %s %d\n", offset * 2, opname, oparg);
}
@@ -841,6 +841,19 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
}
/* Because this avoids the RESUME,
* we need to update instrumentation */
+#ifdef Py_GIL_DISABLED
+ /* Load thread-local bytecode */
+ if (frame->tlbc_index != ((_PyThreadStateImpl *)tstate)->tlbc_index) {
+ _Py_CODEUNIT *bytecode =
+ _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame));
+ if (bytecode == NULL) {
+ goto error;
+ }
+ ptrdiff_t off = frame->instr_ptr - _PyFrame_GetBytecode(frame);
+ frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
+ frame->instr_ptr = bytecode + off;
+ }
+#endif
_Py_Instrument(_PyFrame_GetCode(frame), tstate->interp);
monitor_throw(tstate, frame, frame->instr_ptr);
/* TO DO -- Monitor throw entry. */
@@ -983,7 +996,7 @@ exception_unwind:
Python main loop. */
PyObject *exc = _PyErr_GetRaisedException(tstate);
PUSH(PyStackRef_FromPyObjectSteal(exc));
- next_instr = _PyCode_CODE(_PyFrame_GetCode(frame)) + handler;
+ next_instr = _PyFrame_GetBytecode(frame) + handler;
if (monitor_handled(tstate, frame, next_instr, exc) < 0) {
goto exception_unwind;
@@ -1045,6 +1058,8 @@ enter_tier_two:
#undef ENABLE_SPECIALIZATION
#define ENABLE_SPECIALIZATION 0
+#undef ENABLE_SPECIALIZATION_FT
+#define ENABLE_SPECIALIZATION_FT 0
#ifdef Py_DEBUG
#define DPRINTF(level, ...) \
@@ -1139,7 +1154,7 @@ exit_to_tier1_dynamic:
goto goto_to_tier1;
exit_to_tier1:
assert(next_uop[-1].format == UOP_FORMAT_TARGET);
- next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame));
+ next_instr = next_uop[-1].target + _PyFrame_GetBytecode(frame);
goto_to_tier1:
#ifdef Py_DEBUG
if (lltrace >= 2) {
@@ -1764,7 +1779,7 @@ _PyEvalFramePushAndInit(PyThreadState *tstate, _PyStackRef func,
if (frame == NULL) {
goto fail;
}
- _PyFrame_Initialize(frame, func, locals, code, 0, previous);
+ _PyFrame_Initialize(tstate, frame, func, locals, code, 0, previous);
if (initialize_locals(tstate, func_obj, frame->localsplus, args, argcount, kwnames)) {
assert(frame->owner == FRAME_OWNED_BY_THREAD);
clear_thread_frame(tstate, frame);
diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h
index 6674c4ccf9f..5df55813a0d 100644
--- a/Python/ceval_macros.h
+++ b/Python/ceval_macros.h
@@ -151,7 +151,7 @@ GETITEM(PyObject *v, Py_ssize_t i) {
/* Code access macros */
/* The integer overflow is checked by an assertion below. */
-#define INSTR_OFFSET() ((int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame))))
+#define INSTR_OFFSET() ((int)(next_instr - _PyFrame_GetBytecode(frame)))
#define NEXTOPARG() do { \
_Py_CODEUNIT word = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \
opcode = word.op.code; \
@@ -301,14 +301,6 @@ GETITEM(PyObject *v, Py_ssize_t i) {
#define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \
backoff_counter_triggers(forge_backoff_counter((COUNTER)))
-#ifdef Py_GIL_DISABLED
-#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
- do { \
- /* gh-115999 tracks progress on addressing this. */ \
- static_assert(0, "The specializing interpreter is not yet thread-safe"); \
- } while (0);
-#define PAUSE_ADAPTIVE_COUNTER(COUNTER) ((void)COUNTER)
-#else
#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
do { \
(COUNTER) = advance_backoff_counter((COUNTER)); \
@@ -318,6 +310,18 @@ GETITEM(PyObject *v, Py_ssize_t i) {
do { \
(COUNTER) = pause_backoff_counter((COUNTER)); \
} while (0);
+
+#ifdef ENABLE_SPECIALIZATION_FT
+/* Multiple threads may execute these concurrently if thread-local bytecode is
+ * disabled and they all execute the main copy of the bytecode. Specialization
+ * is disabled in that case so the value is unused, but the RMW cycle should be
+ * free of data races.
+ */
+#define RECORD_BRANCH_TAKEN(bitset, flag) \
+ FT_ATOMIC_STORE_UINT16_RELAXED( \
+ bitset, (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag))
+#else
+#define RECORD_BRANCH_TAKEN(bitset, flag)
#endif
#define UNBOUNDLOCAL_ERROR_MSG \
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
index ff4a0a52a0b..9fac4e881b8 100644
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -41,6 +41,8 @@
/* _QUICKEN_RESUME is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */
+ /* _LOAD_BYTECODE is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */
+
case _RESUME_CHECK: {
#if defined(__EMSCRIPTEN__)
if (_Py_emscripten_signal_clock == 0) {
@@ -56,6 +58,13 @@
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
}
+ #ifdef Py_GIL_DISABLED
+ if (frame->tlbc_index !=
+ ((_PyThreadStateImpl *)tstate)->tlbc_index) {
+ UOP_STAT_INC(uopcode, miss);
+ JUMP_TO_JUMP_TARGET();
+ }
+ #endif
break;
}
@@ -4480,8 +4489,8 @@
_PyFrame_SetStackPointer(frame, stack_pointer);
_PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked(
tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame);
+ assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK);
stack_pointer = _PyFrame_GetStackPointer(frame);
- assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK);
/* Push self onto stack of shim */
shim->localsplus[0] = PyStackRef_DUP(self[0]);
_PyFrame_SetStackPointer(frame, stack_pointer);
@@ -5683,7 +5692,9 @@
PyObject *exit_p = (PyObject *)CURRENT_OPERAND();
_PyExitData *exit = (_PyExitData *)exit_p;
PyCodeObject *code = _PyFrame_GetCode(frame);
- _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target;
+ _PyFrame_SetStackPointer(frame, stack_pointer);
+ _Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
+ stack_pointer = _PyFrame_GetStackPointer(frame);
#if defined(Py_DEBUG) && !defined(_Py_JIT)
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
if (lltrace >= 2) {
@@ -5692,7 +5703,7 @@
_PyUOpPrint(&next_uop[-1]);
printf(", exit %u, temp %d, target %d -> %s]\n",
exit - current_executor->exits, exit->temperature.value_and_backoff,
- (int)(target - _PyCode_CODE(code)),
+ (int)(target - _PyFrame_GetBytecode(frame)),
_PyOpcode_OpName[target->op.code]);
stack_pointer = _PyFrame_GetStackPointer(frame);
}
@@ -5878,7 +5889,7 @@
_PyUOpPrint(&next_uop[-1]);
printf(", exit %u, temp %d, target %d -> %s]\n",
exit - current_executor->exits, exit->temperature.value_and_backoff,
- (int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))),
+ (int)(target - _PyFrame_GetBytecode(frame)),
_PyOpcode_OpName[target->op.code]);
stack_pointer = _PyFrame_GetStackPointer(frame);
}
@@ -5956,9 +5967,11 @@
case _ERROR_POP_N: {
oparg = CURRENT_OPARG();
uint32_t target = (uint32_t)CURRENT_OPERAND();
- frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target;
stack_pointer += -oparg;
assert(WITHIN_STACK_BOUNDS());
+ _PyFrame_SetStackPointer(frame, stack_pointer);
+ frame->instr_ptr = _PyFrame_GetBytecode(frame) + target;
+ stack_pointer = _PyFrame_GetStackPointer(frame);
GOTO_UNWIND();
break;
}
diff --git a/Python/frame.c b/Python/frame.c
index 35e6c2d0a93..9a865e57d97 100644
--- a/Python/frame.c
+++ b/Python/frame.c
@@ -63,7 +63,8 @@ take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame)
// This may be a newly-created generator or coroutine frame. Since it's
// dead anyways, just pretend that the first RESUME ran:
PyCodeObject *code = _PyFrame_GetCode(frame);
- frame->instr_ptr = _PyCode_CODE(code) + code->_co_firsttraceable + 1;
+ frame->instr_ptr =
+ _PyFrame_GetBytecode(frame) + code->_co_firsttraceable + 1;
}
assert(!_PyFrame_IsIncomplete(frame));
assert(f->f_back == NULL);
diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 1969ed608ea..986d80c18d3 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -1953,16 +1953,22 @@ custom_visitor_wrapper(const mi_heap_t *heap, const mi_heap_area_t *area,
}
void
-PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg)
+_PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp,
+ gcvisitobjects_t callback, void *arg)
{
- PyInterpreterState *interp = _PyInterpreterState_GET();
struct custom_visitor_args wrapper = {
.callback = callback,
.arg = arg,
};
-
- _PyEval_StopTheWorld(interp);
gc_visit_heaps(interp, &custom_visitor_wrapper, &wrapper.base);
+}
+
+void
+PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg)
+{
+ PyInterpreterState *interp = _PyInterpreterState_GET();
+ _PyEval_StopTheWorld(interp);
+ _PyGC_VisitObjectsWorldStopped(interp, callback, arg);
_PyEval_StartTheWorld(interp);
}
diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h
index 632cbc7790a..eff246f1997 100644
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
@@ -25,7 +25,7 @@
lhs = stack_pointer[-2];
uint16_t counter = read_u16(&this_instr[1].cache);
(void)counter;
- #if ENABLE_SPECIALIZATION
+ #if ENABLE_SPECIALIZATION_FT
if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
next_instr = this_instr;
_PyFrame_SetStackPointer(frame, stack_pointer);
@@ -35,7 +35,7 @@
}
OPCODE_DEFERRED_INC(BINARY_OP);
ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter);
- #endif /* ENABLE_SPECIALIZATION */
+ #endif /* ENABLE_SPECIALIZATION_FT */
assert(NB_ADD <= oparg);
assert(oparg <= NB_INPLACE_XOR);
}
@@ -435,8 +435,8 @@
container = stack_pointer[-2];
uint16_t counter = read_u16(&this_instr[1].cache);
(void)counter;
- assert(frame->stackpointer == NULL);
#if ENABLE_SPECIALIZATION
+ assert(frame->stackpointer == NULL);
if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
next_instr = this_instr;
_PyFrame_SetStackPointer(frame, stack_pointer);
@@ -1066,8 +1066,8 @@
_PyFrame_SetStackPointer(frame, stack_pointer);
_PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked(
tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame);
+ assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK);
stack_pointer = _PyFrame_GetStackPointer(frame);
- assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK);
/* Push self onto stack of shim */
shim->localsplus[0] = PyStackRef_DUP(self[0]);
_PyFrame_SetStackPointer(frame, stack_pointer);
@@ -4711,7 +4711,9 @@
int original_opcode = 0;
if (tstate->tracing) {
PyCodeObject *code = _PyFrame_GetCode(frame);
- original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode;
+ _PyFrame_SetStackPointer(frame, stack_pointer);
+ original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode;
+ stack_pointer = _PyFrame_GetStackPointer(frame);
next_instr = this_instr;
} else {
_PyFrame_SetStackPointer(frame, stack_pointer);
@@ -4759,9 +4761,7 @@
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
int offset = flag * oparg;
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
DISPATCH();
}
@@ -4782,9 +4782,7 @@
PyStackRef_CLOSE(value_stackref);
offset = 0;
}
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
DISPATCH();
}
@@ -4822,9 +4820,7 @@
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
int offset = flag * oparg;
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
DISPATCH();
}
@@ -4834,6 +4830,28 @@
(void)this_instr;
next_instr += 1;
INSTRUCTION_STATS(INSTRUMENTED_RESUME);
+ // _LOAD_BYTECODE
+ {
+ #ifdef Py_GIL_DISABLED
+ if (frame->tlbc_index !=
+ ((_PyThreadStateImpl *)tstate)->tlbc_index) {
+ _PyFrame_SetStackPointer(frame, stack_pointer);
+ _Py_CODEUNIT *bytecode =
+ _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame));
+ stack_pointer = _PyFrame_GetStackPointer(frame);
+ if (bytecode == NULL) goto error;
+ _PyFrame_SetStackPointer(frame, stack_pointer);
+ int off = this_instr - _PyFrame_GetBytecode(frame);
+ stack_pointer = _PyFrame_GetStackPointer(frame);
+ frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
+ frame->instr_ptr = bytecode + off;
+ // Make sure this_instr gets reset correctley for any uops that
+ // follow
+ next_instr = frame->instr_ptr;
+ DISPATCH();
+ }
+ #endif
+ }
// _MAYBE_INSTRUMENT
{
if (tstate->tracing == 0) {
@@ -6646,9 +6664,7 @@
cond = stack_pointer[-1];
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
stack_pointer += -1;
assert(WITHIN_STACK_BOUNDS());
@@ -6680,9 +6696,7 @@
cond = b;
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}
stack_pointer += -1;
@@ -6715,9 +6729,7 @@
cond = b;
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}
stack_pointer += -1;
@@ -6735,9 +6747,7 @@
cond = stack_pointer[-1];
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
- #if ENABLE_SPECIALIZATION
- this_instr[1].cache = (this_instr[1].cache << 1) | flag;
- #endif
+ RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
stack_pointer += -1;
assert(WITHIN_STACK_BOUNDS());
@@ -6832,7 +6842,11 @@
if (oparg) {
PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]);
if (PyLong_Check(lasti)) {
- frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti);
+ stack_pointer += -1;
+ assert(WITHIN_STACK_BOUNDS());
+ _PyFrame_SetStackPointer(frame, stack_pointer);
+ frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti);
+ stack_pointer = _PyFrame_GetStackPointer(frame);
assert(!_PyErr_Occurred(tstate));
}
else {
@@ -6844,6 +6858,8 @@
Py_DECREF(exc);
goto error;
}
+ stack_pointer += 1;
+ assert(WITHIN_STACK_BOUNDS());
}
assert(exc && PyExceptionInstance_Check(exc));
stack_pointer += -1;
@@ -6871,6 +6887,28 @@
PREDICTED(RESUME);
_Py_CODEUNIT* const this_instr = next_instr - 1;
(void)this_instr;
+ // _LOAD_BYTECODE
+ {
+ #ifdef Py_GIL_DISABLED
+ if (frame->tlbc_index !=
+ ((_PyThreadStateImpl *)tstate)->tlbc_index) {
+ _PyFrame_SetStackPointer(frame, stack_pointer);
+ _Py_CODEUNIT *bytecode =
+ _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame));
+ stack_pointer = _PyFrame_GetStackPointer(frame);
+ if (bytecode == NULL) goto error;
+ _PyFrame_SetStackPointer(frame, stack_pointer);
+ int off = this_instr - _PyFrame_GetBytecode(frame);
+ stack_pointer = _PyFrame_GetStackPointer(frame);
+ frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
+ frame->instr_ptr = bytecode + off;
+ // Make sure this_instr gets reset correctley for any uops that
+ // follow
+ next_instr = frame->instr_ptr;
+ DISPATCH();
+ }
+ #endif
+ }
// _MAYBE_INSTRUMENT
{
if (tstate->tracing == 0) {
@@ -6890,11 +6928,11 @@
}
// _QUICKEN_RESUME
{
- #if ENABLE_SPECIALIZATION
+ #if ENABLE_SPECIALIZATION_FT
if (tstate->tracing == 0 && this_instr->op.code == RESUME) {
FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK);
}
- #endif /* ENABLE_SPECIALIZATION */
+ #endif /* ENABLE_SPECIALIZATION_FT */
}
// _CHECK_PERIODIC_IF_NOT_YIELD_FROM
{
@@ -6925,6 +6963,10 @@
uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version);
assert((version & _PY_EVAL_EVENTS_MASK) == 0);
DEOPT_IF(eval_breaker != version, RESUME);
+ #ifdef Py_GIL_DISABLED
+ DEOPT_IF(frame->tlbc_index !=
+ ((_PyThreadStateImpl *)tstate)->tlbc_index, RESUME);
+ #endif
DISPATCH();
}
diff --git a/Python/index_pool.c b/Python/index_pool.c
new file mode 100644
index 00000000000..526eccff74a
--- /dev/null
+++ b/Python/index_pool.c
@@ -0,0 +1,193 @@
+#include
+
+#include "Python.h"
+
+#include "pycore_index_pool.h"
+#include "pycore_lock.h"
+
+#ifdef Py_GIL_DISABLED
+
+static inline void
+swap(int32_t *values, Py_ssize_t i, Py_ssize_t j)
+{
+ int32_t tmp = values[i];
+ values[i] = values[j];
+ values[j] = tmp;
+}
+
+static bool
+heap_try_swap(_PyIndexHeap *heap, Py_ssize_t i, Py_ssize_t j)
+{
+ if (i < 0 || i >= heap->size) {
+ return 0;
+ }
+ if (j < 0 || j >= heap->size) {
+ return 0;
+ }
+ if (i <= j) {
+ if (heap->values[i] <= heap->values[j]) {
+ return 0;
+ }
+ }
+ else if (heap->values[j] <= heap->values[i]) {
+ return 0;
+ }
+ swap(heap->values, i, j);
+ return 1;
+}
+
+static inline Py_ssize_t
+parent(Py_ssize_t i)
+{
+ return (i - 1) / 2;
+}
+
+static inline Py_ssize_t
+left_child(Py_ssize_t i)
+{
+ return 2 * i + 1;
+}
+
+static inline Py_ssize_t
+right_child(Py_ssize_t i)
+{
+ return 2 * i + 2;
+}
+
+static void
+heap_add(_PyIndexHeap *heap, int32_t val)
+{
+ assert(heap->size < heap->capacity);
+ // Add val to end
+ heap->values[heap->size] = val;
+ heap->size++;
+ // Sift up
+ for (Py_ssize_t cur = heap->size - 1; cur > 0; cur = parent(cur)) {
+ if (!heap_try_swap(heap, cur, parent(cur))) {
+ break;
+ }
+ }
+}
+
+static Py_ssize_t
+heap_min_child(_PyIndexHeap *heap, Py_ssize_t i)
+{
+ if (left_child(i) < heap->size) {
+ if (right_child(i) < heap->size) {
+ Py_ssize_t lval = heap->values[left_child(i)];
+ Py_ssize_t rval = heap->values[right_child(i)];
+ return lval < rval ? left_child(i) : right_child(i);
+ }
+ return left_child(i);
+ }
+ else if (right_child(i) < heap->size) {
+ return right_child(i);
+ }
+ return -1;
+}
+
+static int32_t
+heap_pop(_PyIndexHeap *heap)
+{
+ assert(heap->size > 0);
+ // Pop smallest and replace with the last element
+ int32_t result = heap->values[0];
+ heap->values[0] = heap->values[heap->size - 1];
+ heap->size--;
+ // Sift down
+ for (Py_ssize_t cur = 0; cur < heap->size;) {
+ Py_ssize_t min_child = heap_min_child(heap, cur);
+ if (min_child > -1 && heap_try_swap(heap, cur, min_child)) {
+ cur = min_child;
+ }
+ else {
+ break;
+ }
+ }
+ return result;
+}
+
+static int
+heap_ensure_capacity(_PyIndexHeap *heap, Py_ssize_t limit)
+{
+ assert(limit > 0);
+ if (heap->capacity > limit) {
+ return 0;
+ }
+ Py_ssize_t new_capacity = heap->capacity ? heap->capacity : 1024;
+ while (new_capacity && new_capacity < limit) {
+ new_capacity <<= 1;
+ }
+ if (!new_capacity) {
+ return -1;
+ }
+ int32_t *new_values = PyMem_RawCalloc(new_capacity, sizeof(int32_t));
+ if (new_values == NULL) {
+ return -1;
+ }
+ if (heap->values != NULL) {
+ memcpy(new_values, heap->values, heap->capacity);
+ PyMem_RawFree(heap->values);
+ }
+ heap->values = new_values;
+ heap->capacity = new_capacity;
+ return 0;
+}
+
+static void
+heap_fini(_PyIndexHeap *heap)
+{
+ if (heap->values != NULL) {
+ PyMem_RawFree(heap->values);
+ heap->values = NULL;
+ }
+ heap->size = -1;
+ heap->capacity = -1;
+}
+
+#define LOCK_POOL(pool) PyMutex_LockFlags(&pool->mutex, _Py_LOCK_DONT_DETACH)
+#define UNLOCK_POOL(pool) PyMutex_Unlock(&pool->mutex)
+
+int32_t
+_PyIndexPool_AllocIndex(_PyIndexPool *pool)
+{
+ LOCK_POOL(pool);
+ int32_t index;
+ _PyIndexHeap *free_indices = &pool->free_indices;
+ if (free_indices->size == 0) {
+ // No free indices. Make sure the heap can always store all of the
+ // indices that have been allocated to avoid having to allocate memory
+ // (which can fail) when freeing an index. Freeing indices happens when
+ // threads are being destroyed, which makes error handling awkward /
+ // impossible. This arrangement shifts handling of allocation failures
+ // to when indices are allocated, which happens at thread creation,
+ // where we are better equipped to deal with failure.
+ if (heap_ensure_capacity(free_indices, pool->next_index + 1) < 0) {
+ UNLOCK_POOL(pool);
+ PyErr_NoMemory();
+ return -1;
+ }
+ index = pool->next_index++;
+ }
+ else {
+ index = heap_pop(free_indices);
+ }
+ UNLOCK_POOL(pool);
+ return index;
+}
+
+void
+_PyIndexPool_FreeIndex(_PyIndexPool *pool, int32_t index)
+{
+ LOCK_POOL(pool);
+ heap_add(&pool->free_indices, index);
+ UNLOCK_POOL(pool);
+}
+
+void
+_PyIndexPool_Fini(_PyIndexPool *pool)
+{
+ heap_fini(&pool->free_indices);
+}
+
+#endif // Py_GIL_DISABLED
diff --git a/Python/initconfig.c b/Python/initconfig.c
index c142438b02b..438f8a5c1cf 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -134,6 +134,7 @@ static const PyConfigSpec PYCONFIG_SPEC[] = {
SPEC(dump_refs_file, WSTR_OPT, READ_ONLY, NO_SYS),
#ifdef Py_GIL_DISABLED
SPEC(enable_gil, INT, READ_ONLY, NO_SYS),
+ SPEC(tlbc_enabled, INT, READ_ONLY, NO_SYS),
#endif
SPEC(faulthandler, BOOL, READ_ONLY, NO_SYS),
SPEC(filesystem_encoding, WSTR, READ_ONLY, NO_SYS),
@@ -315,8 +316,13 @@ The following implementation-specific options are available:\n\
"\
-X showrefcount: output the total reference count and number of used\n\
memory blocks when the program finishes or after each statement in\n\
- the interactive interpreter; only works on debug builds\n\
--X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n\
+ the interactive interpreter; only works on debug builds\n"
+#ifdef Py_GIL_DISABLED
+"-X tlbc=[0|1]: enable (1) or disable (0) thread-local bytecode. Also\n\
+ PYTHON_TLBC\n"
+#endif
+"\
+-X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n \
of N frames (default: 1); also PYTHONTRACEMALLOC=N\n\
-X utf8[=0|1]: enable (1) or disable (0) UTF-8 mode; also PYTHONUTF8\n\
-X warn_default_encoding: enable opt-in EncodingWarning for 'encoding=None';\n\
@@ -400,6 +406,9 @@ static const char usage_envvars[] =
#ifdef Py_STATS
"PYTHONSTATS : turns on statistics gathering (-X pystats)\n"
#endif
+#ifdef Py_GIL_DISABLED
+"PYTHON_TLBC : when set to 0, disables thread-local bytecode (-X tlbc)\n"
+#endif
"PYTHONTRACEMALLOC: trace Python memory allocations (-X tracemalloc)\n"
"PYTHONUNBUFFERED: disable stdout/stderr buffering (-u)\n"
"PYTHONUTF8 : control the UTF-8 mode (-X utf8)\n"
@@ -979,6 +988,7 @@ _PyConfig_InitCompatConfig(PyConfig *config)
config->cpu_count = -1;
#ifdef Py_GIL_DISABLED
config->enable_gil = _PyConfig_GIL_DEFAULT;
+ config->tlbc_enabled = 1;
#endif
}
@@ -1862,6 +1872,36 @@ error:
"n must be greater than 0");
}
+static PyStatus
+config_init_tlbc(PyConfig *config)
+{
+#ifdef Py_GIL_DISABLED
+ const char *env = config_get_env(config, "PYTHON_TLBC");
+ if (env) {
+ int enabled;
+ if (_Py_str_to_int(env, &enabled) < 0 || (enabled < 0) || (enabled > 1)) {
+ return _PyStatus_ERR(
+ "PYTHON_TLBC=N: N is missing or invalid");
+ }
+ config->tlbc_enabled = enabled;
+ }
+
+ const wchar_t *xoption = config_get_xoption(config, L"tlbc");
+ if (xoption) {
+ int enabled;
+ const wchar_t *sep = wcschr(xoption, L'=');
+ if (!sep || (config_wstr_to_int(sep + 1, &enabled) < 0) || (enabled < 0) || (enabled > 1)) {
+ return _PyStatus_ERR(
+ "-X tlbc=n: n is missing or invalid");
+ }
+ config->tlbc_enabled = enabled;
+ }
+ return _PyStatus_OK();
+#else
+ return _PyStatus_OK();
+#endif
+}
+
static PyStatus
config_init_perf_profiling(PyConfig *config)
{
@@ -2111,6 +2151,11 @@ config_read_complex_options(PyConfig *config)
}
#endif
+ status = config_init_tlbc(config);
+ if (_PyStatus_EXCEPTION(status)) {
+ return status;
+ }
+
return _PyStatus_OK();
}
diff --git a/Python/instrumentation.c b/Python/instrumentation.c
index d4568764117..87c2addaf80 100644
--- a/Python/instrumentation.c
+++ b/Python/instrumentation.c
@@ -44,10 +44,24 @@
#define UNLOCK_CODE() Py_END_CRITICAL_SECTION()
+#define MODIFY_BYTECODE(code, func, ...) \
+ do { \
+ PyCodeObject *co = (code); \
+ for (Py_ssize_t i = 0; i < code->co_tlbc->size; i++) { \
+ char *bc = co->co_tlbc->entries[i]; \
+ if (bc == NULL) { \
+ continue; \
+ } \
+ (func)((_Py_CODEUNIT *)bc, __VA_ARGS__); \
+ } \
+ } while (0)
+
#else
#define LOCK_CODE(code)
#define UNLOCK_CODE()
+#define MODIFY_BYTECODE(code, func, ...) \
+ (func)(_PyCode_CODE(code), __VA_ARGS__)
#endif
@@ -309,7 +323,8 @@ _PyInstruction_GetLength(PyCodeObject *code, int offset)
{
ASSERT_WORLD_STOPPED_OR_LOCKED(code);
- int opcode = _PyCode_CODE(code)[offset].op.code;
+ int opcode =
+ FT_ATOMIC_LOAD_UINT8_RELAXED(_PyCode_CODE(code)[offset].op.code);
assert(opcode != 0);
assert(opcode != RESERVED);
if (opcode == INSTRUMENTED_LINE) {
@@ -578,7 +593,9 @@ sanity_check_instrumentation(PyCodeObject *code)
_Py_CODEUNIT
_Py_GetBaseCodeUnit(PyCodeObject *code, int i)
{
- _Py_CODEUNIT inst = _PyCode_CODE(code)[i];
+ _Py_CODEUNIT *src_instr = _PyCode_CODE(code) + i;
+ _Py_CODEUNIT inst = {
+ .cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t *)src_instr)};
int opcode = inst.op.code;
if (opcode < MIN_INSTRUMENTED_OPCODE) {
inst.op.code = _PyOpcode_Deopt[opcode];
@@ -614,21 +631,22 @@ _Py_GetBaseCodeUnit(PyCodeObject *code, int i)
}
static void
-de_instrument(PyCodeObject *code, int i, int event)
+de_instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i,
+ int event)
{
assert(event != PY_MONITORING_EVENT_INSTRUCTION);
assert(event != PY_MONITORING_EVENT_LINE);
- _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
+ _Py_CODEUNIT *instr = &bytecode[i];
uint8_t *opcode_ptr = &instr->op.code;
int opcode = *opcode_ptr;
assert(opcode != ENTER_EXECUTOR);
if (opcode == INSTRUMENTED_LINE) {
- opcode_ptr = &code->_co_monitoring->lines[i].original_opcode;
+ opcode_ptr = &monitoring->lines[i].original_opcode;
opcode = *opcode_ptr;
}
if (opcode == INSTRUMENTED_INSTRUCTION) {
- opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i];
+ opcode_ptr = &monitoring->per_instruction_opcodes[i];
opcode = *opcode_ptr;
}
int deinstrumented = DE_INSTRUMENT[opcode];
@@ -644,65 +662,68 @@ de_instrument(PyCodeObject *code, int i, int event)
}
static void
-de_instrument_line(PyCodeObject *code, int i)
+de_instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring,
+ int i)
{
- _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
+ _Py_CODEUNIT *instr = &bytecode[i];
int opcode = instr->op.code;
if (opcode != INSTRUMENTED_LINE) {
return;
}
- _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i];
+ _PyCoLineInstrumentationData *lines = &monitoring->lines[i];
int original_opcode = lines->original_opcode;
if (original_opcode == INSTRUMENTED_INSTRUCTION) {
- lines->original_opcode = code->_co_monitoring->per_instruction_opcodes[i];
+ lines->original_opcode = monitoring->per_instruction_opcodes[i];
}
CHECK(original_opcode != 0);
CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]);
- instr->op.code = original_opcode;
+ FT_ATOMIC_STORE_UINT8(instr->op.code, original_opcode);
if (_PyOpcode_Caches[original_opcode]) {
- instr[1].counter = adaptive_counter_warmup();
+ FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff,
+ adaptive_counter_warmup().value_and_backoff);
}
assert(instr->op.code != INSTRUMENTED_LINE);
}
static void
-de_instrument_per_instruction(PyCodeObject *code, int i)
+de_instrument_per_instruction(_Py_CODEUNIT *bytecode,
+ _PyCoMonitoringData *monitoring, int i)
{
- _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
+ _Py_CODEUNIT *instr = &bytecode[i];
uint8_t *opcode_ptr = &instr->op.code;
int opcode = *opcode_ptr;
if (opcode == INSTRUMENTED_LINE) {
- opcode_ptr = &code->_co_monitoring->lines[i].original_opcode;
+ opcode_ptr = &monitoring->lines[i].original_opcode;
opcode = *opcode_ptr;
}
if (opcode != INSTRUMENTED_INSTRUCTION) {
return;
}
- int original_opcode = code->_co_monitoring->per_instruction_opcodes[i];
+ int original_opcode = monitoring->per_instruction_opcodes[i];
CHECK(original_opcode != 0);
CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]);
- *opcode_ptr = original_opcode;
+ FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, original_opcode);
if (_PyOpcode_Caches[original_opcode]) {
- instr[1].counter = adaptive_counter_warmup();
+ FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff,
+ adaptive_counter_warmup().value_and_backoff);
}
assert(*opcode_ptr != INSTRUMENTED_INSTRUCTION);
assert(instr->op.code != INSTRUMENTED_INSTRUCTION);
}
-
static void
-instrument(PyCodeObject *code, int i)
+instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i)
{
- _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
+ _Py_CODEUNIT *instr = &bytecode[i];
uint8_t *opcode_ptr = &instr->op.code;
int opcode =*opcode_ptr;
if (opcode == INSTRUMENTED_LINE) {
- _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i];
+ _PyCoLineInstrumentationData *lines = &monitoring->lines[i];
opcode_ptr = &lines->original_opcode;
opcode = *opcode_ptr;
}
if (opcode == INSTRUMENTED_INSTRUCTION) {
- opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i];
+ opcode_ptr = &monitoring->per_instruction_opcodes[i];
opcode = *opcode_ptr;
CHECK(opcode != INSTRUMENTED_INSTRUCTION && opcode != INSTRUMENTED_LINE);
CHECK(opcode == _PyOpcode_Deopt[opcode]);
@@ -716,52 +737,52 @@ instrument(PyCodeObject *code, int i)
if (_PyOpcode_Caches[deopt]) {
FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff,
adaptive_counter_warmup().value_and_backoff);
- instr[1].counter = adaptive_counter_warmup();
}
}
}
static void
-instrument_line(PyCodeObject *code, int i)
+instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i)
{
- uint8_t *opcode_ptr = &_PyCode_CODE(code)[i].op.code;
+ uint8_t *opcode_ptr = &bytecode[i].op.code;
int opcode = *opcode_ptr;
if (opcode == INSTRUMENTED_LINE) {
return;
}
- _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i];
+ _PyCoLineInstrumentationData *lines = &monitoring->lines[i];
lines->original_opcode = _PyOpcode_Deopt[opcode];
CHECK(lines->original_opcode > 0);
- *opcode_ptr = INSTRUMENTED_LINE;
+ FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_LINE);
}
static void
-instrument_per_instruction(PyCodeObject *code, int i)
+instrument_per_instruction(_Py_CODEUNIT *bytecode,
+ _PyCoMonitoringData *monitoring, int i)
{
- _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
+ _Py_CODEUNIT *instr = &bytecode[i];
uint8_t *opcode_ptr = &instr->op.code;
int opcode = *opcode_ptr;
if (opcode == INSTRUMENTED_LINE) {
- _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i];
+ _PyCoLineInstrumentationData *lines = &monitoring->lines[i];
opcode_ptr = &lines->original_opcode;
opcode = *opcode_ptr;
}
if (opcode == INSTRUMENTED_INSTRUCTION) {
- assert(code->_co_monitoring->per_instruction_opcodes[i] > 0);
+ assert(monitoring->per_instruction_opcodes[i] > 0);
return;
}
CHECK(opcode != 0);
if (is_instrumented(opcode)) {
- code->_co_monitoring->per_instruction_opcodes[i] = opcode;
+ monitoring->per_instruction_opcodes[i] = opcode;
}
else {
assert(opcode != 0);
assert(_PyOpcode_Deopt[opcode] != 0);
assert(_PyOpcode_Deopt[opcode] != RESUME);
- code->_co_monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode];
+ monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode];
}
- assert(code->_co_monitoring->per_instruction_opcodes[i] > 0);
- *opcode_ptr = INSTRUMENTED_INSTRUCTION;
+ assert(monitoring->per_instruction_opcodes[i] > 0);
+ FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_INSTRUCTION);
}
static void
@@ -773,19 +794,19 @@ remove_tools(PyCodeObject * code, int offset, int event, int tools)
assert(PY_MONITORING_IS_INSTRUMENTED_EVENT(event));
assert(opcode_has_event(_Py_GetBaseCodeUnit(code, offset).op.code));
_PyCoMonitoringData *monitoring = code->_co_monitoring;
+ bool should_de_instrument;
if (monitoring && monitoring->tools) {
monitoring->tools[offset] &= ~tools;
- if (monitoring->tools[offset] == 0) {
- de_instrument(code, offset, event);
- }
+ should_de_instrument = (monitoring->tools[offset] == 0);
}
else {
/* Single tool */
uint8_t single_tool = code->_co_monitoring->active_monitors.tools[event];
assert(_Py_popcount32(single_tool) <= 1);
- if (((single_tool & tools) == single_tool)) {
- de_instrument(code, offset, event);
- }
+ should_de_instrument = ((single_tool & tools) == single_tool);
+ }
+ if (should_de_instrument) {
+ MODIFY_BYTECODE(code, de_instrument, monitoring, offset, event);
}
}
@@ -804,22 +825,23 @@ remove_line_tools(PyCodeObject * code, int offset, int tools)
{
ASSERT_WORLD_STOPPED_OR_LOCKED(code);
- assert(code->_co_monitoring);
- if (code->_co_monitoring->line_tools)
+ _PyCoMonitoringData *monitoring = code->_co_monitoring;
+ assert(monitoring);
+ bool should_de_instrument;
+ if (monitoring->line_tools)
{
- uint8_t *toolsptr = &code->_co_monitoring->line_tools[offset];
+ uint8_t *toolsptr = &monitoring->line_tools[offset];
*toolsptr &= ~tools;
- if (*toolsptr == 0 ) {
- de_instrument_line(code, offset);
- }
+ should_de_instrument = (*toolsptr == 0);
}
else {
/* Single tool */
- uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE];
+ uint8_t single_tool = monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE];
assert(_Py_popcount32(single_tool) <= 1);
- if (((single_tool & tools) == single_tool)) {
- de_instrument_line(code, offset);
- }
+ should_de_instrument = ((single_tool & tools) == single_tool);
+ }
+ if (should_de_instrument) {
+ MODIFY_BYTECODE(code, de_instrument_line, monitoring, offset);
}
}
@@ -841,7 +863,7 @@ add_tools(PyCodeObject * code, int offset, int event, int tools)
assert(_Py_popcount32(tools) == 1);
assert(tools_is_subset_for_event(code, event, tools));
}
- instrument(code, offset);
+ MODIFY_BYTECODE(code, instrument, code->_co_monitoring, offset);
}
static void
@@ -858,7 +880,7 @@ add_line_tools(PyCodeObject * code, int offset, int tools)
/* Single tool */
assert(_Py_popcount32(tools) == 1);
}
- instrument_line(code, offset);
+ MODIFY_BYTECODE(code, instrument_line, code->_co_monitoring, offset);
}
@@ -876,7 +898,7 @@ add_per_instruction_tools(PyCodeObject * code, int offset, int tools)
/* Single tool */
assert(_Py_popcount32(tools) == 1);
}
- instrument_per_instruction(code, offset);
+ MODIFY_BYTECODE(code, instrument_per_instruction, code->_co_monitoring, offset);
}
@@ -885,21 +907,22 @@ remove_per_instruction_tools(PyCodeObject * code, int offset, int tools)
{
ASSERT_WORLD_STOPPED_OR_LOCKED(code);
+ _PyCoMonitoringData *monitoring = code->_co_monitoring;
assert(code->_co_monitoring);
+ bool should_de_instrument;
if (code->_co_monitoring->per_instruction_tools) {
uint8_t *toolsptr = &code->_co_monitoring->per_instruction_tools[offset];
*toolsptr &= ~tools;
- if (*toolsptr == 0) {
- de_instrument_per_instruction(code, offset);
- }
+ should_de_instrument = (*toolsptr == 0);
}
else {
/* Single tool */
uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_INSTRUCTION];
assert(_Py_popcount32(single_tool) <= 1);
- if (((single_tool & tools) == single_tool)) {
- de_instrument_per_instruction(code, offset);
- }
+ should_de_instrument = ((single_tool & tools) == single_tool);
+ }
+ if (should_de_instrument) {
+ MODIFY_BYTECODE(code, de_instrument_per_instruction, monitoring, offset);
}
}
@@ -1087,7 +1110,7 @@ call_instrumentation_vector(
PyCodeObject *code = _PyFrame_GetCode(frame);
assert(args[1] == NULL);
args[1] = (PyObject *)code;
- int offset = (int)(instr - _PyCode_CODE(code));
+ int offset = (int)(instr - _PyFrame_GetBytecode(frame));
/* Offset visible to user should be the offset in bytes, as that is the
* convention for APIs involving code offsets. */
int bytes_offset = offset * (int)sizeof(_Py_CODEUNIT);
@@ -1173,8 +1196,7 @@ _Py_call_instrumentation_jump(
assert(event == PY_MONITORING_EVENT_JUMP ||
event == PY_MONITORING_EVENT_BRANCH);
assert(frame->instr_ptr == instr);
- PyCodeObject *code = _PyFrame_GetCode(frame);
- int to = (int)(target - _PyCode_CODE(code));
+ int to = (int)(target - _PyFrame_GetBytecode(frame));
PyObject *to_obj = PyLong_FromLong(to * (int)sizeof(_Py_CODEUNIT));
if (to_obj == NULL) {
return NULL;
@@ -1240,7 +1262,8 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame,
PyCodeObject *code = _PyFrame_GetCode(frame);
assert(tstate->tracing == 0);
assert(debug_check_sanity(tstate->interp, code));
- int i = (int)(instr - _PyCode_CODE(code));
+ _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame);
+ int i = (int)(instr - bytecode);
_PyCoMonitoringData *monitoring = code->_co_monitoring;
_PyCoLineInstrumentationData *line_data = &monitoring->lines[i];
@@ -1256,10 +1279,10 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame,
line = compute_line(code, i, line_delta);
assert(line >= 0);
assert(prev != NULL);
- int prev_index = (int)(prev - _PyCode_CODE(code));
+ int prev_index = (int)(prev - bytecode);
int prev_line = _Py_Instrumentation_GetLine(code, prev_index);
if (prev_line == line) {
- int prev_opcode = _PyCode_CODE(code)[prev_index].op.code;
+ int prev_opcode = bytecode[prev_index].op.code;
/* RESUME and INSTRUMENTED_RESUME are needed for the operation of
* instrumentation, so must never be hidden by an INSTRUMENTED_LINE.
*/
@@ -1359,7 +1382,7 @@ int
_Py_call_instrumentation_instruction(PyThreadState *tstate, _PyInterpreterFrame* frame, _Py_CODEUNIT *instr)
{
PyCodeObject *code = _PyFrame_GetCode(frame);
- int offset = (int)(instr - _PyCode_CODE(code));
+ int offset = (int)(instr - _PyFrame_GetBytecode(frame));
_PyCoMonitoringData *instrumentation_data = code->_co_monitoring;
assert(instrumentation_data->per_instruction_opcodes);
int next_opcode = instrumentation_data->per_instruction_opcodes[offset];
diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h
index 0a7e44ef78d..54821b23716 100644
--- a/Python/optimizer_cases.c.h
+++ b/Python/optimizer_cases.c.h
@@ -17,6 +17,8 @@
/* _QUICKEN_RESUME is not a viable micro-op for tier 2 */
+ /* _LOAD_BYTECODE is not a viable micro-op for tier 2 */
+
case _RESUME_CHECK: {
break;
}
diff --git a/Python/pystate.c b/Python/pystate.c
index 36b31f3b9e4..ded5fde9c4b 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -1513,6 +1513,11 @@ new_threadstate(PyInterpreterState *interp, int whence)
PyMem_RawFree(new_tstate);
return NULL;
}
+ int32_t tlbc_idx = _Py_ReserveTLBCIndex(interp);
+ if (tlbc_idx < 0) {
+ PyMem_RawFree(new_tstate);
+ return NULL;
+ }
#endif
/* We serialize concurrent creation to protect global state. */
@@ -1555,6 +1560,7 @@ new_threadstate(PyInterpreterState *interp, int whence)
#ifdef Py_GIL_DISABLED
// Must be called with lock unlocked to avoid lock ordering deadlocks.
_Py_qsbr_register(tstate, interp, qsbr_idx);
+ tstate->tlbc_index = tlbc_idx;
#endif
return (PyThreadState *)tstate;
@@ -1706,6 +1712,10 @@ PyThreadState_Clear(PyThreadState *tstate)
// Remove ourself from the biased reference counting table of threads.
_Py_brc_remove_thread(tstate);
+
+ // Release our thread-local copies of the bytecode for reuse by another
+ // thread
+ _Py_ClearTLBCIndex((_PyThreadStateImpl *)tstate);
#endif
// Merge our queue of pointers to be freed into the interpreter queue.
diff --git a/Python/specialize.c b/Python/specialize.c
index ae47809305a..86cb997ca2c 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -24,6 +24,25 @@ extern const char *_PyUOpName(int index);
* ./adaptive.md
*/
+#ifdef Py_GIL_DISABLED
+#define SET_OPCODE_OR_RETURN(instr, opcode) \
+ do { \
+ uint8_t old_op = _Py_atomic_load_uint8_relaxed(&(instr)->op.code); \
+ if (old_op >= MIN_INSTRUMENTED_OPCODE) { \
+ /* Lost race with instrumentation */ \
+ return; \
+ } \
+ if (!_Py_atomic_compare_exchange_uint8(&(instr)->op.code, &old_op, \
+ (opcode))) { \
+ /* Lost race with instrumentation */ \
+ assert(old_op >= MIN_INSTRUMENTED_OPCODE); \
+ return; \
+ } \
+ } while (0)
+#else
+#define SET_OPCODE_OR_RETURN(instr, opcode) (instr)->op.code = (opcode)
+#endif
+
#ifdef Py_STATS
GCStats _py_gc_stats[NUM_GENERATIONS] = { 0 };
static PyStats _Py_stats_struct = { .gc_stats = _py_gc_stats };
@@ -436,16 +455,25 @@ do { \
# define SPECIALIZATION_FAIL(opcode, kind) ((void)0)
#endif
-// Initialize warmup counters and insert superinstructions. This cannot fail.
+// Initialize warmup counters and optimize instructions. This cannot fail.
void
-_PyCode_Quicken(PyCodeObject *code)
+_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts,
+ int enable_counters)
{
- #if ENABLE_SPECIALIZATION
+ #if ENABLE_SPECIALIZATION_FT
+ _Py_BackoffCounter jump_counter, adaptive_counter;
+ if (enable_counters) {
+ jump_counter = initial_jump_backoff_counter();
+ adaptive_counter = adaptive_counter_warmup();
+ }
+ else {
+ jump_counter = initial_unreachable_backoff_counter();
+ adaptive_counter = initial_unreachable_backoff_counter();
+ }
int opcode = 0;
int oparg = 0;
- _Py_CODEUNIT *instructions = _PyCode_CODE(code);
/* The last code unit cannot have a cache, so we don't need to check it */
- for (int i = 0; i < Py_SIZE(code)-1; i++) {
+ for (Py_ssize_t i = 0; i < size-1; i++) {
opcode = instructions[i].op.code;
int caches = _PyOpcode_Caches[opcode];
oparg = (oparg << 8) | instructions[i].op.arg;
@@ -453,7 +481,7 @@ _PyCode_Quicken(PyCodeObject *code)
// The initial value depends on the opcode
switch (opcode) {
case JUMP_BACKWARD:
- instructions[i + 1].counter = initial_jump_backoff_counter();
+ instructions[i + 1].counter = jump_counter;
break;
case POP_JUMP_IF_FALSE:
case POP_JUMP_IF_TRUE:
@@ -462,7 +490,7 @@ _PyCode_Quicken(PyCodeObject *code)
instructions[i + 1].cache = 0x5555; // Alternating 0, 1 bits
break;
default:
- instructions[i + 1].counter = adaptive_counter_warmup();
+ instructions[i + 1].counter = adaptive_counter;
break;
}
i += caches;
@@ -471,7 +499,7 @@ _PyCode_Quicken(PyCodeObject *code)
/* We can't do this in the bytecode compiler as
* marshalling can intern strings and make them immortal. */
- PyObject *obj = PyTuple_GET_ITEM(code->co_consts, oparg);
+ PyObject *obj = PyTuple_GET_ITEM(consts, oparg);
if (_Py_IsImmortal(obj)) {
instructions[i].op.code = LOAD_CONST_IMMORTAL;
}
@@ -480,7 +508,7 @@ _PyCode_Quicken(PyCodeObject *code)
oparg = 0;
}
}
- #endif /* ENABLE_SPECIALIZATION */
+ #endif /* ENABLE_SPECIALIZATION_FT */
}
#define SIMPLE_FUNCTION 0
@@ -2243,9 +2271,10 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
{
PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st);
PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st);
- assert(ENABLE_SPECIALIZATION);
+ assert(ENABLE_SPECIALIZATION_FT);
assert(_PyOpcode_Caches[BINARY_OP] == INLINE_CACHE_ENTRIES_BINARY_OP);
_PyBinaryOpCache *cache = (_PyBinaryOpCache *)(instr + 1);
+ uint8_t specialized_op;
switch (oparg) {
case NB_ADD:
case NB_INPLACE_ADD:
@@ -2256,18 +2285,18 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
_Py_CODEUNIT next = instr[INLINE_CACHE_ENTRIES_BINARY_OP + 1];
bool to_store = (next.op.code == STORE_FAST);
if (to_store && PyStackRef_AsPyObjectBorrow(locals[next.op.arg]) == lhs) {
- instr->op.code = BINARY_OP_INPLACE_ADD_UNICODE;
+ specialized_op = BINARY_OP_INPLACE_ADD_UNICODE;
goto success;
}
- instr->op.code = BINARY_OP_ADD_UNICODE;
+ specialized_op = BINARY_OP_ADD_UNICODE;
goto success;
}
if (PyLong_CheckExact(lhs)) {
- instr->op.code = BINARY_OP_ADD_INT;
+ specialized_op = BINARY_OP_ADD_INT;
goto success;
}
if (PyFloat_CheckExact(lhs)) {
- instr->op.code = BINARY_OP_ADD_FLOAT;
+ specialized_op = BINARY_OP_ADD_FLOAT;
goto success;
}
break;
@@ -2277,11 +2306,11 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
break;
}
if (PyLong_CheckExact(lhs)) {
- instr->op.code = BINARY_OP_MULTIPLY_INT;
+ specialized_op = BINARY_OP_MULTIPLY_INT;
goto success;
}
if (PyFloat_CheckExact(lhs)) {
- instr->op.code = BINARY_OP_MULTIPLY_FLOAT;
+ specialized_op = BINARY_OP_MULTIPLY_FLOAT;
goto success;
}
break;
@@ -2291,22 +2320,23 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
break;
}
if (PyLong_CheckExact(lhs)) {
- instr->op.code = BINARY_OP_SUBTRACT_INT;
+ specialized_op = BINARY_OP_SUBTRACT_INT;
goto success;
}
if (PyFloat_CheckExact(lhs)) {
- instr->op.code = BINARY_OP_SUBTRACT_FLOAT;
+ specialized_op = BINARY_OP_SUBTRACT_FLOAT;
goto success;
}
break;
}
SPECIALIZATION_FAIL(BINARY_OP, binary_op_fail_kind(oparg, lhs, rhs));
STAT_INC(BINARY_OP, failure);
- instr->op.code = BINARY_OP;
+ SET_OPCODE_OR_RETURN(instr, BINARY_OP);
cache->counter = adaptive_counter_backoff(cache->counter);
return;
success:
STAT_INC(BINARY_OP, success);
+ SET_OPCODE_OR_RETURN(instr, specialized_op);
cache->counter = adaptive_counter_cooldown();
}
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index a4abd7c3c45..a086bb979ef 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -2174,6 +2174,11 @@ sys__clear_internal_caches_impl(PyObject *module)
#ifdef _Py_TIER2
PyInterpreterState *interp = _PyInterpreterState_GET();
_Py_Executors_InvalidateAll(interp, 0);
+#endif
+#ifdef Py_GIL_DISABLED
+ if (_Py_ClearUnusedTLBC(_PyInterpreterState_GET()) < 0) {
+ return NULL;
+ }
#endif
PyType_ClearCache();
Py_RETURN_NONE;
diff --git a/Tools/gdb/libpython.py b/Tools/gdb/libpython.py
index 946af4be1a7..ed254152d7d 100755
--- a/Tools/gdb/libpython.py
+++ b/Tools/gdb/libpython.py
@@ -77,6 +77,10 @@ def _managed_dict_offset():
else:
return -3 * _sizeof_void_p()
+def _interp_frame_has_tlbc_index():
+ interp_frame = gdb.lookup_type("_PyInterpreterFrame")
+ return any(field.name == "tlbc_index" for field in interp_frame.fields())
+
Py_TPFLAGS_INLINE_VALUES = (1 << 2)
Py_TPFLAGS_MANAGED_DICT = (1 << 4)
@@ -105,6 +109,8 @@ FRAME_INFO_OPTIMIZED_OUT = '(frame information optimized out)'
UNABLE_READ_INFO_PYTHON_FRAME = 'Unable to read information on python frame'
EVALFRAME = '_PyEval_EvalFrameDefault'
+INTERP_FRAME_HAS_TLBC_INDEX = _interp_frame_has_tlbc_index()
+
class NullPyObjectPtr(RuntimeError):
pass
@@ -693,6 +699,16 @@ def parse_location_table(firstlineno, linetable):
yield addr, end_addr, line
addr = end_addr
+
+class PyCodeArrayPtr:
+ def __init__(self, gdbval):
+ self._gdbval = gdbval
+
+ def get_entry(self, index):
+ assert (index >= 0) and (index < self._gdbval["size"])
+ return self._gdbval["entries"][index]
+
+
class PyCodeObjectPtr(PyObjectPtr):
"""
Class wrapping a gdb.Value that's a PyCodeObject* i.e. a instance
@@ -1085,7 +1101,12 @@ class PyFramePtr:
def _f_lasti(self):
codeunit_p = gdb.lookup_type("_Py_CODEUNIT").pointer()
instr_ptr = self._gdbval["instr_ptr"]
- first_instr = self._f_code().field("co_code_adaptive").cast(codeunit_p)
+ if INTERP_FRAME_HAS_TLBC_INDEX:
+ tlbc_index = self._gdbval["tlbc_index"]
+ code_arr = PyCodeArrayPtr(self._f_code().field("co_tlbc"))
+ first_instr = code_arr.get_entry(tlbc_index).cast(codeunit_p)
+ else:
+ first_instr = self._f_code().field("co_code_adaptive").cast(codeunit_p)
return int(instr_ptr - first_instr)
def is_shim(self):