diff --git a/Include/cpython/code.h b/Include/cpython/code.h index 2561b2b88ba..370f1d259ab 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -72,6 +72,24 @@ typedef struct { uint8_t *per_instruction_tools; } _PyCoMonitoringData; +#ifdef Py_GIL_DISABLED + +/* Each thread specializes a thread-local copy of the bytecode in free-threaded + * builds. These copies are stored on the code object in a `_PyCodeArray`. The + * first entry in the array always points to the "main" copy of the bytecode + * that is stored at the end of the code object. + */ +typedef struct { + Py_ssize_t size; + char *entries[1]; +} _PyCodeArray; + +#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() \ + _PyCodeArray *co_tlbc; +#else +#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() +#endif + // To avoid repeating ourselves in deepfreeze.py, all PyCodeObject members are // defined in this macro: #define _PyCode_DEF(SIZE) { \ @@ -138,6 +156,7 @@ typedef struct { Type is a void* to keep the format private in codeobject.c to force \ people to go through the proper APIs. */ \ void *co_extra; \ + _PyCode_DEF_THREAD_LOCAL_BYTECODE() \ char co_code_adaptive[(SIZE)]; \ } diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index c2cb4e3cdd9..f69c586a4f9 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -183,6 +183,7 @@ typedef struct PyConfig { int cpu_count; #ifdef Py_GIL_DISABLED int enable_gil; + int tlbc_enabled; #endif /* --- Path configuration inputs ------------ */ diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 411bbff106d..80bd19a8878 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -174,6 +174,18 @@ _PyEval_IsGILEnabled(PyThreadState *tstate) extern int _PyEval_EnableGILTransient(PyThreadState *tstate); extern int _PyEval_EnableGILPermanent(PyThreadState *tstate); extern int _PyEval_DisableGIL(PyThreadState *state); + + +static inline _Py_CODEUNIT * +_PyEval_GetExecutableCode(PyThreadState *tstate, PyCodeObject *co) +{ + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(tstate, co); + if (bc != NULL) { + return bc; + } + return _PyCode_GetTLBC(co); +} + #endif extern void _PyEval_DeactivateOpCache(void); diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 57e0a14bb9b..a0acf76db6f 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -11,6 +11,7 @@ extern "C" { #include "pycore_stackref.h" // _PyStackRef #include "pycore_lock.h" // PyMutex #include "pycore_backoff.h" // _Py_BackoffCounter +#include "pycore_tstate.h" // _PyThreadStateImpl /* Each instruction in a code object is a fixed-width value, @@ -313,11 +314,17 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range); /** API for executors */ extern void _PyCode_Clear_Executors(PyCodeObject *code); + #ifdef Py_GIL_DISABLED // gh-115999 tracks progress on addressing this. #define ENABLE_SPECIALIZATION 0 +// Use this to enable specialization families once they are thread-safe. All +// uses will be replaced with ENABLE_SPECIALIZATION once all families are +// thread-safe. +#define ENABLE_SPECIALIZATION_FT 1 #else #define ENABLE_SPECIALIZATION 1 +#define ENABLE_SPECIALIZATION_FT ENABLE_SPECIALIZATION #endif /* Specialization functions */ @@ -600,6 +607,40 @@ struct _PyCode8 _PyCode_DEF(8); PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; +#ifdef Py_GIL_DISABLED + +// Return a pointer to the thread-local bytecode for the current thread, if it +// exists. +static inline _Py_CODEUNIT * +_PyCode_GetTLBCFast(PyThreadState *tstate, PyCodeObject *co) +{ + _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc); + int32_t idx = ((_PyThreadStateImpl*) tstate)->tlbc_index; + if (idx < code->size && code->entries[idx] != NULL) { + return (_Py_CODEUNIT *) code->entries[idx]; + } + return NULL; +} + +// Return a pointer to the thread-local bytecode for the current thread, +// creating it if necessary. +extern _Py_CODEUNIT *_PyCode_GetTLBC(PyCodeObject *co); + +// Reserve an index for the current thread into thread-local bytecode +// arrays +// +// Returns the reserved index or -1 on error. +extern int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp); + +// Release the current thread's index into thread-local bytecode arrays +extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate); + +// Free all TLBC copies not associated with live threads. +// +// Returns 0 on success or -1 on error. +extern int _Py_ClearUnusedTLBC(PyInterpreterState *interp); +#endif + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h index c9ac3819d03..8c0100390d0 100644 --- a/Include/internal/pycore_frame.h +++ b/Include/internal/pycore_frame.h @@ -68,6 +68,10 @@ typedef struct _PyInterpreterFrame { PyObject *f_locals; /* Strong reference, may be NULL. Only valid if not on C stack */ PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */ _Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */ +#ifdef Py_GIL_DISABLED + /* Index of thread-local bytecode containing instr_ptr. */ + int32_t tlbc_index; +#endif _PyStackRef *stackpointer; uint16_t return_offset; /* Only relevant during a function call */ char owner; @@ -76,7 +80,7 @@ typedef struct _PyInterpreterFrame { } _PyInterpreterFrame; #define _PyInterpreterFrame_LASTI(IF) \ - ((int)((IF)->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(IF)))) + ((int)((IF)->instr_ptr - _PyFrame_GetBytecode((IF)))) static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) { PyObject *executable = PyStackRef_AsPyObjectBorrow(f->f_executable); @@ -84,6 +88,19 @@ static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) { return (PyCodeObject *)executable; } +static inline _Py_CODEUNIT * +_PyFrame_GetBytecode(_PyInterpreterFrame *f) +{ +#ifdef Py_GIL_DISABLED + PyCodeObject *co = _PyFrame_GetCode(f); + _PyCodeArray *tlbc = _Py_atomic_load_ptr_acquire(&co->co_tlbc); + assert(f->tlbc_index >= 0 && f->tlbc_index < tlbc->size); + return (_Py_CODEUNIT *)tlbc->entries[f->tlbc_index]; +#else + return _PyCode_CODE(_PyFrame_GetCode(f)); +#endif +} + static inline PyFunctionObject *_PyFrame_GetFunction(_PyInterpreterFrame *f) { PyObject *func = PyStackRef_AsPyObjectBorrow(f->f_funcobj); assert(PyFunction_Check(func)); @@ -144,13 +161,33 @@ static inline void _PyFrame_Copy(_PyInterpreterFrame *src, _PyInterpreterFrame * #endif } +#ifdef Py_GIL_DISABLED +static inline void +_PyFrame_InitializeTLBC(PyThreadState *tstate, _PyInterpreterFrame *frame, + PyCodeObject *code) +{ + _Py_CODEUNIT *tlbc = _PyCode_GetTLBCFast(tstate, code); + if (tlbc == NULL) { + // No thread-local bytecode exists for this thread yet; use the main + // thread's copy, deferring thread-local bytecode creation to the + // execution of RESUME. + frame->instr_ptr = _PyCode_CODE(code); + frame->tlbc_index = 0; + } + else { + frame->instr_ptr = tlbc; + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + } +} +#endif + /* Consumes reference to func and locals. Does not initialize frame->previous, which happens when frame is linked into the frame stack. */ static inline void _PyFrame_Initialize( - _PyInterpreterFrame *frame, _PyStackRef func, + PyThreadState *tstate, _PyInterpreterFrame *frame, _PyStackRef func, PyObject *locals, PyCodeObject *code, int null_locals_from, _PyInterpreterFrame *previous) { frame->previous = previous; @@ -162,7 +199,12 @@ _PyFrame_Initialize( frame->f_locals = locals; frame->stackpointer = frame->localsplus + code->co_nlocalsplus; frame->frame_obj = NULL; +#ifdef Py_GIL_DISABLED + _PyFrame_InitializeTLBC(tstate, frame, code); +#else + (void)tstate; frame->instr_ptr = _PyCode_CODE(code); +#endif frame->return_offset = 0; frame->owner = FRAME_OWNED_BY_THREAD; @@ -224,7 +266,8 @@ _PyFrame_IsIncomplete(_PyInterpreterFrame *frame) return true; } return frame->owner != FRAME_OWNED_BY_GENERATOR && - frame->instr_ptr < _PyCode_CODE(_PyFrame_GetCode(frame)) + _PyFrame_GetCode(frame)->_co_firsttraceable; + frame->instr_ptr < _PyFrame_GetBytecode(frame) + + _PyFrame_GetCode(frame)->_co_firsttraceable; } static inline _PyInterpreterFrame * @@ -315,7 +358,8 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_ _PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top; tstate->datastack_top += code->co_framesize; assert(tstate->datastack_top < tstate->datastack_limit); - _PyFrame_Initialize(new_frame, func, NULL, code, null_locals_from, previous); + _PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from, + previous); return new_frame; } @@ -339,7 +383,11 @@ _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int assert(stackdepth <= code->co_stacksize); frame->stackpointer = frame->localsplus + code->co_nlocalsplus + stackdepth; frame->frame_obj = NULL; +#ifdef Py_GIL_DISABLED + _PyFrame_InitializeTLBC(tstate, frame, code); +#else frame->instr_ptr = _PyCode_CODE(code); +#endif frame->owner = FRAME_OWNED_BY_THREAD; frame->return_offset = 0; diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index b85957df5a6..38a1c56c09d 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -389,6 +389,10 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar } \ } while (0) +#ifdef Py_GIL_DISABLED +extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp, + gcvisitobjects_t callback, void *arg); +#endif #ifdef __cplusplus } diff --git a/Include/internal/pycore_index_pool.h b/Include/internal/pycore_index_pool.h new file mode 100644 index 00000000000..e81bfd4d6ed --- /dev/null +++ b/Include/internal/pycore_index_pool.h @@ -0,0 +1,56 @@ +#ifndef Py_INTERNAL_INDEX_POOL_H +#define Py_INTERNAL_INDEX_POOL_H + +#include "Python.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#ifdef Py_GIL_DISABLED + +// This contains code for allocating unique indices in an array. It is used by +// the free-threaded build to assign each thread a globally unique index into +// each code object's thread-local bytecode array. + +// A min-heap of indices +typedef struct _PyIndexHeap { + int32_t *values; + + // Number of items stored in values + Py_ssize_t size; + + // Maximum number of items that can be stored in values + Py_ssize_t capacity; +} _PyIndexHeap; + +// An unbounded pool of indices. Indices are allocated starting from 0. They +// may be released back to the pool once they are no longer in use. +typedef struct _PyIndexPool { + PyMutex mutex; + + // Min heap of indices available for allocation + _PyIndexHeap free_indices; + + // Next index to allocate if no free indices are available + int32_t next_index; +} _PyIndexPool; + +// Allocate the smallest available index. Returns -1 on error. +extern int32_t _PyIndexPool_AllocIndex(_PyIndexPool *indices); + +// Release `index` back to the pool +extern void _PyIndexPool_FreeIndex(_PyIndexPool *indices, int32_t index); + +extern void _PyIndexPool_Fini(_PyIndexPool *indices); + +#endif // Py_GIL_DISABLED + +#ifdef __cplusplus +} +#endif +#endif // !Py_INTERNAL_INDEX_POOL_H diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 36cd71e5a00..9e3b4299693 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -26,6 +26,7 @@ extern "C" { #include "pycore_genobject.h" // _PyGen_FetchStopIterationValue #include "pycore_global_objects.h"// struct _Py_interp_cached_objects #include "pycore_import.h" // struct _import_state +#include "pycore_index_pool.h" // _PyIndexPool #include "pycore_instruments.h" // _PY_MONITORING_EVENTS #include "pycore_list.h" // struct _Py_list_state #include "pycore_mimalloc.h" // struct _mimalloc_interp_state @@ -222,6 +223,7 @@ struct _is { struct _brc_state brc; // biased reference counting state struct _Py_unique_id_pool unique_ids; // object ids for per-thread refcounts PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; + _PyIndexPool tlbc_indices; #endif // Per-interpreter state for the obmalloc allocator. For the main diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index e0e7d5ebf09..b8bea72baea 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -42,6 +42,9 @@ typedef struct _PyThreadStateImpl { int is_finalized; } refcounts; + // Index to use to retrieve thread-local bytecode for this thread + int32_t tlbc_index; + // When >1, code objects do not immortalize their non-string constants. int suppress_co_const_immortalization; #endif @@ -52,7 +55,6 @@ typedef struct _PyThreadStateImpl { } _PyThreadStateImpl; - #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index de628d240d1..55416d2aae1 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -193,106 +193,107 @@ extern "C" { #define _LOAD_ATTR_SLOT_1 423 #define _LOAD_ATTR_WITH_HINT 424 #define _LOAD_BUILD_CLASS LOAD_BUILD_CLASS +#define _LOAD_BYTECODE 425 #define _LOAD_COMMON_CONSTANT LOAD_COMMON_CONSTANT #define _LOAD_CONST LOAD_CONST #define _LOAD_CONST_IMMORTAL LOAD_CONST_IMMORTAL -#define _LOAD_CONST_INLINE 425 -#define _LOAD_CONST_INLINE_BORROW 426 -#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 427 -#define _LOAD_CONST_INLINE_WITH_NULL 428 +#define _LOAD_CONST_INLINE 426 +#define _LOAD_CONST_INLINE_BORROW 427 +#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 428 +#define _LOAD_CONST_INLINE_WITH_NULL 429 #define _LOAD_DEREF LOAD_DEREF -#define _LOAD_FAST 429 -#define _LOAD_FAST_0 430 -#define _LOAD_FAST_1 431 -#define _LOAD_FAST_2 432 -#define _LOAD_FAST_3 433 -#define _LOAD_FAST_4 434 -#define _LOAD_FAST_5 435 -#define _LOAD_FAST_6 436 -#define _LOAD_FAST_7 437 +#define _LOAD_FAST 430 +#define _LOAD_FAST_0 431 +#define _LOAD_FAST_1 432 +#define _LOAD_FAST_2 433 +#define _LOAD_FAST_3 434 +#define _LOAD_FAST_4 435 +#define _LOAD_FAST_5 436 +#define _LOAD_FAST_6 437 +#define _LOAD_FAST_7 438 #define _LOAD_FAST_AND_CLEAR LOAD_FAST_AND_CLEAR #define _LOAD_FAST_CHECK LOAD_FAST_CHECK #define _LOAD_FAST_LOAD_FAST LOAD_FAST_LOAD_FAST #define _LOAD_FROM_DICT_OR_DEREF LOAD_FROM_DICT_OR_DEREF #define _LOAD_FROM_DICT_OR_GLOBALS LOAD_FROM_DICT_OR_GLOBALS -#define _LOAD_GLOBAL 438 -#define _LOAD_GLOBAL_BUILTINS 439 -#define _LOAD_GLOBAL_BUILTINS_FROM_KEYS 440 -#define _LOAD_GLOBAL_MODULE 441 -#define _LOAD_GLOBAL_MODULE_FROM_KEYS 442 +#define _LOAD_GLOBAL 439 +#define _LOAD_GLOBAL_BUILTINS 440 +#define _LOAD_GLOBAL_BUILTINS_FROM_KEYS 441 +#define _LOAD_GLOBAL_MODULE 442 +#define _LOAD_GLOBAL_MODULE_FROM_KEYS 443 #define _LOAD_LOCALS LOAD_LOCALS #define _LOAD_NAME LOAD_NAME -#define _LOAD_SMALL_INT 443 -#define _LOAD_SMALL_INT_0 444 -#define _LOAD_SMALL_INT_1 445 -#define _LOAD_SMALL_INT_2 446 -#define _LOAD_SMALL_INT_3 447 +#define _LOAD_SMALL_INT 444 +#define _LOAD_SMALL_INT_0 445 +#define _LOAD_SMALL_INT_1 446 +#define _LOAD_SMALL_INT_2 447 +#define _LOAD_SMALL_INT_3 448 #define _LOAD_SPECIAL LOAD_SPECIAL #define _LOAD_SUPER_ATTR_ATTR LOAD_SUPER_ATTR_ATTR #define _LOAD_SUPER_ATTR_METHOD LOAD_SUPER_ATTR_METHOD -#define _MAKE_CALLARGS_A_TUPLE 448 +#define _MAKE_CALLARGS_A_TUPLE 449 #define _MAKE_CELL MAKE_CELL #define _MAKE_FUNCTION MAKE_FUNCTION -#define _MAKE_WARM 449 +#define _MAKE_WARM 450 #define _MAP_ADD MAP_ADD #define _MATCH_CLASS MATCH_CLASS #define _MATCH_KEYS MATCH_KEYS #define _MATCH_MAPPING MATCH_MAPPING #define _MATCH_SEQUENCE MATCH_SEQUENCE -#define _MAYBE_EXPAND_METHOD 450 -#define _MAYBE_EXPAND_METHOD_KW 451 -#define _MONITOR_CALL 452 -#define _MONITOR_JUMP_BACKWARD 453 -#define _MONITOR_RESUME 454 +#define _MAYBE_EXPAND_METHOD 451 +#define _MAYBE_EXPAND_METHOD_KW 452 +#define _MONITOR_CALL 453 +#define _MONITOR_JUMP_BACKWARD 454 +#define _MONITOR_RESUME 455 #define _NOP NOP #define _POP_EXCEPT POP_EXCEPT -#define _POP_JUMP_IF_FALSE 455 -#define _POP_JUMP_IF_TRUE 456 +#define _POP_JUMP_IF_FALSE 456 +#define _POP_JUMP_IF_TRUE 457 #define _POP_TOP POP_TOP -#define _POP_TOP_LOAD_CONST_INLINE_BORROW 457 +#define _POP_TOP_LOAD_CONST_INLINE_BORROW 458 #define _PUSH_EXC_INFO PUSH_EXC_INFO -#define _PUSH_FRAME 458 +#define _PUSH_FRAME 459 #define _PUSH_NULL PUSH_NULL -#define _PY_FRAME_GENERAL 459 -#define _PY_FRAME_KW 460 -#define _QUICKEN_RESUME 461 -#define _REPLACE_WITH_TRUE 462 +#define _PY_FRAME_GENERAL 460 +#define _PY_FRAME_KW 461 +#define _QUICKEN_RESUME 462 +#define _REPLACE_WITH_TRUE 463 #define _RESUME_CHECK RESUME_CHECK #define _RETURN_GENERATOR RETURN_GENERATOR #define _RETURN_VALUE RETURN_VALUE -#define _SAVE_RETURN_OFFSET 463 -#define _SEND 464 -#define _SEND_GEN_FRAME 465 +#define _SAVE_RETURN_OFFSET 464 +#define _SEND 465 +#define _SEND_GEN_FRAME 466 #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 466 -#define _STORE_ATTR 467 -#define _STORE_ATTR_INSTANCE_VALUE 468 -#define _STORE_ATTR_SLOT 469 -#define _STORE_ATTR_WITH_HINT 470 +#define _START_EXECUTOR 467 +#define _STORE_ATTR 468 +#define _STORE_ATTR_INSTANCE_VALUE 469 +#define _STORE_ATTR_SLOT 470 +#define _STORE_ATTR_WITH_HINT 471 #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 471 -#define _STORE_FAST_0 472 -#define _STORE_FAST_1 473 -#define _STORE_FAST_2 474 -#define _STORE_FAST_3 475 -#define _STORE_FAST_4 476 -#define _STORE_FAST_5 477 -#define _STORE_FAST_6 478 -#define _STORE_FAST_7 479 +#define _STORE_FAST 472 +#define _STORE_FAST_0 473 +#define _STORE_FAST_1 474 +#define _STORE_FAST_2 475 +#define _STORE_FAST_3 476 +#define _STORE_FAST_4 477 +#define _STORE_FAST_5 478 +#define _STORE_FAST_6 479 +#define _STORE_FAST_7 480 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME -#define _STORE_SLICE 480 -#define _STORE_SUBSCR 481 +#define _STORE_SLICE 481 +#define _STORE_SUBSCR 482 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TIER2_RESUME_CHECK 482 -#define _TO_BOOL 483 +#define _TIER2_RESUME_CHECK 483 +#define _TO_BOOL 484 #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT #define _TO_BOOL_LIST TO_BOOL_LIST @@ -302,13 +303,13 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 484 +#define _UNPACK_SEQUENCE 485 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START #define _YIELD_VALUE YIELD_VALUE -#define MAX_UOP_ID 484 +#define MAX_UOP_ID 485 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 4cfdecec78b..ade297201f0 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -289,7 +289,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_FATAL_ERROR] = 0, [_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG, [_DEOPT] = 0, - [_ERROR_POP_N] = HAS_ARG_FLAG, + [_ERROR_POP_N] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG, [_TIER2_RESUME_CHECK] = HAS_DEOPT_FLAG, }; diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py index 7c1ef42a497..2ad267e3e08 100644 --- a/Lib/test/support/__init__.py +++ b/Lib/test/support/__init__.py @@ -1274,6 +1274,11 @@ def requires_specialization(test): _opcode.ENABLE_SPECIALIZATION, "requires specialization")(test) +def requires_specialization_ft(test): + return unittest.skipUnless( + _opcode.ENABLE_SPECIALIZATION_FT, "requires specialization")(test) + + #======================================================================= # Check for the presence of docstrings. diff --git a/Lib/test/test_capi/test_config.py b/Lib/test/test_capi/test_config.py index 71fb9ae45c7..77730ad2f32 100644 --- a/Lib/test/test_capi/test_config.py +++ b/Lib/test/test_capi/test_config.py @@ -100,6 +100,7 @@ class CAPITests(unittest.TestCase): options.append(("run_presite", str | None, None)) if sysconfig.get_config_var('Py_GIL_DISABLED'): options.append(("enable_gil", int, None)) + options.append(("tlbc_enabled", int, None)) if support.MS_WINDOWS: options.extend(( ("legacy_windows_stdio", bool, None), diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index f1ab72180d7..c352325ff3d 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -7,7 +7,8 @@ import os import _opcode -from test.support import script_helper, requires_specialization, import_helper +from test.support import (script_helper, requires_specialization, + import_helper, Py_GIL_DISABLED) _testinternalcapi = import_helper.import_module("_testinternalcapi") @@ -34,6 +35,7 @@ def clear_executors(func): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") class TestOptimizerAPI(unittest.TestCase): @@ -138,6 +140,7 @@ def get_opnames(ex): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") class TestExecutorInvalidation(unittest.TestCase): @@ -219,6 +222,7 @@ class TestExecutorInvalidation(unittest.TestCase): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") @@ -586,6 +590,7 @@ class TestUops(unittest.TestCase): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index eca9adf9a7d..634efda3544 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -12,6 +12,7 @@ import unittest from test import support from test.support import os_helper from test.support import force_not_colorized +from test.support import threading_helper from test.support.script_helper import ( spawn_python, kill_python, assert_python_ok, assert_python_failure, interpreter_requires_environment @@ -1068,6 +1069,57 @@ class CmdLineTest(unittest.TestCase): out = res.out.strip().decode("utf-8") return tuple(int(i) for i in out.split()) + @unittest.skipUnless(support.Py_GIL_DISABLED, + "PYTHON_TLBC and -X tlbc" + " only supported in Py_GIL_DISABLED builds") + @threading_helper.requires_working_threading() + def test_disable_thread_local_bytecode(self): + code = """if 1: + import threading + def test(x, y): + return x + y + t = threading.Thread(target=test, args=(1,2)) + t.start() + t.join()""" + assert_python_ok("-W", "always", "-X", "tlbc=0", "-c", code) + assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC="0") + + @unittest.skipUnless(support.Py_GIL_DISABLED, + "PYTHON_TLBC and -X tlbc" + " only supported in Py_GIL_DISABLED builds") + @threading_helper.requires_working_threading() + def test_enable_thread_local_bytecode(self): + code = """if 1: + import threading + def test(x, y): + return x + y + t = threading.Thread(target=test, args=(1,2)) + t.start() + t.join()""" + # The functionality of thread-local bytecode is tested more extensively + # in test_thread_local_bytecode + assert_python_ok("-W", "always", "-X", "tlbc=1", "-c", code) + assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC="1") + + @unittest.skipUnless(support.Py_GIL_DISABLED, + "PYTHON_TLBC and -X tlbc" + " only supported in Py_GIL_DISABLED builds") + def test_invalid_thread_local_bytecode(self): + rc, out, err = assert_python_failure("-X", "tlbc") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc=foo") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc=-1") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc=2") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC="foo") + self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC="-1") + self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC="2") + self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err) + @unittest.skipIf(interpreter_requires_environment(), 'Cannot run -I tests when PYTHON env vars are required.') diff --git a/Lib/test/test_dis.py b/Lib/test/test_dis.py index 3c6570afa50..a991c67fca4 100644 --- a/Lib/test/test_dis.py +++ b/Lib/test/test_dis.py @@ -10,7 +10,8 @@ import sys import types import unittest from test.support import (captured_stdout, requires_debug_ranges, - requires_specialization, cpython_only) + requires_specialization, requires_specialization_ft, + cpython_only) from test.support.bytecode_helper import BytecodeTestCase import opcode @@ -1261,7 +1262,7 @@ class DisTests(DisTestBase): self.do_disassembly_compare(got, dis_load_test_quickened_code) @cpython_only - @requires_specialization + @requires_specialization_ft def test_binary_specialize(self): binary_op_quicken = """\ 0 RESUME_CHECK 0 @@ -1281,6 +1282,9 @@ class DisTests(DisTestBase): got = self.get_disassembly(co_unicode, adaptive=True) self.do_disassembly_compare(got, binary_op_quicken % "BINARY_OP_ADD_UNICODE 0 (+)") + @cpython_only + @requires_specialization + def test_binary_subscr_specialize(self): binary_subscr_quicken = """\ 0 RESUME_CHECK 0 diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 5e886b6c8c3..bf861ef06ee 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -644,6 +644,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): CONFIG_COMPAT['run_presite'] = None if support.Py_GIL_DISABLED: CONFIG_COMPAT['enable_gil'] = -1 + CONFIG_COMPAT['tlbc_enabled'] = GET_DEFAULT_CONFIG if MS_WINDOWS: CONFIG_COMPAT.update({ 'legacy_windows_stdio': False, diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index c0862d7d15f..d839893d2c6 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -1094,7 +1094,14 @@ class SysModuleTest(unittest.TestCase): # While we could imagine a Python session where the number of # multiple buffer objects would exceed the sharing of references, # it is unlikely to happen in a normal test run. - self.assertLess(a, sys.gettotalrefcount()) + # + # In free-threaded builds each code object owns an array of + # pointers to copies of the bytecode. When the number of + # code objects is a large fraction of the total number of + # references, this can cause the total number of allocated + # blocks to exceed the total number of references. + if not support.Py_GIL_DISABLED: + self.assertLess(a, sys.gettotalrefcount()) except AttributeError: # gettotalrefcount() not available pass @@ -1613,7 +1620,10 @@ class SizeofTest(unittest.TestCase): def func(): return sys._getframe() x = func() - INTERPRETER_FRAME = '9PhcP' + if support.Py_GIL_DISABLED: + INTERPRETER_FRAME = '10PhcP' + else: + INTERPRETER_FRAME = '9PhcP' check(x, size('3PiccPP' + INTERPRETER_FRAME + 'P')) # function def func(): pass diff --git a/Lib/test/test_thread_local_bytecode.py b/Lib/test/test_thread_local_bytecode.py new file mode 100644 index 00000000000..7a8809c5ae7 --- /dev/null +++ b/Lib/test/test_thread_local_bytecode.py @@ -0,0 +1,198 @@ +"""Tests for thread-local bytecode.""" +import dis +import textwrap +import unittest + +from test import support +from test.support import cpython_only, import_helper, requires_specialization_ft +from test.support.script_helper import assert_python_ok +from test.support.threading_helper import requires_working_threading + +# Skip this test if the _testinternalcapi module isn't available +_testinternalcapi = import_helper.import_module("_testinternalcapi") + + +@cpython_only +@requires_working_threading() +@unittest.skipUnless(support.Py_GIL_DISABLED, "only in free-threaded builds") +class TLBCTests(unittest.TestCase): + @requires_specialization_ft + def test_new_threads_start_with_unspecialized_code(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(bc): + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc(f)) + return a + b + + for _ in range(100): + # specialize + f(1, 2) + + q = queue.Queue() + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + t.join() + + assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) + assert "BINARY_OP_ADD_INT" not in all_opnames(q.get()) + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + @requires_specialization_ft + def test_threads_specialize_independently(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(bc): + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b): + return a + b + + def g(a, b, q=None): + for _ in range(100): + f(a, b) + if q is not None: + q.put(get_tlbc(f)) + + # specialize in main thread + g(1, 2) + + # specialize in other thread + q = queue.Queue() + t = threading.Thread(target=g, args=('a', 'b', q)) + t.start() + t.join() + + assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) + t_opnames = all_opnames(q.get()) + assert "BINARY_OP_ADD_INT" not in t_opnames + assert "BINARY_OP_ADD_UNICODE" in t_opnames + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + def test_reuse_tlbc_across_threads_different_lifetimes(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc_id(f)) + return a + b + + q = queue.Queue() + tlbc_ids = [] + for _ in range(3): + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + t.join() + tlbc_ids.append(q.get()) + + assert tlbc_ids[0] == tlbc_ids[1] + assert tlbc_ids[1] == tlbc_ids[2] + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + def test_no_copies_if_tlbc_disabled(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc_id(f)) + return a + b + + q = queue.Queue() + threads = [] + for _ in range(3): + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + threads.append(t) + + tlbc_ids = [] + for t in threads: + t.join() + tlbc_ids.append(q.get()) + + main_tlbc_id = get_tlbc_id(f) + assert main_tlbc_id is not None + assert tlbc_ids[0] == main_tlbc_id + assert tlbc_ids[1] == main_tlbc_id + assert tlbc_ids[2] == main_tlbc_id + """) + assert_python_ok("-X", "tlbc=0", "-c", code) + + def test_no_specialization_if_tlbc_disabled(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(f): + bc = get_tlbc(f) + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b): + return a + b + + for _ in range(100): + f(1, 2) + + assert "BINARY_OP_ADD_INT" not in all_opnames(f) + """) + assert_python_ok("-X", "tlbc=0", "-c", code) + + def test_generator_throw(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def g(): + try: + yield + except: + yield get_tlbc_id(g) + + def f(q): + gen = g() + next(gen) + q.put(gen.throw(ValueError)) + + q = queue.Queue() + t = threading.Thread(target=f, args=(q,)) + t.start() + t.join() + + gen = g() + next(gen) + main_id = gen.throw(ValueError) + assert main_id != q.get() + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + +if __name__ == "__main__": + unittest.main() diff --git a/Makefile.pre.in b/Makefile.pre.in index 1a9191ec0ce..c650ecaf7be 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -460,6 +460,7 @@ PYTHON_OBJS= \ Python/hashtable.o \ Python/import.o \ Python/importdl.o \ + Python/index_pool.o \ Python/initconfig.o \ Python/interpconfig.o \ Python/instrumentation.o \ @@ -1228,6 +1229,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_hashtable.h \ $(srcdir)/Include/internal/pycore_import.h \ $(srcdir)/Include/internal/pycore_importdl.h \ + $(srcdir)/Include/internal/pycore_index_pool.h \ $(srcdir)/Include/internal/pycore_initconfig.h \ $(srcdir)/Include/internal/pycore_instruments.h \ $(srcdir)/Include/internal/pycore_instruction_sequence.h \ diff --git a/Modules/_opcode.c b/Modules/_opcode.c index dc93063aee7..7ccf7af6bf9 100644 --- a/Modules/_opcode.c +++ b/Modules/_opcode.c @@ -422,6 +422,9 @@ _opcode_exec(PyObject *m) { if (PyModule_AddIntMacro(m, ENABLE_SPECIALIZATION) < 0) { return -1; } + if (PyModule_AddIntMacro(m, ENABLE_SPECIALIZATION_FT) < 0) { + return -1; + } return 0; } diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index eb98b433c6c..883f32599fb 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -14,6 +14,7 @@ #include "pycore_bitutils.h" // _Py_bswap32() #include "pycore_bytesobject.h" // _PyBytes_Find() #include "pycore_ceval.h" // _PyEval_AddPendingCall() +#include "pycore_code.h" // _PyCode_GetTLBCFast() #include "pycore_compile.h" // _PyCompile_CodeGen() #include "pycore_context.h" // _PyContext_NewHamtForTests() #include "pycore_dict.h" // _PyManagedDictPointer_GetValues() @@ -1963,6 +1964,48 @@ get_py_thread_id(PyObject *self, PyObject *Py_UNUSED(ignored)) Py_BUILD_ASSERT(sizeof(unsigned long long) >= sizeof(tid)); return PyLong_FromUnsignedLongLong(tid); } + +static PyCodeObject * +get_code(PyObject *obj) +{ + if (PyCode_Check(obj)) { + return (PyCodeObject *)obj; + } + else if (PyFunction_Check(obj)) { + return (PyCodeObject *)PyFunction_GetCode(obj); + } + return (PyCodeObject *)PyErr_Format( + PyExc_TypeError, "expected function or code object, got %s", + Py_TYPE(obj)->tp_name); +} + +static PyObject * +get_tlbc(PyObject *Py_UNUSED(module), PyObject *obj) +{ + PyCodeObject *code = get_code(obj); + if (code == NULL) { + return NULL; + } + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(PyThreadState_GET(), code); + if (bc == NULL) { + Py_RETURN_NONE; + } + return PyBytes_FromStringAndSize((const char *)bc, _PyCode_NBYTES(code)); +} + +static PyObject * +get_tlbc_id(PyObject *Py_UNUSED(module), PyObject *obj) +{ + PyCodeObject *code = get_code(obj); + if (code == NULL) { + return NULL; + } + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(PyThreadState_GET(), code); + if (bc == NULL) { + Py_RETURN_NONE; + } + return PyLong_FromVoidPtr(bc); +} #endif static PyObject * @@ -2022,7 +2065,6 @@ identify_type_slot_wrappers(PyObject *self, PyObject *Py_UNUSED(ignored)) return _PyType_GetSlotWrapperNames(); } - static PyMethodDef module_functions[] = { {"get_configs", get_configs, METH_NOARGS}, {"get_recursion_depth", get_recursion_depth, METH_NOARGS}, @@ -2110,6 +2152,8 @@ static PyMethodDef module_functions[] = { #ifdef Py_GIL_DISABLED {"py_thread_id", get_py_thread_id, METH_NOARGS}, + {"get_tlbc", get_tlbc, METH_O, NULL}, + {"get_tlbc_id", get_tlbc_id, METH_O, NULL}, #endif #ifdef _Py_TIER2 {"uop_symbols_test", _Py_uop_symbols_test, METH_NOARGS}, diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 775ea7aca82..1cf9740af9a 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -6,17 +6,22 @@ #include "pycore_code.h" // _PyCodeConstructor #include "pycore_frame.h" // FRAME_SPECIALS_SIZE #include "pycore_hashtable.h" // _Py_hashtable_t +#include "pycore_index_pool.h" // _PyIndexPool #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_interp.h" // PyInterpreterState.co_extra_freefuncs #include "pycore_object.h" // _PyObject_SetDeferredRefcount +#include "pycore_object_stack.h" #include "pycore_opcode_metadata.h" // _PyOpcode_Deopt, _PyOpcode_Caches #include "pycore_opcode_utils.h" // RESUME_AT_FUNC_START +#include "pycore_pymem.h" // _PyMem_FreeDelayed #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_setobject.h" // _PySet_NextEntry() #include "pycore_tuple.h" // _PyTuple_ITEMS() #include "pycore_uniqueid.h" // _PyObject_AssignUniqueId() #include "clinic/codeobject.c.h" +#define INITIAL_SPECIALIZED_CODE_SIZE 16 + static const char * code_event_name(PyCodeEvent event) { switch (event) { @@ -440,9 +445,15 @@ _PyCode_Validate(struct _PyCodeConstructor *con) return 0; } -extern void _PyCode_Quicken(PyCodeObject *code); +extern void +_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts, + int enable_counters); -static void +#ifdef Py_GIL_DISABLED +static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size); +#endif + +static int init_code(PyCodeObject *co, struct _PyCodeConstructor *con) { int nlocalsplus = (int)PyTuple_GET_SIZE(con->localsplusnames); @@ -505,14 +516,27 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) memcpy(_PyCode_CODE(co), PyBytes_AS_STRING(con->code), PyBytes_GET_SIZE(con->code)); +#ifdef Py_GIL_DISABLED + co->co_tlbc = _PyCodeArray_New(INITIAL_SPECIALIZED_CODE_SIZE); + if (co->co_tlbc == NULL) { + return -1; + } + co->co_tlbc->entries[0] = co->co_code_adaptive; +#endif int entry_point = 0; while (entry_point < Py_SIZE(co) && _PyCode_CODE(co)[entry_point].op.code != RESUME) { entry_point++; } co->_co_firsttraceable = entry_point; - _PyCode_Quicken(co); +#ifdef Py_GIL_DISABLED + _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts, + interp->config.tlbc_enabled); +#else + _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts, 1); +#endif notify_code_watchers(PY_CODE_EVENT_CREATE, co); + return 0; } static int @@ -667,7 +691,12 @@ _PyCode_New(struct _PyCodeConstructor *con) PyErr_NoMemory(); return NULL; } - init_code(co, con); + + if (init_code(co, con) < 0) { + Py_DECREF(co); + return NULL; + } + #ifdef Py_GIL_DISABLED co->_co_unique_id = _PyObject_AssignUniqueId((PyObject *)co); _PyObject_GC_TRACK(co); @@ -1871,6 +1900,17 @@ code_dealloc(PyCodeObject *co) PyObject_ClearWeakRefs((PyObject*)co); } free_monitoring_data(co->_co_monitoring); +#ifdef Py_GIL_DISABLED + // The first element always points to the mutable bytecode at the end of + // the code object, which will be freed when the code object is freed. + for (Py_ssize_t i = 1; i < co->co_tlbc->size; i++) { + char *entry = co->co_tlbc->entries[i]; + if (entry != NULL) { + PyMem_Free(entry); + } + } + PyMem_Free(co->co_tlbc); +#endif PyObject_Free(co); } @@ -2646,5 +2686,270 @@ _PyCode_Fini(PyInterpreterState *interp) _Py_hashtable_destroy(state->constants); state->constants = NULL; } + _PyIndexPool_Fini(&interp->tlbc_indices); #endif } + +#ifdef Py_GIL_DISABLED + +// Thread-local bytecode (TLBC) +// +// Each thread specializes a thread-local copy of the bytecode, created on the +// first RESUME, in free-threaded builds. All copies of the bytecode for a code +// object are stored in the `co_tlbc` array. Threads reserve a globally unique +// index identifying its copy of the bytecode in all `co_tlbc` arrays at thread +// creation and release the index at thread destruction. The first entry in +// every `co_tlbc` array always points to the "main" copy of the bytecode that +// is stored at the end of the code object. This ensures that no bytecode is +// copied for programs that do not use threads. +// +// Thread-local bytecode can be disabled at runtime by providing either `-X +// tlbc=0` or `PYTHON_TLBC=0`. Disabling thread-local bytecode also disables +// specialization. All threads share the main copy of the bytecode when +// thread-local bytecode is disabled. +// +// Concurrent modifications to the bytecode made by the specializing +// interpreter and instrumentation use atomics, with specialization taking care +// not to overwrite an instruction that was instrumented concurrently. + +int32_t +_Py_ReserveTLBCIndex(PyInterpreterState *interp) +{ + if (interp->config.tlbc_enabled) { + return _PyIndexPool_AllocIndex(&interp->tlbc_indices); + } + // All threads share the main copy of the bytecode when TLBC is disabled + return 0; +} + +void +_Py_ClearTLBCIndex(_PyThreadStateImpl *tstate) +{ + PyInterpreterState *interp = ((PyThreadState *)tstate)->interp; + if (interp->config.tlbc_enabled) { + _PyIndexPool_FreeIndex(&interp->tlbc_indices, tstate->tlbc_index); + } +} + +static _PyCodeArray * +_PyCodeArray_New(Py_ssize_t size) +{ + _PyCodeArray *arr = PyMem_Calloc( + 1, offsetof(_PyCodeArray, entries) + sizeof(void *) * size); + if (arr == NULL) { + PyErr_NoMemory(); + return NULL; + } + arr->size = size; + return arr; +} + +static void +copy_code(_Py_CODEUNIT *dst, PyCodeObject *co) +{ + int code_len = (int) Py_SIZE(co); + for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) { + dst[i] = _Py_GetBaseCodeUnit(co, i); + } + _PyCode_Quicken(dst, code_len, co->co_consts, 1); +} + +static Py_ssize_t +get_pow2_greater(Py_ssize_t initial, Py_ssize_t limit) +{ + // initial must be a power of two + assert(!(initial & (initial - 1))); + Py_ssize_t res = initial; + while (res && res < limit) { + res <<= 1; + } + return res; +} + +static _Py_CODEUNIT * +create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx) +{ + _PyCodeArray *tlbc = co->co_tlbc; + if (idx >= tlbc->size) { + Py_ssize_t new_size = get_pow2_greater(tlbc->size, idx + 1); + if (!new_size) { + PyErr_NoMemory(); + return NULL; + } + _PyCodeArray *new_tlbc = _PyCodeArray_New(new_size); + if (new_tlbc == NULL) { + return NULL; + } + memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *)); + _Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc); + _PyMem_FreeDelayed(tlbc); + tlbc = new_tlbc; + } + char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co)); + if (bc == NULL) { + PyErr_NoMemory(); + return NULL; + } + copy_code((_Py_CODEUNIT *) bc, co); + assert(tlbc->entries[idx] == NULL); + tlbc->entries[idx] = bc; + return (_Py_CODEUNIT *) bc; +} + +static _Py_CODEUNIT * +get_tlbc_lock_held(PyCodeObject *co) +{ + _PyCodeArray *tlbc = co->co_tlbc; + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)PyThreadState_GET(); + int32_t idx = tstate->tlbc_index; + if (idx < tlbc->size && tlbc->entries[idx] != NULL) { + return (_Py_CODEUNIT *)tlbc->entries[idx]; + } + return create_tlbc_lock_held(co, idx); +} + +_Py_CODEUNIT * +_PyCode_GetTLBC(PyCodeObject *co) +{ + _Py_CODEUNIT *result; + Py_BEGIN_CRITICAL_SECTION(co); + result = get_tlbc_lock_held(co); + Py_END_CRITICAL_SECTION(); + return result; +} + +// My kingdom for a bitset +struct flag_set { + uint8_t *flags; + Py_ssize_t size; +}; + +static inline int +flag_is_set(struct flag_set *flags, Py_ssize_t idx) +{ + assert(idx >= 0); + return (idx < flags->size) && flags->flags[idx]; +} + +// Set the flag for each tlbc index in use +static int +get_indices_in_use(PyInterpreterState *interp, struct flag_set *in_use) +{ + assert(interp->stoptheworld.world_stopped); + assert(in_use->flags == NULL); + int32_t max_index = 0; + for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) { + int32_t idx = ((_PyThreadStateImpl *) p)->tlbc_index; + if (idx > max_index) { + max_index = idx; + } + } + in_use->size = (size_t) max_index + 1; + in_use->flags = PyMem_Calloc(in_use->size, sizeof(*in_use->flags)); + if (in_use->flags == NULL) { + return -1; + } + for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) { + in_use->flags[((_PyThreadStateImpl *) p)->tlbc_index] = 1; + } + return 0; +} + +struct get_code_args { + _PyObjectStack code_objs; + struct flag_set indices_in_use; + int err; +}; + +static void +clear_get_code_args(struct get_code_args *args) +{ + if (args->indices_in_use.flags != NULL) { + PyMem_Free(args->indices_in_use.flags); + args->indices_in_use.flags = NULL; + } + _PyObjectStack_Clear(&args->code_objs); +} + +static inline int +is_bytecode_unused(_PyCodeArray *tlbc, Py_ssize_t idx, + struct flag_set *indices_in_use) +{ + assert(idx > 0 && idx < tlbc->size); + return tlbc->entries[idx] != NULL && !flag_is_set(indices_in_use, idx); +} + +static int +get_code_with_unused_tlbc(PyObject *obj, struct get_code_args *args) +{ + if (!PyCode_Check(obj)) { + return 1; + } + PyCodeObject *co = (PyCodeObject *) obj; + _PyCodeArray *tlbc = co->co_tlbc; + // The first index always points at the main copy of the bytecode embedded + // in the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (is_bytecode_unused(tlbc, i, &args->indices_in_use)) { + if (_PyObjectStack_Push(&args->code_objs, obj) < 0) { + args->err = -1; + return 0; + } + return 1; + } + } + return 1; +} + +static void +free_unused_bytecode(PyCodeObject *co, struct flag_set *indices_in_use) +{ + _PyCodeArray *tlbc = co->co_tlbc; + // The first index always points at the main copy of the bytecode embedded + // in the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (is_bytecode_unused(tlbc, i, indices_in_use)) { + PyMem_Free(tlbc->entries[i]); + tlbc->entries[i] = NULL; + } + } +} + +int +_Py_ClearUnusedTLBC(PyInterpreterState *interp) +{ + struct get_code_args args = { + .code_objs = {NULL}, + .indices_in_use = {NULL, 0}, + .err = 0, + }; + _PyEval_StopTheWorld(interp); + // Collect in-use tlbc indices + if (get_indices_in_use(interp, &args.indices_in_use) < 0) { + goto err; + } + // Collect code objects that have bytecode not in use by any thread + _PyGC_VisitObjectsWorldStopped( + interp, (gcvisitobjects_t)get_code_with_unused_tlbc, &args); + if (args.err < 0) { + goto err; + } + // Free unused bytecode. This must happen outside of gc_visit_heaps; it is + // unsafe to allocate or free any mimalloc managed memory when it's + // running. + PyObject *obj; + while ((obj = _PyObjectStack_Pop(&args.code_objs)) != NULL) { + free_unused_bytecode((PyCodeObject*) obj, &args.indices_in_use); + } + _PyEval_StartTheWorld(interp); + clear_get_code_args(&args); + return 0; + +err: + _PyEval_StartTheWorld(interp); + clear_get_code_args(&args); + PyErr_NoMemory(); + return -1; +} + +#endif diff --git a/Objects/frameobject.c b/Objects/frameobject.c index 55394afa523..c743c254848 100644 --- a/Objects/frameobject.c +++ b/Objects/frameobject.c @@ -1651,7 +1651,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno, void *Py_UNUSED(ignore } /* Finally set the new lasti and return OK. */ f->f_lineno = 0; - f->f_frame->instr_ptr = _PyCode_CODE(code) + best_addr; + f->f_frame->instr_ptr = _PyFrame_GetBytecode(f->f_frame) + best_addr; return 0; } @@ -1867,10 +1867,11 @@ PyTypeObject PyFrame_Type = { }; static void -init_frame(_PyInterpreterFrame *frame, PyFunctionObject *func, PyObject *locals) +init_frame(PyThreadState *tstate, _PyInterpreterFrame *frame, + PyFunctionObject *func, PyObject *locals) { PyCodeObject *code = (PyCodeObject *)func->func_code; - _PyFrame_Initialize(frame, PyStackRef_FromPyObjectNew(func), + _PyFrame_Initialize(tstate, frame, PyStackRef_FromPyObjectNew(func), Py_XNewRef(locals), code, 0, NULL); } @@ -1922,7 +1923,7 @@ PyFrame_New(PyThreadState *tstate, PyCodeObject *code, Py_DECREF(func); return NULL; } - init_frame((_PyInterpreterFrame *)f->_f_frame_data, func, locals); + init_frame(tstate, (_PyInterpreterFrame *)f->_f_frame_data, func, locals); f->f_frame = (_PyInterpreterFrame *)f->_f_frame_data; f->f_frame->owner = FRAME_OWNED_BY_FRAME_OBJECT; // This frame needs to be "complete", so pretend that the first RESUME ran: @@ -1941,7 +1942,8 @@ frame_init_get_vars(_PyInterpreterFrame *frame) // here: PyCodeObject *co = _PyFrame_GetCode(frame); int lasti = _PyInterpreterFrame_LASTI(frame); - if (!(lasti < 0 && _PyCode_CODE(co)->op.code == COPY_FREE_VARS + if (!(lasti < 0 + && _PyFrame_GetBytecode(frame)->op.code == COPY_FREE_VARS && PyStackRef_FunctionCheck(frame->f_funcobj))) { /* Free vars are initialized */ @@ -1957,7 +1959,7 @@ frame_init_get_vars(_PyInterpreterFrame *frame) frame->localsplus[offset + i] = PyStackRef_FromPyObjectNew(o); } // COPY_FREE_VARS doesn't have inline CACHEs, either: - frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)); + frame->instr_ptr = _PyFrame_GetBytecode(frame); } diff --git a/Objects/typeobject.c b/Objects/typeobject.c index b4a11195613..40225313a8a 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -11638,9 +11638,10 @@ super_descr_get(PyObject *self, PyObject *obj, PyObject *type) } static int -super_init_without_args(_PyInterpreterFrame *cframe, PyCodeObject *co, - PyTypeObject **type_p, PyObject **obj_p) +super_init_without_args(_PyInterpreterFrame *cframe, PyTypeObject **type_p, + PyObject **obj_p) { + PyCodeObject *co = _PyFrame_GetCode(cframe); if (co->co_argcount == 0) { PyErr_SetString(PyExc_RuntimeError, "super(): no arguments"); @@ -11740,7 +11741,7 @@ super_init_impl(PyObject *self, PyTypeObject *type, PyObject *obj) { "super(): no current frame"); return -1; } - int res = super_init_without_args(frame, _PyFrame_GetCode(frame), &type, &obj); + int res = super_init_without_args(frame, &type, &obj); if (res < 0) { return -1; diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index a3c2d32c454..51b493f8a84 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -222,6 +222,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index 91b1d75fb8d..09a5f4d30ef 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -232,6 +232,9 @@ Source Files + + Source Files + Source Files diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index a4881e9256e..f840e7fd61f 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -255,6 +255,7 @@ + @@ -614,6 +615,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 740790cc5e1..a930cd0b0b1 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -687,6 +687,9 @@ Include\internal + + Include\internal + Include\internal @@ -1373,6 +1376,9 @@ Python + + Python + Python diff --git a/Python/bytecodes.c b/Python/bytecodes.c index fa98af12c69..2c78cb99317 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -168,11 +168,11 @@ dummy_func( } op(_QUICKEN_RESUME, (--)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (tstate->tracing == 0 && this_instr->op.code == RESUME) { FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK); } - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ } tier1 op(_MAYBE_INSTRUMENT, (--)) { @@ -190,7 +190,26 @@ dummy_func( } } + op(_LOAD_BYTECODE, (--)) { + #ifdef Py_GIL_DISABLED + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + ERROR_IF(bytecode == NULL, error); + int off = this_instr - _PyFrame_GetBytecode(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + // Make sure this_instr gets reset correctley for any uops that + // follow + next_instr = frame->instr_ptr; + DISPATCH(); + } + #endif + } + macro(RESUME) = + _LOAD_BYTECODE + _MAYBE_INSTRUMENT + _QUICKEN_RESUME + _CHECK_PERIODIC_IF_NOT_YIELD_FROM; @@ -204,6 +223,10 @@ dummy_func( uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version); + #ifdef Py_GIL_DISABLED + DEOPT_IF(frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index); + #endif } op(_MONITOR_RESUME, (--)) { @@ -217,6 +240,7 @@ dummy_func( } macro(INSTRUMENTED_RESUME) = + _LOAD_BYTECODE + _MAYBE_INSTRUMENT + _CHECK_PERIODIC_IF_NOT_YIELD_FROM + _MONITOR_RESUME; @@ -682,8 +706,8 @@ dummy_func( }; specializing op(_SPECIALIZE_BINARY_SUBSCR, (counter/1, container, sub -- container, sub)) { - assert(frame->stackpointer == NULL); #if ENABLE_SPECIALIZATION + assert(frame->stackpointer == NULL); if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinarySubscr(container, sub, next_instr); @@ -1236,7 +1260,7 @@ dummy_func( if (oparg) { PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]); if (PyLong_Check(lasti)) { - frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti); + frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti); assert(!_PyErr_Occurred(tstate)); } else { @@ -2671,9 +2695,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); DEAD(cond); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } @@ -2681,9 +2703,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); DEAD(cond); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } @@ -3697,7 +3717,7 @@ dummy_func( op(_CREATE_INIT_FRAME, (init[1], self[1], args[oparg] -- init_frame: _PyInterpreterFrame *)) { _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked( tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame); - assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK); + assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK); /* Push self onto stack of shim */ shim->localsplus[0] = PyStackRef_DUP(self[0]); DEAD(init); @@ -4593,7 +4613,7 @@ dummy_func( } specializing op(_SPECIALIZE_BINARY_OP, (counter/1, lhs, rhs -- lhs, rhs)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); @@ -4601,7 +4621,7 @@ dummy_func( } OPCODE_DEFERRED_INC(BINARY_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); } @@ -4632,7 +4652,7 @@ dummy_func( int original_opcode = 0; if (tstate->tracing) { PyCodeObject *code = _PyFrame_GetCode(frame); - original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode; + original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode; next_instr = this_instr; } else { original_opcode = _Py_call_instrumentation_line( @@ -4687,9 +4707,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); } @@ -4698,9 +4716,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); } @@ -4715,9 +4731,7 @@ dummy_func( PyStackRef_CLOSE(value_stackref); offset = 0; } - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); } @@ -4815,7 +4829,7 @@ dummy_func( tier2 op(_EXIT_TRACE, (exit_p/4 --)) { _PyExitData *exit = (_PyExitData *)exit_p; PyCodeObject *code = _PyFrame_GetCode(frame); - _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + _Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target; #if defined(Py_DEBUG) && !defined(_Py_JIT) OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); if (lltrace >= 2) { @@ -4823,7 +4837,7 @@ dummy_func( _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.value_and_backoff, - (int)(target - _PyCode_CODE(code)), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); } #endif @@ -4933,7 +4947,7 @@ dummy_func( _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.value_and_backoff, - (int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); } #endif @@ -4995,7 +5009,7 @@ dummy_func( } tier2 op(_ERROR_POP_N, (target/2, unused[oparg] --)) { - frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target; + frame->instr_ptr = _PyFrame_GetBytecode(frame) + target; SYNC_SP(); GOTO_UNWIND(); } diff --git a/Python/ceval.c b/Python/ceval.c index beee5325cd6..9a608f06966 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -189,7 +189,7 @@ lltrace_instruction(_PyInterpreterFrame *frame, dump_stack(frame, stack_pointer); const char *opname = _PyOpcode_OpName[opcode]; assert(opname != NULL); - int offset = (int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame))); + int offset = (int)(next_instr - _PyFrame_GetBytecode(frame)); if (OPCODE_HAS_ARG((int)_PyOpcode_Deopt[opcode])) { printf("%d: %s %d\n", offset * 2, opname, oparg); } @@ -841,6 +841,19 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } /* Because this avoids the RESUME, * we need to update instrumentation */ +#ifdef Py_GIL_DISABLED + /* Load thread-local bytecode */ + if (frame->tlbc_index != ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + if (bytecode == NULL) { + goto error; + } + ptrdiff_t off = frame->instr_ptr - _PyFrame_GetBytecode(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + } +#endif _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); monitor_throw(tstate, frame, frame->instr_ptr); /* TO DO -- Monitor throw entry. */ @@ -983,7 +996,7 @@ exception_unwind: Python main loop. */ PyObject *exc = _PyErr_GetRaisedException(tstate); PUSH(PyStackRef_FromPyObjectSteal(exc)); - next_instr = _PyCode_CODE(_PyFrame_GetCode(frame)) + handler; + next_instr = _PyFrame_GetBytecode(frame) + handler; if (monitor_handled(tstate, frame, next_instr, exc) < 0) { goto exception_unwind; @@ -1045,6 +1058,8 @@ enter_tier_two: #undef ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZATION 0 +#undef ENABLE_SPECIALIZATION_FT +#define ENABLE_SPECIALIZATION_FT 0 #ifdef Py_DEBUG #define DPRINTF(level, ...) \ @@ -1139,7 +1154,7 @@ exit_to_tier1_dynamic: goto goto_to_tier1; exit_to_tier1: assert(next_uop[-1].format == UOP_FORMAT_TARGET); - next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); + next_instr = next_uop[-1].target + _PyFrame_GetBytecode(frame); goto_to_tier1: #ifdef Py_DEBUG if (lltrace >= 2) { @@ -1764,7 +1779,7 @@ _PyEvalFramePushAndInit(PyThreadState *tstate, _PyStackRef func, if (frame == NULL) { goto fail; } - _PyFrame_Initialize(frame, func, locals, code, 0, previous); + _PyFrame_Initialize(tstate, frame, func, locals, code, 0, previous); if (initialize_locals(tstate, func_obj, frame->localsplus, args, argcount, kwnames)) { assert(frame->owner == FRAME_OWNED_BY_THREAD); clear_thread_frame(tstate, frame); diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 6674c4ccf9f..5df55813a0d 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -151,7 +151,7 @@ GETITEM(PyObject *v, Py_ssize_t i) { /* Code access macros */ /* The integer overflow is checked by an assertion below. */ -#define INSTR_OFFSET() ((int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame)))) +#define INSTR_OFFSET() ((int)(next_instr - _PyFrame_GetBytecode(frame))) #define NEXTOPARG() do { \ _Py_CODEUNIT word = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \ opcode = word.op.code; \ @@ -301,14 +301,6 @@ GETITEM(PyObject *v, Py_ssize_t i) { #define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \ backoff_counter_triggers(forge_backoff_counter((COUNTER))) -#ifdef Py_GIL_DISABLED -#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \ - do { \ - /* gh-115999 tracks progress on addressing this. */ \ - static_assert(0, "The specializing interpreter is not yet thread-safe"); \ - } while (0); -#define PAUSE_ADAPTIVE_COUNTER(COUNTER) ((void)COUNTER) -#else #define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \ do { \ (COUNTER) = advance_backoff_counter((COUNTER)); \ @@ -318,6 +310,18 @@ GETITEM(PyObject *v, Py_ssize_t i) { do { \ (COUNTER) = pause_backoff_counter((COUNTER)); \ } while (0); + +#ifdef ENABLE_SPECIALIZATION_FT +/* Multiple threads may execute these concurrently if thread-local bytecode is + * disabled and they all execute the main copy of the bytecode. Specialization + * is disabled in that case so the value is unused, but the RMW cycle should be + * free of data races. + */ +#define RECORD_BRANCH_TAKEN(bitset, flag) \ + FT_ATOMIC_STORE_UINT16_RELAXED( \ + bitset, (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag)) +#else +#define RECORD_BRANCH_TAKEN(bitset, flag) #endif #define UNBOUNDLOCAL_ERROR_MSG \ diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index ff4a0a52a0b..9fac4e881b8 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -41,6 +41,8 @@ /* _QUICKEN_RESUME is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */ + /* _LOAD_BYTECODE is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */ + case _RESUME_CHECK: { #if defined(__EMSCRIPTEN__) if (_Py_emscripten_signal_clock == 0) { @@ -56,6 +58,13 @@ UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } + #ifdef Py_GIL_DISABLED + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + UOP_STAT_INC(uopcode, miss); + JUMP_TO_JUMP_TARGET(); + } + #endif break; } @@ -4480,8 +4489,8 @@ _PyFrame_SetStackPointer(frame, stack_pointer); _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked( tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame); + assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK); stack_pointer = _PyFrame_GetStackPointer(frame); - assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK); /* Push self onto stack of shim */ shim->localsplus[0] = PyStackRef_DUP(self[0]); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -5683,7 +5692,9 @@ PyObject *exit_p = (PyObject *)CURRENT_OPERAND(); _PyExitData *exit = (_PyExitData *)exit_p; PyCodeObject *code = _PyFrame_GetCode(frame); - _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target; + stack_pointer = _PyFrame_GetStackPointer(frame); #if defined(Py_DEBUG) && !defined(_Py_JIT) OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); if (lltrace >= 2) { @@ -5692,7 +5703,7 @@ _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.value_and_backoff, - (int)(target - _PyCode_CODE(code)), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); stack_pointer = _PyFrame_GetStackPointer(frame); } @@ -5878,7 +5889,7 @@ _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.value_and_backoff, - (int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); stack_pointer = _PyFrame_GetStackPointer(frame); } @@ -5956,9 +5967,11 @@ case _ERROR_POP_N: { oparg = CURRENT_OPARG(); uint32_t target = (uint32_t)CURRENT_OPERAND(); - frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target; stack_pointer += -oparg; assert(WITHIN_STACK_BOUNDS()); + _PyFrame_SetStackPointer(frame, stack_pointer); + frame->instr_ptr = _PyFrame_GetBytecode(frame) + target; + stack_pointer = _PyFrame_GetStackPointer(frame); GOTO_UNWIND(); break; } diff --git a/Python/frame.c b/Python/frame.c index 35e6c2d0a93..9a865e57d97 100644 --- a/Python/frame.c +++ b/Python/frame.c @@ -63,7 +63,8 @@ take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame) // This may be a newly-created generator or coroutine frame. Since it's // dead anyways, just pretend that the first RESUME ran: PyCodeObject *code = _PyFrame_GetCode(frame); - frame->instr_ptr = _PyCode_CODE(code) + code->_co_firsttraceable + 1; + frame->instr_ptr = + _PyFrame_GetBytecode(frame) + code->_co_firsttraceable + 1; } assert(!_PyFrame_IsIncomplete(frame)); assert(f->f_back == NULL); diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 1969ed608ea..986d80c18d3 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -1953,16 +1953,22 @@ custom_visitor_wrapper(const mi_heap_t *heap, const mi_heap_area_t *area, } void -PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg) +_PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp, + gcvisitobjects_t callback, void *arg) { - PyInterpreterState *interp = _PyInterpreterState_GET(); struct custom_visitor_args wrapper = { .callback = callback, .arg = arg, }; - - _PyEval_StopTheWorld(interp); gc_visit_heaps(interp, &custom_visitor_wrapper, &wrapper.base); +} + +void +PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyEval_StopTheWorld(interp); + _PyGC_VisitObjectsWorldStopped(interp, callback, arg); _PyEval_StartTheWorld(interp); } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 632cbc7790a..eff246f1997 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -25,7 +25,7 @@ lhs = stack_pointer[-2]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -35,7 +35,7 @@ } OPCODE_DEFERRED_INC(BINARY_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); } @@ -435,8 +435,8 @@ container = stack_pointer[-2]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - assert(frame->stackpointer == NULL); #if ENABLE_SPECIALIZATION + assert(frame->stackpointer == NULL); if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -1066,8 +1066,8 @@ _PyFrame_SetStackPointer(frame, stack_pointer); _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked( tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame); + assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK); stack_pointer = _PyFrame_GetStackPointer(frame); - assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK); /* Push self onto stack of shim */ shim->localsplus[0] = PyStackRef_DUP(self[0]); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -4711,7 +4711,9 @@ int original_opcode = 0; if (tstate->tracing) { PyCodeObject *code = _PyFrame_GetCode(frame); - original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode; + _PyFrame_SetStackPointer(frame, stack_pointer); + original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode; + stack_pointer = _PyFrame_GetStackPointer(frame); next_instr = this_instr; } else { _PyFrame_SetStackPointer(frame, stack_pointer); @@ -4759,9 +4761,7 @@ assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); DISPATCH(); } @@ -4782,9 +4782,7 @@ PyStackRef_CLOSE(value_stackref); offset = 0; } - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); DISPATCH(); } @@ -4822,9 +4820,7 @@ assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); DISPATCH(); } @@ -4834,6 +4830,28 @@ (void)this_instr; next_instr += 1; INSTRUCTION_STATS(INSTRUMENTED_RESUME); + // _LOAD_BYTECODE + { + #ifdef Py_GIL_DISABLED + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + stack_pointer = _PyFrame_GetStackPointer(frame); + if (bytecode == NULL) goto error; + _PyFrame_SetStackPointer(frame, stack_pointer); + int off = this_instr - _PyFrame_GetBytecode(frame); + stack_pointer = _PyFrame_GetStackPointer(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + // Make sure this_instr gets reset correctley for any uops that + // follow + next_instr = frame->instr_ptr; + DISPATCH(); + } + #endif + } // _MAYBE_INSTRUMENT { if (tstate->tracing == 0) { @@ -6646,9 +6664,7 @@ cond = stack_pointer[-1]; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); stack_pointer += -1; assert(WITHIN_STACK_BOUNDS()); @@ -6680,9 +6696,7 @@ cond = b; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } stack_pointer += -1; @@ -6715,9 +6729,7 @@ cond = b; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } stack_pointer += -1; @@ -6735,9 +6747,7 @@ cond = stack_pointer[-1]; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); stack_pointer += -1; assert(WITHIN_STACK_BOUNDS()); @@ -6832,7 +6842,11 @@ if (oparg) { PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]); if (PyLong_Check(lasti)) { - frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti); + stack_pointer += -1; + assert(WITHIN_STACK_BOUNDS()); + _PyFrame_SetStackPointer(frame, stack_pointer); + frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti); + stack_pointer = _PyFrame_GetStackPointer(frame); assert(!_PyErr_Occurred(tstate)); } else { @@ -6844,6 +6858,8 @@ Py_DECREF(exc); goto error; } + stack_pointer += 1; + assert(WITHIN_STACK_BOUNDS()); } assert(exc && PyExceptionInstance_Check(exc)); stack_pointer += -1; @@ -6871,6 +6887,28 @@ PREDICTED(RESUME); _Py_CODEUNIT* const this_instr = next_instr - 1; (void)this_instr; + // _LOAD_BYTECODE + { + #ifdef Py_GIL_DISABLED + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + stack_pointer = _PyFrame_GetStackPointer(frame); + if (bytecode == NULL) goto error; + _PyFrame_SetStackPointer(frame, stack_pointer); + int off = this_instr - _PyFrame_GetBytecode(frame); + stack_pointer = _PyFrame_GetStackPointer(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + // Make sure this_instr gets reset correctley for any uops that + // follow + next_instr = frame->instr_ptr; + DISPATCH(); + } + #endif + } // _MAYBE_INSTRUMENT { if (tstate->tracing == 0) { @@ -6890,11 +6928,11 @@ } // _QUICKEN_RESUME { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (tstate->tracing == 0 && this_instr->op.code == RESUME) { FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK); } - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ } // _CHECK_PERIODIC_IF_NOT_YIELD_FROM { @@ -6925,6 +6963,10 @@ uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version, RESUME); + #ifdef Py_GIL_DISABLED + DEOPT_IF(frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index, RESUME); + #endif DISPATCH(); } diff --git a/Python/index_pool.c b/Python/index_pool.c new file mode 100644 index 00000000000..526eccff74a --- /dev/null +++ b/Python/index_pool.c @@ -0,0 +1,193 @@ +#include + +#include "Python.h" + +#include "pycore_index_pool.h" +#include "pycore_lock.h" + +#ifdef Py_GIL_DISABLED + +static inline void +swap(int32_t *values, Py_ssize_t i, Py_ssize_t j) +{ + int32_t tmp = values[i]; + values[i] = values[j]; + values[j] = tmp; +} + +static bool +heap_try_swap(_PyIndexHeap *heap, Py_ssize_t i, Py_ssize_t j) +{ + if (i < 0 || i >= heap->size) { + return 0; + } + if (j < 0 || j >= heap->size) { + return 0; + } + if (i <= j) { + if (heap->values[i] <= heap->values[j]) { + return 0; + } + } + else if (heap->values[j] <= heap->values[i]) { + return 0; + } + swap(heap->values, i, j); + return 1; +} + +static inline Py_ssize_t +parent(Py_ssize_t i) +{ + return (i - 1) / 2; +} + +static inline Py_ssize_t +left_child(Py_ssize_t i) +{ + return 2 * i + 1; +} + +static inline Py_ssize_t +right_child(Py_ssize_t i) +{ + return 2 * i + 2; +} + +static void +heap_add(_PyIndexHeap *heap, int32_t val) +{ + assert(heap->size < heap->capacity); + // Add val to end + heap->values[heap->size] = val; + heap->size++; + // Sift up + for (Py_ssize_t cur = heap->size - 1; cur > 0; cur = parent(cur)) { + if (!heap_try_swap(heap, cur, parent(cur))) { + break; + } + } +} + +static Py_ssize_t +heap_min_child(_PyIndexHeap *heap, Py_ssize_t i) +{ + if (left_child(i) < heap->size) { + if (right_child(i) < heap->size) { + Py_ssize_t lval = heap->values[left_child(i)]; + Py_ssize_t rval = heap->values[right_child(i)]; + return lval < rval ? left_child(i) : right_child(i); + } + return left_child(i); + } + else if (right_child(i) < heap->size) { + return right_child(i); + } + return -1; +} + +static int32_t +heap_pop(_PyIndexHeap *heap) +{ + assert(heap->size > 0); + // Pop smallest and replace with the last element + int32_t result = heap->values[0]; + heap->values[0] = heap->values[heap->size - 1]; + heap->size--; + // Sift down + for (Py_ssize_t cur = 0; cur < heap->size;) { + Py_ssize_t min_child = heap_min_child(heap, cur); + if (min_child > -1 && heap_try_swap(heap, cur, min_child)) { + cur = min_child; + } + else { + break; + } + } + return result; +} + +static int +heap_ensure_capacity(_PyIndexHeap *heap, Py_ssize_t limit) +{ + assert(limit > 0); + if (heap->capacity > limit) { + return 0; + } + Py_ssize_t new_capacity = heap->capacity ? heap->capacity : 1024; + while (new_capacity && new_capacity < limit) { + new_capacity <<= 1; + } + if (!new_capacity) { + return -1; + } + int32_t *new_values = PyMem_RawCalloc(new_capacity, sizeof(int32_t)); + if (new_values == NULL) { + return -1; + } + if (heap->values != NULL) { + memcpy(new_values, heap->values, heap->capacity); + PyMem_RawFree(heap->values); + } + heap->values = new_values; + heap->capacity = new_capacity; + return 0; +} + +static void +heap_fini(_PyIndexHeap *heap) +{ + if (heap->values != NULL) { + PyMem_RawFree(heap->values); + heap->values = NULL; + } + heap->size = -1; + heap->capacity = -1; +} + +#define LOCK_POOL(pool) PyMutex_LockFlags(&pool->mutex, _Py_LOCK_DONT_DETACH) +#define UNLOCK_POOL(pool) PyMutex_Unlock(&pool->mutex) + +int32_t +_PyIndexPool_AllocIndex(_PyIndexPool *pool) +{ + LOCK_POOL(pool); + int32_t index; + _PyIndexHeap *free_indices = &pool->free_indices; + if (free_indices->size == 0) { + // No free indices. Make sure the heap can always store all of the + // indices that have been allocated to avoid having to allocate memory + // (which can fail) when freeing an index. Freeing indices happens when + // threads are being destroyed, which makes error handling awkward / + // impossible. This arrangement shifts handling of allocation failures + // to when indices are allocated, which happens at thread creation, + // where we are better equipped to deal with failure. + if (heap_ensure_capacity(free_indices, pool->next_index + 1) < 0) { + UNLOCK_POOL(pool); + PyErr_NoMemory(); + return -1; + } + index = pool->next_index++; + } + else { + index = heap_pop(free_indices); + } + UNLOCK_POOL(pool); + return index; +} + +void +_PyIndexPool_FreeIndex(_PyIndexPool *pool, int32_t index) +{ + LOCK_POOL(pool); + heap_add(&pool->free_indices, index); + UNLOCK_POOL(pool); +} + +void +_PyIndexPool_Fini(_PyIndexPool *pool) +{ + heap_fini(&pool->free_indices); +} + +#endif // Py_GIL_DISABLED diff --git a/Python/initconfig.c b/Python/initconfig.c index c142438b02b..438f8a5c1cf 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -134,6 +134,7 @@ static const PyConfigSpec PYCONFIG_SPEC[] = { SPEC(dump_refs_file, WSTR_OPT, READ_ONLY, NO_SYS), #ifdef Py_GIL_DISABLED SPEC(enable_gil, INT, READ_ONLY, NO_SYS), + SPEC(tlbc_enabled, INT, READ_ONLY, NO_SYS), #endif SPEC(faulthandler, BOOL, READ_ONLY, NO_SYS), SPEC(filesystem_encoding, WSTR, READ_ONLY, NO_SYS), @@ -315,8 +316,13 @@ The following implementation-specific options are available:\n\ "\ -X showrefcount: output the total reference count and number of used\n\ memory blocks when the program finishes or after each statement in\n\ - the interactive interpreter; only works on debug builds\n\ --X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n\ + the interactive interpreter; only works on debug builds\n" +#ifdef Py_GIL_DISABLED +"-X tlbc=[0|1]: enable (1) or disable (0) thread-local bytecode. Also\n\ + PYTHON_TLBC\n" +#endif +"\ +-X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n \ of N frames (default: 1); also PYTHONTRACEMALLOC=N\n\ -X utf8[=0|1]: enable (1) or disable (0) UTF-8 mode; also PYTHONUTF8\n\ -X warn_default_encoding: enable opt-in EncodingWarning for 'encoding=None';\n\ @@ -400,6 +406,9 @@ static const char usage_envvars[] = #ifdef Py_STATS "PYTHONSTATS : turns on statistics gathering (-X pystats)\n" #endif +#ifdef Py_GIL_DISABLED +"PYTHON_TLBC : when set to 0, disables thread-local bytecode (-X tlbc)\n" +#endif "PYTHONTRACEMALLOC: trace Python memory allocations (-X tracemalloc)\n" "PYTHONUNBUFFERED: disable stdout/stderr buffering (-u)\n" "PYTHONUTF8 : control the UTF-8 mode (-X utf8)\n" @@ -979,6 +988,7 @@ _PyConfig_InitCompatConfig(PyConfig *config) config->cpu_count = -1; #ifdef Py_GIL_DISABLED config->enable_gil = _PyConfig_GIL_DEFAULT; + config->tlbc_enabled = 1; #endif } @@ -1862,6 +1872,36 @@ error: "n must be greater than 0"); } +static PyStatus +config_init_tlbc(PyConfig *config) +{ +#ifdef Py_GIL_DISABLED + const char *env = config_get_env(config, "PYTHON_TLBC"); + if (env) { + int enabled; + if (_Py_str_to_int(env, &enabled) < 0 || (enabled < 0) || (enabled > 1)) { + return _PyStatus_ERR( + "PYTHON_TLBC=N: N is missing or invalid"); + } + config->tlbc_enabled = enabled; + } + + const wchar_t *xoption = config_get_xoption(config, L"tlbc"); + if (xoption) { + int enabled; + const wchar_t *sep = wcschr(xoption, L'='); + if (!sep || (config_wstr_to_int(sep + 1, &enabled) < 0) || (enabled < 0) || (enabled > 1)) { + return _PyStatus_ERR( + "-X tlbc=n: n is missing or invalid"); + } + config->tlbc_enabled = enabled; + } + return _PyStatus_OK(); +#else + return _PyStatus_OK(); +#endif +} + static PyStatus config_init_perf_profiling(PyConfig *config) { @@ -2111,6 +2151,11 @@ config_read_complex_options(PyConfig *config) } #endif + status = config_init_tlbc(config); + if (_PyStatus_EXCEPTION(status)) { + return status; + } + return _PyStatus_OK(); } diff --git a/Python/instrumentation.c b/Python/instrumentation.c index d4568764117..87c2addaf80 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -44,10 +44,24 @@ #define UNLOCK_CODE() Py_END_CRITICAL_SECTION() +#define MODIFY_BYTECODE(code, func, ...) \ + do { \ + PyCodeObject *co = (code); \ + for (Py_ssize_t i = 0; i < code->co_tlbc->size; i++) { \ + char *bc = co->co_tlbc->entries[i]; \ + if (bc == NULL) { \ + continue; \ + } \ + (func)((_Py_CODEUNIT *)bc, __VA_ARGS__); \ + } \ + } while (0) + #else #define LOCK_CODE(code) #define UNLOCK_CODE() +#define MODIFY_BYTECODE(code, func, ...) \ + (func)(_PyCode_CODE(code), __VA_ARGS__) #endif @@ -309,7 +323,8 @@ _PyInstruction_GetLength(PyCodeObject *code, int offset) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); - int opcode = _PyCode_CODE(code)[offset].op.code; + int opcode = + FT_ATOMIC_LOAD_UINT8_RELAXED(_PyCode_CODE(code)[offset].op.code); assert(opcode != 0); assert(opcode != RESERVED); if (opcode == INSTRUMENTED_LINE) { @@ -578,7 +593,9 @@ sanity_check_instrumentation(PyCodeObject *code) _Py_CODEUNIT _Py_GetBaseCodeUnit(PyCodeObject *code, int i) { - _Py_CODEUNIT inst = _PyCode_CODE(code)[i]; + _Py_CODEUNIT *src_instr = _PyCode_CODE(code) + i; + _Py_CODEUNIT inst = { + .cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t *)src_instr)}; int opcode = inst.op.code; if (opcode < MIN_INSTRUMENTED_OPCODE) { inst.op.code = _PyOpcode_Deopt[opcode]; @@ -614,21 +631,22 @@ _Py_GetBaseCodeUnit(PyCodeObject *code, int i) } static void -de_instrument(PyCodeObject *code, int i, int event) +de_instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i, + int event) { assert(event != PY_MONITORING_EVENT_INSTRUCTION); assert(event != PY_MONITORING_EVENT_LINE); - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode = *opcode_ptr; assert(opcode != ENTER_EXECUTOR); if (opcode == INSTRUMENTED_LINE) { - opcode_ptr = &code->_co_monitoring->lines[i].original_opcode; + opcode_ptr = &monitoring->lines[i].original_opcode; opcode = *opcode_ptr; } if (opcode == INSTRUMENTED_INSTRUCTION) { - opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i]; + opcode_ptr = &monitoring->per_instruction_opcodes[i]; opcode = *opcode_ptr; } int deinstrumented = DE_INSTRUMENT[opcode]; @@ -644,65 +662,68 @@ de_instrument(PyCodeObject *code, int i, int event) } static void -de_instrument_line(PyCodeObject *code, int i) +de_instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, + int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; int opcode = instr->op.code; if (opcode != INSTRUMENTED_LINE) { return; } - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; int original_opcode = lines->original_opcode; if (original_opcode == INSTRUMENTED_INSTRUCTION) { - lines->original_opcode = code->_co_monitoring->per_instruction_opcodes[i]; + lines->original_opcode = monitoring->per_instruction_opcodes[i]; } CHECK(original_opcode != 0); CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); - instr->op.code = original_opcode; + FT_ATOMIC_STORE_UINT8(instr->op.code, original_opcode); if (_PyOpcode_Caches[original_opcode]) { - instr[1].counter = adaptive_counter_warmup(); + FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff, + adaptive_counter_warmup().value_and_backoff); } assert(instr->op.code != INSTRUMENTED_LINE); } static void -de_instrument_per_instruction(PyCodeObject *code, int i) +de_instrument_per_instruction(_Py_CODEUNIT *bytecode, + _PyCoMonitoringData *monitoring, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode = *opcode_ptr; if (opcode == INSTRUMENTED_LINE) { - opcode_ptr = &code->_co_monitoring->lines[i].original_opcode; + opcode_ptr = &monitoring->lines[i].original_opcode; opcode = *opcode_ptr; } if (opcode != INSTRUMENTED_INSTRUCTION) { return; } - int original_opcode = code->_co_monitoring->per_instruction_opcodes[i]; + int original_opcode = monitoring->per_instruction_opcodes[i]; CHECK(original_opcode != 0); CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); - *opcode_ptr = original_opcode; + FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, original_opcode); if (_PyOpcode_Caches[original_opcode]) { - instr[1].counter = adaptive_counter_warmup(); + FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff, + adaptive_counter_warmup().value_and_backoff); } assert(*opcode_ptr != INSTRUMENTED_INSTRUCTION); assert(instr->op.code != INSTRUMENTED_INSTRUCTION); } - static void -instrument(PyCodeObject *code, int i) +instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode =*opcode_ptr; if (opcode == INSTRUMENTED_LINE) { - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; opcode_ptr = &lines->original_opcode; opcode = *opcode_ptr; } if (opcode == INSTRUMENTED_INSTRUCTION) { - opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i]; + opcode_ptr = &monitoring->per_instruction_opcodes[i]; opcode = *opcode_ptr; CHECK(opcode != INSTRUMENTED_INSTRUCTION && opcode != INSTRUMENTED_LINE); CHECK(opcode == _PyOpcode_Deopt[opcode]); @@ -716,52 +737,52 @@ instrument(PyCodeObject *code, int i) if (_PyOpcode_Caches[deopt]) { FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff, adaptive_counter_warmup().value_and_backoff); - instr[1].counter = adaptive_counter_warmup(); } } } static void -instrument_line(PyCodeObject *code, int i) +instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) { - uint8_t *opcode_ptr = &_PyCode_CODE(code)[i].op.code; + uint8_t *opcode_ptr = &bytecode[i].op.code; int opcode = *opcode_ptr; if (opcode == INSTRUMENTED_LINE) { return; } - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; lines->original_opcode = _PyOpcode_Deopt[opcode]; CHECK(lines->original_opcode > 0); - *opcode_ptr = INSTRUMENTED_LINE; + FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_LINE); } static void -instrument_per_instruction(PyCodeObject *code, int i) +instrument_per_instruction(_Py_CODEUNIT *bytecode, + _PyCoMonitoringData *monitoring, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode = *opcode_ptr; if (opcode == INSTRUMENTED_LINE) { - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; opcode_ptr = &lines->original_opcode; opcode = *opcode_ptr; } if (opcode == INSTRUMENTED_INSTRUCTION) { - assert(code->_co_monitoring->per_instruction_opcodes[i] > 0); + assert(monitoring->per_instruction_opcodes[i] > 0); return; } CHECK(opcode != 0); if (is_instrumented(opcode)) { - code->_co_monitoring->per_instruction_opcodes[i] = opcode; + monitoring->per_instruction_opcodes[i] = opcode; } else { assert(opcode != 0); assert(_PyOpcode_Deopt[opcode] != 0); assert(_PyOpcode_Deopt[opcode] != RESUME); - code->_co_monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode]; + monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode]; } - assert(code->_co_monitoring->per_instruction_opcodes[i] > 0); - *opcode_ptr = INSTRUMENTED_INSTRUCTION; + assert(monitoring->per_instruction_opcodes[i] > 0); + FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_INSTRUCTION); } static void @@ -773,19 +794,19 @@ remove_tools(PyCodeObject * code, int offset, int event, int tools) assert(PY_MONITORING_IS_INSTRUMENTED_EVENT(event)); assert(opcode_has_event(_Py_GetBaseCodeUnit(code, offset).op.code)); _PyCoMonitoringData *monitoring = code->_co_monitoring; + bool should_de_instrument; if (monitoring && monitoring->tools) { monitoring->tools[offset] &= ~tools; - if (monitoring->tools[offset] == 0) { - de_instrument(code, offset, event); - } + should_de_instrument = (monitoring->tools[offset] == 0); } else { /* Single tool */ uint8_t single_tool = code->_co_monitoring->active_monitors.tools[event]; assert(_Py_popcount32(single_tool) <= 1); - if (((single_tool & tools) == single_tool)) { - de_instrument(code, offset, event); - } + should_de_instrument = ((single_tool & tools) == single_tool); + } + if (should_de_instrument) { + MODIFY_BYTECODE(code, de_instrument, monitoring, offset, event); } } @@ -804,22 +825,23 @@ remove_line_tools(PyCodeObject * code, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); - assert(code->_co_monitoring); - if (code->_co_monitoring->line_tools) + _PyCoMonitoringData *monitoring = code->_co_monitoring; + assert(monitoring); + bool should_de_instrument; + if (monitoring->line_tools) { - uint8_t *toolsptr = &code->_co_monitoring->line_tools[offset]; + uint8_t *toolsptr = &monitoring->line_tools[offset]; *toolsptr &= ~tools; - if (*toolsptr == 0 ) { - de_instrument_line(code, offset); - } + should_de_instrument = (*toolsptr == 0); } else { /* Single tool */ - uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE]; + uint8_t single_tool = monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE]; assert(_Py_popcount32(single_tool) <= 1); - if (((single_tool & tools) == single_tool)) { - de_instrument_line(code, offset); - } + should_de_instrument = ((single_tool & tools) == single_tool); + } + if (should_de_instrument) { + MODIFY_BYTECODE(code, de_instrument_line, monitoring, offset); } } @@ -841,7 +863,7 @@ add_tools(PyCodeObject * code, int offset, int event, int tools) assert(_Py_popcount32(tools) == 1); assert(tools_is_subset_for_event(code, event, tools)); } - instrument(code, offset); + MODIFY_BYTECODE(code, instrument, code->_co_monitoring, offset); } static void @@ -858,7 +880,7 @@ add_line_tools(PyCodeObject * code, int offset, int tools) /* Single tool */ assert(_Py_popcount32(tools) == 1); } - instrument_line(code, offset); + MODIFY_BYTECODE(code, instrument_line, code->_co_monitoring, offset); } @@ -876,7 +898,7 @@ add_per_instruction_tools(PyCodeObject * code, int offset, int tools) /* Single tool */ assert(_Py_popcount32(tools) == 1); } - instrument_per_instruction(code, offset); + MODIFY_BYTECODE(code, instrument_per_instruction, code->_co_monitoring, offset); } @@ -885,21 +907,22 @@ remove_per_instruction_tools(PyCodeObject * code, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); + _PyCoMonitoringData *monitoring = code->_co_monitoring; assert(code->_co_monitoring); + bool should_de_instrument; if (code->_co_monitoring->per_instruction_tools) { uint8_t *toolsptr = &code->_co_monitoring->per_instruction_tools[offset]; *toolsptr &= ~tools; - if (*toolsptr == 0) { - de_instrument_per_instruction(code, offset); - } + should_de_instrument = (*toolsptr == 0); } else { /* Single tool */ uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_INSTRUCTION]; assert(_Py_popcount32(single_tool) <= 1); - if (((single_tool & tools) == single_tool)) { - de_instrument_per_instruction(code, offset); - } + should_de_instrument = ((single_tool & tools) == single_tool); + } + if (should_de_instrument) { + MODIFY_BYTECODE(code, de_instrument_per_instruction, monitoring, offset); } } @@ -1087,7 +1110,7 @@ call_instrumentation_vector( PyCodeObject *code = _PyFrame_GetCode(frame); assert(args[1] == NULL); args[1] = (PyObject *)code; - int offset = (int)(instr - _PyCode_CODE(code)); + int offset = (int)(instr - _PyFrame_GetBytecode(frame)); /* Offset visible to user should be the offset in bytes, as that is the * convention for APIs involving code offsets. */ int bytes_offset = offset * (int)sizeof(_Py_CODEUNIT); @@ -1173,8 +1196,7 @@ _Py_call_instrumentation_jump( assert(event == PY_MONITORING_EVENT_JUMP || event == PY_MONITORING_EVENT_BRANCH); assert(frame->instr_ptr == instr); - PyCodeObject *code = _PyFrame_GetCode(frame); - int to = (int)(target - _PyCode_CODE(code)); + int to = (int)(target - _PyFrame_GetBytecode(frame)); PyObject *to_obj = PyLong_FromLong(to * (int)sizeof(_Py_CODEUNIT)); if (to_obj == NULL) { return NULL; @@ -1240,7 +1262,8 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame, PyCodeObject *code = _PyFrame_GetCode(frame); assert(tstate->tracing == 0); assert(debug_check_sanity(tstate->interp, code)); - int i = (int)(instr - _PyCode_CODE(code)); + _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); + int i = (int)(instr - bytecode); _PyCoMonitoringData *monitoring = code->_co_monitoring; _PyCoLineInstrumentationData *line_data = &monitoring->lines[i]; @@ -1256,10 +1279,10 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame, line = compute_line(code, i, line_delta); assert(line >= 0); assert(prev != NULL); - int prev_index = (int)(prev - _PyCode_CODE(code)); + int prev_index = (int)(prev - bytecode); int prev_line = _Py_Instrumentation_GetLine(code, prev_index); if (prev_line == line) { - int prev_opcode = _PyCode_CODE(code)[prev_index].op.code; + int prev_opcode = bytecode[prev_index].op.code; /* RESUME and INSTRUMENTED_RESUME are needed for the operation of * instrumentation, so must never be hidden by an INSTRUMENTED_LINE. */ @@ -1359,7 +1382,7 @@ int _Py_call_instrumentation_instruction(PyThreadState *tstate, _PyInterpreterFrame* frame, _Py_CODEUNIT *instr) { PyCodeObject *code = _PyFrame_GetCode(frame); - int offset = (int)(instr - _PyCode_CODE(code)); + int offset = (int)(instr - _PyFrame_GetBytecode(frame)); _PyCoMonitoringData *instrumentation_data = code->_co_monitoring; assert(instrumentation_data->per_instruction_opcodes); int next_opcode = instrumentation_data->per_instruction_opcodes[offset]; diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 0a7e44ef78d..54821b23716 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -17,6 +17,8 @@ /* _QUICKEN_RESUME is not a viable micro-op for tier 2 */ + /* _LOAD_BYTECODE is not a viable micro-op for tier 2 */ + case _RESUME_CHECK: { break; } diff --git a/Python/pystate.c b/Python/pystate.c index 36b31f3b9e4..ded5fde9c4b 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1513,6 +1513,11 @@ new_threadstate(PyInterpreterState *interp, int whence) PyMem_RawFree(new_tstate); return NULL; } + int32_t tlbc_idx = _Py_ReserveTLBCIndex(interp); + if (tlbc_idx < 0) { + PyMem_RawFree(new_tstate); + return NULL; + } #endif /* We serialize concurrent creation to protect global state. */ @@ -1555,6 +1560,7 @@ new_threadstate(PyInterpreterState *interp, int whence) #ifdef Py_GIL_DISABLED // Must be called with lock unlocked to avoid lock ordering deadlocks. _Py_qsbr_register(tstate, interp, qsbr_idx); + tstate->tlbc_index = tlbc_idx; #endif return (PyThreadState *)tstate; @@ -1706,6 +1712,10 @@ PyThreadState_Clear(PyThreadState *tstate) // Remove ourself from the biased reference counting table of threads. _Py_brc_remove_thread(tstate); + + // Release our thread-local copies of the bytecode for reuse by another + // thread + _Py_ClearTLBCIndex((_PyThreadStateImpl *)tstate); #endif // Merge our queue of pointers to be freed into the interpreter queue. diff --git a/Python/specialize.c b/Python/specialize.c index ae47809305a..86cb997ca2c 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -24,6 +24,25 @@ extern const char *_PyUOpName(int index); * ./adaptive.md */ +#ifdef Py_GIL_DISABLED +#define SET_OPCODE_OR_RETURN(instr, opcode) \ + do { \ + uint8_t old_op = _Py_atomic_load_uint8_relaxed(&(instr)->op.code); \ + if (old_op >= MIN_INSTRUMENTED_OPCODE) { \ + /* Lost race with instrumentation */ \ + return; \ + } \ + if (!_Py_atomic_compare_exchange_uint8(&(instr)->op.code, &old_op, \ + (opcode))) { \ + /* Lost race with instrumentation */ \ + assert(old_op >= MIN_INSTRUMENTED_OPCODE); \ + return; \ + } \ + } while (0) +#else +#define SET_OPCODE_OR_RETURN(instr, opcode) (instr)->op.code = (opcode) +#endif + #ifdef Py_STATS GCStats _py_gc_stats[NUM_GENERATIONS] = { 0 }; static PyStats _Py_stats_struct = { .gc_stats = _py_gc_stats }; @@ -436,16 +455,25 @@ do { \ # define SPECIALIZATION_FAIL(opcode, kind) ((void)0) #endif -// Initialize warmup counters and insert superinstructions. This cannot fail. +// Initialize warmup counters and optimize instructions. This cannot fail. void -_PyCode_Quicken(PyCodeObject *code) +_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts, + int enable_counters) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT + _Py_BackoffCounter jump_counter, adaptive_counter; + if (enable_counters) { + jump_counter = initial_jump_backoff_counter(); + adaptive_counter = adaptive_counter_warmup(); + } + else { + jump_counter = initial_unreachable_backoff_counter(); + adaptive_counter = initial_unreachable_backoff_counter(); + } int opcode = 0; int oparg = 0; - _Py_CODEUNIT *instructions = _PyCode_CODE(code); /* The last code unit cannot have a cache, so we don't need to check it */ - for (int i = 0; i < Py_SIZE(code)-1; i++) { + for (Py_ssize_t i = 0; i < size-1; i++) { opcode = instructions[i].op.code; int caches = _PyOpcode_Caches[opcode]; oparg = (oparg << 8) | instructions[i].op.arg; @@ -453,7 +481,7 @@ _PyCode_Quicken(PyCodeObject *code) // The initial value depends on the opcode switch (opcode) { case JUMP_BACKWARD: - instructions[i + 1].counter = initial_jump_backoff_counter(); + instructions[i + 1].counter = jump_counter; break; case POP_JUMP_IF_FALSE: case POP_JUMP_IF_TRUE: @@ -462,7 +490,7 @@ _PyCode_Quicken(PyCodeObject *code) instructions[i + 1].cache = 0x5555; // Alternating 0, 1 bits break; default: - instructions[i + 1].counter = adaptive_counter_warmup(); + instructions[i + 1].counter = adaptive_counter; break; } i += caches; @@ -471,7 +499,7 @@ _PyCode_Quicken(PyCodeObject *code) /* We can't do this in the bytecode compiler as * marshalling can intern strings and make them immortal. */ - PyObject *obj = PyTuple_GET_ITEM(code->co_consts, oparg); + PyObject *obj = PyTuple_GET_ITEM(consts, oparg); if (_Py_IsImmortal(obj)) { instructions[i].op.code = LOAD_CONST_IMMORTAL; } @@ -480,7 +508,7 @@ _PyCode_Quicken(PyCodeObject *code) oparg = 0; } } - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ } #define SIMPLE_FUNCTION 0 @@ -2243,9 +2271,10 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in { PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st); PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZATION_FT); assert(_PyOpcode_Caches[BINARY_OP] == INLINE_CACHE_ENTRIES_BINARY_OP); _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(instr + 1); + uint8_t specialized_op; switch (oparg) { case NB_ADD: case NB_INPLACE_ADD: @@ -2256,18 +2285,18 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in _Py_CODEUNIT next = instr[INLINE_CACHE_ENTRIES_BINARY_OP + 1]; bool to_store = (next.op.code == STORE_FAST); if (to_store && PyStackRef_AsPyObjectBorrow(locals[next.op.arg]) == lhs) { - instr->op.code = BINARY_OP_INPLACE_ADD_UNICODE; + specialized_op = BINARY_OP_INPLACE_ADD_UNICODE; goto success; } - instr->op.code = BINARY_OP_ADD_UNICODE; + specialized_op = BINARY_OP_ADD_UNICODE; goto success; } if (PyLong_CheckExact(lhs)) { - instr->op.code = BINARY_OP_ADD_INT; + specialized_op = BINARY_OP_ADD_INT; goto success; } if (PyFloat_CheckExact(lhs)) { - instr->op.code = BINARY_OP_ADD_FLOAT; + specialized_op = BINARY_OP_ADD_FLOAT; goto success; } break; @@ -2277,11 +2306,11 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in break; } if (PyLong_CheckExact(lhs)) { - instr->op.code = BINARY_OP_MULTIPLY_INT; + specialized_op = BINARY_OP_MULTIPLY_INT; goto success; } if (PyFloat_CheckExact(lhs)) { - instr->op.code = BINARY_OP_MULTIPLY_FLOAT; + specialized_op = BINARY_OP_MULTIPLY_FLOAT; goto success; } break; @@ -2291,22 +2320,23 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in break; } if (PyLong_CheckExact(lhs)) { - instr->op.code = BINARY_OP_SUBTRACT_INT; + specialized_op = BINARY_OP_SUBTRACT_INT; goto success; } if (PyFloat_CheckExact(lhs)) { - instr->op.code = BINARY_OP_SUBTRACT_FLOAT; + specialized_op = BINARY_OP_SUBTRACT_FLOAT; goto success; } break; } SPECIALIZATION_FAIL(BINARY_OP, binary_op_fail_kind(oparg, lhs, rhs)); STAT_INC(BINARY_OP, failure); - instr->op.code = BINARY_OP; + SET_OPCODE_OR_RETURN(instr, BINARY_OP); cache->counter = adaptive_counter_backoff(cache->counter); return; success: STAT_INC(BINARY_OP, success); + SET_OPCODE_OR_RETURN(instr, specialized_op); cache->counter = adaptive_counter_cooldown(); } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index a4abd7c3c45..a086bb979ef 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2174,6 +2174,11 @@ sys__clear_internal_caches_impl(PyObject *module) #ifdef _Py_TIER2 PyInterpreterState *interp = _PyInterpreterState_GET(); _Py_Executors_InvalidateAll(interp, 0); +#endif +#ifdef Py_GIL_DISABLED + if (_Py_ClearUnusedTLBC(_PyInterpreterState_GET()) < 0) { + return NULL; + } #endif PyType_ClearCache(); Py_RETURN_NONE; diff --git a/Tools/gdb/libpython.py b/Tools/gdb/libpython.py index 946af4be1a7..ed254152d7d 100755 --- a/Tools/gdb/libpython.py +++ b/Tools/gdb/libpython.py @@ -77,6 +77,10 @@ def _managed_dict_offset(): else: return -3 * _sizeof_void_p() +def _interp_frame_has_tlbc_index(): + interp_frame = gdb.lookup_type("_PyInterpreterFrame") + return any(field.name == "tlbc_index" for field in interp_frame.fields()) + Py_TPFLAGS_INLINE_VALUES = (1 << 2) Py_TPFLAGS_MANAGED_DICT = (1 << 4) @@ -105,6 +109,8 @@ FRAME_INFO_OPTIMIZED_OUT = '(frame information optimized out)' UNABLE_READ_INFO_PYTHON_FRAME = 'Unable to read information on python frame' EVALFRAME = '_PyEval_EvalFrameDefault' +INTERP_FRAME_HAS_TLBC_INDEX = _interp_frame_has_tlbc_index() + class NullPyObjectPtr(RuntimeError): pass @@ -693,6 +699,16 @@ def parse_location_table(firstlineno, linetable): yield addr, end_addr, line addr = end_addr + +class PyCodeArrayPtr: + def __init__(self, gdbval): + self._gdbval = gdbval + + def get_entry(self, index): + assert (index >= 0) and (index < self._gdbval["size"]) + return self._gdbval["entries"][index] + + class PyCodeObjectPtr(PyObjectPtr): """ Class wrapping a gdb.Value that's a PyCodeObject* i.e. a instance @@ -1085,7 +1101,12 @@ class PyFramePtr: def _f_lasti(self): codeunit_p = gdb.lookup_type("_Py_CODEUNIT").pointer() instr_ptr = self._gdbval["instr_ptr"] - first_instr = self._f_code().field("co_code_adaptive").cast(codeunit_p) + if INTERP_FRAME_HAS_TLBC_INDEX: + tlbc_index = self._gdbval["tlbc_index"] + code_arr = PyCodeArrayPtr(self._f_code().field("co_tlbc")) + first_instr = code_arr.get_entry(tlbc_index).cast(codeunit_p) + else: + first_instr = self._f_code().field("co_code_adaptive").cast(codeunit_p) return int(instr_ptr - first_instr) def is_shim(self):