From b03ac72652488f9c7bdb5329f810f146eedd3bdc Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Tue, 17 Mar 2026 14:45:23 +0100 Subject: [PATCH] Put the thread state in PyCriticalSection structs GH-141406 improved performance by only fetching thread state once and storing it in a variable on the stack. This instead puts the thread state in the PyCriticalState struct (also a temp variable on the stack), bringing the public and private implementations closer together. --- Include/cpython/critical_section.h | 2 + Include/internal/pycore_critical_section.h | 83 ++++++++-------------- Python/critical_section.c | 18 ++--- 3 files changed, 41 insertions(+), 62 deletions(-) diff --git a/Include/cpython/critical_section.h b/Include/cpython/critical_section.h index 4fc46fefb93a24..929c8523d690f9 100644 --- a/Include/cpython/critical_section.h +++ b/Include/cpython/critical_section.h @@ -111,6 +111,8 @@ struct PyCriticalSection { // Mutex used to protect critical section PyMutex *_cs_mutex; + + PyThreadState *_cs_tstate; }; // A critical section protected by two mutexes. Use diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h index 2a2846b1296b90..6562e2ac27e5d8 100644 --- a/Include/internal/pycore_critical_section.h +++ b/Include/internal/pycore_critical_section.h @@ -86,10 +86,10 @@ _PyCriticalSection_Resume(PyThreadState *tstate); // (private) slow path for locking the mutex PyAPI_FUNC(void) -_PyCriticalSection_BeginSlow(PyThreadState *tstate, PyCriticalSection *c, PyMutex *m); +_PyCriticalSection_BeginSlow(PyCriticalSection *c, PyMutex *m); PyAPI_FUNC(void) -_PyCriticalSection2_BeginSlow(PyThreadState *tstate, PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2, +_PyCriticalSection2_BeginSlow(PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2, int is_m1_locked); PyAPI_FUNC(void) @@ -104,30 +104,33 @@ _PyCriticalSection_IsActive(uintptr_t tag) } static inline void -_PyCriticalSection_BeginMutex(PyThreadState *tstate, PyCriticalSection *c, PyMutex *m) +_PyCriticalSection_BeginMutex(PyCriticalSection *c, PyMutex *m) { + PyThreadState *tstate = PyThreadState_GET(); + c->_cs_tstate = tstate; if (PyMutex_LockFast(m)) { c->_cs_mutex = m; c->_cs_prev = tstate->critical_section; tstate->critical_section = (uintptr_t)c; } else { - _PyCriticalSection_BeginSlow(tstate, c, m); + _PyCriticalSection_BeginSlow(c, m); } } static inline void -_PyCriticalSection_Begin(PyThreadState *tstate, PyCriticalSection *c, PyObject *op) +_PyCriticalSection_Begin(PyCriticalSection *c, PyObject *op) { - _PyCriticalSection_BeginMutex(tstate, c, &op->ob_mutex); + _PyCriticalSection_BeginMutex(c, &op->ob_mutex); } // Removes the top-most critical section from the thread's stack of critical // sections. If the new top-most critical section is inactive, then it is // resumed. static inline void -_PyCriticalSection_Pop(PyThreadState *tstate, PyCriticalSection *c) +_PyCriticalSection_Pop(PyCriticalSection *c) { + PyThreadState *tstate = c->_cs_tstate; uintptr_t prev = c->_cs_prev; tstate->critical_section = prev; @@ -137,7 +140,7 @@ _PyCriticalSection_Pop(PyThreadState *tstate, PyCriticalSection *c) } static inline void -_PyCriticalSection_End(PyThreadState *tstate, PyCriticalSection *c) +_PyCriticalSection_End(PyCriticalSection *c) { // If the mutex is NULL, we used the fast path in // _PyCriticalSection_BeginSlow for locks already held in the top-most @@ -146,17 +149,17 @@ _PyCriticalSection_End(PyThreadState *tstate, PyCriticalSection *c) return; } PyMutex_Unlock(c->_cs_mutex); - _PyCriticalSection_Pop(tstate, c); + _PyCriticalSection_Pop(c); } static inline void -_PyCriticalSection2_BeginMutex(PyThreadState *tstate, PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2) +_PyCriticalSection2_BeginMutex(PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2) { if (m1 == m2) { // If the two mutex arguments are the same, treat this as a critical // section with a single mutex. c->_cs_mutex2 = NULL; - _PyCriticalSection_BeginMutex(tstate, &c->_cs_base, m1); + _PyCriticalSection_BeginMutex(&c->_cs_base, m1); return; } @@ -169,6 +172,9 @@ _PyCriticalSection2_BeginMutex(PyThreadState *tstate, PyCriticalSection2 *c, PyM m2 = tmp; } + PyThreadState *tstate = PyThreadState_GET(); + c->_cs_base._cs_tstate = tstate; + if (PyMutex_LockFast(m1)) { if (PyMutex_LockFast(m2)) { c->_cs_base._cs_mutex = m1; @@ -179,22 +185,22 @@ _PyCriticalSection2_BeginMutex(PyThreadState *tstate, PyCriticalSection2 *c, PyM tstate->critical_section = p; } else { - _PyCriticalSection2_BeginSlow(tstate, c, m1, m2, 1); + _PyCriticalSection2_BeginSlow(c, m1, m2, 1); } } else { - _PyCriticalSection2_BeginSlow(tstate, c, m1, m2, 0); + _PyCriticalSection2_BeginSlow(c, m1, m2, 0); } } static inline void -_PyCriticalSection2_Begin(PyThreadState *tstate, PyCriticalSection2 *c, PyObject *a, PyObject *b) +_PyCriticalSection2_Begin(PyCriticalSection2 *c, PyObject *a, PyObject *b) { - _PyCriticalSection2_BeginMutex(tstate, c, &a->ob_mutex, &b->ob_mutex); + _PyCriticalSection2_BeginMutex(c, &a->ob_mutex, &b->ob_mutex); } static inline void -_PyCriticalSection2_End(PyThreadState *tstate, PyCriticalSection2 *c) +_PyCriticalSection2_End(PyCriticalSection2 *c) { // if mutex1 is NULL, we used the fast path in // _PyCriticalSection_BeginSlow for mutexes that are already held, @@ -208,7 +214,7 @@ _PyCriticalSection2_End(PyThreadState *tstate, PyCriticalSection2 *c) PyMutex_Unlock(c->_cs_mutex2); } PyMutex_Unlock(c->_cs_base._cs_mutex); - _PyCriticalSection_Pop(tstate, &c->_cs_base); + _PyCriticalSection_Pop(&c->_cs_base); } static inline void @@ -252,43 +258,12 @@ _PyCriticalSection_AssertHeldObj(PyObject *op) #endif } -#undef Py_BEGIN_CRITICAL_SECTION -# define Py_BEGIN_CRITICAL_SECTION(op) \ - { \ - PyCriticalSection _py_cs; \ - PyThreadState *_cs_tstate = _PyThreadState_GET(); \ - _PyCriticalSection_Begin(_cs_tstate, &_py_cs, _PyObject_CAST(op)) - -#undef Py_BEGIN_CRITICAL_SECTION_MUTEX -# define Py_BEGIN_CRITICAL_SECTION_MUTEX(mutex) \ - { \ - PyCriticalSection _py_cs; \ - PyThreadState *_cs_tstate = _PyThreadState_GET(); \ - _PyCriticalSection_BeginMutex(_cs_tstate, &_py_cs, mutex) - -#undef Py_END_CRITICAL_SECTION -# define Py_END_CRITICAL_SECTION() \ - _PyCriticalSection_End(_cs_tstate, &_py_cs); \ - } - -#undef Py_BEGIN_CRITICAL_SECTION2 -# define Py_BEGIN_CRITICAL_SECTION2(a, b) \ - { \ - PyCriticalSection2 _py_cs2; \ - PyThreadState *_cs_tstate = _PyThreadState_GET(); \ - _PyCriticalSection2_Begin(_cs_tstate, &_py_cs2, _PyObject_CAST(a), _PyObject_CAST(b)) - -#undef Py_BEGIN_CRITICAL_SECTION2_MUTEX -# define Py_BEGIN_CRITICAL_SECTION2_MUTEX(m1, m2) \ - { \ - PyCriticalSection2 _py_cs2; \ - PyThreadState *_cs_tstate = _PyThreadState_GET(); \ - _PyCriticalSection2_BeginMutex(_cs_tstate, &_py_cs2, m1, m2) - -#undef Py_END_CRITICAL_SECTION2 -# define Py_END_CRITICAL_SECTION2() \ - _PyCriticalSection2_End(_cs_tstate, &_py_cs2); \ - } +#define PyCriticalSection_Begin _PyCriticalSection_Begin +#define PyCriticalSection_BeginMutex _PyCriticalSection_BeginMutex +#define PyCriticalSection_End _PyCriticalSection_End +#define PyCriticalSection2_Begin _PyCriticalSection2_Begin +#define PyCriticalSection2_BeginMutex _PyCriticalSection2_BeginMutex +#define PyCriticalSection2_End _PyCriticalSection2_End #endif /* Py_GIL_DISABLED */ diff --git a/Python/critical_section.c b/Python/critical_section.c index 98e23eda7cdd77..818176f27529c4 100644 --- a/Python/critical_section.c +++ b/Python/critical_section.c @@ -18,7 +18,7 @@ untag_critical_section(uintptr_t tag) #endif void -_PyCriticalSection_BeginSlow(PyThreadState *tstate, PyCriticalSection *c, PyMutex *m) +_PyCriticalSection_BeginSlow(PyCriticalSection *c, PyMutex *m) { #ifdef Py_GIL_DISABLED // As an optimisation for locking the same object recursively, skip @@ -26,6 +26,7 @@ _PyCriticalSection_BeginSlow(PyThreadState *tstate, PyCriticalSection *c, PyMute // section. // If the top-most critical section is a two-mutex critical section, // then locking is skipped if either mutex is m. + PyThreadState *tstate = c->_cs_tstate; if (tstate->critical_section) { PyCriticalSection *prev = untag_critical_section(tstate->critical_section); if (prev->_cs_mutex == m) { @@ -62,10 +63,11 @@ _PyCriticalSection_BeginSlow(PyThreadState *tstate, PyCriticalSection *c, PyMute } void -_PyCriticalSection2_BeginSlow(PyThreadState *tstate, PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2, +_PyCriticalSection2_BeginSlow(PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2, int is_m1_locked) { #ifdef Py_GIL_DISABLED + PyThreadState *tstate = c->_cs_base._cs_tstate; if (tstate->interp->stoptheworld.world_stopped) { c->_cs_base._cs_mutex = NULL; c->_cs_mutex2 = NULL; @@ -153,7 +155,7 @@ void PyCriticalSection_Begin(PyCriticalSection *c, PyObject *op) { #ifdef Py_GIL_DISABLED - _PyCriticalSection_Begin(_PyThreadState_GET(), c, op); + _PyCriticalSection_Begin(c, op); #endif } @@ -162,7 +164,7 @@ void PyCriticalSection_BeginMutex(PyCriticalSection *c, PyMutex *m) { #ifdef Py_GIL_DISABLED - _PyCriticalSection_BeginMutex(_PyThreadState_GET(), c, m); + _PyCriticalSection_BeginMutex(c, m); #endif } @@ -171,7 +173,7 @@ void PyCriticalSection_End(PyCriticalSection *c) { #ifdef Py_GIL_DISABLED - _PyCriticalSection_End(_PyThreadState_GET(), c); + _PyCriticalSection_End(c); #endif } @@ -180,7 +182,7 @@ void PyCriticalSection2_Begin(PyCriticalSection2 *c, PyObject *a, PyObject *b) { #ifdef Py_GIL_DISABLED - _PyCriticalSection2_Begin(_PyThreadState_GET(), c, a, b); + _PyCriticalSection2_Begin(c, a, b); #endif } @@ -189,7 +191,7 @@ void PyCriticalSection2_BeginMutex(PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2) { #ifdef Py_GIL_DISABLED - _PyCriticalSection2_BeginMutex(_PyThreadState_GET(), c, m1, m2); + _PyCriticalSection2_BeginMutex(c, m1, m2); #endif } @@ -198,6 +200,6 @@ void PyCriticalSection2_End(PyCriticalSection2 *c) { #ifdef Py_GIL_DISABLED - _PyCriticalSection2_End(_PyThreadState_GET(), c); + _PyCriticalSection2_End(c); #endif }