diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8dcf48928e..45f8675889 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -485,6 +485,11 @@ jobs: steps: - uses: actions/checkout@v6 + - name: Clean out unused stuff to save space + run: | + sudo rm -rf /usr/local/lib/android /usr/share/dotnet /opt/ghc /opt/hostedtoolcache/CodeQL + sudo apt-get clean + - name: Add NVHPC Repo run: | echo 'deb [trusted=yes] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | \ @@ -492,10 +497,11 @@ jobs: - name: Install 🐍 3 & NVHPC run: | - sudo apt-get update -y && \ - sudo apt-get install -y cmake environment-modules git python3-dev python3-pip python3-numpy && \ - sudo apt-get install -y --no-install-recommends nvhpc-25-11 && \ + sudo apt-get update -y + sudo apt-get install -y cmake environment-modules git python3-dev python3-pip python3-numpy + sudo apt-get install -y --no-install-recommends nvhpc-25-11 sudo rm -rf /var/lib/apt/lists/* + apt-cache depends nvhpc-25-11 python3 -m pip install --upgrade pip python3 -m pip install --upgrade pytest diff --git a/include/pybind11/detail/class.h b/include/pybind11/detail/class.h index 21e966cfea..3f6e9f32e8 100644 --- a/include/pybind11/detail/class.h +++ b/include/pybind11/detail/class.h @@ -207,49 +207,64 @@ extern "C" inline PyObject *pybind11_meta_call(PyObject *type, PyObject *args, P /// Cleanup the type-info for a pybind11-registered type. extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { - with_internals([obj](internals &internals) { - auto *type = (PyTypeObject *) obj; - - // A pybind11-registered type will: - // 1) be found in internals.registered_types_py - // 2) have exactly one associated `detail::type_info` - auto found_type = internals.registered_types_py.find(type); - if (found_type != internals.registered_types_py.end() && found_type->second.size() == 1 - && found_type->second[0]->type == type) { - - auto *tinfo = found_type->second[0]; - auto tindex = std::type_index(*tinfo->cpptype); - internals.direct_conversions.erase(tindex); - - auto &local_internals = get_local_internals(); - if (tinfo->module_local) { - local_internals.registered_types_cpp.erase(tinfo->cpptype); - } else { - internals.registered_types_cpp.erase(tindex); -#if PYBIND11_INTERNALS_VERSION >= 12 - internals.registered_types_cpp_fast.erase(tinfo->cpptype); - for (const std::type_info *alias : tinfo->alias_chain) { - auto num_erased = internals.registered_types_cpp_fast.erase(alias); - (void) num_erased; - assert(num_erased > 0); - } -#endif + if (is_interpreter_finalizing()) { + PyType_Type.tp_dealloc(obj); + return; + } + + auto *internals_pp = get_internals_pp_manager().get_pp_if_exists(); + if (!internals_pp || internals_pp->get() == nullptr) { + PyType_Type.tp_dealloc(obj); + return; + } + + auto &internals = *internals_pp->get(); + PYBIND11_LOCK_INTERNALS(internals); + + auto *type = (PyTypeObject *) obj; + + // A pybind11-registered type will: + // 1) be found in internals.registered_types_py + // 2) have exactly one associated `detail::type_info` + auto found_type = internals.registered_types_py.find(type); + if (found_type != internals.registered_types_py.end() && found_type->second.size() == 1 + && found_type->second[0]->type == type) { + + auto *tinfo = found_type->second[0]; + auto tindex = std::type_index(*tinfo->cpptype); + internals.direct_conversions.erase(tindex); + + auto *local_internals_pp = get_local_internals_pp_manager().get_pp_if_exists(); + auto *local_internals_ptr = local_internals_pp ? local_internals_pp->get() : nullptr; + if (tinfo->module_local) { + if (local_internals_ptr) { + local_internals_ptr->registered_types_cpp.erase(tinfo->cpptype); } - internals.registered_types_py.erase(tinfo->type); - - // Actually just `std::erase_if`, but that's only available in C++20 - auto &cache = internals.inactive_override_cache; - for (auto it = cache.begin(), last = cache.end(); it != last;) { - if (it->first == (PyObject *) tinfo->type) { - it = cache.erase(it); - } else { - ++it; - } + } else { + internals.registered_types_cpp.erase(tindex); +#if PYBIND11_INTERNALS_VERSION >= 12 + internals.registered_types_cpp_fast.erase(tinfo->cpptype); + for (const std::type_info *alias : tinfo->alias_chain) { + auto num_erased = internals.registered_types_cpp_fast.erase(alias); + (void) num_erased; + assert(num_erased > 0); } +#endif + } + internals.registered_types_py.erase(tinfo->type); - delete tinfo; + // Actually just `std::erase_if`, but that's only available in C++20 + auto &cache = internals.inactive_override_cache; + for (auto it = cache.begin(), last = cache.end(); it != last;) { + if (it->first == (PyObject *) tinfo->type) { + it = cache.erase(it); + } else { + ++it; + } } - }); + + delete tinfo; + } PyType_Type.tp_dealloc(obj); } diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index a92f196b1f..b8dc3dc608 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -103,7 +103,7 @@ class thread_specific_storage { // However, in GraalPy (as of v24.2 or older), TSS is implemented by Java and this call // requires a living Python interpreter. #ifdef GRAALVM_PYTHON - if (!Py_IsInitialized() || _Py_IsFinalizing()) { + if (Py_IsInitialized() == 0 || _Py_IsFinalizing() != 0) { return; } #endif @@ -195,6 +195,22 @@ struct override_hash { using instance_map = std::unordered_multimap; +inline bool is_interpreter_alive() { +#if PY_VERSION_HEX < 0x030D0000 + return Py_IsInitialized() != 0 || _Py_IsFinalizing() != 0; +#else + return Py_IsInitialized() != 0 || Py_IsFinalizing() != 0; +#endif +} + +inline bool is_interpreter_finalizing() { +#if PY_VERSION_HEX < 0x030D0000 + return _Py_IsFinalizing() != 0; +#else + return Py_IsFinalizing() != 0; +#endif +} + #ifdef Py_GIL_DISABLED // Wrapper around PyMutex to provide BasicLockable semantics class pymutex { @@ -308,7 +324,27 @@ struct internals { internals(internals &&other) = delete; internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; - ~internals() = default; + ~internals() { + // Normally this destructor runs during interpreter finalization and it may DECREF things. + // In odd finalization scenarios it might end up running after the interpreter has + // completely shut down, In that case, we should not decref these objects because pymalloc + // is gone. + // However, when called from reset() before Py_EndInterpreter, the interpreter is + // definitely alive, so we should always clean up in that case. + if (is_interpreter_alive()) { + Py_CLEAR(static_property_type); + Py_CLEAR(default_metaclass); + Py_CLEAR(instance_base); + + // Clean up patients - these PyObject* pointers were INCREF'd when added + for (auto &patient_pair : patients) { + for (PyObject *patient : patient_pair.second) { + Py_CLEAR(patient); + } + } + patients.clear(); + } + } }; // the internals struct (above) is shared between all the modules. local_internals are only @@ -325,6 +361,16 @@ struct local_internals { std::forward_list registered_exception_translators; PyTypeObject *function_record_py_type = nullptr; + + ~local_internals() { + // Normally this destructor runs during interpreter finalization and it may DECREF things. + // In odd finalization scenarios it might end up running after the interpreter has + // completely shut down, In that case, we should not decref these objects because pymalloc + // is gone. + if (is_interpreter_alive()) { + Py_CLEAR(function_record_py_type); + } + } }; enum class holder_enum_t : uint8_t { @@ -569,7 +615,7 @@ inline object get_python_state_dict() { // The bool follows std::map::insert convention: true = created, false = existed. template std::pair atomic_get_or_create_in_state_dict(const char *key, - bool clear_destructor = false) { + void (*dtor)(PyObject *) = nullptr) { error_scope err_scope; // preserve any existing Python error states auto state_dict = reinterpret_borrow(get_python_state_dict()); @@ -586,16 +632,13 @@ std::pair atomic_get_or_create_in_state_dict(const char *key, // Use unique_ptr for exception safety: if capsule creation throws, the storage is // automatically deleted. auto storage_ptr = std::unique_ptr(new Payload{}); - // Create capsule with destructor to clean up when the interpreter shuts down. - auto new_capsule = capsule( - storage_ptr.get(), - // The destructor will be called when the capsule is GC'ed. - // - If our capsule is inserted into the dict below, it will be kept alive until - // interpreter shutdown, so the destructor will be called at that time. - // - If our capsule is NOT inserted (another thread inserted first), it will be - // destructed when going out of scope here, so the destructor will be called - // immediately, which will also free the storage. - /*destructor=*/[](void *ptr) -> void { delete static_cast(ptr); }); + auto new_capsule + = capsule(storage_ptr.get(), + // The destructor will be called when the capsule is GC'ed. + // If the insert below fails (entry already in the dict), then this + // destructor will be called on the newly created capsule at the end of this + // function, and we want to just release this memory. + /*destructor=*/[](void *v) { delete static_cast(v); }); // At this point, the capsule object is created successfully. // Release the unique_ptr and let the capsule object own the storage to avoid double-free. (void) storage_ptr.release(); @@ -613,17 +656,16 @@ std::pair atomic_get_or_create_in_state_dict(const char *key, throw error_already_set(); } created = (capsule_obj == new_capsule.ptr()); - if (clear_destructor && created) { - // Our capsule was inserted. - // Remove the destructor to leak the storage on interpreter shutdown. - if (PyCapsule_SetDestructor(capsule_obj, nullptr) < 0) { + // - If key already existed, our `new_capsule` is not inserted, it will be destructed when + // going out of scope here, and will call the destructor set above. + // - Otherwise, our `new_capsule` is now in the dict, and it owns the storage and the state + // dict will incref it. We need to set the caller's destructor on it, which will be + // called when the interpreter shuts down. + if (created && dtor) { + if (PyCapsule_SetDestructor(capsule_obj, dtor) < 0) { throw error_already_set(); } } - // - If key already existed, our `new_capsule` is not inserted, it will be destructed when - // going out of scope here, which will also free the storage. - // - Otherwise, our `new_capsule` is now in the dict, and it owns the storage and the state - // dict will incref it. } // Get the storage pointer from the capsule. @@ -673,6 +715,26 @@ class internals_pp_manager { return internals_singleton_pp_; } + /// Get the current pointer-to-pointer if it already exists, without creating it. + std::unique_ptr *get_pp_if_exists() { +#ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT + if (has_seen_non_main_interpreter()) { + auto *tstate = get_thread_state_unchecked(); + if (tstate && tstate->interp != last_istate_tls()) { + gil_scoped_acquire_simple gil; + last_istate_tls() = tstate->interp; + internals_p_tls() = get_pp_from_state_dict_if_exists(); + } + return internals_p_tls(); + } +#endif + if (!internals_singleton_pp_) { + gil_scoped_acquire_simple gil; + internals_singleton_pp_ = get_pp_from_state_dict_if_exists(); + } + return internals_singleton_pp_; + } + /// Drop all the references we're currently holding. void unref() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT @@ -685,6 +747,29 @@ class internals_pp_manager { internals_singleton_pp_ = nullptr; } + /// Reset the internals object in the capsule (frees the internals before interpreter shutdown) + void reset() { +#ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT + if (has_seen_non_main_interpreter()) { + // Always get the pointer-to-pointer from the state dict to ensure we're resetting + // the correct internals for the current interpreter + gil_scoped_acquire_simple gil; + auto *tstate = get_thread_state_unchecked(); + if (tstate) { + auto *pp = get_pp_from_state_dict_if_exists(); + if (pp && pp->get() != nullptr) { + // Only reset if the unique_ptr actually contains an object + pp->reset(); + } + } + return; + } +#endif + if (internals_singleton_pp_) { + internals_singleton_pp_->reset(); + } + } + void destroy() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (has_seen_non_main_interpreter()) { @@ -692,13 +777,22 @@ class internals_pp_manager { // this could be called without an active interpreter, just use what was cached if (!tstate || tstate->interp == last_istate_tls()) { auto tpp = internals_p_tls(); - + if (tpp) { + // Reset the unique_ptr in the capsule to free the internals object + // before deleting the cached pointer-to-pointer + tpp->reset(); + } delete tpp; } unref(); return; } #endif + if (internals_singleton_pp_) { + // Reset the unique_ptr in the capsule to free the internals object + // before deleting the cached pointer-to-pointer + internals_singleton_pp_->reset(); + } delete internals_singleton_pp_; unref(); } @@ -707,14 +801,21 @@ class internals_pp_manager { internals_pp_manager(char const *id, on_fetch_function *on_fetch) : holder_id_(id), on_fetch_(on_fetch) {} + static void internals_shutdown(PyObject *capsule) { + auto *pp = static_cast *>( + PyCapsule_GetPointer(capsule, nullptr)); + if (pp) { + pp->reset(); + } + // Because the unique_ptr is still pointed to by the pp_manager in this and possibly other + // modules, we cannot delete the unique_ptr itself until after the interpreter has shut + // down. If this interpreter was not created/owned by pybind11 then the unique_ptr itself + // (but not its contents) is leaked. + } + std::unique_ptr *get_or_create_pp_in_state_dict() { - // The `unique_ptr` output is leaked on interpreter shutdown. Once an - // instance is created, it will never be deleted until the process exits (compare to - // interpreter shutdown in multiple-interpreter scenarios). - // Because we cannot guarantee the order of destruction of capsules in the interpreter - // state dict, leaking avoids potential use-after-free issues during interpreter shutdown. auto result = atomic_get_or_create_in_state_dict>( - holder_id_, /*clear_destructor=*/true); + holder_id_, &internals_shutdown); auto *pp = result.first; bool created = result.second; // Only call on_fetch_ when fetching existing internals, not when creating new ones. @@ -724,6 +825,30 @@ class internals_pp_manager { return pp; } + std::unique_ptr *get_pp_from_state_dict_if_exists() { + error_scope err_scope; // preserve any existing Python error states + try { + auto state_dict = reinterpret_borrow(get_python_state_dict()); + PyObject *capsule_obj = dict_getitemstring(state_dict.ptr(), holder_id_); + if (capsule_obj == nullptr) { + if (PyErr_Occurred()) { + PyErr_Clear(); + } + return nullptr; + } + void *raw_ptr = PyCapsule_GetPointer(capsule_obj, /*name=*/nullptr); + if (!raw_ptr) { + if (PyErr_Occurred()) { + PyErr_Clear(); + } + return nullptr; + } + return static_cast *>(raw_ptr); + } catch (const error_already_set &) { + return nullptr; + } + } + #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT static PyInterpreterState *&last_istate_tls() { static thread_local PyInterpreterState *last_istate = nullptr; diff --git a/include/pybind11/subinterpreter.h b/include/pybind11/subinterpreter.h index c47787b6ef..ac19ae44be 100644 --- a/include/pybind11/subinterpreter.h +++ b/include/pybind11/subinterpreter.h @@ -181,6 +181,11 @@ class subinterpreter { detail::get_internals_pp_manager().get_pp(); detail::get_local_internals_pp_manager().get_pp(); + // Reset the internals objects before ending the interpreter to free their memory + // (the capsule destructor won't be called until interpreter shutdown, which is too late) + detail::get_internals_pp_manager().reset(); + detail::get_local_internals_pp_manager().reset(); + // End it Py_EndInterpreter(destroy_tstate);