From 5c13a8cf8d6af5535871f03d7cf8a13fde40e58b Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Mon, 29 Jun 2020 18:26:03 -0400 Subject: [PATCH 01/13] First draft of pybind11 Having issues with the Rust code taking *forever*. Going to break out the compiler explorer and see if it's doing something different from C++. --- .gitignore | 3 +- _posts/2020-06-29-release-the-gil-pt.-2.md | 162 +++++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 _posts/2020-06-29-release-the-gil-pt.-2.md diff --git a/.gitignore b/.gitignore index ddf4d8b..095c115 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ _site/ .sass-cache/ .jekyll-metadata .bundle/ -vendor/ \ No newline at end of file +vendor/ +.vscode/ \ No newline at end of file diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md new file mode 100644 index 0000000..cbd5f6f --- /dev/null +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -0,0 +1,162 @@ +--- +layout: post +title: "Release the GIL: Part 2 - Pybind11, PyO3" +description: "More Python Parallelism" +category: +tags: [python] +--- + +I've been continuing experiments with parallelism in Python; while these techniques are a bit niche, +it's still fun to push the performance envelope. In addition to tools like +[Cython](https://cython.org/) and [Numba](https://numba.pydata.org/) (covered +[here](//2019/12/release-the-gil.html)) that attempt to stay as close to Python as possible, other +projects are available that act as a bridge between Python and other languages. The goal is to make +cooperation simple without compromising independence. + +In practice, this "cooperation" between languages is important for performance reasons. Code written +in C++ shouldn't have to care about the Python GIL. However, unless the GIL is explicitly unlocked, +it will remain implicitly held; though the Python interpreter _could_ be making progress on a +separate thread, it will be stuck waiting on the current operation to complete. We'll look at some +techniques below for managing the GIL in a Python extension. + +# Pybind11 + +The motto of [Pybind11](https://github.com/pybind/pybind11) is "seamless operability between C++11 +and Python", and they certainly deliver on that. My experience was that it was relatively simple to +set up a hybrid project where C++ (using CMake) and Python (using setuptools) were able to +peacefully coexist. We'll examine a simple Fibonacci sequence implementation to demonstrate how +Python's threading model interacts with Pybind11. + +The C++ implementation is very simple: + +```c++ +#include + +inline std::uint64_t fibonacci(std::uint64_t n) { + if (n <= 1) { + return n; + } + + std::uint64_t a = 0; + std::uint64_t b = 1; + std::uint64_t c = 0; + + c = a + b; + for (std::uint64_t _i = 2; _i < n; _i++) { + a = b; + b = c; + c = a + b; + } + + return c; +} + +std::uint64_t fibonacci_gil(std::uint64_t n) { + // The GIL is held by default when entering C++ from Python, so we need no + // manipulation here. Interestingly enough, re-acquiring a held GIL is a safe + // operation (within the same thread), so feel free to scatter + // `py::gil_scoped_acquire` throughout the code. + return fibonacci(n); +} + +std::uint64_t fibonacci_nogil(std::uint64_t n) { + // Because the GIL is held by default, we need to explicitly release it here. + // Note that like Cython, releasing the lock multiple times will crash the + // interpreter. + + py::gil_scoped_release release; + return fibonacci(n); +} +``` + +Admittedly, the project setup is significantly more involved than Cython or Numba. I've omitted +those steps here, but the full project is available at [INSERT LINK HERE]. + +```python +# This number will overflow, but that's OK; our purpose isn't to get an accurate result, +# it's simply to keep the processor busy. +N = 1_000_000_000; + +from fibonacci import fibonacci_gil, fibonacci_nogil +``` + +We'll first run each function independently: + +```python +%%time +_ = fibonacci_gil(N); +``` + +>
+> CPU times: user 350 ms, sys: 3.54 ms, total: 354 ms
+> Wall time: 355 ms
+> 
+ +```python +%%time +_ = fibonacci_nogil(N); +``` + +>
+> CPU times: user 385 ms, sys: 0 ns, total: 385 ms
+> Wall time: 384 ms
+> 
+ +There's some minor variation in how long it takes to run the code, but not a material difference. +When running the same function in multiple threads, we expect the run time to double; even though +there are multiple threads, they effectively run in serial because of the GIL: + +```python +%%time +from threading import Thread + +# Create the two threads to run on +t1 = Thread(target=fibonacci_gil, args=[N]) +t2 = Thread(target=fibonacci_gil, args=[N]) +# Start the threads +t1.start(); t2.start() +# Wait for the threads to finish +t1.join(); t2.join() +``` + +>
+> CPU times: user 709 ms, sys: 0 ns, total: 709 ms
+> Wall time: 705 ms
+> 
+ +However, if one thread unlocks the GIL first, then the threads will execute in parallel: + +```python +%%time + +t1 = Thread(target=fibonacci_nogil, args=[N]) +t2 = Thread(target=fibonacci_gil, args=[N]) +t1.start(); t2.start() +t1.join(); t2.join() +``` + +>
+> CPU times: user 734 ms, sys: 7.89 ms, total: 742 ms
+> Wall time: 372 ms
+> 
+ +While it takes the same amount of CPU time to compute the result ("user" time), the run time ("wall" +time) is cut in half because the code is now running in parallel. + +```python +%%time + +# Note that the GIL-locked version is started first +t1 = Thread(target=fibonacci_gil, args=[N]) +t2 = Thread(target=fibonacci_nogil, args=[N]) +t1.start(); t2.start() +t1.join(); t2.join() +``` + +>
+> CPU times: user 736 ms, sys: 0 ns, total: 736 ms
+> Wall time: 734 ms
+> 
+ +Finally, it's import to note that scheduling matters; in this example, threads run in serial because +the GIL-locked thread is started first. From 1b24d3d09206c113b8c54908079c22624f845c8a Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Mon, 29 Jun 2020 22:48:02 -0400 Subject: [PATCH 02/13] Finish the pybind11 part --- _posts/2020-06-29-release-the-gil-pt.-2.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index cbd5f6f..f48e352 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -31,6 +31,7 @@ The C++ implementation is very simple: ```c++ #include +#include inline std::uint64_t fibonacci(std::uint64_t n) { if (n <= 1) { @@ -77,7 +78,7 @@ those steps here, but the full project is available at [INSERT LINK HERE]. # it's simply to keep the processor busy. N = 1_000_000_000; -from fibonacci import fibonacci_gil, fibonacci_nogil +from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil ``` We'll first run each function independently: From 44a314a401663eda2ed14478a997db9305987290 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 30 Jun 2020 14:23:33 -0400 Subject: [PATCH 03/13] Minor cleanup --- _posts/2020-06-29-release-the-gil-pt.-2.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index f48e352..534df85 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -22,12 +22,12 @@ techniques below for managing the GIL in a Python extension. # Pybind11 The motto of [Pybind11](https://github.com/pybind/pybind11) is "seamless operability between C++11 -and Python", and they certainly deliver on that. My experience was that it was relatively simple to -set up a hybrid project where C++ (using CMake) and Python (using setuptools) were able to -peacefully coexist. We'll examine a simple Fibonacci sequence implementation to demonstrate how -Python's threading model interacts with Pybind11. +and Python", and they certainly deliver on that. Setting up a hybrid project where C++ (using CMake) +and Python (using setuptools) could coexist was straight-forward, and the repository also works as +[a template](LINK HERE) for future projects. -The C++ implementation is very simple: +Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate +how Python's threading model interacts with Pybind11: ```c++ #include @@ -61,9 +61,9 @@ std::uint64_t fibonacci_gil(std::uint64_t n) { } std::uint64_t fibonacci_nogil(std::uint64_t n) { - // Because the GIL is held by default, we need to explicitly release it here. - // Note that like Cython, releasing the lock multiple times will crash the - // interpreter. + // Because the GIL is held by default, we need to explicitly release it here + // to run in parallel. + // WARNING: Releasing the lock multiple times will crash the process. py::gil_scoped_release release; return fibonacci(n); @@ -74,8 +74,8 @@ Admittedly, the project setup is significantly more involved than Cython or Numb those steps here, but the full project is available at [INSERT LINK HERE]. ```python -# This number will overflow, but that's OK; our purpose isn't to get an accurate result, -# it's simply to keep the processor busy. +# The billionth Fibonacci number overflows `std::uint64_t`, but that's OK; +# our purpose is keeping the CPU busy, not getting the correct result. N = 1_000_000_000; from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil From 1e18b201f5259fd328e8db738fc73fa9449bd5c6 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 30 Jun 2020 14:31:26 -0400 Subject: [PATCH 04/13] Repository URL --- _posts/2020-06-29-release-the-gil-pt.-2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index 534df85..bb4eaf2 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -24,7 +24,7 @@ techniques below for managing the GIL in a Python extension. The motto of [Pybind11](https://github.com/pybind/pybind11) is "seamless operability between C++11 and Python", and they certainly deliver on that. Setting up a hybrid project where C++ (using CMake) and Python (using setuptools) could coexist was straight-forward, and the repository also works as -[a template](LINK HERE) for future projects. +[a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects. Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate how Python's threading model interacts with Pybind11: From 7489733f64ba3010ec133a7c197121ad5efa6bfd Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 30 Jun 2020 16:34:25 -0400 Subject: [PATCH 05/13] Notes about double-unlock --- _posts/2020-06-29-release-the-gil-pt.-2.md | 144 ++++++++++++++++++++- 1 file changed, 141 insertions(+), 3 deletions(-) diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index bb4eaf2..2764c60 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -3,7 +3,7 @@ layout: post title: "Release the GIL: Part 2 - Pybind11, PyO3" description: "More Python Parallelism" category: -tags: [python] +tags: [python, rust, c++] --- I've been continuing experiments with parallelism in Python; while these techniques are a bit niche, @@ -26,6 +26,9 @@ and Python", and they certainly deliver on that. Setting up a hybrid project whe and Python (using setuptools) could coexist was straight-forward, and the repository also works as [a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects. +TODO: Include anything about how Pybind11 and Cython are similar because of compilation to C++? +Maybe also talk about project setup being a good deal more complicated? + Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate how Python's threading model interacts with Pybind11: @@ -68,10 +71,28 @@ std::uint64_t fibonacci_nogil(std::uint64_t n) { py::gil_scoped_release release; return fibonacci(n); } + +PYBIND11_MODULE(speiceio_pybind11, m) { + + m.def("fibonacci_gil", &fibonacci_gil, R"pbdoc( + Calculate the Nth Fibonacci number while implicitly holding the GIL + )pbdoc"); + + m.def("fibonacci_nogil", &fibonacci_nogil, + R"pbdoc( + Calculate the Nth Fibonacci number after explicitly unlocking the GIL + )pbdoc"); + +#ifdef VERSION_INFO + m.attr("__version__") = VERSION_INFO; +#else + m.attr("__version__") = "dev"; +#endif +} ``` -Admittedly, the project setup is significantly more involved than Cython or Numba. I've omitted -those steps here, but the full project is available at [INSERT LINK HERE]. +After the code is installed into a `virtualenv` or similar setup, we can use the functions to +demonstrate GIL unlocking: ```python # The billionth Fibonacci number overflows `std::uint64_t`, but that's OK; @@ -161,3 +182,120 @@ t1.join(); t2.join() Finally, it's import to note that scheduling matters; in this example, threads run in serial because the GIL-locked thread is started first. + +TODO: Note about double-unlocking: + +```c++ +void recurse_unlock() { + py::gil_scoped_release release; + return recurse_unlock(); +} +``` + +>
+> Python 3.8.2 (default, Apr 27 2020, 15:53:34) 
+> [GCC 9.3.0] on linux
+> Type "help", "copyright", "credits" or "license" for more information.
+> >>> from speiceio_pybind11 import recurse_unlock
+> >>> recurse_unlock()
+> Fatal Python error: PyEval_SaveThread: NULL tstate
+> Python runtime state: initialized
+> 
+> Current thread 0x00007f213a627740 (most recent call first):
+> File "", line 1 in 
+>  [1]    34943 abort (core dumped)  python
+> 
+ +# PyO3 + +```python +N = 1_000_000_000; + +from speiceio_pyo3 import fibonacci_gil, fibonacci_nogil +``` + +```python +%%time +_ = fibonacci_gil(N) +``` + +>
+> CPU times: user 283 ms, sys: 0 ns, total: 283 ms
+> Wall time: 282 ms
+> 
+ +```python +%%time +_ = fibonacci_nogil(N) +``` + +>
+> CPU times: user 284 ms, sys: 0 ns, total: 284 ms
+> Wall time: 284 ms
+> 
+ +```python +%%time +from threading import Thread + +# Create the two threads to run on +t1 = Thread(target=fibonacci_gil, args=[N]) +t2 = Thread(target=fibonacci_gil, args=[N]) +# Start the threads +t1.start(); t2.start() +# Wait for the threads to finish +t1.join(); t2.join() +``` + +>
+> CPU times: user 503 ms, sys: 3.83 ms, total: 507 ms
+> Wall time: 506 ms
+> 
+ +```python +%%time + +t1 = Thread(target=fibonacci_nogil, args=[N]) +t2 = Thread(target=fibonacci_gil, args=[N]) +t1.start(); t2.start() +t1.join(); t2.join() +``` + +>
+> CPU times: user 501 ms, sys: 3.96 ms, total: 505 ms
+> Wall time: 252 ms
+> 
+ +```python +%%time + +# Note that the GIL-locked version is started first +t1 = Thread(target=fibonacci_gil, args=[N]) +t2 = Thread(target=fibonacci_nogil, args=[N]) +t1.start(); t2.start() +t1.join(); t2.join() +``` + +>
+> CPU times: user 533 ms, sys: 3.69 ms, total: 537 ms
+> Wall time: 537 ms
+> 
+ +Interestingly enough, Rust's borrow rules actually _prevent_ double-unlocking because the GIL handle +can't be transferred across threads: + +```rust +fn recursive_unlock(py: Python) -> PyResult<()> { + py.allow_threads(|| recursive_unlock(py)) +} +``` + +>
+> error[E0277]: `std::rc::Rc<()>` cannot be shared between threads safely
+>   --> src/lib.rs:38:8
+>    |
+> 38 |     py.allow_threads(|| recursive_unlock(py))
+>    |        ^^^^^^^^^^^^^ `std::rc::Rc<()>` cannot be shared between threads safely
+>    |
+>    = help: within `pyo3::python::Python<'_>`, the trait `std::marker::Sync` is not implemented for `std::rc::Rc<()>`
+> 
From 4337e74d6d6465f23c0494b0cadf1e00e59bfe8d Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 30 Jun 2020 16:38:45 -0400 Subject: [PATCH 06/13] Remove some of the boring statistics And add the Rust code --- _posts/2020-06-29-release-the-gil-pt.-2.md | 129 ++++++++------------- 1 file changed, 49 insertions(+), 80 deletions(-) diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index 2764c60..9a61cdc 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -102,31 +102,7 @@ N = 1_000_000_000; from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil ``` -We'll first run each function independently: - -```python -%%time -_ = fibonacci_gil(N); -``` - ->
-> CPU times: user 350 ms, sys: 3.54 ms, total: 354 ms
-> Wall time: 355 ms
-> 
- -```python -%%time -_ = fibonacci_nogil(N); -``` - ->
-> CPU times: user 385 ms, sys: 0 ns, total: 385 ms
-> Wall time: 384 ms
-> 
- -There's some minor variation in how long it takes to run the code, but not a material difference. -When running the same function in multiple threads, we expect the run time to double; even though -there are multiple threads, they effectively run in serial because of the GIL: +Even when using two threads, the code is effectively serial: ```python %%time @@ -146,6 +122,8 @@ t1.join(); t2.join() > Wall time: 705 ms > +The elapsed ("wall") time is effectively the same as the time spent executing on the CPU ("user"). + However, if one thread unlocks the GIL first, then the threads will execute in parallel: ```python @@ -162,26 +140,7 @@ t1.join(); t2.join() > Wall time: 372 ms > -While it takes the same amount of CPU time to compute the result ("user" time), the run time ("wall" -time) is cut in half because the code is now running in parallel. - -```python -%%time - -# Note that the GIL-locked version is started first -t1 = Thread(target=fibonacci_gil, args=[N]) -t2 = Thread(target=fibonacci_nogil, args=[N]) -t1.start(); t2.start() -t1.join(); t2.join() -``` - ->
-> CPU times: user 736 ms, sys: 0 ns, total: 736 ms
-> Wall time: 734 ms
-> 
- -Finally, it's import to note that scheduling matters; in this example, threads run in serial because -the GIL-locked thread is started first. +The CPU time ("user") hasn't changed, but the elapsed time ("wall") is effectively cut in half. TODO: Note about double-unlocking: @@ -208,32 +167,57 @@ void recurse_unlock() { # PyO3 +```rust +use pyo3::prelude::*; +use pyo3::wrap_pyfunction; + +fn fibonacci_impl(n: u64) -> u64 { + if n <= 1 { + return n; + } + + let mut a: u64 = 0; + let mut b: u64 = 1; + let mut c: u64 = a + b; + + for _i in 2..n { + a = b; + b = c; + // We're not particularly concerned about the actual result, just in keeping the + // processor busy. + c = a.overflowing_add(b).0; + } + + c +} + +#[pyfunction] +fn fibonacci_gil(n: u64) -> PyResult { + // The GIL is implicitly held here + Ok(fibonacci_impl(n)) +} + +#[pyfunction] +fn fibonacci_nogil(py: Python, n: u64) -> PyResult { + // Explicitly release the GIL + py.allow_threads(|| Ok(fibonacci_impl(n))) +} + +#[pymodule] +fn speiceio_pyo3(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_wrapped(wrap_pyfunction!(fibonacci_gil))?; + m.add_wrapped(wrap_pyfunction!(fibonacci_nogil))?; + + Ok(()) +} +``` + ```python N = 1_000_000_000; from speiceio_pyo3 import fibonacci_gil, fibonacci_nogil ``` -```python -%%time -_ = fibonacci_gil(N) -``` - ->
-> CPU times: user 283 ms, sys: 0 ns, total: 283 ms
-> Wall time: 282 ms
-> 
- -```python -%%time -_ = fibonacci_nogil(N) -``` - ->
-> CPU times: user 284 ms, sys: 0 ns, total: 284 ms
-> Wall time: 284 ms
-> 
- ```python %%time from threading import Thread @@ -266,21 +250,6 @@ t1.join(); t2.join() > Wall time: 252 ms > -```python -%%time - -# Note that the GIL-locked version is started first -t1 = Thread(target=fibonacci_gil, args=[N]) -t2 = Thread(target=fibonacci_nogil, args=[N]) -t1.start(); t2.start() -t1.join(); t2.join() -``` - ->
-> CPU times: user 533 ms, sys: 3.69 ms, total: 537 ms
-> Wall time: 537 ms
-> 
- Interestingly enough, Rust's borrow rules actually _prevent_ double-unlocking because the GIL handle can't be transferred across threads: From a458ea2dacd13a7933d68a3fab27cfdf9f91a4d4 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 30 Jun 2020 17:03:20 -0400 Subject: [PATCH 07/13] Note on technical similarities. --- Gemfile.lock | 2 +- _posts/2020-06-29-release-the-gil-pt.-2.md | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 40a5613..310c738 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -75,4 +75,4 @@ DEPENDENCIES tzinfo-data BUNDLED WITH - 1.17.3 + 2.1.4 diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index 9a61cdc..eaa82f5 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -1,6 +1,6 @@ --- layout: post -title: "Release the GIL: Part 2 - Pybind11, PyO3" +title: "Release the GIL: Pybind11, PyO3" description: "More Python Parallelism" category: tags: [python, rust, c++] @@ -26,8 +26,11 @@ and Python", and they certainly deliver on that. Setting up a hybrid project whe and Python (using setuptools) could coexist was straight-forward, and the repository also works as [a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects. -TODO: Include anything about how Pybind11 and Cython are similar because of compilation to C++? -Maybe also talk about project setup being a good deal more complicated? +On a technical level, there's a great deal of overlap between Pybind11 and Cython. Where Pybind11 +starts with C++ and facilitates interaction with the interpreter, Cython starts with a Python-like +language and facilitates interaction with other code written in C++. In a way, Pybind11 is for C++ +developers who want to interact with Python, and Cython is for Python developers who want to +interact with C++. Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate how Python's threading model interacts with Pybind11: From 64dc036205240da0e0d7050560c616d9ca6faa27 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 30 Jun 2020 17:14:29 -0400 Subject: [PATCH 08/13] Rewording --- _posts/2020-06-29-release-the-gil-pt.-2.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index eaa82f5..662cdc2 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -26,11 +26,10 @@ and Python", and they certainly deliver on that. Setting up a hybrid project whe and Python (using setuptools) could coexist was straight-forward, and the repository also works as [a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects. -On a technical level, there's a great deal of overlap between Pybind11 and Cython. Where Pybind11 -starts with C++ and facilitates interaction with the interpreter, Cython starts with a Python-like -language and facilitates interaction with other code written in C++. In a way, Pybind11 is for C++ -developers who want to interact with Python, and Cython is for Python developers who want to -interact with C++. +There's a great deal of overlap between Pybind11 and Cython. Where Pybind11 makes it easy for C++ to +interact with the interpreter, Cython uses a Python-like language to facilitate interaction with +C++. Another way of thinking about is like this: Pybind11 is for C++ developers who want to interact +with Python, and Cython is for Python developers who want to interact with C++. Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate how Python's threading model interacts with Pybind11: @@ -46,9 +45,8 @@ inline std::uint64_t fibonacci(std::uint64_t n) { std::uint64_t a = 0; std::uint64_t b = 1; - std::uint64_t c = 0; + std::uint64_t c = a + b; - c = a + b; for (std::uint64_t _i = 2; _i < n; _i++) { a = b; b = c; From fb958ac92a16af7c8601f6ec5ca9b5038f324dd1 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 30 Jun 2020 17:27:06 -0400 Subject: [PATCH 09/13] Bundle update --- Gemfile.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 310c738..c66b54e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -9,12 +9,12 @@ GEM eventmachine (>= 0.12.9) http_parser.rb (~> 0.6.0) eventmachine (1.2.7) - ffi (1.12.2) + ffi (1.13.1) forwardable-extended (2.6.0) http_parser.rb (0.6.0) i18n (0.9.5) concurrent-ruby (~> 1.0) - jekyll (3.8.6) + jekyll (3.8.7) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) @@ -48,11 +48,11 @@ GEM mercenary (0.3.6) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (4.0.4) - rb-fsevent (0.10.3) + public_suffix (4.0.5) + rb-fsevent (0.10.4) rb-inotify (0.10.1) ffi (~> 1.0) - rouge (3.17.0) + rouge (3.20.0) rubyzip (2.3.0) safe_yaml (1.0.5) sass (3.7.4) From 8027538bb003c7f174c7f3a92be3671d6fc39c51 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 30 Jun 2020 17:53:02 -0400 Subject: [PATCH 10/13] Proofreading --- _posts/2020-06-29-release-the-gil-pt.-2.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index 662cdc2..6a8d8e7 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -92,8 +92,8 @@ PYBIND11_MODULE(speiceio_pybind11, m) { } ``` -After the code is installed into a `virtualenv` or similar setup, we can use the functions to -demonstrate GIL unlocking: +After building the C++ module, those functions can be used to demonstrate the effect of unlocking +the GIL. ```python # The billionth Fibonacci number overflows `std::uint64_t`, but that's OK; @@ -103,7 +103,7 @@ N = 1_000_000_000; from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil ``` -Even when using two threads, the code is effectively serial: +Even though two threads are used, the GIL prevents those threads from running in parallel: ```python %%time @@ -123,9 +123,11 @@ t1.join(); t2.join() > Wall time: 705 ms > -The elapsed ("wall") time is effectively the same as the time spent executing on the CPU ("user"). +Because the elapsed ("wall") time is effectively the same as the time spent executing on the CPU +("user"), there was no benefit to using multiple threads. -However, if one thread unlocks the GIL first, then the threads will execute in parallel: +However, if one thread unlocks the GIL first, the Python interpreter is allowed to execute the +second thread in parallel: ```python %%time @@ -141,9 +143,10 @@ t1.join(); t2.join() > Wall time: 372 ms > -The CPU time ("user") hasn't changed, but the elapsed time ("wall") is effectively cut in half. +The CPU time ("user") hasn't changed much, but the elapsed time ("wall") is effectively cut in half. -TODO: Note about double-unlocking: +Caution is advised though; attempting to unlock the GIL when it isn't locked will terminate the +current process: ```c++ void recurse_unlock() { From 6faaa702fa2e1ab76cd69dffbe292d0c6dfa60d2 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 30 Jun 2020 17:58:35 -0400 Subject: [PATCH 11/13] Proofreading --- _posts/2020-06-29-release-the-gil-pt.-2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index 6a8d8e7..7c0c61a 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -103,7 +103,7 @@ N = 1_000_000_000; from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil ``` -Even though two threads are used, the GIL prevents those threads from running in parallel: +In the first example, even though two threads are used, the GIL constrains code to run in serial: ```python %%time From 9ee58b7daaae2db78b35206ece1bbed3127c593f Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Wed, 29 Jul 2020 16:50:57 -0400 Subject: [PATCH 12/13] Snippet from other notes --- _posts/2020-06-29-release-the-gil-pt.-2.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md index 7c0c61a..fe23ada 100644 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ b/_posts/2020-06-29-release-the-gil-pt.-2.md @@ -171,6 +171,8 @@ void recurse_unlock() { # PyO3 +Now that pyo3 is stable, represents a great candidate for bridge. + ```rust use pyo3::prelude::*; use pyo3::wrap_pyfunction; From ffc8c52b477ae41b7808d2526f5359a191ae0ca0 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Sun, 10 Nov 2024 16:33:38 -0500 Subject: [PATCH 13/13] Revert --- .gitignore | 3 +- Gemfile.lock | 12 +- _posts/2020-06-29-release-the-gil-pt.-2.md | 276 --------------------- 3 files changed, 7 insertions(+), 284 deletions(-) delete mode 100644 _posts/2020-06-29-release-the-gil-pt.-2.md diff --git a/.gitignore b/.gitignore index 095c115..ddf4d8b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,4 @@ _site/ .sass-cache/ .jekyll-metadata .bundle/ -vendor/ -.vscode/ \ No newline at end of file +vendor/ \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index c66b54e..40a5613 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -9,12 +9,12 @@ GEM eventmachine (>= 0.12.9) http_parser.rb (~> 0.6.0) eventmachine (1.2.7) - ffi (1.13.1) + ffi (1.12.2) forwardable-extended (2.6.0) http_parser.rb (0.6.0) i18n (0.9.5) concurrent-ruby (~> 1.0) - jekyll (3.8.7) + jekyll (3.8.6) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) @@ -48,11 +48,11 @@ GEM mercenary (0.3.6) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (4.0.5) - rb-fsevent (0.10.4) + public_suffix (4.0.4) + rb-fsevent (0.10.3) rb-inotify (0.10.1) ffi (~> 1.0) - rouge (3.20.0) + rouge (3.17.0) rubyzip (2.3.0) safe_yaml (1.0.5) sass (3.7.4) @@ -75,4 +75,4 @@ DEPENDENCIES tzinfo-data BUNDLED WITH - 2.1.4 + 1.17.3 diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md deleted file mode 100644 index fe23ada..0000000 --- a/_posts/2020-06-29-release-the-gil-pt.-2.md +++ /dev/null @@ -1,276 +0,0 @@ ---- -layout: post -title: "Release the GIL: Pybind11, PyO3" -description: "More Python Parallelism" -category: -tags: [python, rust, c++] ---- - -I've been continuing experiments with parallelism in Python; while these techniques are a bit niche, -it's still fun to push the performance envelope. In addition to tools like -[Cython](https://cython.org/) and [Numba](https://numba.pydata.org/) (covered -[here](//2019/12/release-the-gil.html)) that attempt to stay as close to Python as possible, other -projects are available that act as a bridge between Python and other languages. The goal is to make -cooperation simple without compromising independence. - -In practice, this "cooperation" between languages is important for performance reasons. Code written -in C++ shouldn't have to care about the Python GIL. However, unless the GIL is explicitly unlocked, -it will remain implicitly held; though the Python interpreter _could_ be making progress on a -separate thread, it will be stuck waiting on the current operation to complete. We'll look at some -techniques below for managing the GIL in a Python extension. - -# Pybind11 - -The motto of [Pybind11](https://github.com/pybind/pybind11) is "seamless operability between C++11 -and Python", and they certainly deliver on that. Setting up a hybrid project where C++ (using CMake) -and Python (using setuptools) could coexist was straight-forward, and the repository also works as -[a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects. - -There's a great deal of overlap between Pybind11 and Cython. Where Pybind11 makes it easy for C++ to -interact with the interpreter, Cython uses a Python-like language to facilitate interaction with -C++. Another way of thinking about is like this: Pybind11 is for C++ developers who want to interact -with Python, and Cython is for Python developers who want to interact with C++. - -Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate -how Python's threading model interacts with Pybind11: - -```c++ -#include -#include - -inline std::uint64_t fibonacci(std::uint64_t n) { - if (n <= 1) { - return n; - } - - std::uint64_t a = 0; - std::uint64_t b = 1; - std::uint64_t c = a + b; - - for (std::uint64_t _i = 2; _i < n; _i++) { - a = b; - b = c; - c = a + b; - } - - return c; -} - -std::uint64_t fibonacci_gil(std::uint64_t n) { - // The GIL is held by default when entering C++ from Python, so we need no - // manipulation here. Interestingly enough, re-acquiring a held GIL is a safe - // operation (within the same thread), so feel free to scatter - // `py::gil_scoped_acquire` throughout the code. - return fibonacci(n); -} - -std::uint64_t fibonacci_nogil(std::uint64_t n) { - // Because the GIL is held by default, we need to explicitly release it here - // to run in parallel. - // WARNING: Releasing the lock multiple times will crash the process. - - py::gil_scoped_release release; - return fibonacci(n); -} - -PYBIND11_MODULE(speiceio_pybind11, m) { - - m.def("fibonacci_gil", &fibonacci_gil, R"pbdoc( - Calculate the Nth Fibonacci number while implicitly holding the GIL - )pbdoc"); - - m.def("fibonacci_nogil", &fibonacci_nogil, - R"pbdoc( - Calculate the Nth Fibonacci number after explicitly unlocking the GIL - )pbdoc"); - -#ifdef VERSION_INFO - m.attr("__version__") = VERSION_INFO; -#else - m.attr("__version__") = "dev"; -#endif -} -``` - -After building the C++ module, those functions can be used to demonstrate the effect of unlocking -the GIL. - -```python -# The billionth Fibonacci number overflows `std::uint64_t`, but that's OK; -# our purpose is keeping the CPU busy, not getting the correct result. -N = 1_000_000_000; - -from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil -``` - -In the first example, even though two threads are used, the GIL constrains code to run in serial: - -```python -%%time -from threading import Thread - -# Create the two threads to run on -t1 = Thread(target=fibonacci_gil, args=[N]) -t2 = Thread(target=fibonacci_gil, args=[N]) -# Start the threads -t1.start(); t2.start() -# Wait for the threads to finish -t1.join(); t2.join() -``` - ->
-> CPU times: user 709 ms, sys: 0 ns, total: 709 ms
-> Wall time: 705 ms
-> 
- -Because the elapsed ("wall") time is effectively the same as the time spent executing on the CPU -("user"), there was no benefit to using multiple threads. - -However, if one thread unlocks the GIL first, the Python interpreter is allowed to execute the -second thread in parallel: - -```python -%%time - -t1 = Thread(target=fibonacci_nogil, args=[N]) -t2 = Thread(target=fibonacci_gil, args=[N]) -t1.start(); t2.start() -t1.join(); t2.join() -``` - ->
-> CPU times: user 734 ms, sys: 7.89 ms, total: 742 ms
-> Wall time: 372 ms
-> 
- -The CPU time ("user") hasn't changed much, but the elapsed time ("wall") is effectively cut in half. - -Caution is advised though; attempting to unlock the GIL when it isn't locked will terminate the -current process: - -```c++ -void recurse_unlock() { - py::gil_scoped_release release; - return recurse_unlock(); -} -``` - ->
-> Python 3.8.2 (default, Apr 27 2020, 15:53:34) 
-> [GCC 9.3.0] on linux
-> Type "help", "copyright", "credits" or "license" for more information.
-> >>> from speiceio_pybind11 import recurse_unlock
-> >>> recurse_unlock()
-> Fatal Python error: PyEval_SaveThread: NULL tstate
-> Python runtime state: initialized
-> 
-> Current thread 0x00007f213a627740 (most recent call first):
-> File "", line 1 in 
->  [1]    34943 abort (core dumped)  python
-> 
- -# PyO3 - -Now that pyo3 is stable, represents a great candidate for bridge. - -```rust -use pyo3::prelude::*; -use pyo3::wrap_pyfunction; - -fn fibonacci_impl(n: u64) -> u64 { - if n <= 1 { - return n; - } - - let mut a: u64 = 0; - let mut b: u64 = 1; - let mut c: u64 = a + b; - - for _i in 2..n { - a = b; - b = c; - // We're not particularly concerned about the actual result, just in keeping the - // processor busy. - c = a.overflowing_add(b).0; - } - - c -} - -#[pyfunction] -fn fibonacci_gil(n: u64) -> PyResult { - // The GIL is implicitly held here - Ok(fibonacci_impl(n)) -} - -#[pyfunction] -fn fibonacci_nogil(py: Python, n: u64) -> PyResult { - // Explicitly release the GIL - py.allow_threads(|| Ok(fibonacci_impl(n))) -} - -#[pymodule] -fn speiceio_pyo3(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_wrapped(wrap_pyfunction!(fibonacci_gil))?; - m.add_wrapped(wrap_pyfunction!(fibonacci_nogil))?; - - Ok(()) -} -``` - -```python -N = 1_000_000_000; - -from speiceio_pyo3 import fibonacci_gil, fibonacci_nogil -``` - -```python -%%time -from threading import Thread - -# Create the two threads to run on -t1 = Thread(target=fibonacci_gil, args=[N]) -t2 = Thread(target=fibonacci_gil, args=[N]) -# Start the threads -t1.start(); t2.start() -# Wait for the threads to finish -t1.join(); t2.join() -``` - ->
-> CPU times: user 503 ms, sys: 3.83 ms, total: 507 ms
-> Wall time: 506 ms
-> 
- -```python -%%time - -t1 = Thread(target=fibonacci_nogil, args=[N]) -t2 = Thread(target=fibonacci_gil, args=[N]) -t1.start(); t2.start() -t1.join(); t2.join() -``` - ->
-> CPU times: user 501 ms, sys: 3.96 ms, total: 505 ms
-> Wall time: 252 ms
-> 
- -Interestingly enough, Rust's borrow rules actually _prevent_ double-unlocking because the GIL handle -can't be transferred across threads: - -```rust -fn recursive_unlock(py: Python) -> PyResult<()> { - py.allow_threads(|| recursive_unlock(py)) -} -``` - ->
-> error[E0277]: `std::rc::Rc<()>` cannot be shared between threads safely
->   --> src/lib.rs:38:8
->    |
-> 38 |     py.allow_threads(|| recursive_unlock(py))
->    |        ^^^^^^^^^^^^^ `std::rc::Rc<()>` cannot be shared between threads safely
->    |
->    = help: within `pyo3::python::Python<'_>`, the trait `std::marker::Sync` is not implemented for `std::rc::Rc<()>`
->