From 5c13a8cf8d6af5535871f03d7cf8a13fde40e58b Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Mon, 29 Jun 2020 18:26:03 -0400
Subject: [PATCH 01/13] First draft of pybind11

Having issues with the Rust code taking *forever*. Going to break out
the compiler explorer and see if it's doing something different from
C++.
---
 .gitignore                                 |   3 +-
 _posts/2020-06-29-release-the-gil-pt.-2.md | 162 +++++++++++++++++++++
 2 files changed, 164 insertions(+), 1 deletion(-)
 create mode 100644 _posts/2020-06-29-release-the-gil-pt.-2.md
diff --git a/.gitignore b/.gitignore
index ddf4d8b..095c115 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@ _site/
 .sass-cache/
 .jekyll-metadata
 .bundle/
-vendor/
\ No newline at end of file
+vendor/
+.vscode/
\ No newline at end of file
diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
new file mode 100644
index 0000000..cbd5f6f
--- /dev/null
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -0,0 +1,162 @@
+---
+layout: post
+title: "Release the GIL: Part 2 - Pybind11, PyO3"
+description: "More Python Parallelism"
+category:
+tags: [python]
+---
+
+I've been continuing experiments with parallelism in Python; while these techniques are a bit niche,
+it's still fun to push the performance envelope. In addition to tools like
+[Cython](https://cython.org/) and [Numba](https://numba.pydata.org/) (covered
+[here](//2019/12/release-the-gil.html)) that attempt to stay as close to Python as possible, other
+projects are available that act as a bridge between Python and other languages. The goal is to make
+cooperation simple without compromising independence.
+
+In practice, this "cooperation" between languages is important for performance reasons. Code written
+in C++ shouldn't have to care about the Python GIL. However, unless the GIL is explicitly unlocked,
+it will remain implicitly held; though the Python interpreter _could_ be making progress on a
+separate thread, it will be stuck waiting on the current operation to complete. We'll look at some
+techniques below for managing the GIL in a Python extension.
+
+# Pybind11
+
+The motto of [Pybind11](https://github.com/pybind/pybind11) is "seamless operability between C++11
+and Python", and they certainly deliver on that. My experience was that it was relatively simple to
+set up a hybrid project where C++ (using CMake) and Python (using setuptools) were able to
+peacefully coexist. We'll examine a simple Fibonacci sequence implementation to demonstrate how
+Python's threading model interacts with Pybind11.
+
+The C++ implementation is very simple:
+
+```c++
+#include <cstdint>
+
+inline std::uint64_t fibonacci(std::uint64_t n) {
+  if (n <= 1) {
+    return n;
+  }
+
+  std::uint64_t a = 0;
+  std::uint64_t b = 1;
+  std::uint64_t c = 0;
+
+  c = a + b;
+  for (std::uint64_t _i = 2; _i < n; _i++) {
+    a = b;
+    b = c;
+    c = a + b;
+  }
+
+  return c;
+}
+
+std::uint64_t fibonacci_gil(std::uint64_t n) {
+  // The GIL is held by default when entering C++ from Python, so we need no
+  // manipulation here. Interestingly enough, re-acquiring a held GIL is a safe
+  // operation (within the same thread), so feel free to scatter
+  // `py::gil_scoped_acquire` throughout the code.
+  return fibonacci(n);
+}
+
+std::uint64_t fibonacci_nogil(std::uint64_t n) {
+  // Because the GIL is held by default, we need to explicitly release it here.
+  // Note that like Cython, releasing the lock multiple times will crash the
+  // interpreter.
+
+  py::gil_scoped_release release;
+  return fibonacci(n);
+}
+```
+
+Admittedly, the project setup is significantly more involved than Cython or Numba. I've omitted
+those steps here, but the full project is available at [INSERT LINK HERE].
+
+```python
+# This number will overflow, but that's OK; our purpose isn't to get an accurate result,
+# it's simply to keep the processor busy.
+N = 1_000_000_000;
+
+from fibonacci import fibonacci_gil, fibonacci_nogil
+```
+
+We'll first run each function independently:
+
+```python
+%%time
+_ = fibonacci_gil(N);
+```
+
+> <pre>
+> CPU times: user 350 ms, sys: 3.54 ms, total: 354 ms
+> Wall time: 355 ms
+> </pre>
+
+```python
+%%time
+_ = fibonacci_nogil(N);
+```
+
+> <pre>
+> CPU times: user 385 ms, sys: 0 ns, total: 385 ms
+> Wall time: 384 ms
+> </pre>
+
+There's some minor variation in how long it takes to run the code, but not a material difference.
+When running the same function in multiple threads, we expect the run time to double; even though
+there are multiple threads, they effectively run in serial because of the GIL:
+
+```python
+%%time
+from threading import Thread
+
+# Create the two threads to run on
+t1 = Thread(target=fibonacci_gil, args=[N])
+t2 = Thread(target=fibonacci_gil, args=[N])
+# Start the threads
+t1.start(); t2.start()
+# Wait for the threads to finish
+t1.join(); t2.join()
+```
+
+> <pre>
+> CPU times: user 709 ms, sys: 0 ns, total: 709 ms
+> Wall time: 705 ms
+> </pre>
+
+However, if one thread unlocks the GIL first, then the threads will execute in parallel:
+
+```python
+%%time
+
+t1 = Thread(target=fibonacci_nogil, args=[N])
+t2 = Thread(target=fibonacci_gil, args=[N])
+t1.start(); t2.start()
+t1.join(); t2.join()
+```
+
+> <pre>
+> CPU times: user 734 ms, sys: 7.89 ms, total: 742 ms
+> Wall time: 372 ms
+> </pre>
+
+While it takes the same amount of CPU time to compute the result ("user" time), the run time ("wall"
+time) is cut in half because the code is now running in parallel.
+
+```python
+%%time
+
+# Note that the GIL-locked version is started first
+t1 = Thread(target=fibonacci_gil, args=[N])
+t2 = Thread(target=fibonacci_nogil, args=[N])
+t1.start(); t2.start()
+t1.join(); t2.join()
+```
+
+> <pre>
+> CPU times: user 736 ms, sys: 0 ns, total: 736 ms
+> Wall time: 734 ms
+> </pre>
+
+Finally, it's import to note that scheduling matters; in this example, threads run in serial because
+the GIL-locked thread is started first.

From 1b24d3d09206c113b8c54908079c22624f845c8a Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Mon, 29 Jun 2020 22:48:02 -0400
Subject: [PATCH 02/13] Finish the pybind11 part

---
 _posts/2020-06-29-release-the-gil-pt.-2.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index cbd5f6f..f48e352 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -31,6 +31,7 @@ The C++ implementation is very simple:
 
 ```c++
 #include <cstdint>
+#include <pybind11/pybind.h>
 
 inline std::uint64_t fibonacci(std::uint64_t n) {
   if (n <= 1) {
@@ -77,7 +78,7 @@ those steps here, but the full project is available at [INSERT LINK HERE].
 # it's simply to keep the processor busy.
 N = 1_000_000_000;
 
-from fibonacci import fibonacci_gil, fibonacci_nogil
+from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil
 ```
 
 We'll first run each function independently:

From 44a314a401663eda2ed14478a997db9305987290 Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Tue, 30 Jun 2020 14:23:33 -0400
Subject: [PATCH 03/13] Minor cleanup

---
 _posts/2020-06-29-release-the-gil-pt.-2.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index f48e352..534df85 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -22,12 +22,12 @@ techniques below for managing the GIL in a Python extension.
 # Pybind11
 
 The motto of [Pybind11](https://github.com/pybind/pybind11) is "seamless operability between C++11
-and Python", and they certainly deliver on that. My experience was that it was relatively simple to
-set up a hybrid project where C++ (using CMake) and Python (using setuptools) were able to
-peacefully coexist. We'll examine a simple Fibonacci sequence implementation to demonstrate how
-Python's threading model interacts with Pybind11.
+and Python", and they certainly deliver on that. Setting up a hybrid project where C++ (using CMake)
+and Python (using setuptools) could coexist was straight-forward, and the repository also works as
+[a template](LINK HERE) for future projects.
 
-The C++ implementation is very simple:
+Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate
+how Python's threading model interacts with Pybind11:
 
 ```c++
 #include <cstdint>
@@ -61,9 +61,9 @@ std::uint64_t fibonacci_gil(std::uint64_t n) {
 }
 
 std::uint64_t fibonacci_nogil(std::uint64_t n) {
-  // Because the GIL is held by default, we need to explicitly release it here.
-  // Note that like Cython, releasing the lock multiple times will crash the
-  // interpreter.
+  // Because the GIL is held by default, we need to explicitly release it here
+  // to run in parallel.
+  // WARNING: Releasing the lock multiple times will crash the process.
 
   py::gil_scoped_release release;
   return fibonacci(n);
@@ -74,8 +74,8 @@ Admittedly, the project setup is significantly more involved than Cython or Numb
 those steps here, but the full project is available at [INSERT LINK HERE].
 
 ```python
-# This number will overflow, but that's OK; our purpose isn't to get an accurate result,
-# it's simply to keep the processor busy.
+# The billionth Fibonacci number overflows `std::uint64_t`, but that's OK;
+# our purpose is keeping the CPU busy, not getting the correct result.
 N = 1_000_000_000;
 
 from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil

From 1e18b201f5259fd328e8db738fc73fa9449bd5c6 Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Tue, 30 Jun 2020 14:31:26 -0400
Subject: [PATCH 04/13] Repository URL

---
 _posts/2020-06-29-release-the-gil-pt.-2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index 534df85..bb4eaf2 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -24,7 +24,7 @@ techniques below for managing the GIL in a Python extension.
 The motto of [Pybind11](https://github.com/pybind/pybind11) is "seamless operability between C++11
 and Python", and they certainly deliver on that. Setting up a hybrid project where C++ (using CMake)
 and Python (using setuptools) could coexist was straight-forward, and the repository also works as
-[a template](LINK HERE) for future projects.
+[a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects.
 
 Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate
 how Python's threading model interacts with Pybind11:

From 7489733f64ba3010ec133a7c197121ad5efa6bfd Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Tue, 30 Jun 2020 16:34:25 -0400
Subject: [PATCH 05/13] Notes about double-unlock

---
 _posts/2020-06-29-release-the-gil-pt.-2.md | 144 ++++++++++++++++++++-
 1 file changed, 141 insertions(+), 3 deletions(-)

diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index bb4eaf2..2764c60 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -3,7 +3,7 @@ layout: post
 title: "Release the GIL: Part 2 - Pybind11, PyO3"
 description: "More Python Parallelism"
 category:
-tags: [python]
+tags: [python, rust, c++]
 ---
 
 I've been continuing experiments with parallelism in Python; while these techniques are a bit niche,
@@ -26,6 +26,9 @@ and Python", and they certainly deliver on that. Setting up a hybrid project whe
 and Python (using setuptools) could coexist was straight-forward, and the repository also works as
 [a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects.
 
+TODO: Include anything about how Pybind11 and Cython are similar because of compilation to C++?
+Maybe also talk about project setup being a good deal more complicated?
+
 Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate
 how Python's threading model interacts with Pybind11:
 
@@ -68,10 +71,28 @@ std::uint64_t fibonacci_nogil(std::uint64_t n) {
   py::gil_scoped_release release;
   return fibonacci(n);
 }
+
+PYBIND11_MODULE(speiceio_pybind11, m) {
+
+  m.def("fibonacci_gil", &fibonacci_gil, R"pbdoc(
+        Calculate the Nth Fibonacci number while implicitly holding the GIL
+    )pbdoc");
+
+  m.def("fibonacci_nogil", &fibonacci_nogil,
+        R"pbdoc(
+        Calculate the Nth Fibonacci number after explicitly unlocking the GIL
+    )pbdoc");
+
+#ifdef VERSION_INFO
+  m.attr("__version__") = VERSION_INFO;
+#else
+  m.attr("__version__") = "dev";
+#endif
+}
 ```
 
-Admittedly, the project setup is significantly more involved than Cython or Numba. I've omitted
-those steps here, but the full project is available at [INSERT LINK HERE].
+After the code is installed into a `virtualenv` or similar setup, we can use the functions to
+demonstrate GIL unlocking:
 
 ```python
 # The billionth Fibonacci number overflows `std::uint64_t`, but that's OK;
@@ -161,3 +182,120 @@ t1.join(); t2.join()
 
 Finally, it's import to note that scheduling matters; in this example, threads run in serial because
 the GIL-locked thread is started first.
+
+TODO: Note about double-unlocking:
+
+```c++
+void recurse_unlock() {
+  py::gil_scoped_release release;
+  return recurse_unlock();
+}
+```
+
+> <pre>
+> Python 3.8.2 (default, Apr 27 2020, 15:53:34) 
+> [GCC 9.3.0] on linux
+> Type "help", "copyright", "credits" or "license" for more information.
+> >>> from speiceio_pybind11 import recurse_unlock
+> >>> recurse_unlock()
+> Fatal Python error: PyEval_SaveThread: NULL tstate
+> Python runtime state: initialized
+> 
+> Current thread 0x00007f213a627740 (most recent call first):
+> File "<stdin>", line 1 in <module>
+>  [1]    34943 abort (core dumped)  python
+> </pre>
+
+# PyO3
+
+```python
+N = 1_000_000_000;
+
+from speiceio_pyo3 import fibonacci_gil, fibonacci_nogil
+```
+
+```python
+%%time
+_ = fibonacci_gil(N)
+```
+
+> <pre>
+> CPU times: user 283 ms, sys: 0 ns, total: 283 ms
+> Wall time: 282 ms
+> </pre>
+
+```python
+%%time
+_ = fibonacci_nogil(N)
+```
+
+> <pre>
+> CPU times: user 284 ms, sys: 0 ns, total: 284 ms
+> Wall time: 284 ms
+> </pre>
+
+```python
+%%time
+from threading import Thread
+
+# Create the two threads to run on
+t1 = Thread(target=fibonacci_gil, args=[N])
+t2 = Thread(target=fibonacci_gil, args=[N])
+# Start the threads
+t1.start(); t2.start()
+# Wait for the threads to finish
+t1.join(); t2.join()
+```
+
+> <pre>
+> CPU times: user 503 ms, sys: 3.83 ms, total: 507 ms
+> Wall time: 506 ms
+> </pre>
+
+```python
+%%time
+
+t1 = Thread(target=fibonacci_nogil, args=[N])
+t2 = Thread(target=fibonacci_gil, args=[N])
+t1.start(); t2.start()
+t1.join(); t2.join()
+```
+
+> <pre>
+> CPU times: user 501 ms, sys: 3.96 ms, total: 505 ms
+> Wall time: 252 ms
+> </pre>
+
+```python
+%%time
+
+# Note that the GIL-locked version is started first
+t1 = Thread(target=fibonacci_gil, args=[N])
+t2 = Thread(target=fibonacci_nogil, args=[N])
+t1.start(); t2.start()
+t1.join(); t2.join()
+```
+
+> <pre>
+> CPU times: user 533 ms, sys: 3.69 ms, total: 537 ms
+> Wall time: 537 ms
+> </pre>
+
+Interestingly enough, Rust's borrow rules actually _prevent_ double-unlocking because the GIL handle
+can't be transferred across threads:
+
+```rust
+fn recursive_unlock(py: Python) -> PyResult<()> {
+    py.allow_threads(|| recursive_unlock(py))
+}
+```
+
+> <pre>
+> error[E0277]: `std::rc::Rc<()>` cannot be shared between threads safely
+>   --> src/lib.rs:38:8
+>    |
+> 38 |     py.allow_threads(|| recursive_unlock(py))
+>    |        ^^^^^^^^^^^^^ `std::rc::Rc<()>` cannot be shared between threads safely
+>    |
+>    = help: within `pyo3::python::Python<'_>`, the trait `std::marker::Sync` is not implemented for `std::rc::Rc<()>`
+> </pre>

From 4337e74d6d6465f23c0494b0cadf1e00e59bfe8d Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Tue, 30 Jun 2020 16:38:45 -0400
Subject: [PATCH 06/13] Remove some of the boring statistics

And add the Rust code
---
 _posts/2020-06-29-release-the-gil-pt.-2.md | 129 ++++++++-------------
 1 file changed, 49 insertions(+), 80 deletions(-)

diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index 2764c60..9a61cdc 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -102,31 +102,7 @@ N = 1_000_000_000;
 from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil
 ```
 
-We'll first run each function independently:
-
-```python
-%%time
-_ = fibonacci_gil(N);
-```
-
-> <pre>
-> CPU times: user 350 ms, sys: 3.54 ms, total: 354 ms
-> Wall time: 355 ms
-> </pre>
-
-```python
-%%time
-_ = fibonacci_nogil(N);
-```
-
-> <pre>
-> CPU times: user 385 ms, sys: 0 ns, total: 385 ms
-> Wall time: 384 ms
-> </pre>
-
-There's some minor variation in how long it takes to run the code, but not a material difference.
-When running the same function in multiple threads, we expect the run time to double; even though
-there are multiple threads, they effectively run in serial because of the GIL:
+Even when using two threads, the code is effectively serial:
 
 ```python
 %%time
@@ -146,6 +122,8 @@ t1.join(); t2.join()
 > Wall time: 705 ms
 > </pre>
 
+The elapsed ("wall") time is effectively the same as the time spent executing on the CPU ("user").
+
 However, if one thread unlocks the GIL first, then the threads will execute in parallel:
 
 ```python
@@ -162,26 +140,7 @@ t1.join(); t2.join()
 > Wall time: 372 ms
 > </pre>
 
-While it takes the same amount of CPU time to compute the result ("user" time), the run time ("wall"
-time) is cut in half because the code is now running in parallel.
-
-```python
-%%time
-
-# Note that the GIL-locked version is started first
-t1 = Thread(target=fibonacci_gil, args=[N])
-t2 = Thread(target=fibonacci_nogil, args=[N])
-t1.start(); t2.start()
-t1.join(); t2.join()
-```
-
-> <pre>
-> CPU times: user 736 ms, sys: 0 ns, total: 736 ms
-> Wall time: 734 ms
-> </pre>
-
-Finally, it's import to note that scheduling matters; in this example, threads run in serial because
-the GIL-locked thread is started first.
+The CPU time ("user") hasn't changed, but the elapsed time ("wall") is effectively cut in half.
 
 TODO: Note about double-unlocking:
 
@@ -208,32 +167,57 @@ void recurse_unlock() {
 
 # PyO3
 
+```rust
+use pyo3::prelude::*;
+use pyo3::wrap_pyfunction;
+
+fn fibonacci_impl(n: u64) -> u64 {
+    if n <= 1 {
+        return n;
+    }
+
+    let mut a: u64 = 0;
+    let mut b: u64 = 1;
+    let mut c: u64 = a + b;
+
+    for _i in 2..n {
+        a = b;
+        b = c;
+        // We're not particularly concerned about the actual result, just in keeping the
+        // processor busy.
+        c = a.overflowing_add(b).0;
+    }
+
+    c
+}
+
+#[pyfunction]
+fn fibonacci_gil(n: u64) -> PyResult<u64> {
+    // The GIL is implicitly held here
+    Ok(fibonacci_impl(n))
+}
+
+#[pyfunction]
+fn fibonacci_nogil(py: Python, n: u64) -> PyResult<u64> {
+    // Explicitly release the GIL
+    py.allow_threads(|| Ok(fibonacci_impl(n)))
+}
+
+#[pymodule]
+fn speiceio_pyo3(_py: Python, m: &PyModule) -> PyResult<()> {
+    m.add_wrapped(wrap_pyfunction!(fibonacci_gil))?;
+    m.add_wrapped(wrap_pyfunction!(fibonacci_nogil))?;
+
+    Ok(())
+}
+```
+
 ```python
 N = 1_000_000_000;
 
 from speiceio_pyo3 import fibonacci_gil, fibonacci_nogil
 ```
 
-```python
-%%time
-_ = fibonacci_gil(N)
-```
-
-> <pre>
-> CPU times: user 283 ms, sys: 0 ns, total: 283 ms
-> Wall time: 282 ms
-> </pre>
-
-```python
-%%time
-_ = fibonacci_nogil(N)
-```
-
-> <pre>
-> CPU times: user 284 ms, sys: 0 ns, total: 284 ms
-> Wall time: 284 ms
-> </pre>
-
 ```python
 %%time
 from threading import Thread
@@ -266,21 +250,6 @@ t1.join(); t2.join()
 > Wall time: 252 ms
 > </pre>
 
-```python
-%%time
-
-# Note that the GIL-locked version is started first
-t1 = Thread(target=fibonacci_gil, args=[N])
-t2 = Thread(target=fibonacci_nogil, args=[N])
-t1.start(); t2.start()
-t1.join(); t2.join()
-```
-
-> <pre>
-> CPU times: user 533 ms, sys: 3.69 ms, total: 537 ms
-> Wall time: 537 ms
-> </pre>
-
 Interestingly enough, Rust's borrow rules actually _prevent_ double-unlocking because the GIL handle
 can't be transferred across threads:
 

From a458ea2dacd13a7933d68a3fab27cfdf9f91a4d4 Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Tue, 30 Jun 2020 17:03:20 -0400
Subject: [PATCH 07/13] Note on technical similarities.

---
 Gemfile.lock                               | 2 +-
 _posts/2020-06-29-release-the-gil-pt.-2.md | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/Gemfile.lock b/Gemfile.lock
index 40a5613..310c738 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -75,4 +75,4 @@ DEPENDENCIES
   tzinfo-data
 
 BUNDLED WITH
-   1.17.3
+   2.1.4
diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index 9a61cdc..eaa82f5 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -1,6 +1,6 @@
 ---
 layout: post
-title: "Release the GIL: Part 2 - Pybind11, PyO3"
+title: "Release the GIL: Pybind11, PyO3"
 description: "More Python Parallelism"
 category:
 tags: [python, rust, c++]
@@ -26,8 +26,11 @@ and Python", and they certainly deliver on that. Setting up a hybrid project whe
 and Python (using setuptools) could coexist was straight-forward, and the repository also works as
 [a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects.
 
-TODO: Include anything about how Pybind11 and Cython are similar because of compilation to C++?
-Maybe also talk about project setup being a good deal more complicated?
+On a technical level, there's a great deal of overlap between Pybind11 and Cython. Where Pybind11
+starts with C++ and facilitates interaction with the interpreter, Cython starts with a Python-like
+language and facilitates interaction with other code written in C++. In a way, Pybind11 is for C++
+developers who want to interact with Python, and Cython is for Python developers who want to
+interact with C++.
 
 Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate
 how Python's threading model interacts with Pybind11:

From 64dc036205240da0e0d7050560c616d9ca6faa27 Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Tue, 30 Jun 2020 17:14:29 -0400
Subject: [PATCH 08/13] Rewording

---
 _posts/2020-06-29-release-the-gil-pt.-2.md | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index eaa82f5..662cdc2 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -26,11 +26,10 @@ and Python", and they certainly deliver on that. Setting up a hybrid project whe
 and Python (using setuptools) could coexist was straight-forward, and the repository also works as
 [a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects.
 
-On a technical level, there's a great deal of overlap between Pybind11 and Cython. Where Pybind11
-starts with C++ and facilitates interaction with the interpreter, Cython starts with a Python-like
-language and facilitates interaction with other code written in C++. In a way, Pybind11 is for C++
-developers who want to interact with Python, and Cython is for Python developers who want to
-interact with C++.
+There's a great deal of overlap between Pybind11 and Cython. Where Pybind11 makes it easy for C++ to
+interact with the interpreter, Cython uses a Python-like language to facilitate interaction with
+C++. Another way of thinking about is like this: Pybind11 is for C++ developers who want to interact
+with Python, and Cython is for Python developers who want to interact with C++.
 
 Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate
 how Python's threading model interacts with Pybind11:
@@ -46,9 +45,8 @@ inline std::uint64_t fibonacci(std::uint64_t n) {
 
   std::uint64_t a = 0;
   std::uint64_t b = 1;
-  std::uint64_t c = 0;
+  std::uint64_t c = a + b;
 
-  c = a + b;
   for (std::uint64_t _i = 2; _i < n; _i++) {
     a = b;
     b = c;

From fb958ac92a16af7c8601f6ec5ca9b5038f324dd1 Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Tue, 30 Jun 2020 17:27:06 -0400
Subject: [PATCH 09/13] Bundle update

---
 Gemfile.lock | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Gemfile.lock b/Gemfile.lock
index 310c738..c66b54e 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -9,12 +9,12 @@ GEM
       eventmachine (>= 0.12.9)
       http_parser.rb (~> 0.6.0)
     eventmachine (1.2.7)
-    ffi (1.12.2)
+    ffi (1.13.1)
     forwardable-extended (2.6.0)
     http_parser.rb (0.6.0)
     i18n (0.9.5)
       concurrent-ruby (~> 1.0)
-    jekyll (3.8.6)
+    jekyll (3.8.7)
       addressable (~> 2.4)
       colorator (~> 1.0)
       em-websocket (~> 0.5)
@@ -48,11 +48,11 @@ GEM
     mercenary (0.3.6)
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
-    public_suffix (4.0.4)
-    rb-fsevent (0.10.3)
+    public_suffix (4.0.5)
+    rb-fsevent (0.10.4)
     rb-inotify (0.10.1)
       ffi (~> 1.0)
-    rouge (3.17.0)
+    rouge (3.20.0)
     rubyzip (2.3.0)
     safe_yaml (1.0.5)
     sass (3.7.4)

From 8027538bb003c7f174c7f3a92be3671d6fc39c51 Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Tue, 30 Jun 2020 17:53:02 -0400
Subject: [PATCH 10/13] Proofreading

---
 _posts/2020-06-29-release-the-gil-pt.-2.md | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index 662cdc2..6a8d8e7 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -92,8 +92,8 @@ PYBIND11_MODULE(speiceio_pybind11, m) {
 }
 ```
 
-After the code is installed into a `virtualenv` or similar setup, we can use the functions to
-demonstrate GIL unlocking:
+After building the C++ module, those functions can be used to demonstrate the effect of unlocking
+the GIL.
 
 ```python
 # The billionth Fibonacci number overflows `std::uint64_t`, but that's OK;
@@ -103,7 +103,7 @@ N = 1_000_000_000;
 from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil
 ```
 
-Even when using two threads, the code is effectively serial:
+Even though two threads are used, the GIL prevents those threads from running in parallel:
 
 ```python
 %%time
@@ -123,9 +123,11 @@ t1.join(); t2.join()
 > Wall time: 705 ms
 > </pre>
 
-The elapsed ("wall") time is effectively the same as the time spent executing on the CPU ("user").
+Because the elapsed ("wall") time is effectively the same as the time spent executing on the CPU
+("user"), there was no benefit to using multiple threads.
 
-However, if one thread unlocks the GIL first, then the threads will execute in parallel:
+However, if one thread unlocks the GIL first, the Python interpreter is allowed to execute the
+second thread in parallel:
 
 ```python
 %%time
@@ -141,9 +143,10 @@ t1.join(); t2.join()
 > Wall time: 372 ms
 > </pre>
 
-The CPU time ("user") hasn't changed, but the elapsed time ("wall") is effectively cut in half.
+The CPU time ("user") hasn't changed much, but the elapsed time ("wall") is effectively cut in half.
 
-TODO: Note about double-unlocking:
+Caution is advised though; attempting to unlock the GIL when it isn't locked will terminate the
+current process:
 
 ```c++
 void recurse_unlock() {

From 6faaa702fa2e1ab76cd69dffbe292d0c6dfa60d2 Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Tue, 30 Jun 2020 17:58:35 -0400
Subject: [PATCH 11/13] Proofreading

---
 _posts/2020-06-29-release-the-gil-pt.-2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index 6a8d8e7..7c0c61a 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -103,7 +103,7 @@ N = 1_000_000_000;
 from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil
 ```
 
-Even though two threads are used, the GIL prevents those threads from running in parallel:
+In the first example, even though two threads are used, the GIL constrains code to run in serial:
 
 ```python
 %%time

From 9ee58b7daaae2db78b35206ece1bbed3127c593f Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Wed, 29 Jul 2020 16:50:57 -0400
Subject: [PATCH 12/13] Snippet from other notes

---
 _posts/2020-06-29-release-the-gil-pt.-2.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
index 7c0c61a..fe23ada 100644
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ b/_posts/2020-06-29-release-the-gil-pt.-2.md
@@ -171,6 +171,8 @@ void recurse_unlock() {
 
 # PyO3
 
+Now that pyo3 is stable, represents a great candidate for bridge.
+
 ```rust
 use pyo3::prelude::*;
 use pyo3::wrap_pyfunction;

From ffc8c52b477ae41b7808d2526f5359a191ae0ca0 Mon Sep 17 00:00:00 2001
From: Bradlee Speice <bradlee@speice.io>
Date: Sun, 10 Nov 2024 16:33:38 -0500
Subject: [PATCH 13/13] Revert

---
 .gitignore                                 |   3 +-
 Gemfile.lock                               |  12 +-
 _posts/2020-06-29-release-the-gil-pt.-2.md | 276 ---------------------
 3 files changed, 7 insertions(+), 284 deletions(-)
 delete mode 100644 _posts/2020-06-29-release-the-gil-pt.-2.md

diff --git a/.gitignore b/.gitignore
index 095c115..ddf4d8b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,5 +3,4 @@ _site/
 .sass-cache/
 .jekyll-metadata
 .bundle/
-vendor/
-.vscode/
\ No newline at end of file
+vendor/
\ No newline at end of file
diff --git a/Gemfile.lock b/Gemfile.lock
index c66b54e..40a5613 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -9,12 +9,12 @@ GEM
       eventmachine (>= 0.12.9)
       http_parser.rb (~> 0.6.0)
     eventmachine (1.2.7)
-    ffi (1.13.1)
+    ffi (1.12.2)
     forwardable-extended (2.6.0)
     http_parser.rb (0.6.0)
     i18n (0.9.5)
       concurrent-ruby (~> 1.0)
-    jekyll (3.8.7)
+    jekyll (3.8.6)
       addressable (~> 2.4)
       colorator (~> 1.0)
       em-websocket (~> 0.5)
@@ -48,11 +48,11 @@ GEM
     mercenary (0.3.6)
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
-    public_suffix (4.0.5)
-    rb-fsevent (0.10.4)
+    public_suffix (4.0.4)
+    rb-fsevent (0.10.3)
     rb-inotify (0.10.1)
       ffi (~> 1.0)
-    rouge (3.20.0)
+    rouge (3.17.0)
     rubyzip (2.3.0)
     safe_yaml (1.0.5)
     sass (3.7.4)
@@ -75,4 +75,4 @@ DEPENDENCIES
   tzinfo-data
 
 BUNDLED WITH
-   2.1.4
+   1.17.3
diff --git a/_posts/2020-06-29-release-the-gil-pt.-2.md b/_posts/2020-06-29-release-the-gil-pt.-2.md
deleted file mode 100644
index fe23ada..0000000
--- a/_posts/2020-06-29-release-the-gil-pt.-2.md
+++ /dev/null
@@ -1,276 +0,0 @@
----
-layout: post
-title: "Release the GIL: Pybind11, PyO3"
-description: "More Python Parallelism"
-category:
-tags: [python, rust, c++]
----
-
-I've been continuing experiments with parallelism in Python; while these techniques are a bit niche,
-it's still fun to push the performance envelope. In addition to tools like
-[Cython](https://cython.org/) and [Numba](https://numba.pydata.org/) (covered
-[here](//2019/12/release-the-gil.html)) that attempt to stay as close to Python as possible, other
-projects are available that act as a bridge between Python and other languages. The goal is to make
-cooperation simple without compromising independence.
-
-In practice, this "cooperation" between languages is important for performance reasons. Code written
-in C++ shouldn't have to care about the Python GIL. However, unless the GIL is explicitly unlocked,
-it will remain implicitly held; though the Python interpreter _could_ be making progress on a
-separate thread, it will be stuck waiting on the current operation to complete. We'll look at some
-techniques below for managing the GIL in a Python extension.
-
-# Pybind11
-
-The motto of [Pybind11](https://github.com/pybind/pybind11) is "seamless operability between C++11
-and Python", and they certainly deliver on that. Setting up a hybrid project where C++ (using CMake)
-and Python (using setuptools) could coexist was straight-forward, and the repository also works as
-[a template](https://github.com/speice-io/release-the-gil-pybind11/settings) for future projects.
-
-There's a great deal of overlap between Pybind11 and Cython. Where Pybind11 makes it easy for C++ to
-interact with the interpreter, Cython uses a Python-like language to facilitate interaction with
-C++. Another way of thinking about is like this: Pybind11 is for C++ developers who want to interact
-with Python, and Cython is for Python developers who want to interact with C++.
-
-Just like the previous post, we'll examine a simple Fibonacci sequence implementation to demonstrate
-how Python's threading model interacts with Pybind11:
-
-```c++
-#include <cstdint>
-#include <pybind11/pybind.h>
-
-inline std::uint64_t fibonacci(std::uint64_t n) {
-  if (n <= 1) {
-    return n;
-  }
-
-  std::uint64_t a = 0;
-  std::uint64_t b = 1;
-  std::uint64_t c = a + b;
-
-  for (std::uint64_t _i = 2; _i < n; _i++) {
-    a = b;
-    b = c;
-    c = a + b;
-  }
-
-  return c;
-}
-
-std::uint64_t fibonacci_gil(std::uint64_t n) {
-  // The GIL is held by default when entering C++ from Python, so we need no
-  // manipulation here. Interestingly enough, re-acquiring a held GIL is a safe
-  // operation (within the same thread), so feel free to scatter
-  // `py::gil_scoped_acquire` throughout the code.
-  return fibonacci(n);
-}
-
-std::uint64_t fibonacci_nogil(std::uint64_t n) {
-  // Because the GIL is held by default, we need to explicitly release it here
-  // to run in parallel.
-  // WARNING: Releasing the lock multiple times will crash the process.
-
-  py::gil_scoped_release release;
-  return fibonacci(n);
-}
-
-PYBIND11_MODULE(speiceio_pybind11, m) {
-
-  m.def("fibonacci_gil", &fibonacci_gil, R"pbdoc(
-        Calculate the Nth Fibonacci number while implicitly holding the GIL
-    )pbdoc");
-
-  m.def("fibonacci_nogil", &fibonacci_nogil,
-        R"pbdoc(
-        Calculate the Nth Fibonacci number after explicitly unlocking the GIL
-    )pbdoc");
-
-#ifdef VERSION_INFO
-  m.attr("__version__") = VERSION_INFO;
-#else
-  m.attr("__version__") = "dev";
-#endif
-}
-```
-
-After building the C++ module, those functions can be used to demonstrate the effect of unlocking
-the GIL.
-
-```python
-# The billionth Fibonacci number overflows `std::uint64_t`, but that's OK;
-# our purpose is keeping the CPU busy, not getting the correct result.
-N = 1_000_000_000;
-
-from speiceio_pybind11 import fibonacci_gil, fibonacci_nogil
-```
-
-In the first example, even though two threads are used, the GIL constrains code to run in serial:
-
-```python
-%%time
-from threading import Thread
-
-# Create the two threads to run on
-t1 = Thread(target=fibonacci_gil, args=[N])
-t2 = Thread(target=fibonacci_gil, args=[N])
-# Start the threads
-t1.start(); t2.start()
-# Wait for the threads to finish
-t1.join(); t2.join()
-```
-
-> <pre>
-> CPU times: user 709 ms, sys: 0 ns, total: 709 ms
-> Wall time: 705 ms
-> </pre>
-
-Because the elapsed ("wall") time is effectively the same as the time spent executing on the CPU
-("user"), there was no benefit to using multiple threads.
-
-However, if one thread unlocks the GIL first, the Python interpreter is allowed to execute the
-second thread in parallel:
-
-```python
-%%time
-
-t1 = Thread(target=fibonacci_nogil, args=[N])
-t2 = Thread(target=fibonacci_gil, args=[N])
-t1.start(); t2.start()
-t1.join(); t2.join()
-```
-
-> <pre>
-> CPU times: user 734 ms, sys: 7.89 ms, total: 742 ms
-> Wall time: 372 ms
-> </pre>
-
-The CPU time ("user") hasn't changed much, but the elapsed time ("wall") is effectively cut in half.
-
-Caution is advised though; attempting to unlock the GIL when it isn't locked will terminate the
-current process:
-
-```c++
-void recurse_unlock() {
-  py::gil_scoped_release release;
-  return recurse_unlock();
-}
-```
-
-> <pre>
-> Python 3.8.2 (default, Apr 27 2020, 15:53:34) 
-> [GCC 9.3.0] on linux
-> Type "help", "copyright", "credits" or "license" for more information.
-> >>> from speiceio_pybind11 import recurse_unlock
-> >>> recurse_unlock()
-> Fatal Python error: PyEval_SaveThread: NULL tstate
-> Python runtime state: initialized
-> 
-> Current thread 0x00007f213a627740 (most recent call first):
-> File "<stdin>", line 1 in <module>
->  [1]    34943 abort (core dumped)  python
-> </pre>
-
-# PyO3
-
-Now that pyo3 is stable, represents a great candidate for bridge.
-
-```rust
-use pyo3::prelude::*;
-use pyo3::wrap_pyfunction;
-
-fn fibonacci_impl(n: u64) -> u64 {
-    if n <= 1 {
-        return n;
-    }
-
-    let mut a: u64 = 0;
-    let mut b: u64 = 1;
-    let mut c: u64 = a + b;
-
-    for _i in 2..n {
-        a = b;
-        b = c;
-        // We're not particularly concerned about the actual result, just in keeping the
-        // processor busy.
-        c = a.overflowing_add(b).0;
-    }
-
-    c
-}
-
-#[pyfunction]
-fn fibonacci_gil(n: u64) -> PyResult<u64> {
-    // The GIL is implicitly held here
-    Ok(fibonacci_impl(n))
-}
-
-#[pyfunction]
-fn fibonacci_nogil(py: Python, n: u64) -> PyResult<u64> {
-    // Explicitly release the GIL
-    py.allow_threads(|| Ok(fibonacci_impl(n)))
-}
-
-#[pymodule]
-fn speiceio_pyo3(_py: Python, m: &PyModule) -> PyResult<()> {
-    m.add_wrapped(wrap_pyfunction!(fibonacci_gil))?;
-    m.add_wrapped(wrap_pyfunction!(fibonacci_nogil))?;
-
-    Ok(())
-}
-```
-
-```python
-N = 1_000_000_000;
-
-from speiceio_pyo3 import fibonacci_gil, fibonacci_nogil
-```
-
-```python
-%%time
-from threading import Thread
-
-# Create the two threads to run on
-t1 = Thread(target=fibonacci_gil, args=[N])
-t2 = Thread(target=fibonacci_gil, args=[N])
-# Start the threads
-t1.start(); t2.start()
-# Wait for the threads to finish
-t1.join(); t2.join()
-```
-
-> <pre>
-> CPU times: user 503 ms, sys: 3.83 ms, total: 507 ms
-> Wall time: 506 ms
-> </pre>
-
-```python
-%%time
-
-t1 = Thread(target=fibonacci_nogil, args=[N])
-t2 = Thread(target=fibonacci_gil, args=[N])
-t1.start(); t2.start()
-t1.join(); t2.join()
-```
-
-> <pre>
-> CPU times: user 501 ms, sys: 3.96 ms, total: 505 ms
-> Wall time: 252 ms
-> </pre>
-
-Interestingly enough, Rust's borrow rules actually _prevent_ double-unlocking because the GIL handle
-can't be transferred across threads:
-
-```rust
-fn recursive_unlock(py: Python) -> PyResult<()> {
-    py.allow_threads(|| recursive_unlock(py))
-}
-```
-
-> <pre>
-> error[E0277]: `std::rc::Rc<()>` cannot be shared between threads safely
->   --> src/lib.rs:38:8
->    |
-> 38 |     py.allow_threads(|| recursive_unlock(py))
->    |        ^^^^^^^^^^^^^ `std::rc::Rc<()>` cannot be shared between threads safely
->    |
->    = help: within `pyo3::python::Python<'_>`, the trait `std::marker::Sync` is not implemented for `std::rc::Rc<()>`
-> </pre>