speice.io/assets/js/0fb9ce37.ae75980a.js

1 line
18 KiB
JavaScript

"use strict";(self.webpackChunkspeice_io=self.webpackChunkspeice_io||[]).push([["7848"],{53729:function(e,n,t){t.r(n),t.d(n,{assets:function(){return l},contentTitle:function(){return o},default:function(){return d},frontMatter:function(){return a},metadata:function(){return i},toc:function(){return c}});var i=t(26160),s=t(85893),r=t(50065);let a={slug:"2019/12/release-the-gil",title:"Release the GIL",date:new Date("2019-12-14T12:00:00.000Z"),authors:["bspeice"],tags:[]},o=void 0,l={authorsImageUrls:[void 0]},c=[{value:"Cython",id:"cython",level:2},{value:"Numba",id:"numba",level:2},{value:"Conclusion",id:"conclusion",level:2}];function h(e){let n={a:"a",blockquote:"blockquote",code:"code",em:"em",h2:"h2",li:"li",p:"p",pre:"pre",strong:"strong",ul:"ul",...(0,r.a)(),...e.components};return(0,s.jsxs)(s.Fragment,{children:[(0,s.jsxs)(n.p,{children:["Complaining about the ",(0,s.jsx)(n.a,{href:"https://wiki.python.org/moin/GlobalInterpreterLock",children:"Global Interpreter Lock"}),"\n(GIL) seems like a rite of passage for Python developers. It's easy to criticize a design decision\nmade before multi-core CPU's were widely available, but the fact that it's still around indicates\nthat it generally works ",(0,s.jsx)(n.a,{href:"https://wiki.c2.com/?PrematureOptimization",children:"Good"}),"\n",(0,s.jsx)(n.a,{href:"https://wiki.c2.com/?YouArentGonnaNeedIt",children:"Enough"}),". Besides, there are simple and effective\nworkarounds; it's not hard to start a\n",(0,s.jsx)(n.a,{href:"https://docs.python.org/3/library/multiprocessing.html",children:"new process"})," and use message passing to\nsynchronize code running in parallel."]}),"\n",(0,s.jsxs)(n.p,{children:["Still, wouldn't it be nice to have more than a single active interpreter thread? In an age of\nasynchronicity and ",(0,s.jsx)(n.em,{children:"M:N"})," threading, Python seems lacking. The ideal scenario is to take advantage of\nboth Python's productivity and the modern CPU's parallel capabilities."]}),"\n",(0,s.jsxs)(n.p,{children:["Presented below are two strategies for releasing the GIL's icy grip without giving up on what makes\nPython a nice language to start with. Bear in mind: these are just the tools, no claim is made about\nwhether it's a good idea to use them. Very often, unlocking the GIL is an\n",(0,s.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/XY_problem",children:"XY problem"}),"; you want application performance, and the\nGIL seems like an obvious bottleneck. Remember that any gains from running code in parallel come at\nthe expense of project complexity; messing with the GIL is ultimately messing with Python's memory\nmodel."]}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%load_ext Cython\nfrom numba import jit\n\nN = 1_000_000_000\n"})}),"\n",(0,s.jsx)(n.h2,{id:"cython",children:"Cython"}),"\n",(0,s.jsxs)(n.p,{children:["Put simply, ",(0,s.jsx)(n.a,{href:"https://cython.org/",children:"Cython"})," is a programming language that looks a lot like Python,\ngets ",(0,s.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/Source-to-source_compiler",children:"transpiled"})," to C/C++, and integrates\nwell with the ",(0,s.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/CPython",children:"CPython"})," API. It's great for building Python\nwrappers to C and C++ libraries, writing optimized code for numerical processing, and tons more. And\nwhen it comes to managing the GIL, there are two special features:"]}),"\n",(0,s.jsxs)(n.ul,{children:["\n",(0,s.jsxs)(n.li,{children:["The ",(0,s.jsx)(n.code,{children:"nogil"}),"\n",(0,s.jsx)(n.a,{href:"https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#declaring-a-function-as-callable-without-the-gil",children:"function annotation"}),"\nasserts that a Cython function is safe to use without the GIL, and compilation will fail if it\ninteracts with Python in an unsafe manner"]}),"\n",(0,s.jsxs)(n.li,{children:["The ",(0,s.jsx)(n.code,{children:"with nogil"}),"\n",(0,s.jsx)(n.a,{href:"https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#releasing-the-gil",children:"context manager"}),"\nexplicitly unlocks the CPython GIL while active"]}),"\n"]}),"\n",(0,s.jsxs)(n.p,{children:["Whenever Cython code runs inside a ",(0,s.jsx)(n.code,{children:"with nogil"}),' block on a separate thread, the Python interpreter\nis unblocked and allowed to continue work elsewhere. We\'ll define a "busy work" function that\ndemonstrates this principle in action:']}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%cython\n\n# Annotating a function with `nogil` indicates only that it is safe\n# to call in a `with nogil` block. It *does not* release the GIL.\ncdef unsigned long fibonacci(unsigned long n) nogil:\n if n <= 1:\n return n\n\n cdef unsigned long a = 0, b = 1, c = 0\n\n c = a + b\n for _i in range(2, n):\n a = b\n b = c\n c = a + b\n\n return c\n\n\ndef cython_nogil(unsigned long n):\n # Explicitly release the GIL while running `fibonacci`\n with nogil:\n value = fibonacci(n)\n\n return value\n\n\ndef cython_gil(unsigned long n):\n # Because the GIL is not explicitly released, it implicitly\n # remains acquired when running the `fibonacci` function\n return fibonacci(n)\n"})}),"\n",(0,s.jsx)(n.p,{children:"First, let's time how long it takes Cython to calculate the billionth Fibonacci number:"}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%time\n_ = cython_gil(N);\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsx)("pre",{children:(0,s.jsx)(n.p,{children:"CPU times: user 365 ms, sys: 0 ns, total: 365 ms\nWall time: 372 ms"})}),"\n"]}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%time\n_ = cython_nogil(N);\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsx)("pre",{children:(0,s.jsx)(n.p,{children:"CPU times: user 381 ms, sys: 0 ns, total: 381 ms\nWall time: 388 ms"})}),"\n"]}),"\n",(0,s.jsx)(n.p,{children:"Both versions (with and without GIL) take effectively the same amount of time to run. Even when\nrunning this calculation in parallel on separate threads, it is expected that the run time will\ndouble because only one thread can be active at a time:"}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%time\nfrom threading import Thread\n\n# Create the two threads to run on\nt1 = Thread(target=cython_gil, args=[N])\nt2 = Thread(target=cython_gil, args=[N])\n# Start the threads\nt1.start(); t2.start()\n# Wait for the threads to finish\nt1.join(); t2.join()\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsx)("pre",{children:(0,s.jsx)(n.p,{children:"CPU times: user 641 ms, sys: 5.62 ms, total: 647 ms\nWall time: 645 ms"})}),"\n"]}),"\n",(0,s.jsx)(n.p,{children:"However, if the first thread releases the GIL, the second thread is free to acquire it and run in\nparallel:"}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%time\n\nt1 = Thread(target=cython_nogil, args=[N])\nt2 = Thread(target=cython_gil, args=[N])\nt1.start(); t2.start()\nt1.join(); t2.join()\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsx)("pre",{children:(0,s.jsx)(n.p,{children:"CPU times: user 717 ms, sys: 372 \xb5s, total: 718 ms\nWall time: 358 ms"})}),"\n"]}),"\n",(0,s.jsxs)(n.p,{children:["Because ",(0,s.jsx)(n.code,{children:"user"})," time represents the sum of processing time on all threads, it doesn't change much.\nThe ",(0,s.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/Elapsed_real_time",children:'"wall time"'})," has been cut roughly in half\nbecause each function is running simultaneously."]}),"\n",(0,s.jsxs)(n.p,{children:["Keep in mind that the ",(0,s.jsx)(n.strong,{children:"order in which threads are started"})," makes a difference!"]}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%time\n\n# Note that the GIL-locked version is started first\nt1 = Thread(target=cython_gil, args=[N])\nt2 = Thread(target=cython_nogil, args=[N])\nt1.start(); t2.start()\nt1.join(); t2.join()\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsx)("pre",{children:(0,s.jsx)(n.p,{children:"CPU times: user 667 ms, sys: 0 ns, total: 667 ms\nWall time: 672 ms"})}),"\n"]}),"\n",(0,s.jsx)(n.p,{children:"Even though the second thread releases the GIL while running, it can't start until the first has\ncompleted. Thus, the overall runtime is effectively the same as running two GIL-locked threads."}),"\n",(0,s.jsxs)(n.p,{children:["Finally, be aware that attempting to unlock the GIL from a thread that doesn't own it will crash the\n",(0,s.jsx)(n.strong,{children:"interpreter"}),", not just the thread attempting the unlock:"]}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%cython\n\ncdef int cython_recurse(int n) nogil:\n if n <= 0:\n return 0\n\n with nogil:\n return cython_recurse(n - 1)\n\ncython_recurse(2)\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsxs)("pre",{children:[(0,s.jsx)(n.p,{children:"Fatal Python error: PyEval_SaveThread: NULL tstate"}),(0,s.jsx)(n.p,{children:'Thread 0x00007f499effd700 (most recent call first):\nFile "/home/bspeice/.virtualenvs/release-the-gil/lib/python3.7/site-packages/ipykernel/parentpoller.py", line 39 in run\nFile "/usr/lib/python3.7/threading.py", line 926 in _bootstrap_inner\nFile "/usr/lib/python3.7/threading.py", line 890 in _bootstrap'})]}),"\n"]}),"\n",(0,s.jsxs)(n.p,{children:["In practice, avoiding this issue is simple. First, ",(0,s.jsx)(n.code,{children:"nogil"})," functions probably shouldn't contain\n",(0,s.jsx)(n.code,{children:"with nogil"})," blocks. Second, Cython can\n",(0,s.jsx)(n.a,{href:"https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#conditional-acquiring-releasing-the-gil",children:"conditionally acquire/release"}),"\nthe GIL, so these conditions can be used to synchronize access. Finally, Cython's documentation for\n",(0,s.jsx)(n.a,{href:"https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#acquiring-and-releasing-the-gil",children:"external C code"}),"\ncontains more detail on how to safely manage the GIL."]}),"\n",(0,s.jsxs)(n.p,{children:["To conclude: use Cython's ",(0,s.jsx)(n.code,{children:"nogil"})," annotation to assert that functions are safe for calling when the\nGIL is unlocked, and ",(0,s.jsx)(n.code,{children:"with nogil"})," to actually unlock the GIL and run those functions."]}),"\n",(0,s.jsx)(n.h2,{id:"numba",children:"Numba"}),"\n",(0,s.jsxs)(n.p,{children:["Like Cython, ",(0,s.jsx)(n.a,{href:"https://numba.pydata.org/",children:"Numba"}),' is a "compiled Python." Where Cython works by\ncompiling a Python-like language to C/C++, Numba compiles Python bytecode ',(0,s.jsx)(n.em,{children:"directly to machine code"}),"\nat runtime. Behavior is controlled with a special ",(0,s.jsx)(n.code,{children:"@jit"})," decorator; calling a decorated function\nfirst compiles it to machine code before running. Calling the function a second time re-uses that\nmachine code unless the argument types have changed."]}),"\n",(0,s.jsxs)(n.p,{children:["Numba works best when a ",(0,s.jsx)(n.code,{children:"nopython=True"})," argument is added to the ",(0,s.jsx)(n.code,{children:"@jit"})," decorator; functions\ncompiled in ",(0,s.jsx)(n.a,{href:"http://numba.pydata.org/numba-doc/latest/user/jit.html?#nopython",children:(0,s.jsx)(n.code,{children:"nopython"})})," mode\navoid the CPython API and have performance comparable to C. Further, adding ",(0,s.jsx)(n.code,{children:"nogil=True"})," to the\n",(0,s.jsx)(n.code,{children:"@jit"})," decorator unlocks the GIL while that function is running. Note that ",(0,s.jsx)(n.code,{children:"nogil"})," and ",(0,s.jsx)(n.code,{children:"nopython"}),"\nare separate arguments; while it is necessary for code to be compiled in ",(0,s.jsx)(n.code,{children:"nopython"})," mode in order to\nrelease the lock, the GIL will remain locked if ",(0,s.jsx)(n.code,{children:"nogil=False"})," (the default)."]}),"\n",(0,s.jsx)(n.p,{children:"Let's repeat the same experiment, this time using Numba instead of Cython:"}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"# The `int` type annotation is only for humans and is ignored\n# by Numba.\n@jit(nopython=True, nogil=True)\ndef numba_nogil(n: int) -> int:\n if n <= 1:\n return n\n\n a = 0\n b = 1\n\n c = a + b\n for _i in range(2, n):\n a = b\n b = c\n c = a + b\n\n return c\n\n\n# Run using `nopython` mode to receive a performance boost,\n# but GIL remains locked due to `nogil=False` by default.\n@jit(nopython=True)\ndef numba_gil(n: int) -> int:\n if n <= 1:\n return n\n\n a = 0\n b = 1\n\n c = a + b\n for _i in range(2, n):\n a = b\n b = c\n c = a + b\n\n return c\n\n\n# Call each function once to force compilation; we don't want\n# the timing statistics to include how long it takes to compile.\nnumba_nogil(N)\nnumba_gil(N);\n"})}),"\n",(0,s.jsx)(n.p,{children:"We'll perform the same tests as above; first, figure out how long it takes the function to run:"}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%time\n_ = numba_gil(N)\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsx)("pre",{children:(0,s.jsx)(n.p,{children:"CPU times: user 253 ms, sys: 258 \xb5s, total: 253 ms\nWall time: 251 ms"})}),"\n"]}),"\n",(0,s.jsx)("small",{children:(0,s.jsx)(n.p,{children:"Aside: it's not immediately clear why Numba takes ~20% less time to run than Cython for code that should be\neffectively identical after compilation."})}),"\n",(0,s.jsx)(n.p,{children:"When running two GIL-locked threads, the result (as expected) takes around twice as long to compute:"}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%time\nt1 = Thread(target=numba_gil, args=[N])\nt2 = Thread(target=numba_gil, args=[N])\nt1.start(); t2.start()\nt1.join(); t2.join()\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsx)("pre",{children:(0,s.jsx)(n.p,{children:"CPU times: user 541 ms, sys: 3.96 ms, total: 545 ms\nWall time: 541 ms"})}),"\n"]}),"\n",(0,s.jsx)(n.p,{children:"But if the GIL-unlocking thread starts first, both threads run in parallel:"}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%time\nt1 = Thread(target=numba_nogil, args=[N])\nt2 = Thread(target=numba_gil, args=[N])\nt1.start(); t2.start()\nt1.join(); t2.join()\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsx)("pre",{children:(0,s.jsx)(n.p,{children:"CPU times: user 551 ms, sys: 7.77 ms, total: 559 ms\nWall time: 279 ms"})}),"\n"]}),"\n",(0,s.jsx)(n.p,{children:"Just like Cython, starting the GIL-locked thread first leads to poor performance:"}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"%%time\nt1 = Thread(target=numba_gil, args=[N])\nt2 = Thread(target=numba_nogil, args=[N])\nt1.start(); t2.start()\nt1.join(); t2.join()\n"})}),"\n",(0,s.jsxs)(n.blockquote,{children:["\n",(0,s.jsx)("pre",{children:(0,s.jsx)(n.p,{children:"CPU times: user 524 ms, sys: 0 ns, total: 524 ms\nWall time: 522 ms"})}),"\n"]}),"\n",(0,s.jsxs)(n.p,{children:["Finally, unlike Cython, Numba will unlock the GIL if and only if it is currently acquired;\nrecursively calling ",(0,s.jsx)(n.code,{children:"@jit(nogil=True)"})," functions is perfectly safe:"]}),"\n",(0,s.jsx)(n.pre,{children:(0,s.jsx)(n.code,{className:"language-python",children:"from numba import jit\n\n@jit(nopython=True, nogil=True)\ndef numba_recurse(n: int) -> int:\n if n <= 0:\n return 0\n\n return numba_recurse(n - 1)\n\nnumba_recurse(2);\n"})}),"\n",(0,s.jsx)(n.h2,{id:"conclusion",children:"Conclusion"}),"\n",(0,s.jsx)(n.p,{children:"Before finishing, it's important to address pain points that will show up if these techniques are\nused in a more realistic project:"}),"\n",(0,s.jsxs)(n.p,{children:["First, code running in a GIL-free context will likely also need non-trivial data structures;\nGIL-free functions aren't useful if they're constantly interacting with Python objects whose access\nrequires the GIL. Cython provides\n",(0,s.jsx)(n.a,{href:"http://docs.cython.org/en/latest/src/tutorial/cdef_classes.html",children:"extension types"})," and Numba\nprovides a ",(0,s.jsx)(n.a,{href:"https://numba.pydata.org/numba-doc/dev/user/jitclass.html",children:(0,s.jsx)(n.code,{children:"@jitclass"})})," decorator to\naddress this need."]}),"\n",(0,s.jsx)(n.p,{children:"Second, building and distributing applications that make use of Cython/Numba can be complicated.\nCython packages require running the compiler, (potentially) linking/packaging external dependencies,\nand distributing a binary wheel. Numba is generally simpler because the code being distributed is\npure Python, but can be tricky since errors aren't detected until runtime."}),"\n",(0,s.jsxs)(n.p,{children:["Finally, while unlocking the GIL is often a solution in search of a problem, both Cython and Numba\nprovide tools to directly manage the GIL when appropriate. This enables true parallelism (not just\n",(0,s.jsx)(n.a,{href:"https://stackoverflow.com/a/1050257",children:"concurrency"}),") that is impossible in vanilla Python."]})]})}function d(e={}){let{wrapper:n}={...(0,r.a)(),...e.components};return n?(0,s.jsx)(n,{...e,children:(0,s.jsx)(h,{...e})}):h(e)}},50065:function(e,n,t){t.d(n,{Z:function(){return o},a:function(){return a}});var i=t(67294);let s={},r=i.createContext(s);function a(e){let n=i.useContext(r);return i.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function o(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(s):e.components||s:a(e.components),i.createElement(r.Provider,{value:n},e.children)}},26160:function(e){e.exports=JSON.parse('{"permalink":"/2019/12/release-the-gil","source":"@site/blog/2019-12-14-release-the-gil/index.mdx","title":"Release the GIL","description":"Complaining about the Global Interpreter Lock","date":"2019-12-14T12:00:00.000Z","tags":[],"readingTime":8.58,"hasTruncateMarker":true,"authors":[{"name":"Bradlee Speice","socials":{"github":"https://github.com/bspeice"},"key":"bspeice","page":null}],"frontMatter":{"slug":"2019/12/release-the-gil","title":"Release the GIL","date":"2019-12-14T12:00:00.000Z","authors":["bspeice"],"tags":[]},"unlisted":false,"lastUpdatedAt":1731207983000,"prevItem":{"title":"The webpack industrial complex","permalink":"/2011/11/webpack-industrial-complex"},"nextItem":{"title":"Binary format shootout","permalink":"/2019/09/binary-format-shootout"}}')}}]);