Deploy website - based on 6dcbc1a72c

2026-06-09 23:01:46 -04:00 · 2024-11-10 16:43:02 -05:00
parent 5172f66254
commit 741bf44126
262 changed files with 8754 additions and 4196 deletions
@@ -1,6 +0,0 @@
 FROM mcr.microsoft.com/vscode/devcontainers/ruby:0-2.7-bullseye
 RUN wget https://github.com/errata-ai/vale/releases/download/v2.21.0/vale_2.21.0_Linux_64-bit.tar.gz -O /tmp/vale.tar.gz \
 && cd /usr/local/bin \
 && tar xf /tmp/vale.tar.gz \
 && rm /tmp/vale.tar.gz
@@ -1,13 +0,0 @@
 // For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
 // https://github.com/microsoft/vscode-dev-containers/tree/v0.245.0/containers/ruby
 {
 	"name": "Ruby",
 	"build": {
 		"dockerfile": "Dockerfile"
 	},
 	"runArgs": ["--userns=keep-id"],
 	"remoteUser": "vscode",
 	"containerUser": "vscode",
 	"workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/${localWorkspaceFolderBasename},type=bind,Z"
 }
@@ -1,8 +0,0 @@
 _site/
 .swp
 .sass-cache/
 .jekyll-metadata
 .bundle/
 vendor/
 .styles/
 .vscode/
@@ -1,7 +0,0 @@
 StylesPath = .styles
 MinAlertLevel = suggestion
 Packages = Microsoft, write-good
 [*]
 BasedOnStyles = Vale, Microsoft, write-good
 write-good.E-Prime = NO
@@ -1,29 +0,0 @@
 source "https://rubygems.org"
 # Hello! This is where you manage which Jekyll version is used to run.
 # When you want to use a different version, change it below, save the
 # file and run `bundle install`. Run Jekyll with `bundle exec`, like so:
 #
 #     bundle exec jekyll serve
 #
 # This will help ensure the proper Jekyll version is running.
 # Happy Jekylling!
 gem "jekyll", "~> 3.8.3"
 gem "texture"
 # If you want to use GitHub Pages, remove the "gem "jekyll"" above and
 # uncomment the line below. To upgrade, run `bundle update github-pages`.
 # gem "github-pages", group: :jekyll_plugins
 # If you have any plugins, put them here!
 group :jekyll_plugins do
  gem "jekyll-feed", "~> 0.6"
  gem "jekyll-remote-theme"
 end
 # Windows does not include zoneinfo files, so bundle the tzinfo-data gem
 gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby]
 # Performance-booster for watching directories on Windows
 gem "wdm", "~> 0.1.0" if Gem.win_platform?
@@ -1,78 +0,0 @@
 GEM
  remote: https://rubygems.org/
  specs:
    addressable (2.7.0)
      public_suffix (>= 2.0.2, < 5.0)
    colorator (1.1.0)
    concurrent-ruby (1.1.6)
    em-websocket (0.5.1)
      eventmachine (>= 0.12.9)
      http_parser.rb (~> 0.6.0)
    eventmachine (1.2.7)
    ffi (1.12.2)
    forwardable-extended (2.6.0)
    http_parser.rb (0.6.0)
    i18n (0.9.5)
      concurrent-ruby (~> 1.0)
    jekyll (3.8.6)
      addressable (~> 2.4)
      colorator (~> 1.0)
      em-websocket (~> 0.5)
      i18n (~> 0.7)
      jekyll-sass-converter (~> 1.0)
      jekyll-watch (~> 2.0)
      kramdown (~> 1.14)
      liquid (~> 4.0)
      mercenary (~> 0.3.3)
      pathutil (~> 0.9)
      rouge (>= 1.7, < 4)
      safe_yaml (~> 1.0)
    jekyll-feed (0.13.0)
      jekyll (>= 3.7, < 5.0)
    jekyll-remote-theme (0.4.2)
      addressable (~> 2.0)
      jekyll (>= 3.5, < 5.0)
      jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
      rubyzip (>= 1.3.0, < 3.0)
    jekyll-sass-converter (1.5.2)
      sass (~> 3.4)
    jekyll-seo-tag (2.6.1)
      jekyll (>= 3.3, < 5.0)
    jekyll-watch (2.2.1)
      listen (~> 3.0)
    kramdown (1.17.0)
    liquid (4.0.3)
    listen (3.2.1)
      rb-fsevent (~> 0.10, >= 0.10.3)
      rb-inotify (~> 0.9, >= 0.9.10)
    mercenary (0.3.6)
    pathutil (0.16.2)
      forwardable-extended (~> 2.6)
    public_suffix (4.0.4)
    rb-fsevent (0.10.3)
    rb-inotify (0.10.1)
      ffi (~> 1.0)
    rouge (3.17.0)
    rubyzip (2.3.0)
    safe_yaml (1.0.5)
    sass (3.7.4)
      sass-listen (~> 4.0.0)
    sass-listen (4.0.0)
      rb-fsevent (~> 0.9, >= 0.9.4)
      rb-inotify (~> 0.9, >= 0.9.7)
    texture (0.3)
      jekyll (~> 3.7)
      jekyll-seo-tag (~> 2.1)
 PLATFORMS
  ruby
 DEPENDENCIES
  jekyll (~> 3.8.3)
  jekyll-feed (~> 0.6)
  jekyll-remote-theme
  texture
  tzinfo-data
 BUNDLED WITH
   2.1.4
@@ -1,44 +0,0 @@
 # Welcome to Jekyll!
 #
 # This config file is meant for settings that affect your whole blog, values
 # which you are expected to set up once and rarely edit after that. If you find
 # yourself editing this file very often, consider using Jekyll's data files
 # feature for the data you need to update frequently.
 #
 # For technical reasons, this file is *NOT* reloaded automatically when you use
 # 'bundle exec jekyll serve'. If you change this file, please restart the server process.
 # Site settings
 # These are used to personalize your new site. If you look in the HTML files,
 # you will see them accessed via {{ site.title }}, {{ site.email }}, and so on.
 # You can create any custom variable you would like, and they will be accessible
 # in the templates via {{ site.myvariable }}.
 title: speice.io
 description: The Old Speice Guy
 email: bradlee@speice.io
 baseurl: "" # the subpath of your site, e.g. /blog
 url: "https://speice.io/" # the base hostname & protocol for your site, e.g. http://example.com
 github_username:  bspeice
 # Build settings
 markdown: kramdown
 # theme: texture
 remote_theme: thelehhman/texture
 plugins:
  - jekyll-feed
  - jekyll-remote-theme
 include: [_pages]
 permalink: /:year/:month/:title.html
 # Exclude from processing.
 # The following items will not be processed, by default. Create a custom list
 # to override the default setting.
 # exclude:
 #   - Gemfile
 #   - Gemfile.lock
 #   - node_modules
 #   - vendor/bundle/
 #   - vendor/cache/
 #   - vendor/gems/
 #   - vendor/ruby/
@@ -1,23 +0,0 @@
 {% if page.layout == 'post' %}
 {% comment %}Thanks to https://www.bytedude.com/jekyll-previous-and-next-posts/{% endcomment %}
 <div class="container">
    <hr>
    <div class="post-nav">
        <div>
            {% if page.previous.url %}
            <a href="{{page.previous.url}}">&laquo;&nbsp;{{page.previous.title}}</a>
            {% endif %}
        </div>
        <div class="post-nav-next">
            {% if page.next.url %}
            <a href="{{page.next.url}}">{{page.next.title}}&nbsp;&raquo;</a>
            {% endif %}
        </div>
    </div>
 </div>
 <script type="text/javascript"
    src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
    </script>
 {% endif %}
@@ -1,7 +0,0 @@
 <meta charset="UTF-8">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
 <meta http-equiv="X-UA-Compatible" content="ie=edge">
 <link rel="stylesheet" href="{{ "/assets/css/style.css" | relative_url }}">
 <link rel="stylesheet" href="{{ "/assets/css/fonts.css" | prepend: site.baseurl }}">
 <title>{{ page.title | default: site.title }}</title>
 {% seo %}
@@ -1,7 +0,0 @@
 <div class="navbar">
    <a href="{{ "/" | prepend: site.baseurl }}">Home</a>
    <span class="separator"></span>
    <a href="{{ "/about/" | prepend: site.baseurl }}">About</a>
    <span class="separator"></span>
    <a href="{{ "/feed.xml" | prepend: site.baseurl }}">RSS</a>
 </div>
@@ -1,15 +0,0 @@
 <div class="container">
    <h2>{{ site.title }}</h1>
    <h1>{{ site.description }}</h2>
    <ul class="social">
        {%- if site.texture.social_links.github -%}
            <a href="https://github.com/{{ site.texture.social_links.github }}"><li><i class="icon-github-circled"></i></li></a>
        {%- endif -%}
        {%- if site.texture.social_links.linkedIn -%}
            <a href="https://linkedin.com/{{ site.texture.social_links.linkedIn }}"><li><i class="icon-linkedin-squared"></i></li></a>
        {%- endif -%}
        {%- if site.texture.social_links.twitter -%}
            <a href="https://twitter.com/{{ site.texture.social_links.twitter }}"><li><i class="icon-twitter-squared"></i></li></a>
        {%- endif -%}
    </ul>
 </div>
@@ -1,13 +0,0 @@
 ---
 layout: page
 title: About
 permalink: /about/
 ---
 Developer currently living in New York City.
 Best ways to get in contact:
 - Email: [bradlee@speice.io](mailto:bradlee@speice.io)
 - Github: [bspeice](https://github.com/bspeice)
 - LinkedIn: [bradleespeice](https://www.linkedin.com/in/bradleespeice/)
@@ -1,38 +0,0 @@
 ---
 layout: post
 title: "Hello!"
 description: ""
 category:
 tags: []
 ---
 I'll do what I can to keep this short, there's plenty of other things we both should be doing right
 now.
 If you're here for the bread pics, and to marvel in some other culinary side projects, I've got you
 covered:
 ![Saturday Bread]({{ "/assets/images/2018-05-28-bread.jpg" | absolute_url }})
 And no, I'm not posting pictures of earlier attempts that ended up turning into rocks in the oven.
 Okay, just one:
 ![Bread as rock]({{ "/assets/images/2018-05-28-rocks.jpg" | absolute_url }})
 If you're here for keeping up with the man Bradlee Speice, got plenty of that too. Plus some
 up-coming super-nerdy posts about how I'm changing the world.
 And if you're not here for those things: don't have a lot for you, sorry. But you're welcome to let
 me know what needs to change.
 I'm looking forward to making this a place to talk about what's going on in life, I hope you'll
 stick it out with me. The best way to follow what's going on is on my [About](/about/) page, but if
 you want the joy of clicking links, here's a few good ones:
 - Email (people still use this?): [bradlee@speice.io](mailto:bradlee@speice.io)
 - Mastodon (nerd Twitter): [@bradlee](https://mastodon.social/@bradlee)
 - Chat (RiotIM): [@bspeice:matrix.com](https://matrix.to/#/@bspeice:matrix.com)
 - The comments section (not for people with sanity intact): ↓↓↓
 Thanks, and keep it amazing.
@@ -1,177 +0,0 @@
 ---
 layout: post
 title: "What I Learned: Porting Dateutil Parser to Rust"
 description: ""
 category:
 tags: [dtparse, rust]
 ---
 Hi. I'm Bradlee.
 I've mostly been a lurker in Rust for a while, making a couple small contributions here and there.
 So launching [dtparse](https://github.com/bspeice/dtparse) feels like nice step towards becoming a
 functioning member of society. But not too much, because then you know people start asking you to
 pay bills, and ain't nobody got time for that.
 But I built dtparse, and you can read about my thoughts on the process. Or don't. I won't tell you
 what to do with your life (but you should totally keep reading).
 # Slow down, what?
 OK, fine, I guess I should start with _why_ someone would do this.
 [Dateutil](https://github.com/dateutil/dateutil) is a Python library for handling dates. The
 standard library support for time in Python is kinda dope, but there are a lot of extras that go
 into making it useful beyond just the [datetime](https://docs.python.org/3.6/library/datetime.html)
 module. `dateutil.parser` specifically is code to take all the super-weird time formats people come
 up with and turn them into something actually useful.
 Date/time parsing, it turns out, is just like everything else involving
 [computers](https://infiniteundo.com/post/25326999628/falsehoods-programmers-believe-about-time) and
 [time](https://infiniteundo.com/post/25509354022/more-falsehoods-programmers-believe-about-time): it
 feels like it shouldn't be that difficult to do, until you try to do it, and you realize that people
 suck and this is why
 [we can't we have nice things](https://zachholman.com/talk/utc-is-enough-for-everyone-right). But
 alas, we'll try and make contemporary art out of the rubble and give it a pretentious name like
 _Time_.
 ![A gravel mound](/assets/images/2018-06-25-gravel-mound.jpg)
 > [Time](https://www.goodfreephotos.com/united-states/montana/elkhorn/remains-of-the-mining-operation-elkhorn.jpg.php)
 What makes `dateutil.parser` great is that there's single function with a single argument that
 drives what programmers interact with:
 [`parse(timestr)`](https://github.com/dateutil/dateutil/blob/6dde5d6298cfb81a4c594a38439462799ed2aef2/dateutil/parser/_parser.py#L1258).
 It takes in the time as a string, and gives you back a reasonable "look, this is the best anyone can
 possibly do to make sense of your input" value. It doesn't expect much of you.
 [And now it's in Rust.](https://github.com/bspeice/dtparse/blob/7d565d3a78876dbebd9711c9720364fe9eba7915/src/lib.rs#L1332)
 # Lost in Translation
 Having worked at a bulge-bracket bank watching Java programmers try to be Python programmers, I'm
 admittedly hesitant to publish Python code that's trying to be Rust. Interestingly, Rust code can
 actually do a great job of mimicking Python. It's certainly not idiomatic Rust, but I've had better
 experiences than
 [this guy](https://webcache.googleusercontent.com/search?q=cache:wkYMpktJtnUJ:https://jackstouffer.com/blog/porting_dateutil.html+&cd=3&hl=en&ct=clnk&gl=us)
 who attempted the same thing for D. These are the actual take-aways:
 When transcribing code, **stay as close to the original library as possible**. I'm talking about
 using the same variable names, same access patterns, the whole shebang. It's way too easy to make a
 couple of typos, and all of a sudden your code blows up in new and exciting ways. Having a reference
 manual for verbatim what your code should be means that you don't spend that long debugging
 complicated logic, you're more looking for typos.
 Also, **don't use nice Rust things like enums**. While
 [one time it worked out OK for me](https://github.com/bspeice/dtparse/blob/7d565d3a78876dbebd9711c9720364fe9eba7915/src/lib.rs#L88-L94),
 I also managed to shoot myself in the foot a couple times because `dateutil` stores AM/PM as a
 boolean and I mixed up which was true, and which was false (side note: AM is false, PM is true). In
 general, writing nice code _should not be a first-pass priority_ when you're just trying to recreate
 the same functionality.
 **Exceptions are a pain.** Make peace with it. Python code is just allowed to skip stack frames. So
 when a co-worker told me "Rust is getting try-catch syntax" I properly freaked out. Turns out
 [he's not quite right](https://github.com/rust-lang/rfcs/pull/243), and I'm OK with that. And while
 `dateutil` is pretty well-behaved about not skipping multiple stack frames,
 [130-line try-catch blocks](https://github.com/dateutil/dateutil/blob/16561fc99361979e88cccbd135393b06b1af7e90/dateutil/parser/_parser.py#L730-L865)
 take a while to verify.
 As another Python quirk, **be very careful about
 [long nested if-elif-else blocks](https://github.com/dateutil/dateutil/blob/16561fc99361979e88cccbd135393b06b1af7e90/dateutil/parser/_parser.py#L494-L568)**.
 I used to think that Python's whitespace was just there to get you to format your code correctly. I
 think that no longer. It's way too easy to close a block too early and have incredibly weird issues
 in the logic. Make sure you use an editor that displays indentation levels so you can keep things
 straight.
 **Rust macros are not free.** I originally had the
 [main test body](https://github.com/bspeice/dtparse/blob/b0e737f088eca8e83ab4244c6621a2797d247697/tests/compat.rs#L63-L217)
 wrapped up in a macro using [pyo3](https://github.com/PyO3/PyO3). It took two minutes to compile.
 After
 [moving things to a function](https://github.com/bspeice/dtparse/blob/e017018295c670e4b6c6ee1cfff00dbb233db47d/tests/compat.rs#L76-L205)
 compile times dropped down to ~5 seconds. Turns out 150 lines \* 100 tests = a lot of redundant code
 to be compiled. My new rule of thumb is that any macros longer than 10-15 lines are actually
 functions that need to be liberated, man.
 Finally, **I really miss list comprehensions and dictionary comprehensions.** As a quick comparison,
 see
 [this dateutil code](https://github.com/dateutil/dateutil/blob/16561fc99361979e88cccbd135393b06b1af7e90/dateutil/parser/_parser.py#L476)
 and
 [the implementation in Rust](https://github.com/bspeice/dtparse/blob/7d565d3a78876dbebd9711c9720364fe9eba7915/src/lib.rs#L619-L629).
 I probably wrote it wrong, and I'm sorry. Ultimately though, I hope that these comprehensions can be
 added through macros or syntax extensions. Either way, they're expressive, save typing, and are
 super-readable. Let's get more of that.
 # Using a young language
 Now, Rust is exciting and new, which means that there's opportunity to make a substantive impact. On
 more than one occasion though, I've had issues navigating the Rust ecosystem.
 What I'll call the "canonical library" is still being built. In Python, if you need datetime
 parsing, you use `dateutil`. If you want `decimal` types, it's already in the
 [standard library](https://docs.python.org/3.6/library/decimal.html). While I might've gotten away
 with `f64`, `dateutil` uses decimals, and I wanted to follow the principle of **staying as close to
 the original library as possible**. Thus began my quest to find a decimal library in Rust. What I
 quickly found was summarized in a comment:
 > Writing a BigDecimal is easy. Writing a _good_ BigDecimal is hard.
 >
 > [-cmr](https://github.com/rust-lang/rust/issues/8937#issuecomment-34582794)
 In practice, this means that there are at least [4](https://crates.io/crates/bigdecimal)
 [different](https://crates.io/crates/rust_decimal)
 [implementations](https://crates.io/crates/decimal) [available](https://crates.io/crates/decimate).
 And that's a lot of decisions to worry about when all I'm thinking is "why can't
 [calendar reform](https://en.wikipedia.org/wiki/Calendar_reform) be a thing" and I'm forced to dig
 through a [couple](https://github.com/rust-lang/rust/issues/8937#issuecomment-31661916)
 [different](https://github.com/rust-lang/rfcs/issues/334)
 [threads](https://github.com/rust-num/num/issues/8) to figure out if the library I'm look at is dead
 or just stable.
 And even when the "canonical library" exists, there's no guarantees that it will be well-maintained.
 [Chrono](https://github.com/chronotope/chrono) is the _de facto_ date/time library in Rust, and just
 released version 0.4.4 like two days ago. Meanwhile,
 [chrono-tz](https://github.com/chronotope/chrono-tz) appears to be dead in the water even though
 [there are people happy to help maintain it](https://github.com/chronotope/chrono-tz/issues/19). I
 know relatively little about it, but it appears that most of the release process is automated;
 keeping that up to date should be a no-brainer.
 ## Trial Maintenance Policy
 Specifically given "maintenance" being an
 [oft-discussed](https://www.reddit.com/r/rust/comments/48540g/thoughts_on_initiators_vs_maintainers/)
 issue, I'm going to try out the following policy to keep things moving on `dtparse`:
 1. Issues/PRs needing _maintainer_ feedback will be updated at least weekly. I want to make sure
   nobody's blocking on me.
 2. To keep issues/PRs needing _contributor_ feedback moving, I'm going to (kindly) ask the
   contributor to check in after two weeks, and close the issue without resolution if I hear nothing
   back after a month.
 The second point I think has the potential to be a bit controversial, so I'm happy to receive
 feedback on that. And if a contributor responds with "hey, still working on it, had a kid and I'm
 running on 30 seconds of sleep a night," then first: congratulations on sustaining human life. And
 second: I don't mind keeping those requests going indefinitely. I just want to try and balance
 keeping things moving with giving people the necessary time they need.
 I should also note that I'm still getting some best practices in place - CONTRIBUTING and
 CONTRIBUTORS files need to be added, as well as issue/PR templates. In progress. None of us are
 perfect.
 # Roadmap and Conclusion
 So if I've now built a `dateutil`-compatible parser, we're done, right? Of course not! That's not
 nearly ambitious enough.
 Ultimately, I'd love to have a library that's capable of parsing everything the Linux `date` command
 can do (and not `date` on OSX, because seriously, BSD coreutils are the worst). I know Rust has a
 coreutils rewrite going on, and `dtparse` would potentially be an interesting candidate since it
 doesn't bring in a lot of extra dependencies. [`humantime`](https://crates.io/crates/humantime)
 could help pick up some of the (current) slack in dtparse, so maybe we can share and care with each
 other?
 All in all, I'm mostly hoping that nobody's already done this and I haven't spent a bit over a month
 on redundant code. So if it exists, tell me. I need to know, but be nice about it, because I'm going
 to take it hard.
 And in the mean time, I'm looking forward to building more. Onwards.
@@ -1,323 +0,0 @@
 ---
 layout: post
 title: "Primitives in Rust are Weird (and Cool)"
 description: "but mostly weird."
 category:
 tags: [rust, c, java, python, x86]
 ---
 I wrote a really small Rust program a while back because I was curious. I was 100% convinced it
 couldn't possibly run:
 ```rust
 fn main() {
    println!("{}", 8.to_string())
 }
 ```
 And to my complete befuddlement, it compiled, ran, and produced a completely sensible output. The
 reason I was so surprised has to do with how Rust treats a special category of things I'm going to
 call _primitives_. In the current version of the Rust book, you'll see them referred to as
 [scalars][rust_scalar], and in older versions they'll be called [primitives][rust_primitive], but
 we're going to stick with the name _primitive_ for the time being. Explaining why this program is so
 cool requires talking about a number of other programming languages, and keeping a consistent
 terminology makes things easier.
 **You've been warned:** this is going to be a tedious post about a relatively minor issue that
 involves Java, Python, C, and x86 Assembly. And also me pretending like I know what I'm talking
 about with assembly.
 # Defining primitives (Java)
 The reason I'm using the name _primitive_ comes from how much of my life is Java right now. Spoiler
 alert: a lot of it. And for the most part I like Java, but I digress. In Java, there's a special
 name for some specific types of values:
 > ```
 > bool    char    byte
 > short   int     long
 > float   double
 > ```
 ````
 They are referred to as [primitives][java_primitive]. And relative to the other bits of Java,
 they have two unique features. First, they don't have to worry about the
 [billion-dollar mistake](https://en.wikipedia.org/wiki/Tony_Hoare#Apologies_and_retractions);
 primitives in Java can never be `null`. Second: *they can't have instance methods*.
 Remember that Rust program from earlier? Java has no idea what to do with it:
 ```java
 class Main {
    public static void main(String[] args) {
        int x = 8;
        System.out.println(x.toString()); // Triggers a compiler error
    }
 }
 ````
 The error is:
 ```
 Main.java:5: error: int cannot be dereferenced
        System.out.println(x.toString());
                            ^
 1 error
 ```
 Specifically, Java's [`Object`](https://docs.oracle.com/javase/10/docs/api/java/lang/Object.html)
 and things that inherit from it are pointers under the hood, and we have to dereference them before
 the fields and methods they define can be used. In contrast, _primitive types are just values_ -
 there's nothing to be dereferenced. In memory, they're just a sequence of bits.
 If we really want, we can turn the `int` into an
 [`Integer`](https://docs.oracle.com/javase/10/docs/api/java/lang/Integer.html) and then dereference
 it, but it's a bit wasteful:
 ```java
 class Main {
    public static void main(String[] args) {
        int x = 8;
        Integer y = Integer.valueOf(x);
        System.out.println(y.toString());
    }
 }
 ```
 This creates the variable `y` of type `Integer` (which inherits `Object`), and at run time we
 dereference `y` to locate the `toString()` function and call it. Rust obviously handles things a bit
 differently, but we have to dig into the low-level details to see it in action.
 # Low Level Handling of Primitives (C)
 We first need to build a foundation for reading and understanding the assembly code the final answer
 requires. Let's begin with showing how the `C` language (and your computer) thinks about "primitive"
 values in memory:
 ```c
 void my_function(int num) {}
 int main() {
    int x = 8;
    my_function(x);
 }
 ```
 The [compiler explorer](https://godbolt.org/z/lgNYcc) gives us an easy way of showing off the
 assembly-level code that's generated: <span style="font-size:.6em">whose output has been lightly
 edited</span>
 ```nasm
 main:
        push    rbp
        mov     rbp, rsp
        sub     rsp, 16
        ; We assign the value `8` to `x` here
        mov     DWORD PTR [rbp-4], 8
        ; And copy the bits making up `x` to a location
        ; `my_function` can access (`edi`)
        mov     eax, DWORD PTR [rbp-4]
        mov     edi, eax
        ; Call `my_function` and give it control
        call    my_function
        mov     eax, 0
        leave
        ret
 my_function:
        push    rbp
        mov     rbp, rsp
        ; Copy the bits out of the pre-determined location (`edi`)
        ; to somewhere we can use
        mov     DWORD PTR [rbp-4], edi
        nop
        pop     rbp
        ret
 ```
 At a really low level of memory, we're copying bits around using the [`mov`][x86_guide] instruction;
 nothing crazy. But to show how similar Rust is, let's take a look at our program translated from C
 to Rust:
 ```rust
 fn my_function(x: i32) {}
 fn main() {
    let x = 8;
    my_function(x)
 }
 ```
 And the assembly generated when we stick it in the
 [compiler explorer](https://godbolt.org/z/cAlmk0): <span style="font-size:.6em">again, lightly
 edited</span>
 ```nasm
 example::main:
  push rax
  ; Look familiar? We're copying bits to a location for `my_function`
  ; The compiler just optimizes out holding `x` in memory
  mov edi, 8
  ; Call `my_function` and give it control
  call example::my_function
  pop rax
  ret
 example::my_function:
  sub rsp, 4
  ; And copying those bits again, just like in C
  mov dword ptr [rsp], edi
  add rsp, 4
  ret
 ```
 The generated Rust assembly is functionally pretty close to the C assembly: _When working with
 primitives, we're just dealing with bits in memory_.
 In Java we have to dereference a pointer to call its functions; in Rust, there's no pointer to
 dereference. So what exactly is going on with this `.to_string()` function call?
 # impl primitive (and Python)
 Now it's time to <strike>reveal my trap card</strike> show the revelation that tied all this
 together: _Rust has implementations for its primitive types._ That's right, `impl` blocks aren't
 only for `structs` and `traits`, primitives get them too. Don't believe me? Check out
 [u32](https://doc.rust-lang.org/std/primitive.u32.html),
 [f64](https://doc.rust-lang.org/std/primitive.f64.html) and
 [char](https://doc.rust-lang.org/std/primitive.char.html) as examples.
 But the really interesting bit is how Rust turns those `impl` blocks into assembly. Let's break out
 the [compiler explorer](https://godbolt.org/z/6LBEwq) once again:
 ```rust
 pub fn main() {
    8.to_string()
 }
 ```
 And the interesting bits in the assembly: <span style="font-size:.6em">heavily trimmed down</span>
 ```nasm
 example::main:
  sub rsp, 24
  mov rdi, rsp
  lea rax, [rip + .Lbyte_str.u]
  mov rsi, rax
  ; Cool stuff right here
  call <T as alloc::string::ToString>::to_string@PLT
  mov rdi, rsp
  call core::ptr::drop_in_place
  add rsp, 24
  ret
 ```
 Now, this assembly is a bit more complicated, but here's the big revelation: **we're calling
 `to_string()` as a function that exists all on its own, and giving it the instance of `8`**. Instead
 of thinking of the value 8 as an instance of `u32` and then peeking in to find the location of the
 function we want to call (like Java), we have a function that exists outside of the instance and
 just give that function the value `8`.
 This is an incredibly technical detail, but the interesting idea I had was this: _if `to_string()`
 is a static function, can I refer to the unbound function and give it an instance?_
 Better explained in code (and a [compiler explorer](https://godbolt.org/z/fJY-gA) link because I
 seriously love this thing):
 ```rust
 struct MyVal {
    x: u32
 }
 impl MyVal {
    fn to_string(&self) -> String {
        self.x.to_string()
    }
 }
 pub fn main() {
    let my_val = MyVal { x: 8 };
    // THESE ARE THE SAME
    my_val.to_string();
    MyVal::to_string(&my_val);
 }
 ```
 Rust is totally fine "binding" the function call to the instance, and also as a static.
 MIND == BLOWN.
 Python does the same thing where I can both call functions bound to their instances and also call as
 an unbound function where I give it the instance:
 ```python
 class MyClass():
    x = 24
    def my_function(self):
        print(self.x)
 m = MyClass()
 m.my_function()
 MyClass.my_function(m)
 ```
 And Python tries to make you _think_ that primitives can have instance methods...
 ```python
 >>> dir(8)
 ['__abs__', '__add__', '__and__', '__class__', '__cmp__', '__coerce__',
 '__delattr__', '__div__', '__divmod__', '__doc__', '__float__', '__floordiv__',
 ...
 '__setattr__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__',
 ...]
 >>> # Theoretically `8.__str__()` should exist, but:
 >>> 8.__str__()
  File "<stdin>", line 1
    8.__str__()
             ^
 SyntaxError: invalid syntax
 >>> # It will run if we assign it first though:
 >>> x = 8
 >>> x.__str__()
 '8'
 ```
 ...but in practice it's a bit complicated.
 So while Python handles binding instance methods in a way similar to Rust, it's still not able to
 run the example we started with.
 # Conclusion
 This was a super-roundabout way of demonstrating it, but the way Rust handles incredibly minor
 details like primitives leads to really cool effects. Primitives are optimized like C in how they
 have a space-efficient memory layout, yet the language still has a lot of features I enjoy in Python
 (like both instance and late binding).
 And when you put it together, there are areas where Rust does cool things nobody else can; as a
 quirky feature of Rust's type system, `8.to_string()` is actually valid code.
 Now go forth and fool your friends into thinking you know assembly. This is all I've got.
 [x86_guide]: http://www.cs.virginia.edu/~evans/cs216/guides/x86.html
 [java_primitive]: https://docs.oracle.com/javase/tutorial/java/nutsandbolts/datatypes.html
 [rust_scalar]: https://doc.rust-lang.org/book/second-edition/ch03-02-data-types.html#scalar-types
 [rust_primitive]: https://doc.rust-lang.org/book/first-edition/primitive-types.html
@@ -1,294 +0,0 @@
 ---
 layout: post
 title: "Isomorphic Desktop Apps with Rust"
 description: "Electron + WASM = ☣"
 category:
 tags: [rust, javascript, webassembly]
 ---
 Forgive me, but this is going to be a bit of a schizophrenic post. I both despise Javascript and the
 modern ECMAScript ecosystem, and I'm stunned by its success doing some really cool things. It's
 [this duality](https://www.destroyallsoftware.com/talks/the-birth-and-death-of-javascript) that's
 led me to a couple of (very) late nights over the past weeks trying to reconcile myself as I
 bootstrap a simple desktop application.
 See, as much as
 [Webassembly isn't trying to replace Javascript](https://webassembly.org/docs/faq/#is-webassembly-trying-to-replace-javascript),
 **I want Javascript gone**. There are plenty of people who don't share my views, and they are
 probably nicer and more fun at parties. But I cringe every time "Webpack" is mentioned, and I think
 it's hilarious that the
 [language specification](https://ecma-international.org/publications/standards/Ecma-402.htm)
 dramatically outpaces anyone's
 [actual implementation](https://kangax.github.io/compat-table/es2016plus/). The answer to this
 conundrum is of course to recompile code from newer versions of the language to older versions _of
 the same language_ before running. At least [Babel] is a nice tongue-in-cheek reference.
 Yet for as much hate as [Electron] receives, it does a stunningly good job at solving a really hard
 problem: _how the hell do I put a button on the screen and react when the user clicks it_? GUI
 programming is hard, straight up. But if browsers are already able to run everywhere, why don't we
 take advantage of someone else solving the hard problems for us? I don't like that I have to use
 Javascript for it, but I really don't feel inclined to whip out good ol' [wxWidgets].
 Now there are other native solutions ([libui-rs], [conrod], [oh hey wxWdidgets again!][wxrust]), but
 those also have their own issues with distribution, styling, etc. With Electron, I can
 `yarn create electron-app my-app` and just get going, knowing that packaging/upgrades/etc. are built
 in.
 My question is: given recent innovations with WASM, _are we Electron yet_?
 No, not really.
 Instead, **what would it take to get to a point where we can skip Javascript in Electron apps?**
 # Setting the Stage
 Truth is, WASM/Webassembly is a pretty new technology and I'm a total beginner in this area. There
 may already be solutions to the issues I discuss, but I'm totally unaware of them, so I'm going to
 try and organize what I did manage to discover.
 I should also mention that the content and things I'm talking about here are not intended to be
 prescriptive, but more "if someone else is interested, what do we already know doesn't work?" _I
 expect everything in this post to be obsolete within two months._ Even over the course of writing
 this, [a separate blog post](https://mnt.io/2018/08/28/from-rust-to-beyond-the-asm-js-galaxy/) had
 to be modified because [upstream changes](https://github.com/WebAssembly/binaryen/pull/1642) broke a
 [Rust tool](https://github.com/rustwasm/wasm-bindgen/pull/787) the post tried to use. The post
 ultimately
 [got updated](https://mnt.io/2018/08/28/from-rust-to-beyond-the-asm-js-galaxy/#comment-477), **but
 all this happened within the span of a week.** Things are moving quickly.
 I'll also note that we're going to skip [asm.js] and [emscripten]. Truth be told, I couldn't get
 either of these to output anything, and so I'm just going to say
 [here be dragons.](https://en.wikipedia.org/wiki/Here_be_dragons) Everything I'm discussing here
 uses the `wasm32-unknown-unknown` target.
 The code that I _did_ get running is available
 [over here](https://github.com/speice-io/isomorphic-rust). Feel free to use it as a starting point,
 but I'm mostly including the link as a reference for the things that were attempted.
 # An Example Running Application
 So, I did _technically_ get a running application:
 ![Electron app using WASM](/assets/images/2018-09-15-electron-percy-wasm.png)
 ...which you can also try out if you want:
 ```sh
 git clone https://github.com/speice-io/isomorphic-rust.git
 cd isomorphic_rust/percy
 yarn install && yarn start
 ```
 ...but I wouldn't really call it a "high quality" starting point to base future work on. It's mostly
 there to prove this is possible in the first place. And that's something to be proud of! There's a
 huge amount of engineering that went into showing a window with the text "It's alive!".
 There's also a lot of usability issues that prevent me from recommending anyone try Electron and
 WASM apps at the moment, and I think that's the more important thing to discuss.
 # Issue the First: Complicated Toolchains
 I quickly established that [wasm-bindgen] was necessary to "link" my Rust code to Javascript. At
 that point you've got an Electron app that starts an HTML page which ultimately fetches your WASM
 blob. To keep things simple, the goal was to package everything using [webpack] so that I could just
 load a `bundle.js` file on the page. That decision was to be the last thing that kinda worked in
 this process.
 The first issue
 [I ran into](https://www.reddit.com/r/rust/comments/98lpun/unable_to_load_wasm_for_electron_application/)
 while attempting to bundle everything via `webpack` is a detail in the WASM spec:
 > This function accepts a Response object, or a promise for one, and ... **[if > it] does not match
 > the `application/wasm` MIME type**, the returned promise will be rejected with a TypeError;
 >
 > [WebAssembly - Additional Web Embedding API](https://webassembly.org/docs/web/#additional-web-embedding-api)
 Specifically, if you try and load a WASM blob without the MIME type set, you'll get an error. On the
 web this isn't a huge issue, as the server can set MIME types when delivering the blob. With
 Electron, you're resolving things with a `file://` URL and thus can't control the MIME type:
 ![TypeError: Incorrect response MIME type. Expected 'application/wasm'.](/assets/images/2018-09-15-incorrect-MIME-type.png)
 There are a couple of solutions depending on how far into the deep end you care to venture:
 - Embed a static file server in your Electron application
 - Use a [custom protocol](https://electronjs.org/docs/api/protocol) and custom protocol handler
 - Host your WASM blob on a website that you resolve at runtime
 But all these are pretty bad solutions and defeat the purpose of using WASM in the first place.
 Instead, my workaround was to
 [open a PR with `webpack`](https://github.com/webpack/webpack/issues/7918) and use regex to remove
 calls to `instantiateStreaming` in the
 [build script](https://github.com/speice-io/isomorphic-rust/blob/master/percy/build.sh#L21-L25):
 ```sh
 cargo +nightly build --target=wasm32-unknown-unknown && \
    wasm-bindgen "$WASM_DIR/debug/$WASM_NAME.wasm" --out-dir "$APP_DIR" --no-typescript && \
    # Have to use --mode=development so we can patch out the call to instantiateStreaming
    "$DIR/node_modules/webpack-cli/bin/cli.js" --mode=development "$APP_DIR/app_loader.js" -o "$APP_DIR/bundle.js" && \
    sed -i 's/.*instantiateStreaming.*//g' "$APP_DIR/bundle.js"
 ```
 Once that lands, the
 [build process](https://github.com/speice-io/isomorphic-rust/blob/master/percy_patched_webpack/build.sh#L24-L27)
 becomes much simpler:
 ```sh
 cargo +nightly build --target=wasm32-unknown-unknown && \
    wasm-bindgen "$WASM_DIR/debug/$WASM_NAME.wasm" --out-dir "$APP_DIR" --no-typescript && \
    "$DIR/node_modules/webpack-cli/bin/cli.js" --mode=production "$APP_DIR/app_loader.js" -o "$APP_DIR/bundle.js"
 ```
 But we're not done yet! After we compile Rust into WASM and link WASM to Javascript (via
 `wasm-bindgen` and `webpack`), we still have to make an Electron app. For this purpose I used a
 starter app from [Electron Forge], and then a
 [`prestart` script](https://github.com/speice-io/isomorphic-rust/blob/master/percy/package.json#L8)
 to actually handle starting the application.
 The
 [final toolchain](https://github.com/speice-io/isomorphic-rust/blob/master/percy/package.json#L8)
 looks something like this:
 - `yarn start` triggers the `prestart` script
 - `prestart` checks for missing tools (`wasm-bindgen-cli`, etc.) and then:
  - Uses `cargo` to compile the Rust code into WASM
  - Uses `wasm-bindgen` to link the WASM blob into a Javascript file with exported symbols
  - Uses `webpack` to bundle the page start script with the Javascript we just generated
    - Uses `babel` under the hood to compile the `wasm-bindgen` code down from ES6 into something
      browser-compatible
 - The `start` script runs an Electron Forge handler to do some sanity checks
 - Electron actually starts
 ...which is complicated. I think more work needs to be done to either build a high-quality starter
 app that can manage these steps, or another tool that "just handles" the complexity of linking a
 compiled WASM file into something the Electron browser can run.
 # Issue the Second: WASM tools in Rust
 For as much as I didn't enjoy the Javascript tooling needed to interface with Rust, the Rust-only
 bits aren't any better at the moment. I get it, a lot of projects are just starting off, and that
 leads to a fragmented ecosystem. Here's what I can recommend as a starting point:
 Don't check in your `Cargo.lock` files to version control. If there's a disagreement between the
 version of `wasm-bindgen-cli` you have installed and the `wasm-bindgen` you're compiling with in
 `Cargo.lock`, you get a nasty error:
 ```
 it looks like the Rust project used to create this wasm file was linked against
 a different version of wasm-bindgen than this binary:
 rust wasm file: 0.2.21
    this binary: 0.2.17
 Currently the bindgen format is unstable enough that these two version must
 exactly match, so it's required that these two version are kept in sync by
 either updating the wasm-bindgen dependency or this binary.
 ```
 Not that I ever managed to run into this myself (_coughs nervously_).
 There are two projects attempting to be "application frameworks": [percy] and [yew]. Between those,
 I managed to get [two](https://github.com/speice-io/isomorphic-rust/tree/master/percy)
 [examples](https://github.com/speice-io/isomorphic-rust/tree/master/percy_patched_webpack) running
 using `percy`, but was unable to get an
 [example](https://github.com/speice-io/isomorphic-rust/tree/master/yew) running with `yew` because
 of issues with "missing modules" during the `webpack` step:
 ```sh
 ERROR in ./dist/electron_yew_wasm_bg.wasm
 Module not found: Error: Can't resolve 'env' in '/home/bspeice/Development/isomorphic_rust/yew/dist'
 @ ./dist/electron_yew_wasm_bg.wasm
 @ ./dist/electron_yew_wasm.js
 @ ./dist/app.js
 @ ./dist/app_loader.js
 ```
 If you want to work with the browser APIs directly, your choices are [percy-webapis] or [stdweb] (or
 eventually [web-sys]). See above for my `percy` examples, but when I tried
 [an example with `stdweb`](https://github.com/speice-io/isomorphic-rust/tree/master/stdweb), I was
 unable to get it running:
 ```sh
 ERROR in ./dist/stdweb_electron_bg.wasm
 Module not found: Error: Can't resolve 'env' in '/home/bspeice/Development/isomorphic_rust/stdweb/dist'
 @ ./dist/stdweb_electron_bg.wasm
 @ ./dist/stdweb_electron.js
 @ ./dist/app_loader.js
 ```
 At this point I'm pretty convinced that `stdweb` is causing issues for `yew` as well, but can't
 prove it.
 I did also get a [minimal example](https://github.com/speice-io/isomorphic-rust/tree/master/minimal)
 running that doesn't depend on any tools besides `wasm-bindgen`. However, it requires manually
 writing "`extern C`" blocks for everything you need from the browser. Es no bueno.
 Finally, from a tools and platform view, there are two up-and-coming packages that should be
 mentioned: [js-sys] and [web-sys]. Their purpose is to be fundamental building blocks that exposes
 the browser's APIs to Rust. If you're interested in building an app framework from scratch, these
 should give you the most flexibility. I didn't touch either in my research, though I expect them to
 be essential long-term.
 So there's a lot in play from the Rust side of things, and it's just going to take some time to
 figure out what works and what doesn't.
 # Issue the Third: Known Unknowns
 Alright, so after I managed to get an application started, I stopped there. It was a good deal of
 effort to chain together even a proof of concept, and at this point I'd rather learn [Typescript]
 than keep trying to maintain an incredibly brittle pipeline. Blasphemy, I know...
 The important point I want to make is that there's a lot unknown about how any of this holds up
 outside proofs of concept. Things I didn't attempt:
 - Testing
 - Packaging
 - Updates
 - Literally anything related to why I wanted to use Electron in the first place
 # What it Would Take
 Much as I don't like Javascript, the tools are too shaky for me to recommend mixing Electron and
 WASM at the moment. There's a lot of innovation happening, so who knows? Someone might have an
 application in production a couple months from now. But at the moment, I'm personally going to stay
 away.
 Let's finish with a wishlist then - here are the things that I think need to happen before
 Electron/WASM/Rust can become a thing:
 - Webpack still needs some updates. The necessary work is in progress, but hasn't landed yet
  ([#7983](https://github.com/webpack/webpack/pull/7983))
 - Browser API libraries (`web-sys` and `stdweb`) need to make sure they can support running in
  Electron (see module error above)
 - Projects need to stabilize. There's talk of `stdweb` being turned into a Rust API
  [on top of web-sys](https://github.com/rustwasm/team/issues/226#issuecomment-418475778), and percy
  [moving to web-sys](https://github.com/chinedufn/percy/issues/24), both of which are big changes
 - `wasm-bindgen` is great, but still in the "move fast and break things" phase
 - A good "boilerplate" app would dramatically simplify the start-up costs;
  [electron-react-boilerplate](https://github.com/chentsulin/electron-react-boilerplate) comes to
  mind as a good project to imitate
 - More blog posts/contributors! I think Electron + Rust could be cool, but I have no idea what I'm
  doing
 [wxwidgets]: https://wxwidgets.org/
 [libui-rs]: https://github.com/LeoTindall/libui-rs/
 [electron]: https://electronjs.org/
 [babel]: https://babeljs.io/
 [wxrust]: https://github.com/kenz-gelsoft/wxRust
 [wasm-bindgen]: https://github.com/rustwasm/wasm-bindgen
 [js-sys]: https://crates.io/crates/js-sys
 [percy-webapis]: https://crates.io/crates/percy-webapis
 [stdweb]: https://crates.io/crates/stdweb
 [web-sys]: https://crates.io/crates/web-sys
 [percy]: https://chinedufn.github.io/percy/
 [virtual-dom-rs]: https://crates.io/crates/virtual-dom-rs
 [yew]: https://github.com/DenisKolodin/yew
 [react]: https://reactjs.org/
 [elm]: http://elm-lang.org/
 [asm.js]: http://asmjs.org/
 [emscripten]: https://kripken.github.io/emscripten-site/
 [typescript]: https://www.typescriptlang.org/
 [electron forge]: https://electronforge.io/
 [conrod]: https://github.com/PistonDevelopers/conrod
 [webpack]: https://webpack.js.org/
@@ -1,168 +0,0 @@
 ---
 layout: post
 title: "A Case Study in Heaptrack"
 description: "...because you don't need no garbage collection"
 category:
 tags: []
 ---
 One of my earliest conversations about programming went like this:
 > Programmers have it too easy these days. They should learn to develop in low memory environments
 > and be more efficient.
 >
 > -- My Father (paraphrased)
 ...though it's not like the first code I wrote was for a
 [graphing calculator](https://education.ti.com/en/products/calculators/graphing-calculators/ti-84-plus-se)
 packing a whole 24KB of RAM. By the way, _what are you doing on my lawn?_
 The principle remains though: be efficient with the resources you have, because
 [what Intel giveth, Microsoft taketh away](http://exo-blog.blogspot.com/2007/09/what-intel-giveth-microsoft-taketh-away.html).
 My professional work is focused on this kind of efficiency; low-latency financial markets demand
 that you understand at a deep level _exactly_ what your code is doing. As I continue experimenting
 with Rust for personal projects, it's exciting to bring a utilitarian mindset with me: there's
 flexibility for the times I pretend to have a garbage collector, and flexibility for the times that
 I really care about how memory is used.
 This post is a (small) case study in how I went from the former to the latter. And ultimately, it's
 intended to be a starting toolkit to empower analysis of your own code.
 # Curiosity
 When I first started building the [dtparse] crate, my intention was to mirror as closely as possible
 the equivalent [Python library][dateutil]. Python, as you may know, is garbage collected. Very
 rarely is memory usage considered in Python, and I likewise wasn't paying too much attention when
 `dtparse` was first being built.
 This lackadaisical approach to memory works well enough, and I'm not planning on making `dtparse`
 hyper-efficient. But every so often, I've wondered: "what exactly is going on in memory?" With the
 advent of Rust 1.28 and the
 [Global Allocator trait](https://doc.rust-lang.org/std/alloc/trait.GlobalAlloc.html), I had a really
 great idea: _build a custom allocator that allows you to track your own allocations._ That way, you
 can do things like writing tests for both correct results and correct memory usage. I gave it a
 [shot][qadapt], but learned very quickly: **never write your own allocator**. It went from "fun
 weekend project" to "I have literally no idea what my computer is doing" at breakneck speed.
 Instead, I'll highlight a separate path I took to make sense of my memory usage: [heaptrack].
 # Turning on the System Allocator
 This is the hardest part of the post. Because Rust uses
 [its own allocator](https://github.com/rust-lang/rust/pull/27400#issue-41256384) by default,
 `heaptrack` is unable to properly record unmodified Rust code. To remedy this, we'll make use of the
 `#[global_allocator]` attribute.
 Specifically, in `lib.rs` or `main.rs`, add this:
 ```rust
 use std::alloc::System;
 #[global_allocator]
 static GLOBAL: System = System;
 ```
 ...and that's it. Everything else comes essentially for free.
 # Running heaptrack
 Assuming you've installed heaptrack <span style="font-size: .6em;">(Homebrew in Mac, package manager
 in Linux, ??? in Windows)</span>, all that's left is to fire up your application:
 ```
 heaptrack my_application
 ```
 It's that easy. After the program finishes, you'll see a file in your local directory with a name
 like `heaptrack.my_appplication.XXXX.gz`. If you load that up in `heaptrack_gui`, you'll see
 something like this:
 ![heaptrack](/assets/images/2018-10-heaptrack/heaptrack-before.png)
 ---
 And even these pretty colors:
 ![pretty colors](/assets/images/2018-10-heaptrack/heaptrack-flamegraph.png)
 # Reading Flamegraphs
 To make sense of our memory usage, we're going to focus on that last picture - it's called a
 ["flamegraph"](http://www.brendangregg.com/flamegraphs.html). These charts are typically used to
 show how much time your program spends executing each function, but they're used here to show how
 much memory was allocated during those functions instead.
 For example, we can see that all executions happened during the `main` function:
 ![allocations in main](/assets/images/2018-10-heaptrack/heaptrack-main-colorized.png)
 ...and within that, all allocations happened during `dtparse::parse`:
 ![allocations in dtparse](/assets/images/2018-10-heaptrack/heaptrack-dtparse-colorized.png)
 ...and within _that_, allocations happened in two different places:
 ![allocations in parseinfo](/assets/images/2018-10-heaptrack/heaptrack-parseinfo-colorized.png)
 Now I apologize that it's hard to see, but there's one area specifically that stuck out as an issue:
 **what the heck is the `Default` thing doing?**
 ![pretty colors](/assets/images/2018-10-heaptrack/heaptrack-flamegraph-default.png)
 # Optimizing dtparse
 See, I knew that there were some allocations during calls to `dtparse::parse`, but I was totally
 wrong about where the bulk of allocations occurred in my program. Let me post the code and see if
 you can spot the mistake:
 ```rust
 /// Main entry point for using `dtparse`.
 pub fn parse(timestr: &str) -> ParseResult<(NaiveDateTime, Option<FixedOffset>)> {
    let res = Parser::default().parse(
        timestr, None, None, false, false,
        None, false,
        &HashMap::new(),
    )?;
    Ok((res.0, res.1))
 }
 ```
 > [dtparse](https://github.com/bspeice/dtparse/blob/4d7c5dd99572823fa4a390b483c38ab020a2172f/src/lib.rs#L1286)
 ---
 Because `Parser::parse` requires a mutable reference to itself, I have to create a new
 `Parser::default` every time it receives a string. This is excessive! We'd rather have an immutable
 parser that can be re-used, and avoid allocating memory in the first place.
 Armed with that information, I put some time in to
 [make the parser immutable](https://github.com/bspeice/dtparse/commit/741afa34517d6bc1155713bbc5d66905fea13fad#diff-b4aea3e418ccdb71239b96952d9cddb6).
 Now that I can re-use the same parser over and over, the allocations disappear:
 ![allocations cleaned up](/assets/images/2018-10-heaptrack/heaptrack-flamegraph-after.png)
 In total, we went from requiring 2 MB of memory in
 [version 1.0.2](https://crates.io/crates/dtparse/1.0.2):
 ![memory before](/assets/images/2018-10-heaptrack/heaptrack-closeup.png)
 All the way down to 300KB in [version 1.0.3](https://crates.io/crates/dtparse/1.0.3):
 ![memory after](/assets/images/2018-10-heaptrack/heaptrack-closeup-after.png)
 # Conclusion
 In the end, you don't need to write a custom allocator to be efficient with memory, great tools
 already exist to help you understand what your program is doing.
 **Use them.**
 Given that [Moore's Law](https://en.wikipedia.org/wiki/Moore%27s_law) is
 [dead](https://www.technologyreview.com/s/601441/moores-law-is-dead-now-what/), we've all got to do
 our part to take back what Microsoft stole.
 [dtparse]: https://crates.io/crates/dtparse
 [dateutil]: https://github.com/dateutil/dateutil
 [heaptrack]: https://github.com/KDE/heaptrack
 [qadapt]: https://crates.io/crates/qadapt
@@ -1,34 +0,0 @@
 ---
 layout: post
 title: 'More "What Companies Really Mean"'
 description: 'when they ask "Why should we hire you?"'
 category:
 tags: []
 ---
 I recently stumbled across a phenomenal small article entitled
 [What Startups Really Mean By "Why Should We Hire You?"](https://angel.co/blog/what-startups-really-mean-by-why-should-we-hire-you).
 Having been interviewed by smaller companies (though not exactly startups), the questions and
 subtexts are the same. There's often a question behind the question that you're actually trying to
 answer, and I wish I spotted the nuance earlier in my career.
 Let me also make note of one more question/euphemism I've come across:
 # How do you feel about Production Support?
 **Translation**: _We're a fairly small team, and when things break on an evening/weekend/Christmas
 Day, can we call on you to be there?_
 I've met decidedly few people in my life who truly enjoy the "ops" side of "devops". They're
 incredibly good at taking an impossible problem, pre-existing knowledge of arcane arts, and turning
 that into a functioning system at the end. And if they all left for lunch, we probably wouldn't make
 it out the door before the zombie apocalypse.
 Larger organizations (in my experience, 500+ person organizations) have the luxury of hiring people
 who either enjoy that, or play along nicely enough that our systems keep working.
 Small teams have no such luck. If you're interviewing at a small company, especially as a "data
 scientist" or other somesuch position, be aware that systems can and do spontaneously combust at the
 most inopportune moments.
 **Terrible-but-popular answers include**: _It's a part of the job, and I'm happy to contribute._
@@ -1,218 +0,0 @@
 ---
 layout: post
 title: "QADAPT - debug_assert! for your memory usage"
 description: "...and why you want an allocator that goes 💥."
 category:
 tags: []
 ---
 I think it's part of the human condition to ignore perfectly good advice when it comes our way. A
 bit over a month ago, I was dispensing sage wisdom for the ages:
 > I had a really great idea: build a custom allocator that allows you to track your own allocations.
 > I gave it a shot, but learned very quickly: **never write your own allocator.**
 >
 > -- [me](/2018/10/case-study-optimization.html)
 I proceeded to ignore it, because we never really learn from our mistakes.
 There's another part of the human condition that derives joy from seeing things explode.
 <iframe src="https://giphy.com/embed/YA6dmVW0gfIw8" width="480" height="336" frameBorder="0"></iframe>
 And _that's_ the part I'm going to focus on.
 # Why an Allocator?
 So why, after complaining about allocators, would I still want to write one? There are three reasons
 for that:
 1. Allocation/dropping is slow
 2. It's difficult to know exactly when Rust will allocate or drop, especially when using code that
   you did not write
 3. I want automated tools to verify behavior, instead of inspecting by hand
 When I say "slow," it's important to define the terms. If you're writing web applications, you'll
 spend orders of magnitude more time waiting for the database than you will the allocator. However,
 there's still plenty of code where micro- or nano-seconds matter; think
 [finance](https://www.youtube.com/watch?v=NH1Tta7purM),
 [real-time audio](https://www.reddit.com/r/rust/comments/9hg7yj/synthesizer_progress_update/e6c291f),
 [self-driving cars](https://polysync.io/blog/session-types-for-hearty-codecs/), and
 [networking](https://carllerche.github.io/bytes/bytes/index.html). In these situations it's simply
 unacceptable for you to spend time doing things that are not your program, and waiting on the
 allocator is not cool.
 As I continue to learn Rust, it's difficult for me to predict where exactly allocations will happen.
 So, I propose we play a quick trivia game: **Does this code invoke the allocator?**
 ## Example 1
 ```rust
 fn my_function() {
    let v: Vec<u8> = Vec::new();
 }
 ```
 **No**: Rust [knows how big](https://doc.rust-lang.org/std/mem/fn.size_of.html) the `Vec` type is,
 and reserves a fixed amount of memory on the stack for the `v` vector. However, if we wanted to
 reserve extra space (using `Vec::with_capacity`) the allocator would get invoked.
 ## Example 2
 ```rust
 fn my_function() {
    let v: Box<Vec<u8>> = Box::new(Vec::new());
 }
 ```
 **Yes**: Because Boxes allow us to work with things that are of unknown size, it has to allocate on
 the heap. While the `Box` is unnecessary in this snippet (release builds will optimize out the
 allocation), reserving heap space more generally is needed to pass a dynamically sized type to
 another function.
 ## Example 3
 ```rust
 fn my_function(v: Vec<u8>) {
    v.push(5);
 }
 ```
 **Maybe**: Depending on whether the Vector we were given has space available, we may or may not
 allocate. Especially when dealing with code that you did not author, it's difficult to verify that
 things behave as you expect them to.
 # Blowing Things Up
 So, how exactly does QADAPT solve these problems? **Whenever an allocation or drop occurs in code
 marked allocation-safe, QADAPT triggers a thread panic.** We don't want to let the program continue
 as if nothing strange happened, _we want things to explode_.
 However, you don't want code to panic in production because of circumstances you didn't predict.
 Just like [`debug_assert!`](https://doc.rust-lang.org/std/macro.debug_assert.html), **QADAPT will
 strip out its own code when building in release mode to guarantee no panics and no performance
 impact.**
 Finally, there are three ways to have QADAPT check that your code will not invoke the allocator:
 ## Using a procedural macro
 The easiest method, watch an entire function for allocator invocation:
 ```rust
 use qadapt::no_alloc;
 use qadapt::QADAPT;
 #[global_allocator]
 static Q: QADAPT = QADAPT;
 #[no_alloc]
 fn push_vec(v: &mut Vec<u8>) {
    // This triggers a panic if v.len() == v.capacity()
    v.push(5);
 }
 fn main() {
    let v = Vec::with_capacity(1);
    // This will *not* trigger a panic
    push_vec(&v);
    // This *will* trigger a panic
    push_vec(&v);
 }
 ```
 ## Using a regular macro
 For times when you need more precision:
 ```rust
 use qadapt::assert_no_alloc;
 use qadapt::QADAPT;
 #[global_allocator]
 static Q: QADAPT = QADAPT;
 fn main() {
    let v = Vec::with_capacity(1);
    // No allocations here, we already have space reserved
    assert_no_alloc!(v.push(5));
    // Even though we remove an item, it doesn't trigger a drop
    // because it's a scalar. If it were a `Box<_>` type,
    // a drop would trigger.
    assert_no_alloc!({
        v.pop().unwrap();
    });
 }
 ```
 ## Using function calls
 Both the most precise and most tedious:
 ```rust
 use qadapt::enter_protected;
 use qadapt::exit_protected;
 use qadapt::QADAPT;
 #[global_allocator]
 static Q: QADAPT = QADAPT;
 fn main() {
    // This triggers an allocation (on non-release builds)
    let v = Vec::with_capacity(1);
    enter_protected();
    // This does not trigger an allocation because we've reserved size
    v.push(0);
    exit_protected();
    // This triggers an allocation because we ran out of size,
    // but doesn't panic because we're no longer protected.
    v.push(1);
 }
 ```
 ## Caveats
 It's important to point out that QADAPT code is synchronous, so please be careful when mixing in
 asynchronous functions:
 ```rust
 use futures::future::Future;
 use futures::future::ok;
 #[no_alloc]
 fn async_capacity() -> impl Future<Item=Vec<u8>, Error=()> {
    ok(12).and_then(|e| Ok(Vec::with_capacity(e)))
 }
 fn main() {
    // This doesn't trigger a panic because the `and_then` closure
    // wasn't run during the function call.
    async_capacity();
    // Still no panic
    assert_no_alloc!(async_capacity());
    // This will panic because the allocation happens during `unwrap`
    // in the `assert_no_alloc!` macro
    assert_no_alloc!(async_capacity().poll().unwrap());
 }
 ```
 # Conclusion
 While there's a lot more to writing high-performance code than managing your usage of the allocator,
 it's critical that you do use the allocator correctly. QADAPT will verify that your code is doing
 what you expect. It's usable even on stable Rust from version 1.31 onward, which isn't the case for
 most allocators. Version 1.0 was released today, and you can check it out over at
 [crates.io](https://crates.io/crates/qadapt) or on [github](https://github.com/bspeice/qadapt).
 I'm hoping to write more about high-performance Rust in the future, and I expect that QADAPT will
 help guide that. If there are topics you're interested in, let me know in the comments below!
 [qadapt]: https://crates.io/crates/qadapt
@@ -1,113 +0,0 @@
 ---
 layout: post
 title: "Allocations in Rust"
 description: "An introduction to the memory model."
 category:
 tags: [rust, understanding-allocations]
 ---
 There's an alchemy of distilling complex technical topics into articles and videos that change the
 way programmers see the tools they interact with on a regular basis. I knew what a linker was, but
 there's a staggering amount of complexity in between
 [the OS and `main()`](https://www.youtube.com/watch?v=dOfucXtyEsU). Rust programmers use the
 [`Box`](https://doc.rust-lang.org/stable/std/boxed/struct.Box.html) type all the time, but there's a
 rich history of the Rust language itself wrapped up in
 [how special it is](https://manishearth.github.io/blog/2017/01/10/rust-tidbits-box-is-special/).
 In a similar vein, this series attempts to look at code and understand how memory is used; the
 complex choreography of operating system, compiler, and program that frees you to focus on
 functionality far-flung from frivolous book-keeping. The Rust compiler relieves a great deal of the
 cognitive burden associated with memory management, but we're going to step into its world for a
 while.
 Let's learn a bit about memory in Rust.
 # Table of Contents
 This series is intended as both learning and reference material; we'll work through the different
 memory types Rust uses, and explain the implications of each. Ultimately, a summary will be provided
 as a cheat sheet for easy future reference. To that end, a table of contents is in order:
 - Foreword
 - [Global Memory Usage: The Whole World](/2019/02/the-whole-world.html)
 - [Fixed Memory: Stacking Up](/2019/02/stacking-up.html)
 - [Dynamic Memory: A Heaping Helping](/2019/02/a-heaping-helping.html)
 - [Compiler Optimizations: What It's Done For You Lately](/2019/02/compiler-optimizations.html)
 - [Summary: What Are the Rules?](/2019/02/summary.html)
 # Foreword
 Rust's three defining features of
 [Performance, Reliability, and Productivity](https://www.rust-lang.org/) are all driven to a great
 degree by the how the Rust compiler understands memory usage. Unlike managed memory languages (Java,
 Python), Rust
 [doesn't really](https://words.steveklabnik.com/borrow-checking-escape-analysis-and-the-generational-hypothesis)
 garbage collect; instead, it uses an
 [ownership](https://doc.rust-lang.org/book/ch04-01-what-is-ownership.html) system to reason about
 how long objects will last in your program. In some cases, if the life of an object is fairly
 transient, Rust can make use of a very fast region called the "stack." When that's not possible,
 Rust uses
 [dynamic (heap) memory](https://en.wikipedia.org/wiki/Memory_management#Dynamic_memory_allocation)
 and the ownership system to ensure you can't accidentally corrupt memory. It's not as fast, but it
 is important to have available.
 That said, there are specific situations in Rust where you'd never need to worry about the
 stack/heap distinction! If you:
 1. Never use `unsafe`
 2. Never use `#![feature(alloc)]` or the [`alloc` crate](https://doc.rust-lang.org/alloc/index.html)
 ...then it's not possible for you to use dynamic memory!
 For some uses of Rust, typically embedded devices, these constraints are OK. They have very limited
 memory, and the program binary size itself may significantly affect what's available! There's no
 operating system able to manage this
 ["virtual memory"](https://en.wikipedia.org/wiki/Virtual_memory) thing, but that's not an issue
 because there's only one running application. The
 [embedonomicon](https://docs.rust-embedded.org/embedonomicon/preface.html) is ever in mind, and
 interacting with the "real world" through extra peripherals is accomplished by reading and writing
 to [specific memory addresses](https://bob.cs.sonoma.edu/IntroCompOrg-RPi/sec-gpio-mem.html).
 Most Rust programs find these requirements overly burdensome though. C++ developers would struggle
 without access to [`std::vector`](https://en.cppreference.com/w/cpp/container/vector) (except those
 hardcore no-STL people), and Rust developers would struggle without
 [`std::vec`](https://doc.rust-lang.org/std/vec/struct.Vec.html). But with the constraints above,
 `std::vec` is actually a part of the
 [`alloc` crate](https://doc.rust-lang.org/alloc/vec/struct.Vec.html), and thus off-limits. `Box`,
 `Rc`, etc., are also unusable for the same reason.
 Whether writing code for embedded devices or not, the important thing in both situations is how much
 you know _before your application starts_ about what its memory usage will look like. In embedded
 devices, there's a small, fixed amount of memory to use. In a browser, you have no idea how large
 [google.com](https://www.google.com)'s home page is until you start trying to download it. The
 compiler uses this knowledge (or lack thereof) to optimize how memory is used; put simply, your code
 runs faster when the compiler can guarantee exactly how much memory your program needs while it's
 running. This series is all about understanding how the compiler reasons about your program, with an
 emphasis on the implications for performance.
 Now let's address some conditions and caveats before going much further:
 - We'll focus on "safe" Rust only; `unsafe` lets you use platform-specific allocation API's
  ([`malloc`](https://www.tutorialspoint.com/c_standard_library/c_function_malloc.htm)) that we'll
  ignore.
 - We'll assume a "debug" build of Rust code (what you get with `cargo run` and `cargo test`) and
  address (pun intended) release mode at the end (`cargo run --release` and `cargo test --release`).
 - All content will be run using Rust 1.32, as that's the highest currently supported in the
  [Compiler Exporer](https://godbolt.org/). As such, we'll avoid upcoming innovations like
  [compile-time evaluation of `static`](https://github.com/rust-lang/rfcs/blob/master/text/0911-const-fn.md)
  that are available in nightly.
 - Because of the nature of the content, being able to read assembly is helpful. We'll keep it
  simple, but I [found](https://stackoverflow.com/a/4584131/1454178) a
  [refresher](https://stackoverflow.com/a/26026278/1454178) on the `push` and `pop`
  [instructions](http://www.cs.virginia.edu/~evans/cs216/guides/x86.html) was helpful while writing
  this.
 - I've tried to be precise in saying only what I can prove using the tools (ASM, docs) that are
  available, but if there's something said in error it will be corrected expeditiously. Please let
  me know at [bradlee@speice.io](mailto:bradlee@speice.io)
 Finally, I'll do what I can to flag potential future changes but the Rust docs have a notice worth
 repeating:
 > Rust does not currently have a rigorously and formally defined memory model.
 >
 > -- [the docs](https://doc.rust-lang.org/std/ptr/fn.read_volatile.html)
@@ -1,337 +0,0 @@
 ---
 layout: post
 title: "Global Memory Usage: The Whole World"
 description: "Static considered slightly less harmful."
 category:
 tags: [rust, understanding-allocations]
 ---
 The first memory type we'll look at is pretty special: when Rust can prove that a _value_ is fixed
 for the life of a program (`const`), and when a _reference_ is unique for the life of a program
 (`static` as a declaration, not
 [`'static`](https://doc.rust-lang.org/book/ch10-03-lifetime-syntax.html#the-static-lifetime) as a
 lifetime), we can make use of global memory. This special section of data is embedded directly in
 the program binary so that variables are ready to go once the program loads; no additional
 computation is necessary.
 Understanding the value/reference distinction is important for reasons we'll go into below, and
 while the
 [full specification](https://github.com/rust-lang/rfcs/blob/master/text/0246-const-vs-static.md) for
 these two keywords is available, we'll take a hands-on approach to the topic.
 # **const**
 When a _value_ is guaranteed to be unchanging in your program (where "value" may be scalars,
 `struct`s, etc.), you can declare it `const`. This tells the compiler that it's safe to treat the
 value as never changing, and enables some interesting optimizations; not only is there no
 initialization cost to creating the value (it is loaded at the same time as the executable parts of
 your program), but the compiler can also copy the value around if it speeds up the code.
 The points we need to address when talking about `const` are:
 - `Const` values are stored in read-only memory - it's impossible to modify.
 - Values resulting from calling a `const fn` are materialized at compile-time.
 - The compiler may (or may not) copy `const` values wherever it chooses.
 ## Read-Only
 The first point is a bit strange - "read-only memory."
 [The Rust book](https://doc.rust-lang.org/book/ch03-01-variables-and-mutability.html#differences-between-variables-and-constants)
 mentions in a couple places that using `mut` with constants is illegal, but it's also important to
 demonstrate just how immutable they are. _Typically_ in Rust you can use
 [interior mutability](https://doc.rust-lang.org/book/ch15-05-interior-mutability.html) to modify
 things that aren't declared `mut`.
 [`RefCell`](https://doc.rust-lang.org/std/cell/struct.RefCell.html) provides an example of this
 pattern in action:
 ```rust
 use std::cell::RefCell;
 fn my_mutator(cell: &RefCell<u8>) {
    // Even though we're given an immutable reference,
    // the `replace` method allows us to modify the inner value.
    cell.replace(14);
 }
 fn main() {
    let cell = RefCell::new(25);
    // Prints out 25
    println!("Cell: {:?}", cell);
    my_mutator(&cell);
    // Prints out 14
    println!("Cell: {:?}", cell);
 }
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=8e4bea1a718edaff4507944e825a54b2)
 When `const` is involved though, interior mutability is impossible:
 ```rust
 use std::cell::RefCell;
 const CELL: RefCell<u8> = RefCell::new(25);
 fn my_mutator(cell: &RefCell<u8>) {
    cell.replace(14);
 }
 fn main() {
    // First line prints 25 as expected
    println!("Cell: {:?}", &CELL);
    my_mutator(&CELL);
    // Second line *still* prints 25
    println!("Cell: {:?}", &CELL);
 }
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=88fe98110c33c1b3a51e341f48b8ae00)
 And a second example using [`Once`](https://doc.rust-lang.org/std/sync/struct.Once.html):
 ```rust
 use std::sync::Once;
 const SURPRISE: Once = Once::new();
 fn main() {
    // This is how `Once` is supposed to be used
    SURPRISE.call_once(|| println!("Initializing..."));
    // Because `Once` is a `const` value, we never record it
    // having been initialized the first time, and this closure
    // will also execute.
    SURPRISE.call_once(|| println!("Initializing again???"));
 }
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=c3cc5979b5e5434eca0f9ec4a06ee0ed)
 When the
 [`const` specification](https://github.com/rust-lang/rfcs/blob/26197104b7bb9a5a35db243d639aee6e46d35d75/text/0246-const-vs-static.md)
 refers to ["rvalues"](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2010/n3055.pdf), this
 behavior is what they refer to. [Clippy](https://github.com/rust-lang/rust-clippy) will treat this
 as an error, but it's still something to be aware of.
 ## Initialization == Compilation
 The next thing to mention is that `const` values are loaded into memory _as part of your program
 binary_. Because of this, any `const` values declared in your program will be "realized" at
 compile-time; accessing them may trigger a main-memory lookup (with a fixed address, so your CPU may
 be able to prefetch the value), but that's it.
 ```rust
 use std::cell::RefCell;
 const CELL: RefCell<u32> = RefCell::new(24);
 pub fn multiply(value: u32) -> u32 {
    // CELL is stored at `.L__unnamed_1`
    value * (*CELL.get_mut())
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/Th8boO)
 The compiler creates one `RefCell`, uses it everywhere, and never needs to call the `RefCell::new`
 function.
 ## Copying
 If it's helpful though, the compiler can choose to copy `const` values.
 ```rust
 const FACTOR: u32 = 1000;
 pub fn multiply(value: u32) -> u32 {
    // See assembly line 4 for the `mov edi, 1000` instruction
    value * FACTOR
 }
 pub fn multiply_twice(value: u32) -> u32 {
    // See assembly lines 22 and 29 for `mov edi, 1000` instructions
    value * FACTOR * FACTOR
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/ZtS54X)
 In this example, the `FACTOR` value is turned into the `mov edi, 1000` instruction in both the
 `multiply` and `multiply_twice` functions; the "1000" value is never "stored" anywhere, as it's
 small enough to inline into the assembly instructions.
 Finally, getting the address of a `const` value is possible, but not guaranteed to be unique
 (because the compiler can choose to copy values). I was unable to get non-unique pointers in my
 testing (even using different crates), but the specifications are clear enough: _don't rely on
 pointers to `const` values being consistent_. To be frank, caring about locations for `const` values
 is almost certainly a code smell.
 # **static**
 Static variables are related to `const` variables, but take a slightly different approach. When we
 declare that a _reference_ is unique for the life of a program, you have a `static` variable
 (unrelated to the `'static` lifetime). Because of the reference/value distinction with
 `const`/`static`, static variables behave much more like typical "global" variables.
 But to understand `static`, here's what we'll look at:
 - `static` variables are globally unique locations in memory.
 - Like `const`, `static` variables are loaded at the same time as your program being read into
  memory.
 - All `static` variables must implement the
  [`Sync`](https://doc.rust-lang.org/std/marker/trait.Sync.html) marker trait.
 - Interior mutability is safe and acceptable when using `static` variables.
 ## Memory Uniqueness
 The single biggest difference between `const` and `static` is the guarantees provided about
 uniqueness. Where `const` variables may or may not be copied in code, `static` variables are
 guarantee to be unique. If we take a previous `const` example and change it to `static`, the
 difference should be clear:
 ```rust
 static FACTOR: u32 = 1000;
 pub fn multiply(value: u32) -> u32 {
    // The assembly to `mul dword ptr [rip + example::FACTOR]` is how FACTOR gets used
    value * FACTOR
 }
 pub fn multiply_twice(value: u32) -> u32 {
    // The assembly to `mul dword ptr [rip + example::FACTOR]` is how FACTOR gets used
    value * FACTOR * FACTOR
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/uxmiRQ)
 Where [previously](#copying) there were plenty of references to multiplying by 1000, the new
 assembly refers to `FACTOR` as a named memory location instead. No initialization work needs to be
 done, but the compiler can no longer prove the value never changes during execution.
 ## Initialization == Compilation
 Next, let's talk about initialization. The simplest case is initializing static variables with
 either scalar or struct notation:
 ```rust
 #[derive(Debug)]
 struct MyStruct {
    x: u32
 }
 static MY_STRUCT: MyStruct = MyStruct {
    // You can even reference other statics
    // declared later
    x: MY_VAL
 };
 static MY_VAL: u32 = 24;
 fn main() {
    println!("Static MyStruct: {:?}", MY_STRUCT);
 }
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=b538dbc46076f12db047af4f4403ee6e)
 Things can get a bit weirder when using `const fn` though. In most cases, it just works:
 ```rust
 #[derive(Debug)]
 struct MyStruct {
    x: u32
 }
 impl MyStruct {
    const fn new() -> MyStruct {
        MyStruct { x: 24 }
    }
 }
 static MY_STRUCT: MyStruct = MyStruct::new();
 fn main() {
    println!("const fn Static MyStruct: {:?}", MY_STRUCT);
 }
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=8c796a6e7fc273c12115091b707b0255)
 However, there's a caveat: you're currently not allowed to use `const fn` to initialize static
 variables of types that aren't marked `Sync`. For example,
 [`RefCell::new()`](https://doc.rust-lang.org/std/cell/struct.RefCell.html#method.new) is a
 `const fn`, but because
 [`RefCell` isn't `Sync`](https://doc.rust-lang.org/std/cell/struct.RefCell.html#impl-Sync), you'll
 get an error at compile time:
 ```rust
 use std::cell::RefCell;
 // error[E0277]: `std::cell::RefCell<u8>` cannot be shared between threads safely
 static MY_LOCK: RefCell<u8> = RefCell::new(0);
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=c76ef86e473d07117a1700e21fd45560)
 It's likely that this will
 [change in the future](https://github.com/rust-lang/rfcs/blob/master/text/0911-const-fn.md) though.
 ## **Sync**
 Which leads well to the next point: static variable types must implement the
 [`Sync` marker](https://doc.rust-lang.org/std/marker/trait.Sync.html). Because they're globally
 unique, it must be safe for you to access static variables from any thread at any time. Most
 `struct` definitions automatically implement the `Sync` trait because they contain only elements
 which themselves implement `Sync` (read more in the
 [Nomicon](https://doc.rust-lang.org/nomicon/send-and-sync.html)). This is why earlier examples could
 get away with initializing statics, even though we never included an `impl Sync for MyStruct` in the
 code. To demonstrate this property, Rust refuses to compile our earlier example if we add a
 non-`Sync` element to the `struct` definition:
 ```rust
 use std::cell::RefCell;
 struct MyStruct {
    x: u32,
    y: RefCell<u8>,
 }
 // error[E0277]: `std::cell::RefCell<u8>` cannot be shared between threads safely
 static MY_STRUCT: MyStruct = MyStruct {
    x: 8,
    y: RefCell::new(8)
 };
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=40074d0248f056c296b662dbbff97cfc)
 ## Interior Mutability
 Finally, while `static mut` variables are allowed, mutating them is an `unsafe` operation. If we
 want to stay in `safe` Rust, we can use interior mutability to accomplish similar goals:
 ```rust
 use std::sync::Once;
 // This example adapted from https://doc.rust-lang.org/std/sync/struct.Once.html#method.call_once
 static INIT: Once = Once::new();
 fn main() {
    // Note that while `INIT` is declared immutable, we're still allowed
    // to mutate its interior
    INIT.call_once(|| println!("Initializing..."));
    // This code won't panic, as the interior of INIT was modified
    // as part of the previous `call_once`
    INIT.call_once(|| panic!("INIT was called twice!"));
 }
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=3ba003a981a7ed7400240caadd384d59)
@@ -1,601 +0,0 @@
 ---
 layout: post
 title: "Fixed Memory: Stacking Up"
 description: "We don't need no allocator."
 category:
 tags: [rust, understanding-allocations]
 ---
 `const` and `static` are perfectly fine, but it's relatively rare that we know at compile-time about
 either values or references that will be the same for the duration of our program. Put another way,
 it's not often the case that either you or your compiler knows how much memory your entire program
 will ever need.
 However, there are still some optimizations the compiler can do if it knows how much memory
 individual functions will need. Specifically, the compiler can make use of "stack" memory (as
 opposed to "heap" memory) which can be managed far faster in both the short- and long-term. When
 requesting memory, the [`push` instruction](http://www.cs.virginia.edu/~evans/cs216/guides/x86.html)
 can typically complete in [1 or 2 cycles](https://agner.org/optimize/instruction_tables.ods) (<1
 nanosecond on modern CPUs). Contrast that to heap memory which requires an allocator (specialized
 software to track what memory is in use) to reserve space. When you're finished with stack memory,
 the `pop` instruction runs in 1-3 cycles, as opposed to an allocator needing to worry about memory
 fragmentation and other issues with the heap. All sorts of incredibly sophisticated techniques have
 been used to design allocators:
 - [Garbage Collection](<https://en.wikipedia.org/wiki/Garbage_collection_(computer_science)>)
  strategies like [Tracing](https://en.wikipedia.org/wiki/Tracing_garbage_collection) (used in
  [Java](https://www.oracle.com/technetwork/java/javase/tech/g1-intro-jsp-135488.html)) and
  [Reference counting](https://en.wikipedia.org/wiki/Reference_counting) (used in
  [Python](https://docs.python.org/3/extending/extending.html#reference-counts))
 - Thread-local structures to prevent locking the allocator in
  [tcmalloc](https://jamesgolick.com/2013/5/19/how-tcmalloc-works.html)
 - Arena structures used in [jemalloc](http://jemalloc.net/), which
  [until recently](https://blog.rust-lang.org/2019/01/17/Rust-1.32.0.html#jemalloc-is-removed-by-default)
  was the primary allocator for Rust programs!
 But no matter how fast your allocator is, the principle remains: the fastest allocator is the one
 you never use. As such, we're not going to discuss how exactly the
 [`push` and `pop` instructions work](http://www.cs.virginia.edu/~evans/cs216/guides/x86.html), but
 we'll focus instead on the conditions that enable the Rust compiler to use faster stack-based
 allocation for variables.
 So, **how do we know when Rust will or will not use stack allocation for objects we create?**
 Looking at other languages, it's often easy to delineate between stack and heap. Managed memory
 languages (Python, Java,
 [C#](https://blogs.msdn.microsoft.com/ericlippert/2010/09/30/the-truth-about-value-types/)) place
 everything on the heap. JIT compilers ([PyPy](https://www.pypy.org/),
 [HotSpot](https://www.oracle.com/technetwork/java/javase/tech/index-jsp-136373.html)) may optimize
 some heap allocations away, but you should never assume it will happen. C makes things clear with
 calls to special functions (like [malloc(3)](https://linux.die.net/man/3/malloc)) needed to access
 heap memory. Old C++ has the [`new`](https://stackoverflow.com/a/655086/1454178) keyword, though
 modern C++/C++11 is more complicated with [RAII](https://en.cppreference.com/w/cpp/language/raii).
 For Rust, we can summarize as follows: **stack allocation will be used for everything that doesn't
 involve "smart pointers" and collections**. We'll skip over a precise definition of the term "smart
 pointer" for now, and instead discuss what we should watch for to understand when stack and heap
 memory regions are used:
 1. Stack manipulation instructions (`push`, `pop`, and `add`/`sub` of the `rsp` register) indicate
   allocation of stack memory:
   ```rust
   pub fn stack_alloc(x: u32) -> u32 {
       // Space for `y` is allocated by subtracting from `rsp`,
       // and then populated
       let y = [1u8, 2, 3, 4];
       // Space for `y` is deallocated by adding back to `rsp`
       x
   }
   ```
   -- [Compiler Explorer](https://godbolt.org/z/5WSgc9)
 2. Tracking when exactly heap allocation calls occur is difficult. It's typically easier to watch
   for `call core::ptr::real_drop_in_place`, and infer that a heap allocation happened in the recent
   past:
   ```rust
   pub fn heap_alloc(x: usize) -> usize {
       // Space for elements in a vector has to be allocated
       // on the heap, and is then de-allocated once the
       // vector goes out of scope
       let y: Vec<u8> = Vec::with_capacity(x);
       x
   }
   ```
   -- [Compiler Explorer](https://godbolt.org/z/epfgoQ) (`real_drop_in_place` happens on line 1317)
   <span style="font-size: .8em">Note: While the
   [`Drop` trait](https://doc.rust-lang.org/std/ops/trait.Drop.html) is
   [called for stack-allocated objects](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=87edf374d8983816eb3d8cfeac657b46),
   the Rust standard library only defines `Drop` implementations for types that involve heap
   allocation.</span>
 3. If you don't want to inspect the assembly, use a custom allocator that's able to track and alert
   when heap allocations occur. Crates like
   [`alloc_counter`](https://crates.io/crates/alloc_counter) are designed for exactly this purpose.
 With all that in mind, let's talk about situations in which we're guaranteed to use stack memory:
 - Structs are created on the stack.
 - Function arguments are passed on the stack, meaning the
  [`#[inline]` attribute](https://doc.rust-lang.org/reference/attributes.html#inline-attribute) will
  not change the memory region used.
 - Enums and unions are stack-allocated.
 - [Arrays](https://doc.rust-lang.org/std/primitive.array.html) are always stack-allocated.
 - Closures capture their arguments on the stack.
 - Generics will use stack allocation, even with dynamic dispatch.
 - [`Copy`](https://doc.rust-lang.org/std/marker/trait.Copy.html) types are guaranteed to be
  stack-allocated, and copying them will be done in stack memory.
 - [`Iterator`s](https://doc.rust-lang.org/std/iter/trait.Iterator.html) in the standard library are
  stack-allocated even when iterating over heap-based collections.
 # Structs
 The simplest case comes first. When creating vanilla `struct` objects, we use stack memory to hold
 their contents:
 ```rust
 struct Point {
    x: u64,
    y: u64,
 }
 struct Line {
    a: Point,
    b: Point,
 }
 pub fn make_line() {
    // `origin` is stored in the first 16 bytes of memory
    // starting at location `rsp`
    let origin = Point { x: 0, y: 0 };
    // `point` makes up the next 16 bytes of memory
    let point = Point { x: 1, y: 2 };
    // When creating `ray`, we just move the content out of
    // `origin` and `point` into the next 32 bytes of memory
    let ray = Line { a: origin, b: point };
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/vri9BE)
 Note that while some extra-fancy instructions are used for memory manipulation in the assembly, the
 `sub rsp, 64` instruction indicates we're still working with the stack.
 # Function arguments
 Have you ever wondered how functions communicate with each other? Like, once the variables are given
 to you, everything's fine. But how do you "give" those variables to another function? How do you get
 the results back afterward? The answer: the compiler arranges memory and assembly instructions using
 a pre-determined [calling convention](http://llvm.org/docs/LangRef.html#calling-conventions). This
 convention governs the rules around where arguments needed by a function will be located (either in
 memory offsets relative to the stack pointer `rsp`, or in other registers), and where the results
 can be found once the function has finished. And when multiple languages agree on what the calling
 conventions are, you can do things like having [Go call Rust code](https://blog.filippo.io/rustgo/)!
 Put simply: it's the compiler's job to figure out how to call other functions, and you can assume
 that the compiler is good at its job.
 We can see this in action using a simple example:
 ```rust
 struct Point {
    x: i64,
    y: i64,
 }
 // We use integer division operations to keep
 // the assembly clean, understanding the result
 // isn't accurate.
 fn distance(a: &Point, b: &Point) -> i64 {
    // Immediately subtract from `rsp` the bytes needed
    // to hold all the intermediate results - this is
    // the stack allocation step
    // The compiler used the `rdi` and `rsi` registers
    // to pass our arguments, so read them in
    let x1 = a.x;
    let x2 = b.x;
    let y1 = a.y;
    let y2 = b.y;
    // Do the actual math work
    let x_pow = (x1 - x2) * (x1 - x2);
    let y_pow = (y1 - y2) * (y1 - y2);
    let squared = x_pow + y_pow;
    squared / squared
    // Our final result will be stored in the `rax` register
    // so that our caller knows where to retrieve it.
    // Finally, add back to `rsp` the stack memory that is
    // now ready to be used by other functions.
 }
 pub fn total_distance() {
    let start = Point { x: 1, y: 2 };
    let middle = Point { x: 3, y: 4 };
    let end = Point { x: 5, y: 6 };
    let _dist_1 = distance(&start, &middle);
    let _dist_2 = distance(&middle, &end);
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/Qmx4ST)
 As a consequence of function arguments never using heap memory, we can also infer that functions
 using the `#[inline]` attributes also do not heap allocate. But better than inferring, we can look
 at the assembly to prove it:
 ```rust
 struct Point {
    x: i64,
    y: i64,
 }
 // Note that there is no `distance` function in the assembly output,
 // and the total line count goes from 229 with inlining off
 // to 306 with inline on. Even still, no heap allocations occur.
 #[inline(always)]
 fn distance(a: &Point, b: &Point) -> i64 {
    let x1 = a.x;
    let x2 = b.x;
    let y1 = a.y;
    let y2 = b.y;
    let x_pow = (a.x - b.x) * (a.x - b.x);
    let y_pow = (a.y - b.y) * (a.y - b.y);
    let squared = x_pow + y_pow;
    squared / squared
 }
 pub fn total_distance() {
    let start = Point { x: 1, y: 2 };
    let middle = Point { x: 3, y: 4 };
    let end = Point { x: 5, y: 6 };
    let _dist_1 = distance(&start, &middle);
    let _dist_2 = distance(&middle, &end);
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/30Sh66)
 Finally, passing by value (arguments with type
 [`Copy`](https://doc.rust-lang.org/std/marker/trait.Copy.html)) and passing by reference (either
 moving ownership or passing a pointer) may have slightly different layouts in assembly, but will
 still use either stack memory or CPU registers:
 ```rust
 pub struct Point {
    x: i64,
    y: i64,
 }
 // Moving values
 pub fn distance_moved(a: Point, b: Point) -> i64 {
    let x1 = a.x;
    let x2 = b.x;
    let y1 = a.y;
    let y2 = b.y;
    let x_pow = (x1 - x2) * (x1 - x2);
    let y_pow = (y1 - y2) * (y1 - y2);
    let squared = x_pow + y_pow;
    squared / squared
 }
 // Borrowing values has two extra `mov` instructions on lines 21 and 22
 pub fn distance_borrowed(a: &Point, b: &Point) -> i64 {
    let x1 = a.x;
    let x2 = b.x;
    let y1 = a.y;
    let y2 = b.y;
    let x_pow = (x1 - x2) * (x1 - x2);
    let y_pow = (y1 - y2) * (y1 - y2);
    let squared = x_pow + y_pow;
    squared / squared
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/06hGiv)
 # Enums
 If you've ever worried that wrapping your types in
 [`Option`](https://doc.rust-lang.org/stable/core/option/enum.Option.html) or
 [`Result`](https://doc.rust-lang.org/stable/core/result/enum.Result.html) would finally make them
 large enough that Rust decides to use heap allocation instead, fear no longer: `enum` and union
 types don't use heap allocation:
 ```rust
 enum MyEnum {
    Small(u8),
    Large(u64)
 }
 struct MyStruct {
    x: MyEnum,
    y: MyEnum,
 }
 pub fn enum_compare() {
    let x = MyEnum::Small(0);
    let y = MyEnum::Large(0);
    let z = MyStruct { x, y };
    let opt = Option::Some(z);
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/HK7zBx)
 Because the size of an `enum` is the size of its largest element plus a flag, the compiler can
 predict how much memory is used no matter which variant of an enum is currently stored in a
 variable. Thus, enums and unions have no need of heap allocation. There's unfortunately not a great
 way to show this in assembly, so I'll instead point you to the
 [`core::mem::size_of`](https://doc.rust-lang.org/stable/core/mem/fn.size_of.html#size-of-enums)
 documentation.
 # Arrays
 The array type is guaranteed to be stack allocated, which is why the array size must be declared.
 Interestingly enough, this can be used to cause safe Rust programs to crash:
 ```rust
 // 256 bytes
 #[derive(Default)]
 struct TwoFiftySix {
    _a: [u64; 32]
 }
 // 8 kilobytes
 #[derive(Default)]
 struct EightK {
    _a: [TwoFiftySix; 32]
 }
 // 256 kilobytes
 #[derive(Default)]
 struct TwoFiftySixK {
    _a: [EightK; 32]
 }
 // 8 megabytes - exceeds space typically provided for the stack,
 // though the kernel can be instructed to allocate more.
 // On Linux, you can check stack size using `ulimit -s`
 #[derive(Default)]
 struct EightM {
    _a: [TwoFiftySixK; 32]
 }
 fn main() {
    // Because we already have things in stack memory
    // (like the current function call stack), allocating another
    // eight megabytes of stack memory crashes the program
    let _x = EightM::default();
 }
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=587a6380a4914bcbcef4192c90c01dc4)
 There aren't any security implications of this (no memory corruption occurs), but it's good to note
 that the Rust compiler won't move arrays into heap memory even if they can be reasonably expected to
 overflow the stack.
 # Closures
 Rules for how anonymous functions capture their arguments are typically language-specific. In Java,
 [Lambda Expressions](https://docs.oracle.com/javase/tutorial/java/javaOO/lambdaexpressions.html) are
 actually objects created on the heap that capture local primitives by copying, and capture local
 non-primitives as (`final`) references.
 [Python](https://docs.python.org/3.7/reference/expressions.html#lambda) and
 [JavaScript](https://javascriptweblog.wordpress.com/2010/10/25/understanding-javascript-closures/)
 both bind _everything_ by reference normally, but Python can also
 [capture values](https://stackoverflow.com/a/235764/1454178) and JavaScript has
 [Arrow functions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions).
 In Rust, arguments to closures are the same as arguments to other functions; closures are simply
 functions that don't have a declared name. Some weird ordering of the stack may be required to
 handle them, but it's the compiler's responsiblity to figure that out.
 Each example below has the same effect, but a different assembly implementation. In the simplest
 case, we immediately run a closure returned by another function. Because we don't store a reference
 to the closure, the stack memory needed to store the captured values is contiguous:
 ```rust
 fn my_func() -> impl FnOnce() {
    let x = 24;
    // Note that this closure in assembly looks exactly like
    // any other function; you even use the `call` instruction
    // to start running it.
    move || { x; }
 }
 pub fn immediate() {
    my_func()();
    my_func()();
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/mgJ2zl), 25 total assembly instructions
 If we store a reference to the closure, the Rust compiler keeps values it needs in the stack memory
 of the original function. Getting the details right is a bit harder, so the instruction count goes
 up even though this code is functionally equivalent to our original example:
 ```rust
 pub fn simple_reference() {
    let x = my_func();
    let y = my_func();
    y();
    x();
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/K_dj5n), 55 total assembly instructions
 Even things like variable order can make a difference in instruction count:
 ```rust
 pub fn complex() {
    let x = my_func();
    let y = my_func();
    x();
    y();
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/p37qFl), 70 total assembly instructions
 In every circumstance though, the compiler ensured that no heap allocations were necessary.
 # Generics
 Traits in Rust come in two broad forms: static dispatch (monomorphization, `impl Trait`) and dynamic
 dispatch (trait objects, `dyn Trait`). While dynamic dispatch is often _associated_ with trait
 objects being stored in the heap, dynamic dispatch can be used with stack allocated objects as well:
 ```rust
 trait GetInt {
    fn get_int(&self) -> u64;
 }
 // vtable stored at section L__unnamed_1
 struct WhyNotU8 {
    x: u8
 }
 impl GetInt for WhyNotU8 {
    fn get_int(&self) -> u64 {
        self.x as u64
    }
 }
 // vtable stored at section L__unnamed_2
 struct ActualU64 {
    x: u64
 }
 impl GetInt for ActualU64 {
    fn get_int(&self) -> u64 {
        self.x
    }
 }
 // `&dyn` declares that we want to use dynamic dispatch
 // rather than monomorphization, so there is only one
 // `retrieve_int` function that shows up in the final assembly.
 // If we used generics, there would be one implementation of
 // `retrieve_int` for each type that implements `GetInt`.
 pub fn retrieve_int(u: &dyn GetInt) {
    // In the assembly, we just call an address given to us
    // in the `rsi` register and hope that it was set up
    // correctly when this function was invoked.
    let x = u.get_int();
 }
 pub fn do_call() {
    // Note that even though the vtable for `WhyNotU8` and
    // `ActualU64` includes a pointer to
    // `core::ptr::real_drop_in_place`, it is never invoked.
    let a = WhyNotU8 { x: 0 };
    let b = ActualU64 { x: 0 };
    retrieve_int(&a);
    retrieve_int(&b);
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/u_yguS)
 It's hard to imagine practical situations where dynamic dispatch would be used for objects that
 aren't heap allocated, but it technically can be done.
 # Copy types
 Understanding move semantics and copy semantics in Rust is weird at first. The Rust docs
 [go into detail](https://doc.rust-lang.org/stable/core/marker/trait.Copy.html) far better than can
 be addressed here, so I'll leave them to do the job. From a memory perspective though, their
 guideline is reasonable:
 [if your type can implemement `Copy`, it should](https://doc.rust-lang.org/stable/core/marker/trait.Copy.html#when-should-my-type-be-copy).
 While there are potential speed tradeoffs to _benchmark_ when discussing `Copy` (move semantics for
 stack objects vs. copying stack pointers vs. copying stack `struct`s), _it's impossible for `Copy`
 to introduce a heap allocation_.
 But why is this the case? Fundamentally, it's because the language controls what `Copy` means -
 ["the behavior of `Copy` is not overloadable"](https://doc.rust-lang.org/std/marker/trait.Copy.html#whats-the-difference-between-copy-and-clone)
 because it's a marker trait. From there we'll note that a type
 [can implement `Copy`](https://doc.rust-lang.org/std/marker/trait.Copy.html#when-can-my-type-be-copy)
 if (and only if) its components implement `Copy`, and that
 [no heap-allocated types implement `Copy`](https://doc.rust-lang.org/std/marker/trait.Copy.html#implementors).
 Thus, assignments involving heap types are always move semantics, and new heap allocations won't
 occur because of implicit operator behavior.
 ```rust
 #[derive(Clone)]
 struct Cloneable {
    x: Box<u64>
 }
 // error[E0204]: the trait `Copy` may not be implemented for this type
 #[derive(Copy, Clone)]
 struct NotCopyable {
    x: Box<u64>
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/VToRuK)
 # Iterators
 In managed memory languages (like
 [Java](https://www.youtube.com/watch?v=bSkpMdDe4g4&feature=youtu.be&t=357)), there's a subtle
 difference between these two code samples:
 ```java
 public static int sum_for(List<Long> vals) {
    long sum = 0;
    // Regular for loop
    for (int i = 0; i < vals.length; i++) {
        sum += vals[i];
    }
    return sum;
 }
 public static int sum_foreach(List<Long> vals) {
    long sum = 0;
    // "Foreach" loop - uses iteration
    for (Long l : vals) {
        sum += l;
    }
    return sum;
 }
 ```
 In the `sum_for` function, nothing terribly interesting happens. In `sum_foreach`, an object of type
 [`Iterator`](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/Iterator.html)
 is allocated on the heap, and will eventually be garbage-collected. This isn't a great design;
 iterators are often transient objects that you need during a function and can discard once the
 function ends. Sounds exactly like the issue stack-allocated objects address, no?
 In Rust, iterators are allocated on the stack. The objects to iterate over are almost certainly in
 heap memory, but the iterator itself
 ([`Iter`](https://doc.rust-lang.org/std/slice/struct.Iter.html)) doesn't need to use the heap. In
 each of the examples below we iterate over a collection, but never use heap allocation:
 ```rust
 use std::collections::HashMap;
 // There's a lot of assembly generated, but if you search in the text,
 // there are no references to `real_drop_in_place` anywhere.
 pub fn sum_vec(x: &Vec<u32>) {
    let mut s = 0;
    // Basic iteration over vectors doesn't need allocation
    for y in x {
        s += y;
    }
 }
 pub fn sum_enumerate(x: &Vec<u32>) {
    let mut s = 0;
    // More complex iterators are just fine too
    for (_i, y) in x.iter().enumerate() {
        s += y;
    }
 }
 pub fn sum_hm(x: &HashMap<u32, u32>) {
    let mut s = 0;
    // And it's not just Vec, all types will allocate the iterator
    // on stack memory
    for y in x.values() {
        s += y;
    }
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/FTT3CT)
@@ -1,254 +0,0 @@
 ---
 layout: post
 title: "Dynamic Memory: A Heaping Helping"
 description: "The reason Rust exists."
 category:
 tags: [rust, understanding-allocations]
 ---
 Managing dynamic memory is hard. Some languages assume users will do it themselves (C, C++), and
 some languages go to extreme lengths to protect users from themselves (Java, Python). In Rust, how
 the language uses dynamic memory (also referred to as the **heap**) is a system called _ownership_.
 And as the docs mention, ownership
 [is Rust's most unique feature](https://doc.rust-lang.org/book/ch04-00-understanding-ownership.html).
 The heap is used in two situations; when the compiler is unable to predict either the _total size of
 memory needed_, or _how long the memory is needed for_, it allocates space in the heap. This happens
 pretty frequently; if you want to download the Google home page, you won't know how large it is
 until your program runs. And when you're finished with Google, we deallocate the memory so it can be
 used to store other webpages. If you're interested in a slightly longer explanation of the heap,
 check out
 [The Stack and the Heap](https://doc.rust-lang.org/book/ch04-01-what-is-ownership.html#the-stack-and-the-heap)
 in Rust's documentation.
 We won't go into detail on how the heap is managed; the
 [ownership documentation](https://doc.rust-lang.org/book/ch04-01-what-is-ownership.html) does a
 phenomenal job explaining both the "why" and "how" of memory management. Instead, we're going to
 focus on understanding "when" heap allocations occur in Rust.
 To start off, take a guess for how many allocations happen in the program below:
 ```rust
 fn main() {}
 ```
 It's obviously a trick question; while no heap allocations occur as a result of that code, the setup
 needed to call `main` does allocate on the heap. Here's a way to show it:
 ```rust
 #![feature(integer_atomics)]
 use std::alloc::{GlobalAlloc, Layout, System};
 use std::sync::atomic::{AtomicU64, Ordering};
 static ALLOCATION_COUNT: AtomicU64 = AtomicU64::new(0);
 struct CountingAllocator;
 unsafe impl GlobalAlloc for CountingAllocator {
    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
        ALLOCATION_COUNT.fetch_add(1, Ordering::SeqCst);
        System.alloc(layout)
    }
    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
        System.dealloc(ptr, layout);
    }
 }
 #[global_allocator]
 static A: CountingAllocator = CountingAllocator;
 fn main() {
    let x = ALLOCATION_COUNT.fetch_add(0, Ordering::SeqCst);
    println!("There were {} allocations before calling main!", x);
 }
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=nightly&mode=debug&edition=2018&gist=fb5060025ba79fc0f906b65a4ef8eb8e)
 As of the time of writing, there are five allocations that happen before `main` is ever called.
 But when we want to understand more practically where heap allocation happens, we'll follow this
 guide:
 - Smart pointers hold their contents in the heap
 - Collections are smart pointers for many objects at a time, and reallocate when they need to grow
 Finally, there are two "addendum" issues that are important to address when discussing Rust and the
 heap:
 - Non-heap alternatives to many standard library types are available.
 - Special allocators to track memory behavior should be used to benchmark code.
 # Smart pointers
 The first thing to note are the "smart pointer" types. When you have data that must outlive the
 scope in which it is declared, or your data is of unknown or dynamic size, you'll make use of these
 types.
 The term [smart pointer](https://en.wikipedia.org/wiki/Smart_pointer) comes from C++, and while it's
 closely linked to a general design pattern of
 ["Resource Acquisition Is Initialization"](https://en.cppreference.com/w/cpp/language/raii), we'll
 use it here specifically to describe objects that are responsible for managing ownership of data
 allocated on the heap. The smart pointers available in the `alloc` crate should look mostly
 familiar:
 - [`Box`](https://doc.rust-lang.org/alloc/boxed/struct.Box.html)
 - [`Rc`](https://doc.rust-lang.org/alloc/rc/struct.Rc.html)
 - [`Arc`](https://doc.rust-lang.org/alloc/sync/struct.Arc.html)
 - [`Cow`](https://doc.rust-lang.org/alloc/borrow/enum.Cow.html)
 The [standard library](https://doc.rust-lang.org/std/) also defines some smart pointers to manage
 heap objects, though more than can be covered here. Some examples are:
 - [`RwLock`](https://doc.rust-lang.org/std/sync/struct.RwLock.html)
 - [`Mutex`](https://doc.rust-lang.org/std/sync/struct.Mutex.html)
 Finally, there is one ["gotcha"](https://www.merriam-webster.com/dictionary/gotcha): **cell types**
 (like [`RefCell`](https://doc.rust-lang.org/stable/core/cell/struct.RefCell.html)) look and behave
 similarly, but **don't involve heap allocation**. The
 [`core::cell` docs](https://doc.rust-lang.org/stable/core/cell/index.html) have more information.
 When a smart pointer is created, the data it is given is placed in heap memory and the location of
 that data is recorded in the smart pointer. Once the smart pointer has determined it's safe to
 deallocate that memory (when a `Box` has
 [gone out of scope](https://doc.rust-lang.org/stable/std/boxed/index.html) or a reference count
 [goes to zero](https://doc.rust-lang.org/alloc/rc/index.html)), the heap space is reclaimed. We can
 prove these types use heap memory by looking at code:
 ```rust
 use std::rc::Rc;
 use std::sync::Arc;
 use std::borrow::Cow;
 pub fn my_box() {
    // Drop at assembly line 1640
    Box::new(0);
 }
 pub fn my_rc() {
    // Drop at assembly line 1650
    Rc::new(0);
 }
 pub fn my_arc() {
    // Drop at assembly line 1660
    Arc::new(0);
 }
 pub fn my_cow() {
    // Drop at assembly line 1672
    Cow::from("drop");
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/4AMQug)
 # Collections
 Collection types use heap memory because their contents have dynamic size; they will request more
 memory [when needed](https://doc.rust-lang.org/std/vec/struct.Vec.html#method.reserve), and can
 [release memory](https://doc.rust-lang.org/std/vec/struct.Vec.html#method.shrink_to_fit) when it's
 no longer necessary. This dynamic property forces Rust to heap allocate everything they contain. In
 a way, **collections are smart pointers for many objects at a time**. Common types that fall under
 this umbrella are [`Vec`](https://doc.rust-lang.org/stable/alloc/vec/struct.Vec.html),
 [`HashMap`](https://doc.rust-lang.org/stable/std/collections/struct.HashMap.html), and
 [`String`](https://doc.rust-lang.org/stable/alloc/string/struct.String.html) (not
 [`str`](https://doc.rust-lang.org/std/primitive.str.html)).
 While collections store the objects they own in heap memory, _creating new collections will not
 allocate on the heap_. This is a bit weird; if we call `Vec::new()`, the assembly shows a
 corresponding call to `real_drop_in_place`:
 ```rust
 pub fn my_vec() {
    // Drop in place at line 481
    Vec::<u8>::new();
 }
 ```
 -- [Compiler Explorer](https://godbolt.org/z/1WkNtC)
 But because the vector has no elements to manage, no calls to the allocator will ever be dispatched:
 ```rust
 use std::alloc::{GlobalAlloc, Layout, System};
 use std::sync::atomic::{AtomicBool, Ordering};
 fn main() {
    // Turn on panicking if we allocate on the heap
    DO_PANIC.store(true, Ordering::SeqCst);
    // Interesting bit happens here
    let x: Vec<u8> = Vec::new();
    drop(x);
    // Turn panicking back off, some deallocations occur
    // after main as well.
    DO_PANIC.store(false, Ordering::SeqCst);
 }
 #[global_allocator]
 static A: PanicAllocator = PanicAllocator;
 static DO_PANIC: AtomicBool = AtomicBool::new(false);
 struct PanicAllocator;
 unsafe impl GlobalAlloc for PanicAllocator {
    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
        if DO_PANIC.load(Ordering::SeqCst) {
            panic!("Unexpected allocation.");
        }
        System.alloc(layout)
    }
    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
        if DO_PANIC.load(Ordering::SeqCst) {
            panic!("Unexpected deallocation.");
        }
        System.dealloc(ptr, layout);
    }
 }
 ```
 --
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=831a297d176d015b1f9ace01ae416cc6)
 Other standard library types follow the same behavior; make sure to check out
 [`HashMap::new()`](https://doc.rust-lang.org/std/collections/hash_map/struct.HashMap.html#method.new),
 and [`String::new()`](https://doc.rust-lang.org/std/string/struct.String.html#method.new).
 # Heap Alternatives
 While it is a bit strange to speak of the stack after spending time with the heap, it's worth
 pointing out that some heap-allocated objects in Rust have stack-based counterparts provided by
 other crates. If you have need of the functionality, but want to avoid allocating, there are
 typically alternatives available.
 When it comes to some standard library smart pointers
 ([`RwLock`](https://doc.rust-lang.org/std/sync/struct.RwLock.html) and
 [`Mutex`](https://doc.rust-lang.org/std/sync/struct.Mutex.html)), stack-based alternatives are
 provided in crates like [parking_lot](https://crates.io/crates/parking_lot) and
 [spin](https://crates.io/crates/spin). You can check out
 [`lock_api::RwLock`](https://docs.rs/lock_api/0.1.5/lock_api/struct.RwLock.html),
 [`lock_api::Mutex`](https://docs.rs/lock_api/0.1.5/lock_api/struct.Mutex.html), and
 [`spin::Once`](https://mvdnes.github.io/rust-docs/spin-rs/spin/struct.Once.html) if you're in need
 of synchronization primitives.
 [thread_id](https://crates.io/crates/thread-id) may be necessary if you're implementing an allocator
 because [`thread::current().id()`](https://doc.rust-lang.org/std/thread/struct.ThreadId.html) uses a
 [`thread_local!` structure](https://doc.rust-lang.org/stable/src/std/sys_common/thread_info.rs.html#17-36)
 that needs heap allocation.
 # Tracing Allocators
 When writing performance-sensitive code, there's no alternative to measuring your code. If you
 didn't write a benchmark,
 [you don't care about it's performance](https://www.youtube.com/watch?v=2EWejmkKlxs&feature=youtu.be&t=263)
 You should never rely on your instincts when
 [a microsecond is an eternity](https://www.youtube.com/watch?v=NH1Tta7purM).
 Similarly, there's great work going on in Rust with allocators that keep track of what they're doing
 (like [`alloc_counter`](https://crates.io/crates/alloc_counter)). When it comes to tracking heap
 behavior, it's easy to make mistakes; please write tests and make sure you have tools to guard
 against future issues.
@@ -1,148 +0,0 @@
 ---
 layout: post
 title: "Compiler Optimizations: What It's Done Lately"
 description: "A lot. The answer is a lot."
 category:
 tags: [rust, understanding-allocations]
 ---
 **Update 2019-02-10**: When debugging a
 [related issue](https://gitlab.com/sio4/code/alloc-counter/issues/1), it was discovered that the
 original code worked because LLVM optimized out the entire function, rather than just the allocation
 segments. The code has been updated with proper use of
 [`read_volatile`](https://doc.rust-lang.org/std/ptr/fn.read_volatile.html), and a previous section
 on vector capacity has been removed.
 ---
 Up to this point, we've been discussing memory usage in the Rust language by focusing on simple
 rules that are mostly right for small chunks of code. We've spent time showing how those rules work
 themselves out in practice, and become familiar with reading the assembly code needed to see each
 memory type (global, stack, heap) in action.
 Throughout the series so far, we've put a handicap on the code. In the name of consistent and
 understandable results, we've asked the compiler to pretty please leave the training wheels on. Now
 is the time where we throw out all the rules and take off the kid gloves. As it turns out, both the
 Rust compiler and the LLVM optimizers are incredibly sophisticated, and we'll step back and let them
 do their job.
 Similar to
 ["What Has My Compiler Done For Me Lately?"](https://www.youtube.com/watch?v=bSkpMdDe4g4), we're
 focusing on interesting things the Rust language (and LLVM!) can do with memory management. We'll
 still be looking at assembly code to understand what's going on, but it's important to mention
 again: **please use automated tools like [alloc-counter](https://crates.io/crates/alloc_counter) to
 double-check memory behavior if it's something you care about**. It's far too easy to mis-read
 assembly in large code sections, you should always verify behavior if you care about memory usage.
 The guiding principal as we move forward is this: _optimizing compilers won't produce worse programs
 than we started with._ There won't be any situations where stack allocations get moved to heap
 allocations. There will, however, be an opera of optimization.
 # The Case of the Disappearing Box
 Our first optimization comes when LLVM can reason that the lifetime of an object is sufficiently
 short that heap allocations aren't necessary. In these cases, LLVM will move the allocation to the
 stack instead! The way this interacts with `#[inline]` attributes is a bit opaque, but the important
 part is that LLVM can sometimes do better than the baseline Rust language:
 ```rust
 use std::alloc::{GlobalAlloc, Layout, System};
 use std::sync::atomic::{AtomicBool, Ordering};
 pub fn cmp(x: u32) {
    // Turn on panicking if we allocate on the heap
    DO_PANIC.store(true, Ordering::SeqCst);
    // The compiler is able to see through the constant `Box`
    // and directly compare `x` to 24 - assembly line 73
    let y = Box::new(24);
    let equals = x == *y;
    // This call to drop is eliminated
    drop(y);
    // Need to mark the comparison result as volatile so that
    // LLVM doesn't strip out all the code. If `y` is marked
    // volatile instead, allocation will be forced.
    unsafe { std::ptr::read_volatile(&equals) };
    // Turn off panicking, as there are some deallocations
    // when we exit main.
    DO_PANIC.store(false, Ordering::SeqCst);
 }
 fn main() {
    cmp(12)
 }
 #[global_allocator]
 static A: PanicAllocator = PanicAllocator;
 static DO_PANIC: AtomicBool = AtomicBool::new(false);
 struct PanicAllocator;
 unsafe impl GlobalAlloc for PanicAllocator {
    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
        if DO_PANIC.load(Ordering::SeqCst) {
            panic!("Unexpected allocation.");
        }
        System.alloc(layout)
    }
    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
        if DO_PANIC.load(Ordering::SeqCst) {
            panic!("Unexpected deallocation.");
        }
        System.dealloc(ptr, layout);
    }
 }
 ```
 ## -- [Compiler Explorer](https://godbolt.org/z/BZ_Yp3)
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=release&edition=2018&gist=4a765f753183d5b919f62c71d2109d5d)
 # Dr. Array or: How I Learned to Love the Optimizer
 Finally, this isn't so much about LLVM figuring out different memory behavior, but LLVM stripping
 out code that doesn't do anything. Optimizations of this type have a lot of nuance to them; if
 you're not careful, they can make your benchmarks look
 [impossibly good](https://www.youtube.com/watch?v=nXaxk27zwlk&feature=youtu.be&t=1199). In Rust, the
 `black_box` function (implemented in both
 [`libtest`](https://doc.rust-lang.org/1.1.0/test/fn.black_box.html) and
 [`criterion`](https://docs.rs/criterion/0.2.10/criterion/fn.black_box.html)) will tell the compiler
 to disable this kind of optimization. But if you let LLVM remove unnecessary code, you can end up
 running programs that previously caused errors:
 ```rust
 #[derive(Default)]
 struct TwoFiftySix {
    _a: [u64; 32]
 }
 #[derive(Default)]
 struct EightK {
    _a: [TwoFiftySix; 32]
 }
 #[derive(Default)]
 struct TwoFiftySixK {
    _a: [EightK; 32]
 }
 #[derive(Default)]
 struct EightM {
    _a: [TwoFiftySixK; 32]
 }
 pub fn main() {
    // Normally this blows up because we can't reserve size on stack
    // for the `EightM` struct. But because the compiler notices we
    // never do anything with `_x`, it optimizes out the stack storage
    // and the program completes successfully.
    let _x = EightM::default();
 }
 ```
 ## -- [Compiler Explorer](https://godbolt.org/z/daHn7P)
 [Rust Playground](https://play.rust-lang.org/?version=stable&mode=release&edition=2018&gist=4c253bf26072119896ab93c6ef064dc0)
@@ -1,35 +0,0 @@
 ---
 layout: post
 title: "Summary: What are the Allocation Rules?"
 description: "A synopsis and reference."
 category:
 tags: [rust, understanding-allocations]
 ---
 While there's a lot of interesting detail captured in this series, it's often helpful to have a
 document that answers some "yes/no" questions. You may not care about what an `Iterator` looks like
 in assembly, you just need to know whether it allocates an object on the heap or not. And while Rust
 will prioritize the fastest behavior it can, here are the rules for each memory type:
 **Heap Allocation**:
 - Smart pointers (`Box`, `Rc`, `Mutex`, etc.) allocate their contents in heap memory.
 - Collections (`HashMap`, `Vec`, `String`, etc.) allocate their contents in heap memory.
 - Some smart pointers in the standard library have counterparts in other crates that don't need heap
  memory. If possible, use those.
 **Stack Allocation**:
 - Everything not using a smart pointer will be allocated on the stack.
 - Structs, enums, iterators, arrays, and closures are all stack allocated.
 - Cell types (`RefCell`) behave like smart pointers, but are stack-allocated.
 - Inlining (`#[inline]`) will not affect allocation behavior for better or worse.
 - Types that are marked `Copy` are guaranteed to have their contents stack-allocated.
 **Global Allocation**:
 - `const` is a fixed value; the compiler is allowed to copy it wherever useful.
 - `static` is a fixed reference; the compiler will guarantee it is unique.
 ![Container Sizes in Rust](/assets/images/2019-02-04-container-size.svg) --
 [Raph Levien](https://docs.google.com/presentation/d/1q-c7UAyrUlM-eZyTo1pd8SZ0qwA_wYxmPZVOQkoDmH4/edit?usp=sharing)
@@ -1,52 +0,0 @@
 ---
 layout: post
 title: "Making Bread"
 description: "...because I've got some free time now. 🍞"
 category:
 tags: [baking]
 ---
 Having recently started my "gardening leave" between positions, I have some more personal time
 available. I'm planning to stay productive, contributing to some open-source projects, but it also
 occurred to me that despite [talking about](https://speice.io/2018/05/hello.html) bread pics, this
 blog has been purely technical. Maybe I'll change the site title from "The Old Speice Guy" to "Bites
 and Bytes"?
 Either way, I'm baking a little bit again, and figured it was worth taking a quick break to focus on
 some lighter material. I recently learned two critically important lessons: first, the temperature
 of the dough when you put the yeast in makes a huge difference.
 Previously, when I wasn't paying attention to dough temperature:
 ![Whole weat dough](/assets/images/2019-05-03-making-bread/whole-wheat-not-rising.jpg)
 Compared with what happens when I put the dough in the microwave for a defrost cycle because the
 water I used wasn't warm enough:
 ![White dough](/assets/images/2019-05-03-making-bread/white-dough-rising-before-fold.jpg)
 I mean, just look at the bubbles!
 ![White dough with bubbles](/assets/images/2019-05-03-making-bread/white-dough-rising-after-fold.jpg)
 After shaping the dough, I've got two loaves ready:
 ![Shaped loaves](/assets/images/2019-05-03-making-bread/shaped-loaves.jpg)
 Now, the recipe normally calls for a Dutch Oven to bake the bread because it keeps the dough from
 drying out in the oven. Because I don't own a Dutch Oven, I typically put a casserole dish on the
 bottom rack and fill it with water so there's still some moisture in the oven. This time, I forgot
 to add the water and learned my second lesson: never add room-temperature water to a glass dish
 that's currently at 500 degrees.
 ![Shattered glass dish](/assets/images/2019-05-03-making-bread/shattered-glass.jpg)
 Needless to say, trying to pull out sharp glass from an incredibly hot oven is not what I expected
 to be doing during my garden leave.
 In the end, the bread crust wasn't great, but the bread itself turned out pretty alright:
 ![Baked bread](/assets/images/2019-05-03-making-bread/final-product.jpg)
 I've been writing a lot more during this break, so I'm looking forward to sharing that in the
 future. In the mean-time, I'm planning on making a sandwich.
@@ -1,296 +0,0 @@
 ---
 layout: post
 title: "On Building High Performance Systems"
 description: ""
 category:
 tags: []
 ---
 **Update 2019-09-21**: Added notes on `isolcpus` and `systemd` affinity.
 Prior to working in the trading industry, my assumption was that High Frequency Trading (HFT) is
 made up of people who have access to secret techniques mortal developers could only dream of. There
 had to be some secret art that could only be learned if one had an appropriately tragic backstory:
 <img src="/assets/images/2019-04-24-kung-fu.webp" alt="kung-fu fight">
 > How I assumed HFT people learn their secret techniques
 How else do you explain people working on systems that complete the round trip of market data in to
 orders out (a.k.a. tick-to-trade) consistently within
 [750-800 nanoseconds](https://stackoverflow.com/a/22082528/1454178)? In roughly the time it takes a
 computer to access
 [main memory 8 times](https://people.eecs.berkeley.edu/~rcs/research/interactive_latency.html),
 trading systems are capable of reading the market data packets, deciding what orders to send, doing
 risk checks, creating new packets for exchange-specific protocols, and putting those packets on the
 wire.
 Having now worked in the trading industry, I can confirm the developers aren't super-human; I've
 made some simple mistakes at the very least. Instead, what shows up in public discussions is that
 philosophy, not technique, separates high-performance systems from everything else.
 Performance-critical systems don't rely on "this one cool C++ optimization trick" to make code fast
 (though micro-optimizations have their place); there's a lot more to worry about than just the code
 written for the project.
 The framework I'd propose is this: **If you want to build high-performance systems, focus first on
 reducing performance variance** (reducing the gap between the fastest and slowest runs of the same
 code), **and only look at average latency once variance is at an acceptable level**.
 Don't get me wrong, I'm a much happier person when things are fast. Computer goes from booting in 20
 seconds down to 10 because I installed a solid-state drive? Awesome. But if every fifth day it takes
 a full minute to boot because of corrupted sectors? Not so great. Average speed over the course of a
 week is the same in each situation, but you're painfully aware of that minute when it happens. When
 it comes to code, the principal is the same: speeding up a function by an average of 10 milliseconds
 doesn't mean much if there's a 100ms difference between your fastest and slowest runs. When
 performance matters, you need to respond quickly _every time_, not just in aggregate.
 High-performance systems should first optimize for time variance. Once you're consistent at the time
 scale you care about, then focus on improving average time.
 This focus on variance shows up all the time in industry too (emphasis added in all quotes below):
 - In [marketing materials](https://business.nasdaq.com/market-tech/marketplaces/trading) for
  NASDAQ's matching engine, the most performance-sensitive component of the exchange, dependability
  is highlighted in addition to instantaneous metrics:
  > Able to **consistently sustain** an order rate of over 100,000 orders per second at sub-40
  > microsecond average latency
 - The [Aeron](https://github.com/real-logic/aeron) message bus has this to say about performance:
  > Performance is the key focus. Aeron is designed to be the highest throughput with the lowest and
  > **most predictable latency possible** of any messaging system
 - The company PolySync, which is working on autonomous vehicles,
  [mentions why](https://polysync.io/blog/session-types-for-hearty-codecs/) they picked their
  specific messaging format:
  > In general, high performance is almost always desirable for serialization. But in the world of
  > autonomous vehicles, **steady timing performance is even more important** than peak throughput.
  > This is because safe operation is sensitive to timing outliers. Nobody wants the system that
  > decides when to slam on the brakes to occasionally take 100 times longer than usual to encode
  > its commands.
 - [Solarflare](https://solarflare.com/), which makes highly-specialized network hardware, points out
  variance (jitter) as a big concern for
  [electronic trading](https://solarflare.com/electronic-trading/):
  > The high stakes world of electronic trading, investment banks, market makers, hedge funds and
  > exchanges demand the **lowest possible latency and jitter** while utilizing the highest
  > bandwidth and return on their investment.
 And to further clarify: we're not discussing _total run-time_, but variance of total run-time. There
 are situations where it's not reasonably possible to make things faster, and you'd much rather be
 consistent. For example, trading firms use
 [wireless networks](https://sniperinmahwah.wordpress.com/2017/06/07/network-effects-part-i/) because
 the speed of light through air is faster than through fiber-optic cables. There's still at _absolute
 minimum_ a [~33.76 millisecond](http://tinyurl.com/y2vd7tn8) delay required to send data between,
 say,
 [Chicago and Tokyo](https://www.theice.com/market-data/connectivity-and-feeds/wireless/tokyo-chicago).
 If a trading system in Chicago calls the function for "send order to Tokyo" and waits to see if a
 trade occurs, there's a physical limit to how long that will take. In this situation, the focus is
 on keeping variance of _additional processing_ to a minimum, since speed of light is the limiting
 factor.
 So how does one go about looking for and eliminating performance variance? To tell the truth, I
 don't think a systematic answer or flow-chart exists. There's no substitute for (A) building a deep
 understanding of the entire technology stack, and (B) actually measuring system performance (though
 (C) watching a lot of [CppCon](https://www.youtube.com/channel/UCMlGfpWw-RUdWX_JbLCukXg) videos for
 inspiration never hurt). Even then, every project cares about performance to a different degree; you
 may need to build an entire
 [replica production system](https://www.youtube.com/watch?v=NH1Tta7purM&feature=youtu.be&t=3015) to
 accurately benchmark at nanosecond precision, or you may be content to simply
 [avoid garbage collection](https://www.youtube.com/watch?v=BD9cRbxWQx8&feature=youtu.be&t=1335) in
 your Java code.
 Even though everyone has different needs, there are still common things to look for when trying to
 isolate and eliminate variance. In no particular order, these are my focus areas when thinking about
 high-performance systems:
 ## Language-specific
 **Garbage Collection**: How often does garbage collection happen? When is it triggered? What are the
 impacts?
 - [In Python](https://rushter.com/blog/python-garbage-collector/), individual objects are collected
  if the reference count reaches 0, and each generation is collected if
  `num_alloc - num_dealloc > gc_threshold` whenever an allocation happens. The GIL is acquired for
  the duration of generational collection.
 - Java has
  [many](https://docs.oracle.com/en/java/javase/12/gctuning/parallel-collector1.html#GUID-DCDD6E46-0406-41D1-AB49-FB96A50EB9CE)
  [different](https://docs.oracle.com/en/java/javase/12/gctuning/garbage-first-garbage-collector.html#GUID-ED3AB6D3-FD9B-4447-9EDF-983ED2F7A573)
  [collection](https://docs.oracle.com/en/java/javase/12/gctuning/garbage-first-garbage-collector-tuning.html#GUID-90E30ACA-8040-432E-B3A0-1E0440AB556A)
  [algorithms](https://docs.oracle.com/en/java/javase/12/gctuning/z-garbage-collector1.html#GUID-A5A42691-095E-47BA-B6DC-FB4E5FAA43D0)
  to choose from, each with different characteristics. The default algorithms (Parallel GC in Java
  8, G1 in Java 9) freeze the JVM while collecting, while more recent algorithms
  ([ZGC](https://wiki.openjdk.java.net/display/zgc) and
  [Shenandoah](https://wiki.openjdk.java.net/display/shenandoah)) are designed to keep "stop the
  world" to a minimum by doing collection work in parallel.
 **Allocation**: Every language has a different way of interacting with "heap" memory, but the
 principle is the same: running the allocator to allocate/deallocate memory takes time that can often
 be put to better use. Understanding when your language interacts with the allocator is crucial, and
 not always obvious. For example: C++ and Rust don't allocate heap memory for iterators, but Java
 does (meaning potential GC pauses). Take time to understand heap behavior (I made a
 [a guide for Rust](/2019/02/understanding-allocations-in-rust.html)), and look into alternative
 allocators ([jemalloc](http://jemalloc.net/),
 [tcmalloc](https://gperftools.github.io/gperftools/tcmalloc.html)) that might run faster than the
 operating system default.
 **Data Layout**: How your data is arranged in memory matters;
 [data-oriented design](https://www.youtube.com/watch?v=yy8jQgmhbAU) and
 [cache locality](https://www.youtube.com/watch?v=2EWejmkKlxs&feature=youtu.be&t=1185) can have huge
 impacts on performance. The C family of languages (C, value types in C#, C++) and Rust all have
 guarantees about the shape every object takes in memory that others (e.g. Java and Python) can't
 make. [Cachegrind](http://valgrind.org/docs/manual/cg-manual.html) and kernel
 [perf](https://perf.wiki.kernel.org/index.php/Main_Page) counters are both great for understanding
 how performance relates to memory layout.
 **Just-In-Time Compilation**: Languages that are compiled on the fly (LuaJIT, C#, Java, PyPy) are
 great because they optimize your program for how it's actually being used, rather than how a
 compiler expects it to be used. However, there's a variance problem if the program stops executing
 while waiting for translation from VM bytecode to native code. As a remedy, many languages support
 ahead-of-time compilation in addition to the JIT versions
 ([CoreRT](https://github.com/dotnet/corert) in C# and [GraalVM](https://www.graalvm.org/) in Java).
 On the other hand, LLVM supports
 [Profile Guided Optimization](https://clang.llvm.org/docs/UsersManual.html#profile-guided-optimization),
 which theoretically brings JIT benefits to non-JIT languages. Finally, be careful to avoid comparing
 apples and oranges during benchmarks; you don't want your code to suddenly speed up because the JIT
 compiler kicked in.
 **Programming Tricks**: These won't make or break performance, but can be useful in specific
 circumstances. For example, C++ can use
 [templates instead of branches](https://www.youtube.com/watch?v=NH1Tta7purM&feature=youtu.be&t=1206)
 in critical sections.
 ## Kernel
 Code you wrote is almost certainly not the _only_ code running on your hardware. There are many ways
 the operating system interacts with your program, from interrupts to system calls, that are
 important to watch for. These are written from a Linux perspective, but Windows does typically have
 equivalent functionality.
 **Scheduling**: The kernel is normally free to schedule any process on any core, so it's important
 to reserve CPU cores exclusively for the important programs. There are a few parts to this: first,
 limit the CPU cores that non-critical processes are allowed to run on by excluding cores from
 scheduling
 ([`isolcpus`](https://www.linuxtopia.org/online_books/linux_kernel/kernel_configuration/re46.html)
 kernel command-line option), or by setting the `init` process CPU affinity
 ([`systemd` example](https://access.redhat.com/solutions/2884991)). Second, set critical processes
 to run on the isolated cores by setting the
 [processor affinity](https://en.wikipedia.org/wiki/Processor_affinity) using
 [taskset](https://linux.die.net/man/1/taskset). Finally, use
 [`NO_HZ`](https://github.com/torvalds/linux/blob/master/Documentation/timers/NO_HZ.txt) or
 [`chrt`](https://linux.die.net/man/1/chrt) to disable scheduling interrupts. Turning off
 hyper-threading is also likely beneficial.
 **System calls**: Reading from a UNIX socket? Writing to a file? In addition to not knowing how long
 the I/O operation takes, these all trigger expensive
 [system calls (syscalls)](https://en.wikipedia.org/wiki/System_call). To handle these, the CPU must
 [context switch](https://en.wikipedia.org/wiki/Context_switch) to the kernel, let the kernel
 operation complete, then context switch back to your program. We'd rather keep these
 [to a minimum](https://www.destroyallsoftware.com/talks/the-birth-and-death-of-javascript) (see
 timestamp 18:20). [Strace](https://linux.die.net/man/1/strace) is your friend for understanding when
 and where syscalls happen.
 **Signal Handling**: Far less likely to be an issue, but signals do trigger a context switch if your
 code has a handler registered. This will be highly dependent on the application, but you can
 [block signals](https://www.linuxprogrammingblog.com/all-about-linux-signals?page=show#Blocking_signals)
 if it's an issue.
 **Interrupts**: System interrupts are how devices connected to your computer notify the CPU that
 something has happened. The CPU will then choose a processor core to pause and context switch to the
 OS to handle the interrupt. Make sure that
 [SMP affinity](http://www.alexonlinux.com/smp-affinity-and-proper-interrupt-handling-in-linux) is
 set so that interrupts are handled on a CPU core not running the program you care about.
 **[NUMA](https://www.kernel.org/doc/html/latest/vm/numa.html)**: While NUMA is good at making
 multi-cell systems transparent, there are variance implications; if the kernel moves a process
 across nodes, future memory accesses must wait for the controller on the original node. Use
 [numactl](https://linux.die.net/man/8/numactl) to handle memory-/cpu-cell pinning so this doesn't
 happen.
 ## Hardware
 **CPU Pipelining/Speculation**: Speculative execution in modern processors gave us vulnerabilities
 like Spectre, but it also gave us performance improvements like
 [branch prediction](https://stackoverflow.com/a/11227902/1454178). And if the CPU mis-speculates
 your code, there's variance associated with rewind and replay. While the compiler knows a lot about
 how your CPU [pipelines instructions](https://youtu.be/nAbCKa0FzjQ?t=4467), code can be
 [structured to help](https://www.youtube.com/watch?v=NH1Tta7purM&feature=youtu.be&t=755) the branch
 predictor.
 **Paging**: For most systems, virtual memory is incredible. Applications live in their own worlds,
 and the CPU/[MMU](https://en.wikipedia.org/wiki/Memory_management_unit) figures out the details.
 However, there's a variance penalty associated with memory paging and caching; if you access more
 memory pages than the [TLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer) can store,
 you'll have to wait for the page walk. Kernel perf tools are necessary to figure out if this is an
 issue, but using [huge pages](https://blog.pythian.com/performance-tuning-hugepages-in-linux/) can
 reduce TLB burdens. Alternately, running applications in a hypervisor like
 [Jailhouse](https://github.com/siemens/jailhouse) allows one to skip virtual memory entirely, but
 this is probably more work than the benefits are worth.
 **Network Interfaces**: When more than one computer is involved, variance can go up dramatically.
 Tuning kernel
 [network parameters](https://github.com/leandromoreira/linux-network-performance-parameters) may be
 helpful, but modern systems more frequently opt to skip the kernel altogether with a technique
 called [kernel bypass](https://blog.cloudflare.com/kernel-bypass/). This typically requires
 specialized hardware and [drivers](https://www.openonload.org/), but even industries like
 [telecom](https://www.bbc.co.uk/rd/blog/2018-04-high-speed-networking-open-source-kernel-bypass) are
 finding the benefits.
 ## Networks
 **Routing**: There's a reason financial firms are willing to pay
 [millions of euros](https://sniperinmahwah.wordpress.com/2019/03/26/4-les-moeres-english-version/)
 for rights to a small plot of land - having a straight-line connection from point A to point B means
 the path their data takes is the shortest possible. In contrast, there are currently 6 computers in
 between me and Google, but that may change at any moment if my ISP realizes a
 [more efficient route](https://en.wikipedia.org/wiki/Border_Gateway_Protocol) is available. Whether
 it's using
 [research-quality equipment](https://sniperinmahwah.wordpress.com/2018/05/07/shortwave-trading-part-i-the-west-chicago-tower-mystery/)
 for shortwave radio, or just making sure there's no data inadvertently going between data centers,
 routing matters.
 **Protocol**: TCP as a network protocol is awesome: guaranteed and in-order delivery, flow control,
 and congestion control all built in. But these attributes make the most sense when networking
 infrastructure is lossy; for systems that expect nearly all packets to be delivered correctly, the
 setup handshaking and packet acknowledgment are just overhead. Using UDP (unicast or multicast) may
 make sense in these contexts as it avoids the chatter needed to track connection state, and
 [gap-fill](https://iextrading.com/docs/IEX%20Transport%20Specification.pdf)
 [strategies](http://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/moldudp64.pdf)
 can handle the rest.
 **Switching**: Many routers/switches handle packets using "store-and-forward" behavior: wait for the
 whole packet, validate checksums, and then send to the next device. In variance terms, the time
 needed to move data between two nodes is proportional to the size of that data; the switch must
 "store" all data before it can calculate checksums and "forward" to the next node. With
 ["cut-through"](https://www.networkworld.com/article/2241573/latency-and-jitter--cut-through-design-pays-off-for-arista--blade.html)
 designs, switches will begin forwarding data as soon as they know where the destination is,
 checksums be damned. This means there's a fixed cost (at the switch) for network traffic, no matter
 the size.
 # Final Thoughts
 High-performance systems, regardless of industry, are not magical. They do require extreme precision
 and attention to detail, but they're designed, built, and operated by regular people, using a lot of
 tools that are publicly available. Interested in seeing how context switching affects performance of
 your benchmarks? `taskset` should be installed in all modern Linux distributions, and can be used to
 make sure the OS never migrates your process. Curious how often garbage collection triggers during a
 crucial operation? Your language of choice will typically expose details of its operations
 ([Python](https://docs.python.org/3/library/gc.html),
 [Java](https://www.oracle.com/technetwork/java/javase/tech/vmoptions-jsp-140102.html#DebuggingOptions)).
 Want to know how hard your program is stressing the TLB? Use `perf record` and look for
 `dtlb_load_misses.miss_causes_a_walk`.
 Two final guiding questions, then: first, before attempting to apply some of the technology above to
 your own systems, can you first identify
 [where/when you care](http://wiki.c2.com/?PrematureOptimization) about "high-performance"? As an
 example, if parts of a system rely on humans pushing buttons, CPU pinning won't have any measurable
 effect. Humans are already far too slow to react in time. Second, if you're using benchmarks, are
 they being designed in a way that's actually helpful? Tools like
 [Criterion](http://www.serpentine.com/criterion/) (also in
 [Rust](https://github.com/bheisler/criterion.rs)) and Google's
 [Benchmark](https://github.com/google/benchmark) output not only average run time, but variance as
 well; your benchmarking environment is subject to the same concerns your production environment is.
 Finally, I believe high-performance systems are a matter of philosophy, not necessarily technique.
 Rigorous focus on variance is the first step, and there are plenty of ways to measure and mitigate
 it; once that's at an acceptable level, then optimize for speed.
@@ -1,263 +0,0 @@
 ---
 layout: post
 title: "Binary Format Shootout"
 description: "Cap'n Proto vs. Flatbuffers vs. SBE"
 category:
 tags: [rust]
 ---
 I've found that in many personal projects,
 [analysis paralysis](https://en.wikipedia.org/wiki/Analysis_paralysis) is particularly deadly.
 Making good decisions in the beginning avoids pain and suffering later; if extra research prevents
 future problems, I'm happy to continue ~~procrastinating~~ researching indefinitely.
 So let's say you're in need of a binary serialization format. Data will be going over the network,
 not just in memory, so having a schema document and code generation is a must. Performance is
 crucial, so formats that support zero-copy de/serialization are given priority. And the more
 languages supported, the better; I use Rust, but can't predict what other languages this could
 interact with.
 Given these requirements, the candidates I could find were:
 1. [Cap'n Proto](https://capnproto.org/) has been around the longest, and is the most established
 2. [Flatbuffers](https://google.github.io/flatbuffers/) is the newest, and claims to have a simpler
   encoding
 3. [Simple Binary Encoding](https://github.com/real-logic/simple-binary-encoding) has the simplest
   encoding, but the Rust implementation is unmaintained
 Any one of these will satisfy the project requirements: easy to transmit over a network, reasonably
 fast, and polyglot support. But how do you actually pick one? It's impossible to know what issues
 will follow that choice, so I tend to avoid commitment until the last possible moment.
 Still, a choice must be made. Instead of worrying about which is "the best," I decided to build a
 small proof-of-concept system in each format and pit them against each other. All code can be found
 in the [repository](https://github.com/speice-io/marketdata-shootout) for this post.
 We'll discuss more in detail, but a quick preview of the results:
 - Cap'n Proto: Theoretically performs incredibly well, the implementation had issues
 - Flatbuffers: Has some quirks, but largely lived up to its "zero-copy" promises
 - SBE: Best median and worst-case performance, but the message structure has a limited feature set
 # Prologue: Binary Parsing with Nom
 Our benchmark system will be a simple data processor; given depth-of-book market data from
 [IEX](https://iextrading.com/trading/market-data/#deep), serialize each message into the schema
 format, read it back, and calculate total size of stock traded and the lowest/highest quoted prices.
 This test isn't complex, but is representative of the project I need a binary format for.
 But before we make it to that point, we have to actually read in the market data. To do so, I'm
 using a library called [`nom`](https://github.com/Geal/nom). Version 5.0 was recently released and
 brought some big changes, so this was an opportunity to build a non-trivial program and get
 familiar.
 If you don't already know about `nom`, it's a "parser generator". By combining different smaller
 parsers, you can assemble a parser to handle complex structures without writing tedious code by
 hand. For example, when parsing
 [PCAP files](https://www.winpcap.org/ntar/draft/PCAP-DumpFileFormat.html#rfc.section.3.3):
 ```
   0                   1                   2                   3
   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   +---------------------------------------------------------------+
 0 |                    Block Type = 0x00000006                    |
   +---------------------------------------------------------------+
 4 |                      Block Total Length                       |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 8 |                         Interface ID                          |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 12 |                        Timestamp (High)                       |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 16 |                        Timestamp (Low)                        |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 20 |                         Captured Len                          |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 24 |                          Packet Len                           |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   |                          Packet Data                          |
   |                              ...                              |
 ```
 ...you can build a parser in `nom` that looks like
 [this](https://github.com/speice-io/marketdata-shootout/blob/369613843d39cfdc728e1003123bf87f79422497/src/parsers.rs#L59-L93):
 ```rust
 const ENHANCED_PACKET: [u8; 4] = [0x06, 0x00, 0x00, 0x00];
 pub fn enhanced_packet_block(input: &[u8]) -> IResult<&[u8], &[u8]> {
    let (
        remaining,
        (
            block_type,
            block_len,
            interface_id,
            timestamp_high,
            timestamp_low,
            captured_len,
            packet_len,
        ),
    ) = tuple((
        tag(ENHANCED_PACKET),
        le_u32,
        le_u32,
        le_u32,
        le_u32,
        le_u32,
        le_u32,
    ))(input)?;
    let (remaining, packet_data) = take(captured_len)(remaining)?;
    Ok((remaining, packet_data))
 }
 ```
 While this example isn't too interesting, more complex formats (like IEX market data) are where
 [`nom` really shines](https://github.com/speice-io/marketdata-shootout/blob/369613843d39cfdc728e1003123bf87f79422497/src/iex.rs).
 Ultimately, because the `nom` code in this shootout was the same for all formats, we're not too
 interested in its performance. Still, it's worth mentioning that building the market data parser was
 actually fun; I didn't have to write tons of boring code by hand.
 # Part 1: Cap'n Proto
 Now it's time to get into the meaty part of the story. Cap'n Proto was the first format I tried
 because of how long it has supported Rust (thanks to [dwrensha](https://github.com/dwrensha) for
 maintaining the Rust port since
 [2014!](https://github.com/capnproto/capnproto-rust/releases/tag/rustc-0.10)). However, I had a ton
 of performance concerns once I started using it.
 To serialize new messages, Cap'n Proto uses a "builder" object. This builder allocates memory on the
 heap to hold the message content, but because builders
 [can't be re-used](https://github.com/capnproto/capnproto-rust/issues/111), we have to allocate a
 new buffer for every single message. I was able to work around this with a
 [special builder](https://github.com/speice-io/marketdata-shootout/blob/369613843d39cfdc728e1003123bf87f79422497/src/capnp_runner.rs#L17-L51)
 that could re-use the buffer, but it required reading through Cap'n Proto's
 [benchmarks](https://github.com/capnproto/capnproto-rust/blob/master/benchmark/benchmark.rs#L124-L156)
 to find an example, and used
 [`std::mem::transmute`](https://doc.rust-lang.org/std/mem/fn.transmute.html) to bypass Rust's borrow
 checker.
 The process of reading messages was better, but still had issues. Cap'n Proto has two message
 encodings: a ["packed"](https://capnproto.org/encoding.html#packing) representation, and an
 "unpacked" version. When reading "packed" messages, we need a buffer to unpack the message into
 before we can use it; Cap'n Proto allocates a new buffer for each message we unpack, and I wasn't
 able to figure out a way around that. In contrast, the unpacked message format should be where Cap'n
 Proto shines; its main selling point is that there's [no decoding step](https://capnproto.org/).
 However, accomplishing zero-copy deserialization required code in the private API
 ([since fixed](https://github.com/capnproto/capnproto-rust/issues/148)), and we allocate a vector on
 every read for the segment table.
 In the end, I put in significant work to make Cap'n Proto as fast as possible, but there were too
 many issues for me to feel comfortable using it long-term.
 # Part 2: Flatbuffers
 This is the new kid on the block. After a
 [first attempt](https://github.com/google/flatbuffers/pull/3894) didn't pan out, official support
 was [recently launched](https://github.com/google/flatbuffers/pull/4898). Flatbuffers intends to
 address the same problems as Cap'n Proto: high-performance, polyglot, binary messaging. The
 difference is that Flatbuffers claims to have a simpler wire format and
 [more flexibility](https://google.github.io/flatbuffers/flatbuffers_benchmarks.html).
 On the whole, I enjoyed using Flatbuffers; the [tooling](https://crates.io/crates/flatc-rust) is
 nice, and unlike Cap'n Proto, parsing messages was actually zero-copy and zero-allocation. However,
 there were still some issues.
 First, Flatbuffers (at least in Rust) can't handle nested vectors. This is a problem for formats
 like the following:
 ```
 table Message {
  symbol: string;
 }
 table MultiMessage {
  messages:[Message];
 }
 ```
 We want to create a `MultiMessage` which contains a vector of `Message`, and each `Message` itself
 contains a vector (the `string` type). I was able to work around this by
 [caching `Message` elements](https://github.com/speice-io/marketdata-shootout/blob/e9d07d148bf36a211a6f86802b313c4918377d1b/src/flatbuffers_runner.rs#L83)
 in a `SmallVec` before building the final `MultiMessage`, but it was a painful process that I
 believe contributed to poor serialization performance.
 Second, streaming support in Flatbuffers seems to be something of an
 [afterthought](https://github.com/google/flatbuffers/issues/3898). Where Cap'n Proto in Rust handles
 reading messages from a stream as part of the API, Flatbuffers just sticks a `u32` at the front of
 each message to indicate the size. Not specifically a problem, but calculating message size without
 that tag is nigh on impossible.
 Ultimately, I enjoyed using Flatbuffers, and had to do significantly less work to make it perform
 well.
 # Part 3: Simple Binary Encoding
 Support for SBE was added by the author of one of my favorite
 [Rust blog posts](https://web.archive.org/web/20190427124806/https://polysync.io/blog/session-types-for-hearty-codecs/).
 I've [talked previously]({% post_url 2019-06-31-high-performance-systems %}) about how important
 variance is in high-performance systems, so it was encouraging to read about a format that
 [directly addressed](https://github.com/real-logic/simple-binary-encoding/wiki/Why-Low-Latency) my
 concerns. SBE has by far the simplest binary format, but it does make some tradeoffs.
 Both Cap'n Proto and Flatbuffers use [message offsets](https://capnproto.org/encoding.html#structs)
 to handle variable-length data, [unions](https://capnproto.org/language.html#unions), and various
 other features. In contrast, messages in SBE are essentially
 [just structs](https://github.com/real-logic/simple-binary-encoding/blob/master/sbe-samples/src/main/resources/example-schema.xml);
 variable-length data is supported, but there's no union type.
 As mentioned in the beginning, the Rust port of SBE works well, but is
 [essentially unmaintained](https://users.rust-lang.org/t/zero-cost-abstraction-frontier-no-copy-low-allocation-ordered-decoding/11515/9).
 However, if you don't need union types, and can accept that schemas are XML documents, it's still
 worth using. SBE's implementation had the best streaming support of all formats I tested, and
 doesn't trigger allocation during de/serialization.
 # Results
 After building a test harness
 [for](https://github.com/speice-io/marketdata-shootout/blob/master/src/capnp_runner.rs)
 [each](https://github.com/speice-io/marketdata-shootout/blob/master/src/flatbuffers_runner.rs)
 [format](https://github.com/speice-io/marketdata-shootout/blob/master/src/sbe_runner.rs), it was
 time to actually take them for a spin. I used
 [this script](https://github.com/speice-io/marketdata-shootout/blob/master/run_shootout.sh) to run
 the benchmarks, and the raw results are
 [here](https://github.com/speice-io/marketdata-shootout/blob/master/shootout.csv). All data reported
 below is the average of 10 runs on a single day of IEX data. Results were validated to make sure
 that each format parsed the data correctly.
 ## Serialization
 This test measures, on a
 [per-message basis](https://github.com/speice-io/marketdata-shootout/blob/master/src/main.rs#L268-L272),
 how long it takes to serialize the IEX message into the desired format and write to a pre-allocated
 buffer.
 | Schema               | Median | 99th Pctl | 99.9th Pctl | Total  |
 | :------------------- | :----- | :-------- | :---------- | :----- |
 | Cap'n Proto Packed   | 413ns  | 1751ns    | 2943ns      | 14.80s |
 | Cap'n Proto Unpacked | 273ns  | 1828ns    | 2836ns      | 10.65s |
 | Flatbuffers          | 355ns  | 2185ns    | 3497ns      | 14.31s |
 | SBE                  | 91ns   | 1535ns    | 2423ns      | 3.91s  |
 ## Deserialization
 This test measures, on a
 [per-message basis](https://github.com/speice-io/marketdata-shootout/blob/master/src/main.rs#L294-L298),
 how long it takes to read the previously-serialized message and perform some basic aggregation. The
 aggregation code is the same for each format, so any performance differences are due solely to the
 format implementation.
 | Schema               | Median | 99th Pctl | 99.9th Pctl | Total  |
 | :------------------- | :----- | :-------- | :---------- | :----- |
 | Cap'n Proto Packed   | 539ns  | 1216ns    | 2599ns      | 18.92s |
 | Cap'n Proto Unpacked | 366ns  | 737ns     | 1583ns      | 12.32s |
 | Flatbuffers          | 173ns  | 421ns     | 1007ns      | 6.00s  |
 | SBE                  | 116ns  | 286ns     | 659ns       | 4.05s  |
 # Conclusion
 Building a benchmark turned out to be incredibly helpful in making a decision; because a "union"
 type isn't important to me, I can be confident that SBE best addresses my needs.
 While SBE was the fastest in terms of both median and worst-case performance, its worst case
 performance was proportionately far higher than any other format. It seems to be that
 de/serialization time scales with message size, but I'll need to do some more research to understand
 what exactly is going on.
@@ -1,370 +0,0 @@
 ---
 layout: post
 title: "Release the GIL"
 description: "Strategies for Parallelism in Python"
 category:
 tags: [python]
 ---
 Complaining about the [Global Interpreter Lock](https://wiki.python.org/moin/GlobalInterpreterLock)
 (GIL) seems like a rite of passage for Python developers. It's easy to criticize a design decision
 made before multi-core CPU's were widely available, but the fact that it's still around indicates
 that it generally works [Good](https://wiki.c2.com/?PrematureOptimization)
 [Enough](https://wiki.c2.com/?YouArentGonnaNeedIt). Besides, there are simple and effective
 workarounds; it's not hard to start a
 [new process](https://docs.python.org/3/library/multiprocessing.html) and use message passing to
 synchronize code running in parallel.
 Still, wouldn't it be nice to have more than a single active interpreter thread? In an age of
 asynchronicity and _M:N_ threading, Python seems lacking. The ideal scenario is to take advantage of
 both Python's productivity and the modern CPU's parallel capabilities.
 Presented below are two strategies for releasing the GIL's icy grip without giving up on what makes
 Python a nice language to start with. Bear in mind: these are just the tools, no claim is made about
 whether it's a good idea to use them. Very often, unlocking the GIL is an
 [XY problem](https://en.wikipedia.org/wiki/XY_problem); you want application performance, and the
 GIL seems like an obvious bottleneck. Remember that any gains from running code in parallel come at
 the expense of project complexity; messing with the GIL is ultimately messing with Python's memory
 model.
 ```python
 %load_ext Cython
 from numba import jit
 N = 1_000_000_000
 ```
 # Cython
 Put simply, [Cython](https://cython.org/) is a programming language that looks a lot like Python,
 gets [transpiled](https://en.wikipedia.org/wiki/Source-to-source_compiler) to C/C++, and integrates
 well with the [CPython](https://en.wikipedia.org/wiki/CPython) API. It's great for building Python
 wrappers to C and C++ libraries, writing optimized code for numerical processing, and tons more. And
 when it comes to managing the GIL, there are two special features:
 - The `nogil`
  [function annotation](https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#declaring-a-function-as-callable-without-the-gil)
  asserts that a Cython function is safe to use without the GIL, and compilation will fail if it
  interacts with Python in an unsafe manner
 - The `with nogil`
  [context manager](https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#releasing-the-gil)
  explicitly unlocks the CPython GIL while active
 Whenever Cython code runs inside a `with nogil` block on a separate thread, the Python interpreter
 is unblocked and allowed to continue work elsewhere. We'll define a "busy work" function that
 demonstrates this principle in action:
 ```python
 %%cython
 # Annotating a function with `nogil` indicates only that it is safe
 # to call in a `with nogil` block. It *does not* release the GIL.
 cdef unsigned long fibonacci(unsigned long n) nogil:
    if n <= 1:
        return n
    cdef unsigned long a = 0, b = 1, c = 0
    c = a + b
    for _i in range(2, n):
        a = b
        b = c
        c = a + b
    return c
 def cython_nogil(unsigned long n):
    # Explicitly release the GIL while running `fibonacci`
    with nogil:
        value = fibonacci(n)
    return value
 def cython_gil(unsigned long n):
    # Because the GIL is not explicitly released, it implicitly
    # remains acquired when running the `fibonacci` function
    return fibonacci(n)
 ```
 First, let's time how long it takes Cython to calculate the billionth Fibonacci number:
 ```python
 %%time
 _ = cython_gil(N);
 ```
 > <pre>
 > CPU times: user 365 ms, sys: 0 ns, total: 365 ms
 > Wall time: 372 ms
 > </pre>
 ```python
 %%time
 _ = cython_nogil(N);
 ```
 > <pre>
 > CPU times: user 381 ms, sys: 0 ns, total: 381 ms
 > Wall time: 388 ms
 > </pre>
 Both versions (with and without GIL) take effectively the same amount of time to run. Even when
 running this calculation in parallel on separate threads, it is expected that the run time will
 double because only one thread can be active at a time:
 ```python
 %%time
 from threading import Thread
 # Create the two threads to run on
 t1 = Thread(target=cython_gil, args=[N])
 t2 = Thread(target=cython_gil, args=[N])
 # Start the threads
 t1.start(); t2.start()
 # Wait for the threads to finish
 t1.join(); t2.join()
 ```
 > <pre>
 > CPU times: user 641 ms, sys: 5.62 ms, total: 647 ms
 > Wall time: 645 ms
 > </pre>
 However, if the first thread releases the GIL, the second thread is free to acquire it and run in
 parallel:
 ```python
 %%time
 t1 = Thread(target=cython_nogil, args=[N])
 t2 = Thread(target=cython_gil, args=[N])
 t1.start(); t2.start()
 t1.join(); t2.join()
 ```
 > <pre>
 > CPU times: user 717 ms, sys: 372 µs, total: 718 ms
 > Wall time: 358 ms
 > </pre>
 Because `user` time represents the sum of processing time on all threads, it doesn't change much.
 The ["wall time"](https://en.wikipedia.org/wiki/Elapsed_real_time) has been cut roughly in half
 because each function is running simultaneously.
 Keep in mind that the **order in which threads are started** makes a difference!
 ```python
 %%time
 # Note that the GIL-locked version is started first
 t1 = Thread(target=cython_gil, args=[N])
 t2 = Thread(target=cython_nogil, args=[N])
 t1.start(); t2.start()
 t1.join(); t2.join()
 ```
 > <pre>
 > CPU times: user 667 ms, sys: 0 ns, total: 667 ms
 > Wall time: 672 ms
 > </pre>
 Even though the second thread releases the GIL while running, it can't start until the first has
 completed. Thus, the overall runtime is effectively the same as running two GIL-locked threads.
 Finally, be aware that attempting to unlock the GIL from a thread that doesn't own it will crash the
 **interpreter**, not just the thread attempting the unlock:
 ```python
 %%cython
 cdef int cython_recurse(int n) nogil:
    if n <= 0:
        return 0
    with nogil:
        return cython_recurse(n - 1)
 cython_recurse(2)
 ```
 > <pre>
 > Fatal Python error: PyEval_SaveThread: NULL tstate
 > 
 > Thread 0x00007f499effd700 (most recent call first):
 >   File "/home/bspeice/.virtualenvs/release-the-gil/lib/python3.7/site-packages/ipykernel/parentpoller.py", line 39 in run
 >   File "/usr/lib/python3.7/threading.py", line 926 in _bootstrap_inner
 >   File "/usr/lib/python3.7/threading.py", line 890 in _bootstrap
 > </pre>
 In practice, avoiding this issue is simple. First, `nogil` functions probably shouldn't contain
 `with nogil` blocks. Second, Cython can
 [conditionally acquire/release](https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#conditional-acquiring-releasing-the-gil)
 the GIL, so these conditions can be used to synchronize access. Finally, Cython's documentation for
 [external C code](https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#acquiring-and-releasing-the-gil)
 contains more detail on how to safely manage the GIL.
 To conclude: use Cython's `nogil` annotation to assert that functions are safe for calling when the
 GIL is unlocked, and `with nogil` to actually unlock the GIL and run those functions.
 # Numba
 Like Cython, [Numba](https://numba.pydata.org/) is a "compiled Python." Where Cython works by
 compiling a Python-like language to C/C++, Numba compiles Python bytecode _directly to machine code_
 at runtime. Behavior is controlled with a special `@jit` decorator; calling a decorated function
 first compiles it to machine code before running. Calling the function a second time re-uses that
 machine code unless the argument types have changed.
 Numba works best when a `nopython=True` argument is added to the `@jit` decorator; functions
 compiled in [`nopython`](http://numba.pydata.org/numba-doc/latest/user/jit.html?#nopython) mode
 avoid the CPython API and have performance comparable to C. Further, adding `nogil=True` to the
 `@jit` decorator unlocks the GIL while that function is running. Note that `nogil` and `nopython`
 are separate arguments; while it is necessary for code to be compiled in `nopython` mode in order to
 release the lock, the GIL will remain locked if `nogil=False` (the default).
 Let's repeat the same experiment, this time using Numba instead of Cython:
 ```python
 # The `int` type annotation is only for humans and is ignored
 # by Numba.
@jit(nopython=True, nogil=True)
 def numba_nogil(n: int) -> int:
    if n <= 1:
        return n
    a = 0
    b = 1
    c = a + b
    for _i in range(2, n):
        a = b
        b = c
        c = a + b
    return c
 # Run using `nopython` mode to receive a performance boost,
 # but GIL remains locked due to `nogil=False` by default.
@jit(nopython=True)
 def numba_gil(n: int) -> int:
    if n <= 1:
        return n
    a = 0
    b = 1
    c = a + b
    for _i in range(2, n):
        a = b
        b = c
        c = a + b
    return c
 # Call each function once to force compilation; we don't want
 # the timing statistics to include how long it takes to compile.
 numba_nogil(N)
 numba_gil(N);
 ```
 We'll perform the same tests as above; first, figure out how long it takes the function to run:
 ```python
 %%time
 _ = numba_gil(N)
 ```
 > <pre>
 > CPU times: user 253 ms, sys: 258 µs, total: 253 ms
 > Wall time: 251 ms
 > </pre>
 <span style="font-size: .8em">
 Aside: it's not immediately clear why Numba takes ~20% less time to run than Cython for code that should be
 effectively identical after compilation.
 </span>
 When running two GIL-locked threads, the result (as expected) takes around twice as long to compute:
 ```python
 %%time
 t1 = Thread(target=numba_gil, args=[N])
 t2 = Thread(target=numba_gil, args=[N])
 t1.start(); t2.start()
 t1.join(); t2.join()
 ```
 > <pre>
 > CPU times: user 541 ms, sys: 3.96 ms, total: 545 ms
 > Wall time: 541 ms
 > </pre>
 But if the GIL-unlocking thread starts first, both threads run in parallel:
 ```python
 %%time
 t1 = Thread(target=numba_nogil, args=[N])
 t2 = Thread(target=numba_gil, args=[N])
 t1.start(); t2.start()
 t1.join(); t2.join()
 ```
 > <pre>
 > CPU times: user 551 ms, sys: 7.77 ms, total: 559 ms
 > Wall time: 279 ms
 > </pre>
 Just like Cython, starting the GIL-locked thread first leads to poor performance:
 ```python
 %%time
 t1 = Thread(target=numba_gil, args=[N])
 t2 = Thread(target=numba_nogil, args=[N])
 t1.start(); t2.start()
 t1.join(); t2.join()
 ```
 > <pre>
 > CPU times: user 524 ms, sys: 0 ns, total: 524 ms
 > Wall time: 522 ms
 > </pre>
 Finally, unlike Cython, Numba will unlock the GIL if and only if it is currently acquired;
 recursively calling `@jit(nogil=True)` functions is perfectly safe:
 ```python
 from numba import jit
@jit(nopython=True, nogil=True)
 def numba_recurse(n: int) -> int:
    if n <= 0:
        return 0
    return numba_recurse(n - 1)
 numba_recurse(2);
 ```
 # Conclusion
 Before finishing, it's important to address pain points that will show up if these techniques are
 used in a more realistic project:
 First, code running in a GIL-free context will likely also need non-trivial data structures;
 GIL-free functions aren't useful if they're constantly interacting with Python objects whose access
 requires the GIL. Cython provides
 [extension types](http://docs.cython.org/en/latest/src/tutorial/cdef_classes.html) and Numba
 provides a [`@jitclass`](https://numba.pydata.org/numba-doc/dev/user/jitclass.html) decorator to
 address this need.
 Second, building and distributing applications that make use of Cython/Numba can be complicated.
 Cython packages require running the compiler, (potentially) linking/packaging external dependencies,
 and distributing a binary wheel. Numba is generally simpler because the code being distributed is
 pure Python, but can be tricky since errors aren't detected until runtime.
 Finally, while unlocking the GIL is often a solution in search of a problem, both Cython and Numba
 provide tools to directly manage the GIL when appropriate. This enables true parallelism (not just
 [concurrency](https://stackoverflow.com/a/1050257)) that is impossible in vanilla Python.
@@ -1,60 +0,0 @@
 ---
 layout: post
 title: "The webpack industrial complex"
 description: "Reflections on a new project"
 category:
 tags: [webpack, react, vite]
 ---
 This started because I wanted to build a synthesizer. Setting a goal of "digital DX7" was ambitious, but I needed something unrelated to the day job. Beyond that, working with audio seemed like a good challenge. I enjoy performance-focused code, and performance problems in audio are conspicuous. Building a web project was an obvious choice because of the web audio API documentation and independence from a large Digital Audio Workstation (DAW).
 The project was soon derailed trying to sort out technical issues unrelated to the original purpose. Finding a resolution was a frustrating journey, and it's still not clear whether those problems were my fault. As a result, I'm writing this to try making sense of it, as a case study/reference material, and to salvage something from the process.
 ## Starting strong
 The sole starting requirement was to write everything in TypeScript. Not because of project scale, but because guardrails help with unfamiliar territory. Keeping that in mind, the first question was: how does one start a new project? All I actually need is "compile TypeScript, show it in a browser."
 Create React App (CRA) came to the rescue and the rest of that evening was a joy. My TypeScript/JavaScript skills were rusty, but the online documentation was helpful. I had never understood the appeal of JSX (why put a DOM in JavaScript?) until it made connecting an `onEvent` handler and a function easy.
 Some quick dimensional analysis later and there was a sine wave oscillator playing A=440 through the speakers. I specifically remember thinking "modern browsers are magical."
 ## Continuing on
 Now comes the first mistake: I began to worry about "scale" before encountering an actual problem. Rather than rendering audio in the main thread, why not use audio worklets and render in a background thread instead?
 The first sign something was amiss came from the TypeScript compiler errors showing the audio worklet API [was missing](https://github.com/microsoft/TypeScript/issues/28308). After searching out Github issues and (unsuccessfully) tweaking the `.tsconfig` settings, I settled on installing a package and moving on.
 The next problem came from actually using the API. Worklets must load from separate "modules," but it wasn't clear how to guarantee the worklet code stayed separate from the application. I saw recommendations to use `new URL(<local path>, import.meta.url)` and it worked! Well, kind of:
 ![Browser error](/assets/images/2022-11-20-video_mp2t.png)
 That file has the audio processor code, so why does it get served with `Content-Type: video/mp2t`?
 ## Floundering about
 Now comes the second mistake: even though I didn't understand the error, I ignored recommendations to [just use JavaScript](https://hackernoon.com/implementing-audioworklets-with-react-8a80a470474) and stuck by the original TypeScript requirement.
 I tried different project structures. Moving the worklet code to a new folder didn't help, nor did setting up a monorepo and placing it in a new package.
 I tried three different CRA tools - `react-app-rewired`, `craco`, `customize-react-app` - but got the same problem. Each has varying levels of compatibility with recent CRA versions, so it wasn't clear if I had the right solution but implemented it incorrectly. After attempting to eject the application and panicking after seeing the configuration, I abandoned that as well.
 I tried changing the webpack configuration: using [new](https://github.com/webpack/webpack/issues/11543#issuecomment-917673256) [loaders](https://github.com/popelenkow/worker-url), setting [asset rules](https://github.com/webpack/webpack/discussions/14093#discussioncomment-1257149), even [changing how webpack detects worker resources](https://github.com/webpack/webpack/issues/11543#issuecomment-826897590). In hindsight, entry points may have been the answer. But because CRA actively resists attempts to change its webpack configuration, and I couldn't find audio worklet examples in any other framework, I gave up.
 I tried so many application frameworks. Next.js looked like a good candidate, but added its own [bespoke webpack complexity](https://github.com/vercel/next.js/issues/24907) to the existing confusion. Astro had the best "getting started" experience, but I refuse to install an IDE-specific plugin. I first used Deno while exploring Lume, but it couldn't import the audio worklet types (maybe because of module compatibility?). Each framework was unique in its own way (shout-out to SvelteKit) but I couldn't figure out how to make them work.
 ## Learning and reflecting
 I ended up using Vite and vite-plugin-react-pages to handle both "build the app" and "bundle worklets," but the specific tool choice isn't important. Instead, the focus should be on lessons learned.
 For myself:
 - I'm obsessed with tooling, to the point it can derail the original goal. While it comes from a good place (for example: "types are awesome"), it can get in the way of more important work
 - I tend to reach for online resources right after seeing a new problem. While finding help online is often faster, spending time understanding the problem would have been more productive than cycling through (often outdated) blog posts
 For the tools:
 - Resource bundling is great and solves a genuine challenge. I've heard too many horror stories of developers writing modules by hand to believe this is unnecessary complexity
 - Webpack is a build system and modern frameworks are deeply dependent on it (hence the "webpack industrial complex"). While this often saves users from unnecessary complexity, there's no path forward if something breaks
 - There's little ability to mix and match tools across frameworks. Next.js and Gatsby let users extend webpack, but because each framework adds its own modules, changes aren't portable. After spending a week looking at webpack, I had an example running with parcel in thirty minutes, but couldn't integrate it
 In the end, learning new systems is fun, but a focus on tools that "just work" can leave users out in the cold if they break down.
@@ -1,15 +0,0 @@
@font-face {
    font-family: 'JetBrains Mono';
    src: url('/assets/font/JetBrainsMono-Regular.woff2') format('woff2'),
         url('/assets/font/JetBrainsMono-Regular.woff') format('woff');
    font-weight: normal;
    font-style: normal;
 }
@font-face {
    font-family: 'Lato';
    src: url('/assets/font/lato-regular-webfont.woff2') format('woff2'),
         url('/assets/font/lato-regular-webfont.woff') format('woff');
    font-weight: normal;
    font-style: normal;
 }
@@ -1,119 +0,0 @@
 ---
 ---
 // Import the theme rules
@import "theme";
 body {
    max-width: 100%;
    overflow-x: hidden;
    font-family: 'Lato', sans-serif;
 }
 .navbar {
    color: $gray;
 }
 .separator {
    margin-right: .45rem;
    margin-left: .25rem;
    color: #000;
    &:after {
        content: '\00a0/';
    }
 }
 header {
    padding-top: 80px;
    padding-bottom: 0;
 };
 header h1,h2 {
    color: #000;
 }
 .post-description {
    color: #555;
 }
 .post-container a {
    color: #555;
    border-bottom-color: $gray;
    border-bottom-style: dotted;
    border-bottom-width: 1px;
    position: relative;
    display: inline-block;
    padding: 1px 1px;
    transition: color ease 0.3s;
    &::after {
      content: '';
      position: absolute;
      z-index: -1;
      width: 100%;
      height: 0%;
      left: 0;
      bottom: 0;
      background-color: $gray;
      transition: all ease 0.3s;
    }
    &:hover {
      color: #fff;
      border-bottom-style: solid;
      &::after {
        height: 100%;
      }
    }
 }
 body pre {
    font-size: 15px;
 }
 pre.highlight, code {
    font-family: 'JetBrains Mono', monospace;
 }
 div.highlighter-rouge {
    // Default theme uses `width: 100vw`, which while cool, does cause the page
    // to exceed screen width and trigger horizontal scrolling. No bueno.
    width: 99vw;
 }
 .post-date {
    // On the front page, make sure titles don't force wrapping the date box content
    text-align: right;
    white-space: nowrap;
 }
 blockquote {
    color: #555;
    right: 100px;
    margin-left: 0;
    padding-left: 1.8rem;
    border-left: 5px solid $gray;
 }
 .post-nav {
    /* Insert your custom styling here. Example:
       font-size: 14px;
    */
    display: flex;
    margin-top: 1em;
    margin-bottom: 1em;
 }
 .post-nav div {
    /* flex-grow, flex-shrink, flex-basis */
    flex: 1 1 0;
 }
 .post-nav-next {
    text-align: right;
 }
 th, td {
    border-bottom: 1px solid $gray;
    padding: 0.75em;
 }
--- a/Show More
+++ b/Show More