Deploy website - based on 6dcbc1a72c
@ -1,6 +0,0 @@
|
||||
FROM mcr.microsoft.com/vscode/devcontainers/ruby:0-2.7-bullseye
|
||||
|
||||
RUN wget https://github.com/errata-ai/vale/releases/download/v2.21.0/vale_2.21.0_Linux_64-bit.tar.gz -O /tmp/vale.tar.gz \
|
||||
&& cd /usr/local/bin \
|
||||
&& tar xf /tmp/vale.tar.gz \
|
||||
&& rm /tmp/vale.tar.gz
|
@ -1,13 +0,0 @@
|
||||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
|
||||
// https://github.com/microsoft/vscode-dev-containers/tree/v0.245.0/containers/ruby
|
||||
{
|
||||
"name": "Ruby",
|
||||
"build": {
|
||||
"dockerfile": "Dockerfile"
|
||||
},
|
||||
"runArgs": ["--userns=keep-id"],
|
||||
|
||||
"remoteUser": "vscode",
|
||||
"containerUser": "vscode",
|
||||
"workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/${localWorkspaceFolderBasename},type=bind,Z"
|
||||
}
|
8
.gitignore
vendored
@ -1,8 +0,0 @@
|
||||
_site/
|
||||
.swp
|
||||
.sass-cache/
|
||||
.jekyll-metadata
|
||||
.bundle/
|
||||
vendor/
|
||||
.styles/
|
||||
.vscode/
|
@ -1,7 +0,0 @@
|
||||
StylesPath = .styles
|
||||
MinAlertLevel = suggestion
|
||||
Packages = Microsoft, write-good
|
||||
|
||||
[*]
|
||||
BasedOnStyles = Vale, Microsoft, write-good
|
||||
write-good.E-Prime = NO
|
32
2011/11/webpack-industrial-complex/index.html
Normal file
95
2015/11/autocallable/index.html
Normal file
47
2015/11/welcome/index.html
Normal file
40
2015/12/testing-cramer/index.html
Normal file
41
2016/01/cloudy-in-seattle/index.html
Normal file
30
2016/01/complaining-about-the-weather/index.html
Normal file
75
2016/02/guaranteed-money-maker/index.html
Normal file
48
2016/03/predicting-santander-customer-happiness/index.html
Normal file
59
2016/03/tweet-like-me/index.html
Normal file
83
2016/04/tick-tock/index.html
Normal file
180
2016/05/the-unfair-casino/index.html
Normal file
74
2016/06/event-studies-and-earnings-releases/index.html
Normal file
187
2016/10/rustic-repodcasting/index.html
Normal file
66
2016/11/pca-audio-compression/index.html
Normal file
88
2018/01/captains-cookbook-part-1/index.html
Normal file
75
2018/01/captains-cookbook-part-2/index.html
Normal file
9
2018/05/hello/index.html
Normal file
142
2018/06/dateutil-parser-to-rust/index.html
Normal file
184
2018/09/isomorphic-apps/index.html
Normal file
99
2018/09/primitives-in-rust-are-weird/index.html
Normal file
92
2018/10/case-study-optimization/index.html
Normal file
77
2018/12/allocation-safety/index.html
Normal file
19
2018/12/what-small-business-really-means/index.html
Normal file
46
2019/02/08/compiler-optimizations/index.html
Normal file
122
2019/02/a-heaping-helping/index.html
Normal file
210
2019/02/stacking-up/index.html
Normal file
26
2019/02/summary/index.html
Normal file
133
2019/02/the-whole-world/index.html
Normal file
83
2019/02/understanding-allocations-in-rust/index.html
Normal file
29
2019/05/making-bread/index.html
Normal file
267
2019/06/high-performance-systems/index.html
Normal file
151
2019/09/binary-format-shootout/index.html
Normal file
151
2019/12/release-the-gil/index.html
Normal file
29
Gemfile
@ -1,29 +0,0 @@
|
||||
source "https://rubygems.org"
|
||||
|
||||
# Hello! This is where you manage which Jekyll version is used to run.
|
||||
# When you want to use a different version, change it below, save the
|
||||
# file and run `bundle install`. Run Jekyll with `bundle exec`, like so:
|
||||
#
|
||||
# bundle exec jekyll serve
|
||||
#
|
||||
# This will help ensure the proper Jekyll version is running.
|
||||
# Happy Jekylling!
|
||||
gem "jekyll", "~> 3.8.3"
|
||||
|
||||
gem "texture"
|
||||
|
||||
# If you want to use GitHub Pages, remove the "gem "jekyll"" above and
|
||||
# uncomment the line below. To upgrade, run `bundle update github-pages`.
|
||||
# gem "github-pages", group: :jekyll_plugins
|
||||
|
||||
# If you have any plugins, put them here!
|
||||
group :jekyll_plugins do
|
||||
gem "jekyll-feed", "~> 0.6"
|
||||
gem "jekyll-remote-theme"
|
||||
end
|
||||
|
||||
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem
|
||||
gem "tzinfo-data", platforms: [:mingw, :mswin, :x64_mingw, :jruby]
|
||||
|
||||
# Performance-booster for watching directories on Windows
|
||||
gem "wdm", "~> 0.1.0" if Gem.win_platform?
|
78
Gemfile.lock
@ -1,78 +0,0 @@
|
||||
GEM
|
||||
remote: https://rubygems.org/
|
||||
specs:
|
||||
addressable (2.7.0)
|
||||
public_suffix (>= 2.0.2, < 5.0)
|
||||
colorator (1.1.0)
|
||||
concurrent-ruby (1.1.6)
|
||||
em-websocket (0.5.1)
|
||||
eventmachine (>= 0.12.9)
|
||||
http_parser.rb (~> 0.6.0)
|
||||
eventmachine (1.2.7)
|
||||
ffi (1.12.2)
|
||||
forwardable-extended (2.6.0)
|
||||
http_parser.rb (0.6.0)
|
||||
i18n (0.9.5)
|
||||
concurrent-ruby (~> 1.0)
|
||||
jekyll (3.8.6)
|
||||
addressable (~> 2.4)
|
||||
colorator (~> 1.0)
|
||||
em-websocket (~> 0.5)
|
||||
i18n (~> 0.7)
|
||||
jekyll-sass-converter (~> 1.0)
|
||||
jekyll-watch (~> 2.0)
|
||||
kramdown (~> 1.14)
|
||||
liquid (~> 4.0)
|
||||
mercenary (~> 0.3.3)
|
||||
pathutil (~> 0.9)
|
||||
rouge (>= 1.7, < 4)
|
||||
safe_yaml (~> 1.0)
|
||||
jekyll-feed (0.13.0)
|
||||
jekyll (>= 3.7, < 5.0)
|
||||
jekyll-remote-theme (0.4.2)
|
||||
addressable (~> 2.0)
|
||||
jekyll (>= 3.5, < 5.0)
|
||||
jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
|
||||
rubyzip (>= 1.3.0, < 3.0)
|
||||
jekyll-sass-converter (1.5.2)
|
||||
sass (~> 3.4)
|
||||
jekyll-seo-tag (2.6.1)
|
||||
jekyll (>= 3.3, < 5.0)
|
||||
jekyll-watch (2.2.1)
|
||||
listen (~> 3.0)
|
||||
kramdown (1.17.0)
|
||||
liquid (4.0.3)
|
||||
listen (3.2.1)
|
||||
rb-fsevent (~> 0.10, >= 0.10.3)
|
||||
rb-inotify (~> 0.9, >= 0.9.10)
|
||||
mercenary (0.3.6)
|
||||
pathutil (0.16.2)
|
||||
forwardable-extended (~> 2.6)
|
||||
public_suffix (4.0.4)
|
||||
rb-fsevent (0.10.3)
|
||||
rb-inotify (0.10.1)
|
||||
ffi (~> 1.0)
|
||||
rouge (3.17.0)
|
||||
rubyzip (2.3.0)
|
||||
safe_yaml (1.0.5)
|
||||
sass (3.7.4)
|
||||
sass-listen (~> 4.0.0)
|
||||
sass-listen (4.0.0)
|
||||
rb-fsevent (~> 0.9, >= 0.9.4)
|
||||
rb-inotify (~> 0.9, >= 0.9.7)
|
||||
texture (0.3)
|
||||
jekyll (~> 3.7)
|
||||
jekyll-seo-tag (~> 2.1)
|
||||
|
||||
PLATFORMS
|
||||
ruby
|
||||
|
||||
DEPENDENCIES
|
||||
jekyll (~> 3.8.3)
|
||||
jekyll-feed (~> 0.6)
|
||||
jekyll-remote-theme
|
||||
texture
|
||||
tzinfo-data
|
||||
|
||||
BUNDLED WITH
|
||||
2.1.4
|
44
_config.yml
@ -1,44 +0,0 @@
|
||||
# Welcome to Jekyll!
|
||||
#
|
||||
# This config file is meant for settings that affect your whole blog, values
|
||||
# which you are expected to set up once and rarely edit after that. If you find
|
||||
# yourself editing this file very often, consider using Jekyll's data files
|
||||
# feature for the data you need to update frequently.
|
||||
#
|
||||
# For technical reasons, this file is *NOT* reloaded automatically when you use
|
||||
# 'bundle exec jekyll serve'. If you change this file, please restart the server process.
|
||||
|
||||
# Site settings
|
||||
# These are used to personalize your new site. If you look in the HTML files,
|
||||
# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on.
|
||||
# You can create any custom variable you would like, and they will be accessible
|
||||
# in the templates via {{ site.myvariable }}.
|
||||
title: speice.io
|
||||
description: The Old Speice Guy
|
||||
email: bradlee@speice.io
|
||||
baseurl: "" # the subpath of your site, e.g. /blog
|
||||
url: "https://speice.io/" # the base hostname & protocol for your site, e.g. http://example.com
|
||||
github_username: bspeice
|
||||
|
||||
# Build settings
|
||||
markdown: kramdown
|
||||
# theme: texture
|
||||
remote_theme: thelehhman/texture
|
||||
plugins:
|
||||
- jekyll-feed
|
||||
- jekyll-remote-theme
|
||||
|
||||
include: [_pages]
|
||||
permalink: /:year/:month/:title.html
|
||||
|
||||
# Exclude from processing.
|
||||
# The following items will not be processed, by default. Create a custom list
|
||||
# to override the default setting.
|
||||
# exclude:
|
||||
# - Gemfile
|
||||
# - Gemfile.lock
|
||||
# - node_modules
|
||||
# - vendor/bundle/
|
||||
# - vendor/cache/
|
||||
# - vendor/gems/
|
||||
# - vendor/ruby/
|
@ -1,23 +0,0 @@
|
||||
{% if page.layout == 'post' %}
|
||||
{% comment %}Thanks to https://www.bytedude.com/jekyll-previous-and-next-posts/{% endcomment %}
|
||||
<div class="container">
|
||||
<hr>
|
||||
<div class="post-nav">
|
||||
<div>
|
||||
{% if page.previous.url %}
|
||||
<a href="{{page.previous.url}}">« {{page.previous.title}}</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="post-nav-next">
|
||||
{% if page.next.url %}
|
||||
<a href="{{page.next.url}}">{{page.next.title}} »</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script type="text/javascript"
|
||||
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
|
||||
</script>
|
||||
{% endif %}
|
@ -1,7 +0,0 @@
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<meta http-equiv="X-UA-Compatible" content="ie=edge">
|
||||
<link rel="stylesheet" href="{{ "/assets/css/style.css" | relative_url }}">
|
||||
<link rel="stylesheet" href="{{ "/assets/css/fonts.css" | prepend: site.baseurl }}">
|
||||
<title>{{ page.title | default: site.title }}</title>
|
||||
{% seo %}
|
@ -1,7 +0,0 @@
|
||||
<div class="navbar">
|
||||
<a href="{{ "/" | prepend: site.baseurl }}">Home</a>
|
||||
<span class="separator"></span>
|
||||
<a href="{{ "/about/" | prepend: site.baseurl }}">About</a>
|
||||
<span class="separator"></span>
|
||||
<a href="{{ "/feed.xml" | prepend: site.baseurl }}">RSS</a>
|
||||
</div>
|
@ -1,15 +0,0 @@
|
||||
<div class="container">
|
||||
<h2>{{ site.title }}</h1>
|
||||
<h1>{{ site.description }}</h2>
|
||||
<ul class="social">
|
||||
{%- if site.texture.social_links.github -%}
|
||||
<a href="https://github.com/{{ site.texture.social_links.github }}"><li><i class="icon-github-circled"></i></li></a>
|
||||
{%- endif -%}
|
||||
{%- if site.texture.social_links.linkedIn -%}
|
||||
<a href="https://linkedin.com/{{ site.texture.social_links.linkedIn }}"><li><i class="icon-linkedin-squared"></i></li></a>
|
||||
{%- endif -%}
|
||||
{%- if site.texture.social_links.twitter -%}
|
||||
<a href="https://twitter.com/{{ site.texture.social_links.twitter }}"><li><i class="icon-twitter-squared"></i></li></a>
|
||||
{%- endif -%}
|
||||
</ul>
|
||||
</div>
|
@ -1,13 +0,0 @@
|
||||
---
|
||||
layout: page
|
||||
title: About
|
||||
permalink: /about/
|
||||
---
|
||||
|
||||
Developer currently living in New York City.
|
||||
|
||||
Best ways to get in contact:
|
||||
|
||||
- Email: [bradlee@speice.io](mailto:bradlee@speice.io)
|
||||
- Github: [bspeice](https://github.com/bspeice)
|
||||
- LinkedIn: [bradleespeice](https://www.linkedin.com/in/bradleespeice/)
|
@ -1,38 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Hello!"
|
||||
description: ""
|
||||
category:
|
||||
tags: []
|
||||
---
|
||||
|
||||
I'll do what I can to keep this short, there's plenty of other things we both should be doing right
|
||||
now.
|
||||
|
||||
If you're here for the bread pics, and to marvel in some other culinary side projects, I've got you
|
||||
covered:
|
||||
|
||||
![Saturday Bread]({{ "/assets/images/2018-05-28-bread.jpg" | absolute_url }})
|
||||
|
||||
And no, I'm not posting pictures of earlier attempts that ended up turning into rocks in the oven.
|
||||
|
||||
Okay, just one:
|
||||
|
||||
![Bread as rock]({{ "/assets/images/2018-05-28-rocks.jpg" | absolute_url }})
|
||||
|
||||
If you're here for keeping up with the man Bradlee Speice, got plenty of that too. Plus some
|
||||
up-coming super-nerdy posts about how I'm changing the world.
|
||||
|
||||
And if you're not here for those things: don't have a lot for you, sorry. But you're welcome to let
|
||||
me know what needs to change.
|
||||
|
||||
I'm looking forward to making this a place to talk about what's going on in life, I hope you'll
|
||||
stick it out with me. The best way to follow what's going on is on my [About](/about/) page, but if
|
||||
you want the joy of clicking links, here's a few good ones:
|
||||
|
||||
- Email (people still use this?): [bradlee@speice.io](mailto:bradlee@speice.io)
|
||||
- Mastodon (nerd Twitter): [@bradlee](https://mastodon.social/@bradlee)
|
||||
- Chat (RiotIM): [@bspeice:matrix.com](https://matrix.to/#/@bspeice:matrix.com)
|
||||
- The comments section (not for people with sanity intact): ↓↓↓
|
||||
|
||||
Thanks, and keep it amazing.
|
@ -1,177 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "What I Learned: Porting Dateutil Parser to Rust"
|
||||
description: ""
|
||||
category:
|
||||
tags: [dtparse, rust]
|
||||
---
|
||||
|
||||
Hi. I'm Bradlee.
|
||||
|
||||
I've mostly been a lurker in Rust for a while, making a couple small contributions here and there.
|
||||
So launching [dtparse](https://github.com/bspeice/dtparse) feels like nice step towards becoming a
|
||||
functioning member of society. But not too much, because then you know people start asking you to
|
||||
pay bills, and ain't nobody got time for that.
|
||||
|
||||
But I built dtparse, and you can read about my thoughts on the process. Or don't. I won't tell you
|
||||
what to do with your life (but you should totally keep reading).
|
||||
|
||||
# Slow down, what?
|
||||
|
||||
OK, fine, I guess I should start with _why_ someone would do this.
|
||||
|
||||
[Dateutil](https://github.com/dateutil/dateutil) is a Python library for handling dates. The
|
||||
standard library support for time in Python is kinda dope, but there are a lot of extras that go
|
||||
into making it useful beyond just the [datetime](https://docs.python.org/3.6/library/datetime.html)
|
||||
module. `dateutil.parser` specifically is code to take all the super-weird time formats people come
|
||||
up with and turn them into something actually useful.
|
||||
|
||||
Date/time parsing, it turns out, is just like everything else involving
|
||||
[computers](https://infiniteundo.com/post/25326999628/falsehoods-programmers-believe-about-time) and
|
||||
[time](https://infiniteundo.com/post/25509354022/more-falsehoods-programmers-believe-about-time): it
|
||||
feels like it shouldn't be that difficult to do, until you try to do it, and you realize that people
|
||||
suck and this is why
|
||||
[we can't we have nice things](https://zachholman.com/talk/utc-is-enough-for-everyone-right). But
|
||||
alas, we'll try and make contemporary art out of the rubble and give it a pretentious name like
|
||||
_Time_.
|
||||
|
||||
![A gravel mound](/assets/images/2018-06-25-gravel-mound.jpg)
|
||||
|
||||
> [Time](https://www.goodfreephotos.com/united-states/montana/elkhorn/remains-of-the-mining-operation-elkhorn.jpg.php)
|
||||
|
||||
What makes `dateutil.parser` great is that there's single function with a single argument that
|
||||
drives what programmers interact with:
|
||||
[`parse(timestr)`](https://github.com/dateutil/dateutil/blob/6dde5d6298cfb81a4c594a38439462799ed2aef2/dateutil/parser/_parser.py#L1258).
|
||||
It takes in the time as a string, and gives you back a reasonable "look, this is the best anyone can
|
||||
possibly do to make sense of your input" value. It doesn't expect much of you.
|
||||
|
||||
[And now it's in Rust.](https://github.com/bspeice/dtparse/blob/7d565d3a78876dbebd9711c9720364fe9eba7915/src/lib.rs#L1332)
|
||||
|
||||
# Lost in Translation
|
||||
|
||||
Having worked at a bulge-bracket bank watching Java programmers try to be Python programmers, I'm
|
||||
admittedly hesitant to publish Python code that's trying to be Rust. Interestingly, Rust code can
|
||||
actually do a great job of mimicking Python. It's certainly not idiomatic Rust, but I've had better
|
||||
experiences than
|
||||
[this guy](https://webcache.googleusercontent.com/search?q=cache:wkYMpktJtnUJ:https://jackstouffer.com/blog/porting_dateutil.html+&cd=3&hl=en&ct=clnk&gl=us)
|
||||
who attempted the same thing for D. These are the actual take-aways:
|
||||
|
||||
When transcribing code, **stay as close to the original library as possible**. I'm talking about
|
||||
using the same variable names, same access patterns, the whole shebang. It's way too easy to make a
|
||||
couple of typos, and all of a sudden your code blows up in new and exciting ways. Having a reference
|
||||
manual for verbatim what your code should be means that you don't spend that long debugging
|
||||
complicated logic, you're more looking for typos.
|
||||
|
||||
Also, **don't use nice Rust things like enums**. While
|
||||
[one time it worked out OK for me](https://github.com/bspeice/dtparse/blob/7d565d3a78876dbebd9711c9720364fe9eba7915/src/lib.rs#L88-L94),
|
||||
I also managed to shoot myself in the foot a couple times because `dateutil` stores AM/PM as a
|
||||
boolean and I mixed up which was true, and which was false (side note: AM is false, PM is true). In
|
||||
general, writing nice code _should not be a first-pass priority_ when you're just trying to recreate
|
||||
the same functionality.
|
||||
|
||||
**Exceptions are a pain.** Make peace with it. Python code is just allowed to skip stack frames. So
|
||||
when a co-worker told me "Rust is getting try-catch syntax" I properly freaked out. Turns out
|
||||
[he's not quite right](https://github.com/rust-lang/rfcs/pull/243), and I'm OK with that. And while
|
||||
`dateutil` is pretty well-behaved about not skipping multiple stack frames,
|
||||
[130-line try-catch blocks](https://github.com/dateutil/dateutil/blob/16561fc99361979e88cccbd135393b06b1af7e90/dateutil/parser/_parser.py#L730-L865)
|
||||
take a while to verify.
|
||||
|
||||
As another Python quirk, **be very careful about
|
||||
[long nested if-elif-else blocks](https://github.com/dateutil/dateutil/blob/16561fc99361979e88cccbd135393b06b1af7e90/dateutil/parser/_parser.py#L494-L568)**.
|
||||
I used to think that Python's whitespace was just there to get you to format your code correctly. I
|
||||
think that no longer. It's way too easy to close a block too early and have incredibly weird issues
|
||||
in the logic. Make sure you use an editor that displays indentation levels so you can keep things
|
||||
straight.
|
||||
|
||||
**Rust macros are not free.** I originally had the
|
||||
[main test body](https://github.com/bspeice/dtparse/blob/b0e737f088eca8e83ab4244c6621a2797d247697/tests/compat.rs#L63-L217)
|
||||
wrapped up in a macro using [pyo3](https://github.com/PyO3/PyO3). It took two minutes to compile.
|
||||
After
|
||||
[moving things to a function](https://github.com/bspeice/dtparse/blob/e017018295c670e4b6c6ee1cfff00dbb233db47d/tests/compat.rs#L76-L205)
|
||||
compile times dropped down to ~5 seconds. Turns out 150 lines \* 100 tests = a lot of redundant code
|
||||
to be compiled. My new rule of thumb is that any macros longer than 10-15 lines are actually
|
||||
functions that need to be liberated, man.
|
||||
|
||||
Finally, **I really miss list comprehensions and dictionary comprehensions.** As a quick comparison,
|
||||
see
|
||||
[this dateutil code](https://github.com/dateutil/dateutil/blob/16561fc99361979e88cccbd135393b06b1af7e90/dateutil/parser/_parser.py#L476)
|
||||
and
|
||||
[the implementation in Rust](https://github.com/bspeice/dtparse/blob/7d565d3a78876dbebd9711c9720364fe9eba7915/src/lib.rs#L619-L629).
|
||||
I probably wrote it wrong, and I'm sorry. Ultimately though, I hope that these comprehensions can be
|
||||
added through macros or syntax extensions. Either way, they're expressive, save typing, and are
|
||||
super-readable. Let's get more of that.
|
||||
|
||||
# Using a young language
|
||||
|
||||
Now, Rust is exciting and new, which means that there's opportunity to make a substantive impact. On
|
||||
more than one occasion though, I've had issues navigating the Rust ecosystem.
|
||||
|
||||
What I'll call the "canonical library" is still being built. In Python, if you need datetime
|
||||
parsing, you use `dateutil`. If you want `decimal` types, it's already in the
|
||||
[standard library](https://docs.python.org/3.6/library/decimal.html). While I might've gotten away
|
||||
with `f64`, `dateutil` uses decimals, and I wanted to follow the principle of **staying as close to
|
||||
the original library as possible**. Thus began my quest to find a decimal library in Rust. What I
|
||||
quickly found was summarized in a comment:
|
||||
|
||||
> Writing a BigDecimal is easy. Writing a _good_ BigDecimal is hard.
|
||||
>
|
||||
> [-cmr](https://github.com/rust-lang/rust/issues/8937#issuecomment-34582794)
|
||||
|
||||
In practice, this means that there are at least [4](https://crates.io/crates/bigdecimal)
|
||||
[different](https://crates.io/crates/rust_decimal)
|
||||
[implementations](https://crates.io/crates/decimal) [available](https://crates.io/crates/decimate).
|
||||
And that's a lot of decisions to worry about when all I'm thinking is "why can't
|
||||
[calendar reform](https://en.wikipedia.org/wiki/Calendar_reform) be a thing" and I'm forced to dig
|
||||
through a [couple](https://github.com/rust-lang/rust/issues/8937#issuecomment-31661916)
|
||||
[different](https://github.com/rust-lang/rfcs/issues/334)
|
||||
[threads](https://github.com/rust-num/num/issues/8) to figure out if the library I'm look at is dead
|
||||
or just stable.
|
||||
|
||||
And even when the "canonical library" exists, there's no guarantees that it will be well-maintained.
|
||||
[Chrono](https://github.com/chronotope/chrono) is the _de facto_ date/time library in Rust, and just
|
||||
released version 0.4.4 like two days ago. Meanwhile,
|
||||
[chrono-tz](https://github.com/chronotope/chrono-tz) appears to be dead in the water even though
|
||||
[there are people happy to help maintain it](https://github.com/chronotope/chrono-tz/issues/19). I
|
||||
know relatively little about it, but it appears that most of the release process is automated;
|
||||
keeping that up to date should be a no-brainer.
|
||||
|
||||
## Trial Maintenance Policy
|
||||
|
||||
Specifically given "maintenance" being an
|
||||
[oft-discussed](https://www.reddit.com/r/rust/comments/48540g/thoughts_on_initiators_vs_maintainers/)
|
||||
issue, I'm going to try out the following policy to keep things moving on `dtparse`:
|
||||
|
||||
1. Issues/PRs needing _maintainer_ feedback will be updated at least weekly. I want to make sure
|
||||
nobody's blocking on me.
|
||||
|
||||
2. To keep issues/PRs needing _contributor_ feedback moving, I'm going to (kindly) ask the
|
||||
contributor to check in after two weeks, and close the issue without resolution if I hear nothing
|
||||
back after a month.
|
||||
|
||||
The second point I think has the potential to be a bit controversial, so I'm happy to receive
|
||||
feedback on that. And if a contributor responds with "hey, still working on it, had a kid and I'm
|
||||
running on 30 seconds of sleep a night," then first: congratulations on sustaining human life. And
|
||||
second: I don't mind keeping those requests going indefinitely. I just want to try and balance
|
||||
keeping things moving with giving people the necessary time they need.
|
||||
|
||||
I should also note that I'm still getting some best practices in place - CONTRIBUTING and
|
||||
CONTRIBUTORS files need to be added, as well as issue/PR templates. In progress. None of us are
|
||||
perfect.
|
||||
|
||||
# Roadmap and Conclusion
|
||||
|
||||
So if I've now built a `dateutil`-compatible parser, we're done, right? Of course not! That's not
|
||||
nearly ambitious enough.
|
||||
|
||||
Ultimately, I'd love to have a library that's capable of parsing everything the Linux `date` command
|
||||
can do (and not `date` on OSX, because seriously, BSD coreutils are the worst). I know Rust has a
|
||||
coreutils rewrite going on, and `dtparse` would potentially be an interesting candidate since it
|
||||
doesn't bring in a lot of extra dependencies. [`humantime`](https://crates.io/crates/humantime)
|
||||
could help pick up some of the (current) slack in dtparse, so maybe we can share and care with each
|
||||
other?
|
||||
|
||||
All in all, I'm mostly hoping that nobody's already done this and I haven't spent a bit over a month
|
||||
on redundant code. So if it exists, tell me. I need to know, but be nice about it, because I'm going
|
||||
to take it hard.
|
||||
|
||||
And in the mean time, I'm looking forward to building more. Onwards.
|
@ -1,323 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Primitives in Rust are Weird (and Cool)"
|
||||
description: "but mostly weird."
|
||||
category:
|
||||
tags: [rust, c, java, python, x86]
|
||||
---
|
||||
|
||||
I wrote a really small Rust program a while back because I was curious. I was 100% convinced it
|
||||
couldn't possibly run:
|
||||
|
||||
```rust
|
||||
fn main() {
|
||||
println!("{}", 8.to_string())
|
||||
}
|
||||
```
|
||||
|
||||
And to my complete befuddlement, it compiled, ran, and produced a completely sensible output. The
|
||||
reason I was so surprised has to do with how Rust treats a special category of things I'm going to
|
||||
call _primitives_. In the current version of the Rust book, you'll see them referred to as
|
||||
[scalars][rust_scalar], and in older versions they'll be called [primitives][rust_primitive], but
|
||||
we're going to stick with the name _primitive_ for the time being. Explaining why this program is so
|
||||
cool requires talking about a number of other programming languages, and keeping a consistent
|
||||
terminology makes things easier.
|
||||
|
||||
**You've been warned:** this is going to be a tedious post about a relatively minor issue that
|
||||
involves Java, Python, C, and x86 Assembly. And also me pretending like I know what I'm talking
|
||||
about with assembly.
|
||||
|
||||
# Defining primitives (Java)
|
||||
|
||||
The reason I'm using the name _primitive_ comes from how much of my life is Java right now. Spoiler
|
||||
alert: a lot of it. And for the most part I like Java, but I digress. In Java, there's a special
|
||||
name for some specific types of values:
|
||||
|
||||
> ```
|
||||
> bool char byte
|
||||
> short int long
|
||||
> float double
|
||||
> ```
|
||||
|
||||
````
|
||||
|
||||
They are referred to as [primitives][java_primitive]. And relative to the other bits of Java,
|
||||
they have two unique features. First, they don't have to worry about the
|
||||
[billion-dollar mistake](https://en.wikipedia.org/wiki/Tony_Hoare#Apologies_and_retractions);
|
||||
primitives in Java can never be `null`. Second: *they can't have instance methods*.
|
||||
Remember that Rust program from earlier? Java has no idea what to do with it:
|
||||
|
||||
```java
|
||||
class Main {
|
||||
public static void main(String[] args) {
|
||||
int x = 8;
|
||||
System.out.println(x.toString()); // Triggers a compiler error
|
||||
}
|
||||
}
|
||||
````
|
||||
|
||||
The error is:
|
||||
|
||||
```
|
||||
Main.java:5: error: int cannot be dereferenced
|
||||
System.out.println(x.toString());
|
||||
^
|
||||
1 error
|
||||
```
|
||||
|
||||
Specifically, Java's [`Object`](https://docs.oracle.com/javase/10/docs/api/java/lang/Object.html)
|
||||
and things that inherit from it are pointers under the hood, and we have to dereference them before
|
||||
the fields and methods they define can be used. In contrast, _primitive types are just values_ -
|
||||
there's nothing to be dereferenced. In memory, they're just a sequence of bits.
|
||||
|
||||
If we really want, we can turn the `int` into an
|
||||
[`Integer`](https://docs.oracle.com/javase/10/docs/api/java/lang/Integer.html) and then dereference
|
||||
it, but it's a bit wasteful:
|
||||
|
||||
```java
|
||||
class Main {
|
||||
public static void main(String[] args) {
|
||||
int x = 8;
|
||||
Integer y = Integer.valueOf(x);
|
||||
System.out.println(y.toString());
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This creates the variable `y` of type `Integer` (which inherits `Object`), and at run time we
|
||||
dereference `y` to locate the `toString()` function and call it. Rust obviously handles things a bit
|
||||
differently, but we have to dig into the low-level details to see it in action.
|
||||
|
||||
# Low Level Handling of Primitives (C)
|
||||
|
||||
We first need to build a foundation for reading and understanding the assembly code the final answer
|
||||
requires. Let's begin with showing how the `C` language (and your computer) thinks about "primitive"
|
||||
values in memory:
|
||||
|
||||
```c
|
||||
void my_function(int num) {}
|
||||
|
||||
int main() {
|
||||
int x = 8;
|
||||
my_function(x);
|
||||
}
|
||||
```
|
||||
|
||||
The [compiler explorer](https://godbolt.org/z/lgNYcc) gives us an easy way of showing off the
|
||||
assembly-level code that's generated: <span style="font-size:.6em">whose output has been lightly
|
||||
edited</span>
|
||||
|
||||
```nasm
|
||||
main:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
sub rsp, 16
|
||||
|
||||
; We assign the value `8` to `x` here
|
||||
mov DWORD PTR [rbp-4], 8
|
||||
|
||||
; And copy the bits making up `x` to a location
|
||||
; `my_function` can access (`edi`)
|
||||
mov eax, DWORD PTR [rbp-4]
|
||||
mov edi, eax
|
||||
|
||||
; Call `my_function` and give it control
|
||||
call my_function
|
||||
|
||||
mov eax, 0
|
||||
leave
|
||||
ret
|
||||
|
||||
my_function:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
|
||||
; Copy the bits out of the pre-determined location (`edi`)
|
||||
; to somewhere we can use
|
||||
mov DWORD PTR [rbp-4], edi
|
||||
nop
|
||||
|
||||
pop rbp
|
||||
ret
|
||||
```
|
||||
|
||||
At a really low level of memory, we're copying bits around using the [`mov`][x86_guide] instruction;
|
||||
nothing crazy. But to show how similar Rust is, let's take a look at our program translated from C
|
||||
to Rust:
|
||||
|
||||
```rust
|
||||
fn my_function(x: i32) {}
|
||||
|
||||
fn main() {
|
||||
let x = 8;
|
||||
my_function(x)
|
||||
}
|
||||
```
|
||||
|
||||
And the assembly generated when we stick it in the
|
||||
[compiler explorer](https://godbolt.org/z/cAlmk0): <span style="font-size:.6em">again, lightly
|
||||
edited</span>
|
||||
|
||||
```nasm
|
||||
example::main:
|
||||
push rax
|
||||
|
||||
; Look familiar? We're copying bits to a location for `my_function`
|
||||
; The compiler just optimizes out holding `x` in memory
|
||||
mov edi, 8
|
||||
|
||||
; Call `my_function` and give it control
|
||||
call example::my_function
|
||||
|
||||
pop rax
|
||||
ret
|
||||
|
||||
example::my_function:
|
||||
sub rsp, 4
|
||||
|
||||
; And copying those bits again, just like in C
|
||||
mov dword ptr [rsp], edi
|
||||
|
||||
add rsp, 4
|
||||
ret
|
||||
```
|
||||
|
||||
The generated Rust assembly is functionally pretty close to the C assembly: _When working with
|
||||
primitives, we're just dealing with bits in memory_.
|
||||
|
||||
In Java we have to dereference a pointer to call its functions; in Rust, there's no pointer to
|
||||
dereference. So what exactly is going on with this `.to_string()` function call?
|
||||
|
||||
# impl primitive (and Python)
|
||||
|
||||
Now it's time to <strike>reveal my trap card</strike> show the revelation that tied all this
|
||||
together: _Rust has implementations for its primitive types._ That's right, `impl` blocks aren't
|
||||
only for `structs` and `traits`, primitives get them too. Don't believe me? Check out
|
||||
[u32](https://doc.rust-lang.org/std/primitive.u32.html),
|
||||
[f64](https://doc.rust-lang.org/std/primitive.f64.html) and
|
||||
[char](https://doc.rust-lang.org/std/primitive.char.html) as examples.
|
||||
|
||||
But the really interesting bit is how Rust turns those `impl` blocks into assembly. Let's break out
|
||||
the [compiler explorer](https://godbolt.org/z/6LBEwq) once again:
|
||||
|
||||
```rust
|
||||
pub fn main() {
|
||||
8.to_string()
|
||||
}
|
||||
```
|
||||
|
||||
And the interesting bits in the assembly: <span style="font-size:.6em">heavily trimmed down</span>
|
||||
|
||||
```nasm
|
||||
example::main:
|
||||
sub rsp, 24
|
||||
mov rdi, rsp
|
||||
lea rax, [rip + .Lbyte_str.u]
|
||||
mov rsi, rax
|
||||
|
||||
; Cool stuff right here
|
||||
call <T as alloc::string::ToString>::to_string@PLT
|
||||
|
||||
mov rdi, rsp
|
||||
call core::ptr::drop_in_place
|
||||
add rsp, 24
|
||||
ret
|
||||
```
|
||||
|
||||
Now, this assembly is a bit more complicated, but here's the big revelation: **we're calling
|
||||
`to_string()` as a function that exists all on its own, and giving it the instance of `8`**. Instead
|
||||
of thinking of the value 8 as an instance of `u32` and then peeking in to find the location of the
|
||||
function we want to call (like Java), we have a function that exists outside of the instance and
|
||||
just give that function the value `8`.
|
||||
|
||||
This is an incredibly technical detail, but the interesting idea I had was this: _if `to_string()`
|
||||
is a static function, can I refer to the unbound function and give it an instance?_
|
||||
|
||||
Better explained in code (and a [compiler explorer](https://godbolt.org/z/fJY-gA) link because I
|
||||
seriously love this thing):
|
||||
|
||||
```rust
|
||||
struct MyVal {
|
||||
x: u32
|
||||
}
|
||||
|
||||
impl MyVal {
|
||||
fn to_string(&self) -> String {
|
||||
self.x.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
let my_val = MyVal { x: 8 };
|
||||
|
||||
// THESE ARE THE SAME
|
||||
my_val.to_string();
|
||||
MyVal::to_string(&my_val);
|
||||
}
|
||||
```
|
||||
|
||||
Rust is totally fine "binding" the function call to the instance, and also as a static.
|
||||
|
||||
MIND == BLOWN.
|
||||
|
||||
Python does the same thing where I can both call functions bound to their instances and also call as
|
||||
an unbound function where I give it the instance:
|
||||
|
||||
```python
|
||||
class MyClass():
|
||||
x = 24
|
||||
|
||||
def my_function(self):
|
||||
print(self.x)
|
||||
|
||||
m = MyClass()
|
||||
|
||||
m.my_function()
|
||||
MyClass.my_function(m)
|
||||
```
|
||||
|
||||
And Python tries to make you _think_ that primitives can have instance methods...
|
||||
|
||||
```python
|
||||
>>> dir(8)
|
||||
['__abs__', '__add__', '__and__', '__class__', '__cmp__', '__coerce__',
|
||||
'__delattr__', '__div__', '__divmod__', '__doc__', '__float__', '__floordiv__',
|
||||
...
|
||||
'__setattr__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__',
|
||||
...]
|
||||
|
||||
>>> # Theoretically `8.__str__()` should exist, but:
|
||||
|
||||
>>> 8.__str__()
|
||||
File "<stdin>", line 1
|
||||
8.__str__()
|
||||
^
|
||||
SyntaxError: invalid syntax
|
||||
|
||||
>>> # It will run if we assign it first though:
|
||||
>>> x = 8
|
||||
>>> x.__str__()
|
||||
'8'
|
||||
```
|
||||
|
||||
...but in practice it's a bit complicated.
|
||||
|
||||
So while Python handles binding instance methods in a way similar to Rust, it's still not able to
|
||||
run the example we started with.
|
||||
|
||||
# Conclusion
|
||||
|
||||
This was a super-roundabout way of demonstrating it, but the way Rust handles incredibly minor
|
||||
details like primitives leads to really cool effects. Primitives are optimized like C in how they
|
||||
have a space-efficient memory layout, yet the language still has a lot of features I enjoy in Python
|
||||
(like both instance and late binding).
|
||||
|
||||
And when you put it together, there are areas where Rust does cool things nobody else can; as a
|
||||
quirky feature of Rust's type system, `8.to_string()` is actually valid code.
|
||||
|
||||
Now go forth and fool your friends into thinking you know assembly. This is all I've got.
|
||||
|
||||
[x86_guide]: http://www.cs.virginia.edu/~evans/cs216/guides/x86.html
|
||||
[java_primitive]: https://docs.oracle.com/javase/tutorial/java/nutsandbolts/datatypes.html
|
||||
[rust_scalar]: https://doc.rust-lang.org/book/second-edition/ch03-02-data-types.html#scalar-types
|
||||
[rust_primitive]: https://doc.rust-lang.org/book/first-edition/primitive-types.html
|
@ -1,294 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Isomorphic Desktop Apps with Rust"
|
||||
description: "Electron + WASM = ☣"
|
||||
category:
|
||||
tags: [rust, javascript, webassembly]
|
||||
---
|
||||
|
||||
Forgive me, but this is going to be a bit of a schizophrenic post. I both despise Javascript and the
|
||||
modern ECMAScript ecosystem, and I'm stunned by its success doing some really cool things. It's
|
||||
[this duality](https://www.destroyallsoftware.com/talks/the-birth-and-death-of-javascript) that's
|
||||
led me to a couple of (very) late nights over the past weeks trying to reconcile myself as I
|
||||
bootstrap a simple desktop application.
|
||||
|
||||
See, as much as
|
||||
[Webassembly isn't trying to replace Javascript](https://webassembly.org/docs/faq/#is-webassembly-trying-to-replace-javascript),
|
||||
**I want Javascript gone**. There are plenty of people who don't share my views, and they are
|
||||
probably nicer and more fun at parties. But I cringe every time "Webpack" is mentioned, and I think
|
||||
it's hilarious that the
|
||||
[language specification](https://ecma-international.org/publications/standards/Ecma-402.htm)
|
||||
dramatically outpaces anyone's
|
||||
[actual implementation](https://kangax.github.io/compat-table/es2016plus/). The answer to this
|
||||
conundrum is of course to recompile code from newer versions of the language to older versions _of
|
||||
the same language_ before running. At least [Babel] is a nice tongue-in-cheek reference.
|
||||
|
||||
Yet for as much hate as [Electron] receives, it does a stunningly good job at solving a really hard
|
||||
problem: _how the hell do I put a button on the screen and react when the user clicks it_? GUI
|
||||
programming is hard, straight up. But if browsers are already able to run everywhere, why don't we
|
||||
take advantage of someone else solving the hard problems for us? I don't like that I have to use
|
||||
Javascript for it, but I really don't feel inclined to whip out good ol' [wxWidgets].
|
||||
|
||||
Now there are other native solutions ([libui-rs], [conrod], [oh hey wxWdidgets again!][wxrust]), but
|
||||
those also have their own issues with distribution, styling, etc. With Electron, I can
|
||||
`yarn create electron-app my-app` and just get going, knowing that packaging/upgrades/etc. are built
|
||||
in.
|
||||
|
||||
My question is: given recent innovations with WASM, _are we Electron yet_?
|
||||
|
||||
No, not really.
|
||||
|
||||
Instead, **what would it take to get to a point where we can skip Javascript in Electron apps?**
|
||||
|
||||
# Setting the Stage
|
||||
|
||||
Truth is, WASM/Webassembly is a pretty new technology and I'm a total beginner in this area. There
|
||||
may already be solutions to the issues I discuss, but I'm totally unaware of them, so I'm going to
|
||||
try and organize what I did manage to discover.
|
||||
|
||||
I should also mention that the content and things I'm talking about here are not intended to be
|
||||
prescriptive, but more "if someone else is interested, what do we already know doesn't work?" _I
|
||||
expect everything in this post to be obsolete within two months._ Even over the course of writing
|
||||
this, [a separate blog post](https://mnt.io/2018/08/28/from-rust-to-beyond-the-asm-js-galaxy/) had
|
||||
to be modified because [upstream changes](https://github.com/WebAssembly/binaryen/pull/1642) broke a
|
||||
[Rust tool](https://github.com/rustwasm/wasm-bindgen/pull/787) the post tried to use. The post
|
||||
ultimately
|
||||
[got updated](https://mnt.io/2018/08/28/from-rust-to-beyond-the-asm-js-galaxy/#comment-477), **but
|
||||
all this happened within the span of a week.** Things are moving quickly.
|
||||
|
||||
I'll also note that we're going to skip [asm.js] and [emscripten]. Truth be told, I couldn't get
|
||||
either of these to output anything, and so I'm just going to say
|
||||
[here be dragons.](https://en.wikipedia.org/wiki/Here_be_dragons) Everything I'm discussing here
|
||||
uses the `wasm32-unknown-unknown` target.
|
||||
|
||||
The code that I _did_ get running is available
|
||||
[over here](https://github.com/speice-io/isomorphic-rust). Feel free to use it as a starting point,
|
||||
but I'm mostly including the link as a reference for the things that were attempted.
|
||||
|
||||
# An Example Running Application
|
||||
|
||||
So, I did _technically_ get a running application:
|
||||
|
||||
![Electron app using WASM](/assets/images/2018-09-15-electron-percy-wasm.png)
|
||||
|
||||
...which you can also try out if you want:
|
||||
|
||||
```sh
|
||||
git clone https://github.com/speice-io/isomorphic-rust.git
|
||||
cd isomorphic_rust/percy
|
||||
yarn install && yarn start
|
||||
```
|
||||
|
||||
...but I wouldn't really call it a "high quality" starting point to base future work on. It's mostly
|
||||
there to prove this is possible in the first place. And that's something to be proud of! There's a
|
||||
huge amount of engineering that went into showing a window with the text "It's alive!".
|
||||
|
||||
There's also a lot of usability issues that prevent me from recommending anyone try Electron and
|
||||
WASM apps at the moment, and I think that's the more important thing to discuss.
|
||||
|
||||
# Issue the First: Complicated Toolchains
|
||||
|
||||
I quickly established that [wasm-bindgen] was necessary to "link" my Rust code to Javascript. At
|
||||
that point you've got an Electron app that starts an HTML page which ultimately fetches your WASM
|
||||
blob. To keep things simple, the goal was to package everything using [webpack] so that I could just
|
||||
load a `bundle.js` file on the page. That decision was to be the last thing that kinda worked in
|
||||
this process.
|
||||
|
||||
The first issue
|
||||
[I ran into](https://www.reddit.com/r/rust/comments/98lpun/unable_to_load_wasm_for_electron_application/)
|
||||
while attempting to bundle everything via `webpack` is a detail in the WASM spec:
|
||||
|
||||
> This function accepts a Response object, or a promise for one, and ... **[if > it] does not match
|
||||
> the `application/wasm` MIME type**, the returned promise will be rejected with a TypeError;
|
||||
>
|
||||
> [WebAssembly - Additional Web Embedding API](https://webassembly.org/docs/web/#additional-web-embedding-api)
|
||||
|
||||
Specifically, if you try and load a WASM blob without the MIME type set, you'll get an error. On the
|
||||
web this isn't a huge issue, as the server can set MIME types when delivering the blob. With
|
||||
Electron, you're resolving things with a `file://` URL and thus can't control the MIME type:
|
||||
|
||||
![TypeError: Incorrect response MIME type. Expected 'application/wasm'.](/assets/images/2018-09-15-incorrect-MIME-type.png)
|
||||
|
||||
There are a couple of solutions depending on how far into the deep end you care to venture:
|
||||
|
||||
- Embed a static file server in your Electron application
|
||||
- Use a [custom protocol](https://electronjs.org/docs/api/protocol) and custom protocol handler
|
||||
- Host your WASM blob on a website that you resolve at runtime
|
||||
|
||||
But all these are pretty bad solutions and defeat the purpose of using WASM in the first place.
|
||||
Instead, my workaround was to
|
||||
[open a PR with `webpack`](https://github.com/webpack/webpack/issues/7918) and use regex to remove
|
||||
calls to `instantiateStreaming` in the
|
||||
[build script](https://github.com/speice-io/isomorphic-rust/blob/master/percy/build.sh#L21-L25):
|
||||
|
||||
```sh
|
||||
cargo +nightly build --target=wasm32-unknown-unknown && \
|
||||
wasm-bindgen "$WASM_DIR/debug/$WASM_NAME.wasm" --out-dir "$APP_DIR" --no-typescript && \
|
||||
# Have to use --mode=development so we can patch out the call to instantiateStreaming
|
||||
"$DIR/node_modules/webpack-cli/bin/cli.js" --mode=development "$APP_DIR/app_loader.js" -o "$APP_DIR/bundle.js" && \
|
||||
sed -i 's/.*instantiateStreaming.*//g' "$APP_DIR/bundle.js"
|
||||
```
|
||||
|
||||
Once that lands, the
|
||||
[build process](https://github.com/speice-io/isomorphic-rust/blob/master/percy_patched_webpack/build.sh#L24-L27)
|
||||
becomes much simpler:
|
||||
|
||||
```sh
|
||||
|
||||
cargo +nightly build --target=wasm32-unknown-unknown && \
|
||||
wasm-bindgen "$WASM_DIR/debug/$WASM_NAME.wasm" --out-dir "$APP_DIR" --no-typescript && \
|
||||
"$DIR/node_modules/webpack-cli/bin/cli.js" --mode=production "$APP_DIR/app_loader.js" -o "$APP_DIR/bundle.js"
|
||||
```
|
||||
|
||||
But we're not done yet! After we compile Rust into WASM and link WASM to Javascript (via
|
||||
`wasm-bindgen` and `webpack`), we still have to make an Electron app. For this purpose I used a
|
||||
starter app from [Electron Forge], and then a
|
||||
[`prestart` script](https://github.com/speice-io/isomorphic-rust/blob/master/percy/package.json#L8)
|
||||
to actually handle starting the application.
|
||||
|
||||
The
|
||||
[final toolchain](https://github.com/speice-io/isomorphic-rust/blob/master/percy/package.json#L8)
|
||||
looks something like this:
|
||||
|
||||
- `yarn start` triggers the `prestart` script
|
||||
- `prestart` checks for missing tools (`wasm-bindgen-cli`, etc.) and then:
|
||||
- Uses `cargo` to compile the Rust code into WASM
|
||||
- Uses `wasm-bindgen` to link the WASM blob into a Javascript file with exported symbols
|
||||
- Uses `webpack` to bundle the page start script with the Javascript we just generated
|
||||
- Uses `babel` under the hood to compile the `wasm-bindgen` code down from ES6 into something
|
||||
browser-compatible
|
||||
- The `start` script runs an Electron Forge handler to do some sanity checks
|
||||
- Electron actually starts
|
||||
|
||||
...which is complicated. I think more work needs to be done to either build a high-quality starter
|
||||
app that can manage these steps, or another tool that "just handles" the complexity of linking a
|
||||
compiled WASM file into something the Electron browser can run.
|
||||
|
||||
# Issue the Second: WASM tools in Rust
|
||||
|
||||
For as much as I didn't enjoy the Javascript tooling needed to interface with Rust, the Rust-only
|
||||
bits aren't any better at the moment. I get it, a lot of projects are just starting off, and that
|
||||
leads to a fragmented ecosystem. Here's what I can recommend as a starting point:
|
||||
|
||||
Don't check in your `Cargo.lock` files to version control. If there's a disagreement between the
|
||||
version of `wasm-bindgen-cli` you have installed and the `wasm-bindgen` you're compiling with in
|
||||
`Cargo.lock`, you get a nasty error:
|
||||
|
||||
```
|
||||
it looks like the Rust project used to create this wasm file was linked against
|
||||
a different version of wasm-bindgen than this binary:
|
||||
|
||||
rust wasm file: 0.2.21
|
||||
this binary: 0.2.17
|
||||
|
||||
Currently the bindgen format is unstable enough that these two version must
|
||||
exactly match, so it's required that these two version are kept in sync by
|
||||
either updating the wasm-bindgen dependency or this binary.
|
||||
```
|
||||
|
||||
Not that I ever managed to run into this myself (_coughs nervously_).
|
||||
|
||||
There are two projects attempting to be "application frameworks": [percy] and [yew]. Between those,
|
||||
I managed to get [two](https://github.com/speice-io/isomorphic-rust/tree/master/percy)
|
||||
[examples](https://github.com/speice-io/isomorphic-rust/tree/master/percy_patched_webpack) running
|
||||
using `percy`, but was unable to get an
|
||||
[example](https://github.com/speice-io/isomorphic-rust/tree/master/yew) running with `yew` because
|
||||
of issues with "missing modules" during the `webpack` step:
|
||||
|
||||
```sh
|
||||
ERROR in ./dist/electron_yew_wasm_bg.wasm
|
||||
Module not found: Error: Can't resolve 'env' in '/home/bspeice/Development/isomorphic_rust/yew/dist'
|
||||
@ ./dist/electron_yew_wasm_bg.wasm
|
||||
@ ./dist/electron_yew_wasm.js
|
||||
@ ./dist/app.js
|
||||
@ ./dist/app_loader.js
|
||||
```
|
||||
|
||||
If you want to work with the browser APIs directly, your choices are [percy-webapis] or [stdweb] (or
|
||||
eventually [web-sys]). See above for my `percy` examples, but when I tried
|
||||
[an example with `stdweb`](https://github.com/speice-io/isomorphic-rust/tree/master/stdweb), I was
|
||||
unable to get it running:
|
||||
|
||||
```sh
|
||||
ERROR in ./dist/stdweb_electron_bg.wasm
|
||||
Module not found: Error: Can't resolve 'env' in '/home/bspeice/Development/isomorphic_rust/stdweb/dist'
|
||||
@ ./dist/stdweb_electron_bg.wasm
|
||||
@ ./dist/stdweb_electron.js
|
||||
@ ./dist/app_loader.js
|
||||
```
|
||||
|
||||
At this point I'm pretty convinced that `stdweb` is causing issues for `yew` as well, but can't
|
||||
prove it.
|
||||
|
||||
I did also get a [minimal example](https://github.com/speice-io/isomorphic-rust/tree/master/minimal)
|
||||
running that doesn't depend on any tools besides `wasm-bindgen`. However, it requires manually
|
||||
writing "`extern C`" blocks for everything you need from the browser. Es no bueno.
|
||||
|
||||
Finally, from a tools and platform view, there are two up-and-coming packages that should be
|
||||
mentioned: [js-sys] and [web-sys]. Their purpose is to be fundamental building blocks that exposes
|
||||
the browser's APIs to Rust. If you're interested in building an app framework from scratch, these
|
||||
should give you the most flexibility. I didn't touch either in my research, though I expect them to
|
||||
be essential long-term.
|
||||
|
||||
So there's a lot in play from the Rust side of things, and it's just going to take some time to
|
||||
figure out what works and what doesn't.
|
||||
|
||||
# Issue the Third: Known Unknowns
|
||||
|
||||
Alright, so after I managed to get an application started, I stopped there. It was a good deal of
|
||||
effort to chain together even a proof of concept, and at this point I'd rather learn [Typescript]
|
||||
than keep trying to maintain an incredibly brittle pipeline. Blasphemy, I know...
|
||||
|
||||
The important point I want to make is that there's a lot unknown about how any of this holds up
|
||||
outside proofs of concept. Things I didn't attempt:
|
||||
|
||||
- Testing
|
||||
- Packaging
|
||||
- Updates
|
||||
- Literally anything related to why I wanted to use Electron in the first place
|
||||
|
||||
# What it Would Take
|
||||
|
||||
Much as I don't like Javascript, the tools are too shaky for me to recommend mixing Electron and
|
||||
WASM at the moment. There's a lot of innovation happening, so who knows? Someone might have an
|
||||
application in production a couple months from now. But at the moment, I'm personally going to stay
|
||||
away.
|
||||
|
||||
Let's finish with a wishlist then - here are the things that I think need to happen before
|
||||
Electron/WASM/Rust can become a thing:
|
||||
|
||||
- Webpack still needs some updates. The necessary work is in progress, but hasn't landed yet
|
||||
([#7983](https://github.com/webpack/webpack/pull/7983))
|
||||
- Browser API libraries (`web-sys` and `stdweb`) need to make sure they can support running in
|
||||
Electron (see module error above)
|
||||
- Projects need to stabilize. There's talk of `stdweb` being turned into a Rust API
|
||||
[on top of web-sys](https://github.com/rustwasm/team/issues/226#issuecomment-418475778), and percy
|
||||
[moving to web-sys](https://github.com/chinedufn/percy/issues/24), both of which are big changes
|
||||
- `wasm-bindgen` is great, but still in the "move fast and break things" phase
|
||||
- A good "boilerplate" app would dramatically simplify the start-up costs;
|
||||
[electron-react-boilerplate](https://github.com/chentsulin/electron-react-boilerplate) comes to
|
||||
mind as a good project to imitate
|
||||
- More blog posts/contributors! I think Electron + Rust could be cool, but I have no idea what I'm
|
||||
doing
|
||||
|
||||
[wxwidgets]: https://wxwidgets.org/
|
||||
[libui-rs]: https://github.com/LeoTindall/libui-rs/
|
||||
[electron]: https://electronjs.org/
|
||||
[babel]: https://babeljs.io/
|
||||
[wxrust]: https://github.com/kenz-gelsoft/wxRust
|
||||
[wasm-bindgen]: https://github.com/rustwasm/wasm-bindgen
|
||||
[js-sys]: https://crates.io/crates/js-sys
|
||||
[percy-webapis]: https://crates.io/crates/percy-webapis
|
||||
[stdweb]: https://crates.io/crates/stdweb
|
||||
[web-sys]: https://crates.io/crates/web-sys
|
||||
[percy]: https://chinedufn.github.io/percy/
|
||||
[virtual-dom-rs]: https://crates.io/crates/virtual-dom-rs
|
||||
[yew]: https://github.com/DenisKolodin/yew
|
||||
[react]: https://reactjs.org/
|
||||
[elm]: http://elm-lang.org/
|
||||
[asm.js]: http://asmjs.org/
|
||||
[emscripten]: https://kripken.github.io/emscripten-site/
|
||||
[typescript]: https://www.typescriptlang.org/
|
||||
[electron forge]: https://electronforge.io/
|
||||
[conrod]: https://github.com/PistonDevelopers/conrod
|
||||
[webpack]: https://webpack.js.org/
|
@ -1,168 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "A Case Study in Heaptrack"
|
||||
description: "...because you don't need no garbage collection"
|
||||
category:
|
||||
tags: []
|
||||
---
|
||||
|
||||
One of my earliest conversations about programming went like this:
|
||||
|
||||
> Programmers have it too easy these days. They should learn to develop in low memory environments
|
||||
> and be more efficient.
|
||||
>
|
||||
> -- My Father (paraphrased)
|
||||
|
||||
...though it's not like the first code I wrote was for a
|
||||
[graphing calculator](https://education.ti.com/en/products/calculators/graphing-calculators/ti-84-plus-se)
|
||||
packing a whole 24KB of RAM. By the way, _what are you doing on my lawn?_
|
||||
|
||||
The principle remains though: be efficient with the resources you have, because
|
||||
[what Intel giveth, Microsoft taketh away](http://exo-blog.blogspot.com/2007/09/what-intel-giveth-microsoft-taketh-away.html).
|
||||
My professional work is focused on this kind of efficiency; low-latency financial markets demand
|
||||
that you understand at a deep level _exactly_ what your code is doing. As I continue experimenting
|
||||
with Rust for personal projects, it's exciting to bring a utilitarian mindset with me: there's
|
||||
flexibility for the times I pretend to have a garbage collector, and flexibility for the times that
|
||||
I really care about how memory is used.
|
||||
|
||||
This post is a (small) case study in how I went from the former to the latter. And ultimately, it's
|
||||
intended to be a starting toolkit to empower analysis of your own code.
|
||||
|
||||
# Curiosity
|
||||
|
||||
When I first started building the [dtparse] crate, my intention was to mirror as closely as possible
|
||||
the equivalent [Python library][dateutil]. Python, as you may know, is garbage collected. Very
|
||||
rarely is memory usage considered in Python, and I likewise wasn't paying too much attention when
|
||||
`dtparse` was first being built.
|
||||
|
||||
This lackadaisical approach to memory works well enough, and I'm not planning on making `dtparse`
|
||||
hyper-efficient. But every so often, I've wondered: "what exactly is going on in memory?" With the
|
||||
advent of Rust 1.28 and the
|
||||
[Global Allocator trait](https://doc.rust-lang.org/std/alloc/trait.GlobalAlloc.html), I had a really
|
||||
great idea: _build a custom allocator that allows you to track your own allocations._ That way, you
|
||||
can do things like writing tests for both correct results and correct memory usage. I gave it a
|
||||
[shot][qadapt], but learned very quickly: **never write your own allocator**. It went from "fun
|
||||
weekend project" to "I have literally no idea what my computer is doing" at breakneck speed.
|
||||
|
||||
Instead, I'll highlight a separate path I took to make sense of my memory usage: [heaptrack].
|
||||
|
||||
# Turning on the System Allocator
|
||||
|
||||
This is the hardest part of the post. Because Rust uses
|
||||
[its own allocator](https://github.com/rust-lang/rust/pull/27400#issue-41256384) by default,
|
||||
`heaptrack` is unable to properly record unmodified Rust code. To remedy this, we'll make use of the
|
||||
`#[global_allocator]` attribute.
|
||||
|
||||
Specifically, in `lib.rs` or `main.rs`, add this:
|
||||
|
||||
```rust
|
||||
use std::alloc::System;
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: System = System;
|
||||
```
|
||||
|
||||
...and that's it. Everything else comes essentially for free.
|
||||
|
||||
# Running heaptrack
|
||||
|
||||
Assuming you've installed heaptrack <span style="font-size: .6em;">(Homebrew in Mac, package manager
|
||||
in Linux, ??? in Windows)</span>, all that's left is to fire up your application:
|
||||
|
||||
```
|
||||
heaptrack my_application
|
||||
```
|
||||
|
||||
It's that easy. After the program finishes, you'll see a file in your local directory with a name
|
||||
like `heaptrack.my_appplication.XXXX.gz`. If you load that up in `heaptrack_gui`, you'll see
|
||||
something like this:
|
||||
|
||||
![heaptrack](/assets/images/2018-10-heaptrack/heaptrack-before.png)
|
||||
|
||||
---
|
||||
|
||||
And even these pretty colors:
|
||||
|
||||
![pretty colors](/assets/images/2018-10-heaptrack/heaptrack-flamegraph.png)
|
||||
|
||||
# Reading Flamegraphs
|
||||
|
||||
To make sense of our memory usage, we're going to focus on that last picture - it's called a
|
||||
["flamegraph"](http://www.brendangregg.com/flamegraphs.html). These charts are typically used to
|
||||
show how much time your program spends executing each function, but they're used here to show how
|
||||
much memory was allocated during those functions instead.
|
||||
|
||||
For example, we can see that all executions happened during the `main` function:
|
||||
|
||||
![allocations in main](/assets/images/2018-10-heaptrack/heaptrack-main-colorized.png)
|
||||
|
||||
...and within that, all allocations happened during `dtparse::parse`:
|
||||
|
||||
![allocations in dtparse](/assets/images/2018-10-heaptrack/heaptrack-dtparse-colorized.png)
|
||||
|
||||
...and within _that_, allocations happened in two different places:
|
||||
|
||||
![allocations in parseinfo](/assets/images/2018-10-heaptrack/heaptrack-parseinfo-colorized.png)
|
||||
|
||||
Now I apologize that it's hard to see, but there's one area specifically that stuck out as an issue:
|
||||
**what the heck is the `Default` thing doing?**
|
||||
|
||||
![pretty colors](/assets/images/2018-10-heaptrack/heaptrack-flamegraph-default.png)
|
||||
|
||||
# Optimizing dtparse
|
||||
|
||||
See, I knew that there were some allocations during calls to `dtparse::parse`, but I was totally
|
||||
wrong about where the bulk of allocations occurred in my program. Let me post the code and see if
|
||||
you can spot the mistake:
|
||||
|
||||
```rust
|
||||
/// Main entry point for using `dtparse`.
|
||||
pub fn parse(timestr: &str) -> ParseResult<(NaiveDateTime, Option<FixedOffset>)> {
|
||||
let res = Parser::default().parse(
|
||||
timestr, None, None, false, false,
|
||||
None, false,
|
||||
&HashMap::new(),
|
||||
)?;
|
||||
|
||||
Ok((res.0, res.1))
|
||||
}
|
||||
```
|
||||
|
||||
> [dtparse](https://github.com/bspeice/dtparse/blob/4d7c5dd99572823fa4a390b483c38ab020a2172f/src/lib.rs#L1286)
|
||||
|
||||
---
|
||||
|
||||
Because `Parser::parse` requires a mutable reference to itself, I have to create a new
|
||||
`Parser::default` every time it receives a string. This is excessive! We'd rather have an immutable
|
||||
parser that can be re-used, and avoid allocating memory in the first place.
|
||||
|
||||
Armed with that information, I put some time in to
|
||||
[make the parser immutable](https://github.com/bspeice/dtparse/commit/741afa34517d6bc1155713bbc5d66905fea13fad#diff-b4aea3e418ccdb71239b96952d9cddb6).
|
||||
Now that I can re-use the same parser over and over, the allocations disappear:
|
||||
|
||||
![allocations cleaned up](/assets/images/2018-10-heaptrack/heaptrack-flamegraph-after.png)
|
||||
|
||||
In total, we went from requiring 2 MB of memory in
|
||||
[version 1.0.2](https://crates.io/crates/dtparse/1.0.2):
|
||||
|
||||
![memory before](/assets/images/2018-10-heaptrack/heaptrack-closeup.png)
|
||||
|
||||
All the way down to 300KB in [version 1.0.3](https://crates.io/crates/dtparse/1.0.3):
|
||||
|
||||
![memory after](/assets/images/2018-10-heaptrack/heaptrack-closeup-after.png)
|
||||
|
||||
# Conclusion
|
||||
|
||||
In the end, you don't need to write a custom allocator to be efficient with memory, great tools
|
||||
already exist to help you understand what your program is doing.
|
||||
|
||||
**Use them.**
|
||||
|
||||
Given that [Moore's Law](https://en.wikipedia.org/wiki/Moore%27s_law) is
|
||||
[dead](https://www.technologyreview.com/s/601441/moores-law-is-dead-now-what/), we've all got to do
|
||||
our part to take back what Microsoft stole.
|
||||
|
||||
[dtparse]: https://crates.io/crates/dtparse
|
||||
[dateutil]: https://github.com/dateutil/dateutil
|
||||
[heaptrack]: https://github.com/KDE/heaptrack
|
||||
[qadapt]: https://crates.io/crates/qadapt
|
@ -1,34 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: 'More "What Companies Really Mean"'
|
||||
description: 'when they ask "Why should we hire you?"'
|
||||
category:
|
||||
tags: []
|
||||
---
|
||||
|
||||
I recently stumbled across a phenomenal small article entitled
|
||||
[What Startups Really Mean By "Why Should We Hire You?"](https://angel.co/blog/what-startups-really-mean-by-why-should-we-hire-you).
|
||||
Having been interviewed by smaller companies (though not exactly startups), the questions and
|
||||
subtexts are the same. There's often a question behind the question that you're actually trying to
|
||||
answer, and I wish I spotted the nuance earlier in my career.
|
||||
|
||||
Let me also make note of one more question/euphemism I've come across:
|
||||
|
||||
# How do you feel about Production Support?
|
||||
|
||||
**Translation**: _We're a fairly small team, and when things break on an evening/weekend/Christmas
|
||||
Day, can we call on you to be there?_
|
||||
|
||||
I've met decidedly few people in my life who truly enjoy the "ops" side of "devops". They're
|
||||
incredibly good at taking an impossible problem, pre-existing knowledge of arcane arts, and turning
|
||||
that into a functioning system at the end. And if they all left for lunch, we probably wouldn't make
|
||||
it out the door before the zombie apocalypse.
|
||||
|
||||
Larger organizations (in my experience, 500+ person organizations) have the luxury of hiring people
|
||||
who either enjoy that, or play along nicely enough that our systems keep working.
|
||||
|
||||
Small teams have no such luck. If you're interviewing at a small company, especially as a "data
|
||||
scientist" or other somesuch position, be aware that systems can and do spontaneously combust at the
|
||||
most inopportune moments.
|
||||
|
||||
**Terrible-but-popular answers include**: _It's a part of the job, and I'm happy to contribute._
|
@ -1,218 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "QADAPT - debug_assert! for your memory usage"
|
||||
description: "...and why you want an allocator that goes 💥."
|
||||
category:
|
||||
tags: []
|
||||
---
|
||||
|
||||
I think it's part of the human condition to ignore perfectly good advice when it comes our way. A
|
||||
bit over a month ago, I was dispensing sage wisdom for the ages:
|
||||
|
||||
> I had a really great idea: build a custom allocator that allows you to track your own allocations.
|
||||
> I gave it a shot, but learned very quickly: **never write your own allocator.**
|
||||
>
|
||||
> -- [me](/2018/10/case-study-optimization.html)
|
||||
|
||||
I proceeded to ignore it, because we never really learn from our mistakes.
|
||||
|
||||
There's another part of the human condition that derives joy from seeing things explode.
|
||||
|
||||
<iframe src="https://giphy.com/embed/YA6dmVW0gfIw8" width="480" height="336" frameBorder="0"></iframe>
|
||||
|
||||
And _that's_ the part I'm going to focus on.
|
||||
|
||||
# Why an Allocator?
|
||||
|
||||
So why, after complaining about allocators, would I still want to write one? There are three reasons
|
||||
for that:
|
||||
|
||||
1. Allocation/dropping is slow
|
||||
2. It's difficult to know exactly when Rust will allocate or drop, especially when using code that
|
||||
you did not write
|
||||
3. I want automated tools to verify behavior, instead of inspecting by hand
|
||||
|
||||
When I say "slow," it's important to define the terms. If you're writing web applications, you'll
|
||||
spend orders of magnitude more time waiting for the database than you will the allocator. However,
|
||||
there's still plenty of code where micro- or nano-seconds matter; think
|
||||
[finance](https://www.youtube.com/watch?v=NH1Tta7purM),
|
||||
[real-time audio](https://www.reddit.com/r/rust/comments/9hg7yj/synthesizer_progress_update/e6c291f),
|
||||
[self-driving cars](https://polysync.io/blog/session-types-for-hearty-codecs/), and
|
||||
[networking](https://carllerche.github.io/bytes/bytes/index.html). In these situations it's simply
|
||||
unacceptable for you to spend time doing things that are not your program, and waiting on the
|
||||
allocator is not cool.
|
||||
|
||||
As I continue to learn Rust, it's difficult for me to predict where exactly allocations will happen.
|
||||
So, I propose we play a quick trivia game: **Does this code invoke the allocator?**
|
||||
|
||||
## Example 1
|
||||
|
||||
```rust
|
||||
fn my_function() {
|
||||
let v: Vec<u8> = Vec::new();
|
||||
}
|
||||
```
|
||||
|
||||
**No**: Rust [knows how big](https://doc.rust-lang.org/std/mem/fn.size_of.html) the `Vec` type is,
|
||||
and reserves a fixed amount of memory on the stack for the `v` vector. However, if we wanted to
|
||||
reserve extra space (using `Vec::with_capacity`) the allocator would get invoked.
|
||||
|
||||
## Example 2
|
||||
|
||||
```rust
|
||||
fn my_function() {
|
||||
let v: Box<Vec<u8>> = Box::new(Vec::new());
|
||||
}
|
||||
```
|
||||
|
||||
**Yes**: Because Boxes allow us to work with things that are of unknown size, it has to allocate on
|
||||
the heap. While the `Box` is unnecessary in this snippet (release builds will optimize out the
|
||||
allocation), reserving heap space more generally is needed to pass a dynamically sized type to
|
||||
another function.
|
||||
|
||||
## Example 3
|
||||
|
||||
```rust
|
||||
fn my_function(v: Vec<u8>) {
|
||||
v.push(5);
|
||||
}
|
||||
```
|
||||
|
||||
**Maybe**: Depending on whether the Vector we were given has space available, we may or may not
|
||||
allocate. Especially when dealing with code that you did not author, it's difficult to verify that
|
||||
things behave as you expect them to.
|
||||
|
||||
# Blowing Things Up
|
||||
|
||||
So, how exactly does QADAPT solve these problems? **Whenever an allocation or drop occurs in code
|
||||
marked allocation-safe, QADAPT triggers a thread panic.** We don't want to let the program continue
|
||||
as if nothing strange happened, _we want things to explode_.
|
||||
|
||||
However, you don't want code to panic in production because of circumstances you didn't predict.
|
||||
Just like [`debug_assert!`](https://doc.rust-lang.org/std/macro.debug_assert.html), **QADAPT will
|
||||
strip out its own code when building in release mode to guarantee no panics and no performance
|
||||
impact.**
|
||||
|
||||
Finally, there are three ways to have QADAPT check that your code will not invoke the allocator:
|
||||
|
||||
## Using a procedural macro
|
||||
|
||||
The easiest method, watch an entire function for allocator invocation:
|
||||
|
||||
```rust
|
||||
use qadapt::no_alloc;
|
||||
use qadapt::QADAPT;
|
||||
|
||||
#[global_allocator]
|
||||
static Q: QADAPT = QADAPT;
|
||||
|
||||
#[no_alloc]
|
||||
fn push_vec(v: &mut Vec<u8>) {
|
||||
// This triggers a panic if v.len() == v.capacity()
|
||||
v.push(5);
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let v = Vec::with_capacity(1);
|
||||
|
||||
// This will *not* trigger a panic
|
||||
push_vec(&v);
|
||||
|
||||
// This *will* trigger a panic
|
||||
push_vec(&v);
|
||||
}
|
||||
```
|
||||
|
||||
## Using a regular macro
|
||||
|
||||
For times when you need more precision:
|
||||
|
||||
```rust
|
||||
use qadapt::assert_no_alloc;
|
||||
use qadapt::QADAPT;
|
||||
|
||||
#[global_allocator]
|
||||
static Q: QADAPT = QADAPT;
|
||||
|
||||
fn main() {
|
||||
let v = Vec::with_capacity(1);
|
||||
|
||||
// No allocations here, we already have space reserved
|
||||
assert_no_alloc!(v.push(5));
|
||||
|
||||
// Even though we remove an item, it doesn't trigger a drop
|
||||
// because it's a scalar. If it were a `Box<_>` type,
|
||||
// a drop would trigger.
|
||||
assert_no_alloc!({
|
||||
v.pop().unwrap();
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
## Using function calls
|
||||
|
||||
Both the most precise and most tedious:
|
||||
|
||||
```rust
|
||||
use qadapt::enter_protected;
|
||||
use qadapt::exit_protected;
|
||||
use qadapt::QADAPT;
|
||||
|
||||
#[global_allocator]
|
||||
static Q: QADAPT = QADAPT;
|
||||
|
||||
fn main() {
|
||||
// This triggers an allocation (on non-release builds)
|
||||
let v = Vec::with_capacity(1);
|
||||
|
||||
enter_protected();
|
||||
// This does not trigger an allocation because we've reserved size
|
||||
v.push(0);
|
||||
exit_protected();
|
||||
|
||||
// This triggers an allocation because we ran out of size,
|
||||
// but doesn't panic because we're no longer protected.
|
||||
v.push(1);
|
||||
}
|
||||
```
|
||||
|
||||
## Caveats
|
||||
|
||||
It's important to point out that QADAPT code is synchronous, so please be careful when mixing in
|
||||
asynchronous functions:
|
||||
|
||||
```rust
|
||||
use futures::future::Future;
|
||||
use futures::future::ok;
|
||||
|
||||
#[no_alloc]
|
||||
fn async_capacity() -> impl Future<Item=Vec<u8>, Error=()> {
|
||||
ok(12).and_then(|e| Ok(Vec::with_capacity(e)))
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// This doesn't trigger a panic because the `and_then` closure
|
||||
// wasn't run during the function call.
|
||||
async_capacity();
|
||||
|
||||
// Still no panic
|
||||
assert_no_alloc!(async_capacity());
|
||||
|
||||
// This will panic because the allocation happens during `unwrap`
|
||||
// in the `assert_no_alloc!` macro
|
||||
assert_no_alloc!(async_capacity().poll().unwrap());
|
||||
}
|
||||
```
|
||||
|
||||
# Conclusion
|
||||
|
||||
While there's a lot more to writing high-performance code than managing your usage of the allocator,
|
||||
it's critical that you do use the allocator correctly. QADAPT will verify that your code is doing
|
||||
what you expect. It's usable even on stable Rust from version 1.31 onward, which isn't the case for
|
||||
most allocators. Version 1.0 was released today, and you can check it out over at
|
||||
[crates.io](https://crates.io/crates/qadapt) or on [github](https://github.com/bspeice/qadapt).
|
||||
|
||||
I'm hoping to write more about high-performance Rust in the future, and I expect that QADAPT will
|
||||
help guide that. If there are topics you're interested in, let me know in the comments below!
|
||||
|
||||
[qadapt]: https://crates.io/crates/qadapt
|
@ -1,113 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Allocations in Rust"
|
||||
description: "An introduction to the memory model."
|
||||
category:
|
||||
tags: [rust, understanding-allocations]
|
||||
---
|
||||
|
||||
There's an alchemy of distilling complex technical topics into articles and videos that change the
|
||||
way programmers see the tools they interact with on a regular basis. I knew what a linker was, but
|
||||
there's a staggering amount of complexity in between
|
||||
[the OS and `main()`](https://www.youtube.com/watch?v=dOfucXtyEsU). Rust programmers use the
|
||||
[`Box`](https://doc.rust-lang.org/stable/std/boxed/struct.Box.html) type all the time, but there's a
|
||||
rich history of the Rust language itself wrapped up in
|
||||
[how special it is](https://manishearth.github.io/blog/2017/01/10/rust-tidbits-box-is-special/).
|
||||
|
||||
In a similar vein, this series attempts to look at code and understand how memory is used; the
|
||||
complex choreography of operating system, compiler, and program that frees you to focus on
|
||||
functionality far-flung from frivolous book-keeping. The Rust compiler relieves a great deal of the
|
||||
cognitive burden associated with memory management, but we're going to step into its world for a
|
||||
while.
|
||||
|
||||
Let's learn a bit about memory in Rust.
|
||||
|
||||
# Table of Contents
|
||||
|
||||
This series is intended as both learning and reference material; we'll work through the different
|
||||
memory types Rust uses, and explain the implications of each. Ultimately, a summary will be provided
|
||||
as a cheat sheet for easy future reference. To that end, a table of contents is in order:
|
||||
|
||||
- Foreword
|
||||
- [Global Memory Usage: The Whole World](/2019/02/the-whole-world.html)
|
||||
- [Fixed Memory: Stacking Up](/2019/02/stacking-up.html)
|
||||
- [Dynamic Memory: A Heaping Helping](/2019/02/a-heaping-helping.html)
|
||||
- [Compiler Optimizations: What It's Done For You Lately](/2019/02/compiler-optimizations.html)
|
||||
- [Summary: What Are the Rules?](/2019/02/summary.html)
|
||||
|
||||
# Foreword
|
||||
|
||||
Rust's three defining features of
|
||||
[Performance, Reliability, and Productivity](https://www.rust-lang.org/) are all driven to a great
|
||||
degree by the how the Rust compiler understands memory usage. Unlike managed memory languages (Java,
|
||||
Python), Rust
|
||||
[doesn't really](https://words.steveklabnik.com/borrow-checking-escape-analysis-and-the-generational-hypothesis)
|
||||
garbage collect; instead, it uses an
|
||||
[ownership](https://doc.rust-lang.org/book/ch04-01-what-is-ownership.html) system to reason about
|
||||
how long objects will last in your program. In some cases, if the life of an object is fairly
|
||||
transient, Rust can make use of a very fast region called the "stack." When that's not possible,
|
||||
Rust uses
|
||||
[dynamic (heap) memory](https://en.wikipedia.org/wiki/Memory_management#Dynamic_memory_allocation)
|
||||
and the ownership system to ensure you can't accidentally corrupt memory. It's not as fast, but it
|
||||
is important to have available.
|
||||
|
||||
That said, there are specific situations in Rust where you'd never need to worry about the
|
||||
stack/heap distinction! If you:
|
||||
|
||||
1. Never use `unsafe`
|
||||
2. Never use `#![feature(alloc)]` or the [`alloc` crate](https://doc.rust-lang.org/alloc/index.html)
|
||||
|
||||
...then it's not possible for you to use dynamic memory!
|
||||
|
||||
For some uses of Rust, typically embedded devices, these constraints are OK. They have very limited
|
||||
memory, and the program binary size itself may significantly affect what's available! There's no
|
||||
operating system able to manage this
|
||||
["virtual memory"](https://en.wikipedia.org/wiki/Virtual_memory) thing, but that's not an issue
|
||||
because there's only one running application. The
|
||||
[embedonomicon](https://docs.rust-embedded.org/embedonomicon/preface.html) is ever in mind, and
|
||||
interacting with the "real world" through extra peripherals is accomplished by reading and writing
|
||||
to [specific memory addresses](https://bob.cs.sonoma.edu/IntroCompOrg-RPi/sec-gpio-mem.html).
|
||||
|
||||
Most Rust programs find these requirements overly burdensome though. C++ developers would struggle
|
||||
without access to [`std::vector`](https://en.cppreference.com/w/cpp/container/vector) (except those
|
||||
hardcore no-STL people), and Rust developers would struggle without
|
||||
[`std::vec`](https://doc.rust-lang.org/std/vec/struct.Vec.html). But with the constraints above,
|
||||
`std::vec` is actually a part of the
|
||||
[`alloc` crate](https://doc.rust-lang.org/alloc/vec/struct.Vec.html), and thus off-limits. `Box`,
|
||||
`Rc`, etc., are also unusable for the same reason.
|
||||
|
||||
Whether writing code for embedded devices or not, the important thing in both situations is how much
|
||||
you know _before your application starts_ about what its memory usage will look like. In embedded
|
||||
devices, there's a small, fixed amount of memory to use. In a browser, you have no idea how large
|
||||
[google.com](https://www.google.com)'s home page is until you start trying to download it. The
|
||||
compiler uses this knowledge (or lack thereof) to optimize how memory is used; put simply, your code
|
||||
runs faster when the compiler can guarantee exactly how much memory your program needs while it's
|
||||
running. This series is all about understanding how the compiler reasons about your program, with an
|
||||
emphasis on the implications for performance.
|
||||
|
||||
Now let's address some conditions and caveats before going much further:
|
||||
|
||||
- We'll focus on "safe" Rust only; `unsafe` lets you use platform-specific allocation API's
|
||||
([`malloc`](https://www.tutorialspoint.com/c_standard_library/c_function_malloc.htm)) that we'll
|
||||
ignore.
|
||||
- We'll assume a "debug" build of Rust code (what you get with `cargo run` and `cargo test`) and
|
||||
address (pun intended) release mode at the end (`cargo run --release` and `cargo test --release`).
|
||||
- All content will be run using Rust 1.32, as that's the highest currently supported in the
|
||||
[Compiler Exporer](https://godbolt.org/). As such, we'll avoid upcoming innovations like
|
||||
[compile-time evaluation of `static`](https://github.com/rust-lang/rfcs/blob/master/text/0911-const-fn.md)
|
||||
that are available in nightly.
|
||||
- Because of the nature of the content, being able to read assembly is helpful. We'll keep it
|
||||
simple, but I [found](https://stackoverflow.com/a/4584131/1454178) a
|
||||
[refresher](https://stackoverflow.com/a/26026278/1454178) on the `push` and `pop`
|
||||
[instructions](http://www.cs.virginia.edu/~evans/cs216/guides/x86.html) was helpful while writing
|
||||
this.
|
||||
- I've tried to be precise in saying only what I can prove using the tools (ASM, docs) that are
|
||||
available, but if there's something said in error it will be corrected expeditiously. Please let
|
||||
me know at [bradlee@speice.io](mailto:bradlee@speice.io)
|
||||
|
||||
Finally, I'll do what I can to flag potential future changes but the Rust docs have a notice worth
|
||||
repeating:
|
||||
|
||||
> Rust does not currently have a rigorously and formally defined memory model.
|
||||
>
|
||||
> -- [the docs](https://doc.rust-lang.org/std/ptr/fn.read_volatile.html)
|
@ -1,337 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Global Memory Usage: The Whole World"
|
||||
description: "Static considered slightly less harmful."
|
||||
category:
|
||||
tags: [rust, understanding-allocations]
|
||||
---
|
||||
|
||||
The first memory type we'll look at is pretty special: when Rust can prove that a _value_ is fixed
|
||||
for the life of a program (`const`), and when a _reference_ is unique for the life of a program
|
||||
(`static` as a declaration, not
|
||||
[`'static`](https://doc.rust-lang.org/book/ch10-03-lifetime-syntax.html#the-static-lifetime) as a
|
||||
lifetime), we can make use of global memory. This special section of data is embedded directly in
|
||||
the program binary so that variables are ready to go once the program loads; no additional
|
||||
computation is necessary.
|
||||
|
||||
Understanding the value/reference distinction is important for reasons we'll go into below, and
|
||||
while the
|
||||
[full specification](https://github.com/rust-lang/rfcs/blob/master/text/0246-const-vs-static.md) for
|
||||
these two keywords is available, we'll take a hands-on approach to the topic.
|
||||
|
||||
# **const**
|
||||
|
||||
When a _value_ is guaranteed to be unchanging in your program (where "value" may be scalars,
|
||||
`struct`s, etc.), you can declare it `const`. This tells the compiler that it's safe to treat the
|
||||
value as never changing, and enables some interesting optimizations; not only is there no
|
||||
initialization cost to creating the value (it is loaded at the same time as the executable parts of
|
||||
your program), but the compiler can also copy the value around if it speeds up the code.
|
||||
|
||||
The points we need to address when talking about `const` are:
|
||||
|
||||
- `Const` values are stored in read-only memory - it's impossible to modify.
|
||||
- Values resulting from calling a `const fn` are materialized at compile-time.
|
||||
- The compiler may (or may not) copy `const` values wherever it chooses.
|
||||
|
||||
## Read-Only
|
||||
|
||||
The first point is a bit strange - "read-only memory."
|
||||
[The Rust book](https://doc.rust-lang.org/book/ch03-01-variables-and-mutability.html#differences-between-variables-and-constants)
|
||||
mentions in a couple places that using `mut` with constants is illegal, but it's also important to
|
||||
demonstrate just how immutable they are. _Typically_ in Rust you can use
|
||||
[interior mutability](https://doc.rust-lang.org/book/ch15-05-interior-mutability.html) to modify
|
||||
things that aren't declared `mut`.
|
||||
[`RefCell`](https://doc.rust-lang.org/std/cell/struct.RefCell.html) provides an example of this
|
||||
pattern in action:
|
||||
|
||||
```rust
|
||||
use std::cell::RefCell;
|
||||
|
||||
fn my_mutator(cell: &RefCell<u8>) {
|
||||
// Even though we're given an immutable reference,
|
||||
// the `replace` method allows us to modify the inner value.
|
||||
cell.replace(14);
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let cell = RefCell::new(25);
|
||||
// Prints out 25
|
||||
println!("Cell: {:?}", cell);
|
||||
my_mutator(&cell);
|
||||
// Prints out 14
|
||||
println!("Cell: {:?}", cell);
|
||||
}
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=8e4bea1a718edaff4507944e825a54b2)
|
||||
|
||||
When `const` is involved though, interior mutability is impossible:
|
||||
|
||||
```rust
|
||||
use std::cell::RefCell;
|
||||
|
||||
const CELL: RefCell<u8> = RefCell::new(25);
|
||||
|
||||
fn my_mutator(cell: &RefCell<u8>) {
|
||||
cell.replace(14);
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// First line prints 25 as expected
|
||||
println!("Cell: {:?}", &CELL);
|
||||
my_mutator(&CELL);
|
||||
// Second line *still* prints 25
|
||||
println!("Cell: {:?}", &CELL);
|
||||
}
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=88fe98110c33c1b3a51e341f48b8ae00)
|
||||
|
||||
And a second example using [`Once`](https://doc.rust-lang.org/std/sync/struct.Once.html):
|
||||
|
||||
```rust
|
||||
use std::sync::Once;
|
||||
|
||||
const SURPRISE: Once = Once::new();
|
||||
|
||||
fn main() {
|
||||
// This is how `Once` is supposed to be used
|
||||
SURPRISE.call_once(|| println!("Initializing..."));
|
||||
// Because `Once` is a `const` value, we never record it
|
||||
// having been initialized the first time, and this closure
|
||||
// will also execute.
|
||||
SURPRISE.call_once(|| println!("Initializing again???"));
|
||||
}
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=c3cc5979b5e5434eca0f9ec4a06ee0ed)
|
||||
|
||||
When the
|
||||
[`const` specification](https://github.com/rust-lang/rfcs/blob/26197104b7bb9a5a35db243d639aee6e46d35d75/text/0246-const-vs-static.md)
|
||||
refers to ["rvalues"](http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2010/n3055.pdf), this
|
||||
behavior is what they refer to. [Clippy](https://github.com/rust-lang/rust-clippy) will treat this
|
||||
as an error, but it's still something to be aware of.
|
||||
|
||||
## Initialization == Compilation
|
||||
|
||||
The next thing to mention is that `const` values are loaded into memory _as part of your program
|
||||
binary_. Because of this, any `const` values declared in your program will be "realized" at
|
||||
compile-time; accessing them may trigger a main-memory lookup (with a fixed address, so your CPU may
|
||||
be able to prefetch the value), but that's it.
|
||||
|
||||
```rust
|
||||
use std::cell::RefCell;
|
||||
|
||||
const CELL: RefCell<u32> = RefCell::new(24);
|
||||
|
||||
pub fn multiply(value: u32) -> u32 {
|
||||
// CELL is stored at `.L__unnamed_1`
|
||||
value * (*CELL.get_mut())
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/Th8boO)
|
||||
|
||||
The compiler creates one `RefCell`, uses it everywhere, and never needs to call the `RefCell::new`
|
||||
function.
|
||||
|
||||
## Copying
|
||||
|
||||
If it's helpful though, the compiler can choose to copy `const` values.
|
||||
|
||||
```rust
|
||||
const FACTOR: u32 = 1000;
|
||||
|
||||
pub fn multiply(value: u32) -> u32 {
|
||||
// See assembly line 4 for the `mov edi, 1000` instruction
|
||||
value * FACTOR
|
||||
}
|
||||
|
||||
pub fn multiply_twice(value: u32) -> u32 {
|
||||
// See assembly lines 22 and 29 for `mov edi, 1000` instructions
|
||||
value * FACTOR * FACTOR
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/ZtS54X)
|
||||
|
||||
In this example, the `FACTOR` value is turned into the `mov edi, 1000` instruction in both the
|
||||
`multiply` and `multiply_twice` functions; the "1000" value is never "stored" anywhere, as it's
|
||||
small enough to inline into the assembly instructions.
|
||||
|
||||
Finally, getting the address of a `const` value is possible, but not guaranteed to be unique
|
||||
(because the compiler can choose to copy values). I was unable to get non-unique pointers in my
|
||||
testing (even using different crates), but the specifications are clear enough: _don't rely on
|
||||
pointers to `const` values being consistent_. To be frank, caring about locations for `const` values
|
||||
is almost certainly a code smell.
|
||||
|
||||
# **static**
|
||||
|
||||
Static variables are related to `const` variables, but take a slightly different approach. When we
|
||||
declare that a _reference_ is unique for the life of a program, you have a `static` variable
|
||||
(unrelated to the `'static` lifetime). Because of the reference/value distinction with
|
||||
`const`/`static`, static variables behave much more like typical "global" variables.
|
||||
|
||||
But to understand `static`, here's what we'll look at:
|
||||
|
||||
- `static` variables are globally unique locations in memory.
|
||||
- Like `const`, `static` variables are loaded at the same time as your program being read into
|
||||
memory.
|
||||
- All `static` variables must implement the
|
||||
[`Sync`](https://doc.rust-lang.org/std/marker/trait.Sync.html) marker trait.
|
||||
- Interior mutability is safe and acceptable when using `static` variables.
|
||||
|
||||
## Memory Uniqueness
|
||||
|
||||
The single biggest difference between `const` and `static` is the guarantees provided about
|
||||
uniqueness. Where `const` variables may or may not be copied in code, `static` variables are
|
||||
guarantee to be unique. If we take a previous `const` example and change it to `static`, the
|
||||
difference should be clear:
|
||||
|
||||
```rust
|
||||
static FACTOR: u32 = 1000;
|
||||
|
||||
pub fn multiply(value: u32) -> u32 {
|
||||
// The assembly to `mul dword ptr [rip + example::FACTOR]` is how FACTOR gets used
|
||||
value * FACTOR
|
||||
}
|
||||
|
||||
pub fn multiply_twice(value: u32) -> u32 {
|
||||
// The assembly to `mul dword ptr [rip + example::FACTOR]` is how FACTOR gets used
|
||||
value * FACTOR * FACTOR
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/uxmiRQ)
|
||||
|
||||
Where [previously](#copying) there were plenty of references to multiplying by 1000, the new
|
||||
assembly refers to `FACTOR` as a named memory location instead. No initialization work needs to be
|
||||
done, but the compiler can no longer prove the value never changes during execution.
|
||||
|
||||
## Initialization == Compilation
|
||||
|
||||
Next, let's talk about initialization. The simplest case is initializing static variables with
|
||||
either scalar or struct notation:
|
||||
|
||||
```rust
|
||||
#[derive(Debug)]
|
||||
struct MyStruct {
|
||||
x: u32
|
||||
}
|
||||
|
||||
static MY_STRUCT: MyStruct = MyStruct {
|
||||
// You can even reference other statics
|
||||
// declared later
|
||||
x: MY_VAL
|
||||
};
|
||||
|
||||
static MY_VAL: u32 = 24;
|
||||
|
||||
fn main() {
|
||||
println!("Static MyStruct: {:?}", MY_STRUCT);
|
||||
}
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=b538dbc46076f12db047af4f4403ee6e)
|
||||
|
||||
Things can get a bit weirder when using `const fn` though. In most cases, it just works:
|
||||
|
||||
```rust
|
||||
#[derive(Debug)]
|
||||
struct MyStruct {
|
||||
x: u32
|
||||
}
|
||||
|
||||
impl MyStruct {
|
||||
const fn new() -> MyStruct {
|
||||
MyStruct { x: 24 }
|
||||
}
|
||||
}
|
||||
|
||||
static MY_STRUCT: MyStruct = MyStruct::new();
|
||||
|
||||
fn main() {
|
||||
println!("const fn Static MyStruct: {:?}", MY_STRUCT);
|
||||
}
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=8c796a6e7fc273c12115091b707b0255)
|
||||
|
||||
However, there's a caveat: you're currently not allowed to use `const fn` to initialize static
|
||||
variables of types that aren't marked `Sync`. For example,
|
||||
[`RefCell::new()`](https://doc.rust-lang.org/std/cell/struct.RefCell.html#method.new) is a
|
||||
`const fn`, but because
|
||||
[`RefCell` isn't `Sync`](https://doc.rust-lang.org/std/cell/struct.RefCell.html#impl-Sync), you'll
|
||||
get an error at compile time:
|
||||
|
||||
```rust
|
||||
use std::cell::RefCell;
|
||||
|
||||
// error[E0277]: `std::cell::RefCell<u8>` cannot be shared between threads safely
|
||||
static MY_LOCK: RefCell<u8> = RefCell::new(0);
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=c76ef86e473d07117a1700e21fd45560)
|
||||
|
||||
It's likely that this will
|
||||
[change in the future](https://github.com/rust-lang/rfcs/blob/master/text/0911-const-fn.md) though.
|
||||
|
||||
## **Sync**
|
||||
|
||||
Which leads well to the next point: static variable types must implement the
|
||||
[`Sync` marker](https://doc.rust-lang.org/std/marker/trait.Sync.html). Because they're globally
|
||||
unique, it must be safe for you to access static variables from any thread at any time. Most
|
||||
`struct` definitions automatically implement the `Sync` trait because they contain only elements
|
||||
which themselves implement `Sync` (read more in the
|
||||
[Nomicon](https://doc.rust-lang.org/nomicon/send-and-sync.html)). This is why earlier examples could
|
||||
get away with initializing statics, even though we never included an `impl Sync for MyStruct` in the
|
||||
code. To demonstrate this property, Rust refuses to compile our earlier example if we add a
|
||||
non-`Sync` element to the `struct` definition:
|
||||
|
||||
```rust
|
||||
use std::cell::RefCell;
|
||||
|
||||
struct MyStruct {
|
||||
x: u32,
|
||||
y: RefCell<u8>,
|
||||
}
|
||||
|
||||
// error[E0277]: `std::cell::RefCell<u8>` cannot be shared between threads safely
|
||||
static MY_STRUCT: MyStruct = MyStruct {
|
||||
x: 8,
|
||||
y: RefCell::new(8)
|
||||
};
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=40074d0248f056c296b662dbbff97cfc)
|
||||
|
||||
## Interior Mutability
|
||||
|
||||
Finally, while `static mut` variables are allowed, mutating them is an `unsafe` operation. If we
|
||||
want to stay in `safe` Rust, we can use interior mutability to accomplish similar goals:
|
||||
|
||||
```rust
|
||||
use std::sync::Once;
|
||||
|
||||
// This example adapted from https://doc.rust-lang.org/std/sync/struct.Once.html#method.call_once
|
||||
static INIT: Once = Once::new();
|
||||
|
||||
fn main() {
|
||||
// Note that while `INIT` is declared immutable, we're still allowed
|
||||
// to mutate its interior
|
||||
INIT.call_once(|| println!("Initializing..."));
|
||||
// This code won't panic, as the interior of INIT was modified
|
||||
// as part of the previous `call_once`
|
||||
INIT.call_once(|| panic!("INIT was called twice!"));
|
||||
}
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=3ba003a981a7ed7400240caadd384d59)
|
@ -1,601 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Fixed Memory: Stacking Up"
|
||||
description: "We don't need no allocator."
|
||||
category:
|
||||
tags: [rust, understanding-allocations]
|
||||
---
|
||||
|
||||
`const` and `static` are perfectly fine, but it's relatively rare that we know at compile-time about
|
||||
either values or references that will be the same for the duration of our program. Put another way,
|
||||
it's not often the case that either you or your compiler knows how much memory your entire program
|
||||
will ever need.
|
||||
|
||||
However, there are still some optimizations the compiler can do if it knows how much memory
|
||||
individual functions will need. Specifically, the compiler can make use of "stack" memory (as
|
||||
opposed to "heap" memory) which can be managed far faster in both the short- and long-term. When
|
||||
requesting memory, the [`push` instruction](http://www.cs.virginia.edu/~evans/cs216/guides/x86.html)
|
||||
can typically complete in [1 or 2 cycles](https://agner.org/optimize/instruction_tables.ods) (<1
|
||||
nanosecond on modern CPUs). Contrast that to heap memory which requires an allocator (specialized
|
||||
software to track what memory is in use) to reserve space. When you're finished with stack memory,
|
||||
the `pop` instruction runs in 1-3 cycles, as opposed to an allocator needing to worry about memory
|
||||
fragmentation and other issues with the heap. All sorts of incredibly sophisticated techniques have
|
||||
been used to design allocators:
|
||||
|
||||
- [Garbage Collection](<https://en.wikipedia.org/wiki/Garbage_collection_(computer_science)>)
|
||||
strategies like [Tracing](https://en.wikipedia.org/wiki/Tracing_garbage_collection) (used in
|
||||
[Java](https://www.oracle.com/technetwork/java/javase/tech/g1-intro-jsp-135488.html)) and
|
||||
[Reference counting](https://en.wikipedia.org/wiki/Reference_counting) (used in
|
||||
[Python](https://docs.python.org/3/extending/extending.html#reference-counts))
|
||||
- Thread-local structures to prevent locking the allocator in
|
||||
[tcmalloc](https://jamesgolick.com/2013/5/19/how-tcmalloc-works.html)
|
||||
- Arena structures used in [jemalloc](http://jemalloc.net/), which
|
||||
[until recently](https://blog.rust-lang.org/2019/01/17/Rust-1.32.0.html#jemalloc-is-removed-by-default)
|
||||
was the primary allocator for Rust programs!
|
||||
|
||||
But no matter how fast your allocator is, the principle remains: the fastest allocator is the one
|
||||
you never use. As such, we're not going to discuss how exactly the
|
||||
[`push` and `pop` instructions work](http://www.cs.virginia.edu/~evans/cs216/guides/x86.html), but
|
||||
we'll focus instead on the conditions that enable the Rust compiler to use faster stack-based
|
||||
allocation for variables.
|
||||
|
||||
So, **how do we know when Rust will or will not use stack allocation for objects we create?**
|
||||
Looking at other languages, it's often easy to delineate between stack and heap. Managed memory
|
||||
languages (Python, Java,
|
||||
[C#](https://blogs.msdn.microsoft.com/ericlippert/2010/09/30/the-truth-about-value-types/)) place
|
||||
everything on the heap. JIT compilers ([PyPy](https://www.pypy.org/),
|
||||
[HotSpot](https://www.oracle.com/technetwork/java/javase/tech/index-jsp-136373.html)) may optimize
|
||||
some heap allocations away, but you should never assume it will happen. C makes things clear with
|
||||
calls to special functions (like [malloc(3)](https://linux.die.net/man/3/malloc)) needed to access
|
||||
heap memory. Old C++ has the [`new`](https://stackoverflow.com/a/655086/1454178) keyword, though
|
||||
modern C++/C++11 is more complicated with [RAII](https://en.cppreference.com/w/cpp/language/raii).
|
||||
|
||||
For Rust, we can summarize as follows: **stack allocation will be used for everything that doesn't
|
||||
involve "smart pointers" and collections**. We'll skip over a precise definition of the term "smart
|
||||
pointer" for now, and instead discuss what we should watch for to understand when stack and heap
|
||||
memory regions are used:
|
||||
|
||||
1. Stack manipulation instructions (`push`, `pop`, and `add`/`sub` of the `rsp` register) indicate
|
||||
allocation of stack memory:
|
||||
|
||||
```rust
|
||||
pub fn stack_alloc(x: u32) -> u32 {
|
||||
// Space for `y` is allocated by subtracting from `rsp`,
|
||||
// and then populated
|
||||
let y = [1u8, 2, 3, 4];
|
||||
// Space for `y` is deallocated by adding back to `rsp`
|
||||
x
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/5WSgc9)
|
||||
|
||||
2. Tracking when exactly heap allocation calls occur is difficult. It's typically easier to watch
|
||||
for `call core::ptr::real_drop_in_place`, and infer that a heap allocation happened in the recent
|
||||
past:
|
||||
|
||||
```rust
|
||||
pub fn heap_alloc(x: usize) -> usize {
|
||||
// Space for elements in a vector has to be allocated
|
||||
// on the heap, and is then de-allocated once the
|
||||
// vector goes out of scope
|
||||
let y: Vec<u8> = Vec::with_capacity(x);
|
||||
x
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/epfgoQ) (`real_drop_in_place` happens on line 1317)
|
||||
<span style="font-size: .8em">Note: While the
|
||||
[`Drop` trait](https://doc.rust-lang.org/std/ops/trait.Drop.html) is
|
||||
[called for stack-allocated objects](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=87edf374d8983816eb3d8cfeac657b46),
|
||||
the Rust standard library only defines `Drop` implementations for types that involve heap
|
||||
allocation.</span>
|
||||
|
||||
3. If you don't want to inspect the assembly, use a custom allocator that's able to track and alert
|
||||
when heap allocations occur. Crates like
|
||||
[`alloc_counter`](https://crates.io/crates/alloc_counter) are designed for exactly this purpose.
|
||||
|
||||
With all that in mind, let's talk about situations in which we're guaranteed to use stack memory:
|
||||
|
||||
- Structs are created on the stack.
|
||||
- Function arguments are passed on the stack, meaning the
|
||||
[`#[inline]` attribute](https://doc.rust-lang.org/reference/attributes.html#inline-attribute) will
|
||||
not change the memory region used.
|
||||
- Enums and unions are stack-allocated.
|
||||
- [Arrays](https://doc.rust-lang.org/std/primitive.array.html) are always stack-allocated.
|
||||
- Closures capture their arguments on the stack.
|
||||
- Generics will use stack allocation, even with dynamic dispatch.
|
||||
- [`Copy`](https://doc.rust-lang.org/std/marker/trait.Copy.html) types are guaranteed to be
|
||||
stack-allocated, and copying them will be done in stack memory.
|
||||
- [`Iterator`s](https://doc.rust-lang.org/std/iter/trait.Iterator.html) in the standard library are
|
||||
stack-allocated even when iterating over heap-based collections.
|
||||
|
||||
# Structs
|
||||
|
||||
The simplest case comes first. When creating vanilla `struct` objects, we use stack memory to hold
|
||||
their contents:
|
||||
|
||||
```rust
|
||||
struct Point {
|
||||
x: u64,
|
||||
y: u64,
|
||||
}
|
||||
|
||||
struct Line {
|
||||
a: Point,
|
||||
b: Point,
|
||||
}
|
||||
|
||||
pub fn make_line() {
|
||||
// `origin` is stored in the first 16 bytes of memory
|
||||
// starting at location `rsp`
|
||||
let origin = Point { x: 0, y: 0 };
|
||||
// `point` makes up the next 16 bytes of memory
|
||||
let point = Point { x: 1, y: 2 };
|
||||
|
||||
// When creating `ray`, we just move the content out of
|
||||
// `origin` and `point` into the next 32 bytes of memory
|
||||
let ray = Line { a: origin, b: point };
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/vri9BE)
|
||||
|
||||
Note that while some extra-fancy instructions are used for memory manipulation in the assembly, the
|
||||
`sub rsp, 64` instruction indicates we're still working with the stack.
|
||||
|
||||
# Function arguments
|
||||
|
||||
Have you ever wondered how functions communicate with each other? Like, once the variables are given
|
||||
to you, everything's fine. But how do you "give" those variables to another function? How do you get
|
||||
the results back afterward? The answer: the compiler arranges memory and assembly instructions using
|
||||
a pre-determined [calling convention](http://llvm.org/docs/LangRef.html#calling-conventions). This
|
||||
convention governs the rules around where arguments needed by a function will be located (either in
|
||||
memory offsets relative to the stack pointer `rsp`, or in other registers), and where the results
|
||||
can be found once the function has finished. And when multiple languages agree on what the calling
|
||||
conventions are, you can do things like having [Go call Rust code](https://blog.filippo.io/rustgo/)!
|
||||
|
||||
Put simply: it's the compiler's job to figure out how to call other functions, and you can assume
|
||||
that the compiler is good at its job.
|
||||
|
||||
We can see this in action using a simple example:
|
||||
|
||||
```rust
|
||||
struct Point {
|
||||
x: i64,
|
||||
y: i64,
|
||||
}
|
||||
|
||||
// We use integer division operations to keep
|
||||
// the assembly clean, understanding the result
|
||||
// isn't accurate.
|
||||
fn distance(a: &Point, b: &Point) -> i64 {
|
||||
// Immediately subtract from `rsp` the bytes needed
|
||||
// to hold all the intermediate results - this is
|
||||
// the stack allocation step
|
||||
|
||||
// The compiler used the `rdi` and `rsi` registers
|
||||
// to pass our arguments, so read them in
|
||||
let x1 = a.x;
|
||||
let x2 = b.x;
|
||||
let y1 = a.y;
|
||||
let y2 = b.y;
|
||||
|
||||
// Do the actual math work
|
||||
let x_pow = (x1 - x2) * (x1 - x2);
|
||||
let y_pow = (y1 - y2) * (y1 - y2);
|
||||
let squared = x_pow + y_pow;
|
||||
squared / squared
|
||||
|
||||
// Our final result will be stored in the `rax` register
|
||||
// so that our caller knows where to retrieve it.
|
||||
// Finally, add back to `rsp` the stack memory that is
|
||||
// now ready to be used by other functions.
|
||||
}
|
||||
|
||||
pub fn total_distance() {
|
||||
let start = Point { x: 1, y: 2 };
|
||||
let middle = Point { x: 3, y: 4 };
|
||||
let end = Point { x: 5, y: 6 };
|
||||
|
||||
let _dist_1 = distance(&start, &middle);
|
||||
let _dist_2 = distance(&middle, &end);
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/Qmx4ST)
|
||||
|
||||
As a consequence of function arguments never using heap memory, we can also infer that functions
|
||||
using the `#[inline]` attributes also do not heap allocate. But better than inferring, we can look
|
||||
at the assembly to prove it:
|
||||
|
||||
```rust
|
||||
struct Point {
|
||||
x: i64,
|
||||
y: i64,
|
||||
}
|
||||
|
||||
// Note that there is no `distance` function in the assembly output,
|
||||
// and the total line count goes from 229 with inlining off
|
||||
// to 306 with inline on. Even still, no heap allocations occur.
|
||||
#[inline(always)]
|
||||
fn distance(a: &Point, b: &Point) -> i64 {
|
||||
let x1 = a.x;
|
||||
let x2 = b.x;
|
||||
let y1 = a.y;
|
||||
let y2 = b.y;
|
||||
|
||||
let x_pow = (a.x - b.x) * (a.x - b.x);
|
||||
let y_pow = (a.y - b.y) * (a.y - b.y);
|
||||
let squared = x_pow + y_pow;
|
||||
squared / squared
|
||||
}
|
||||
|
||||
pub fn total_distance() {
|
||||
let start = Point { x: 1, y: 2 };
|
||||
let middle = Point { x: 3, y: 4 };
|
||||
let end = Point { x: 5, y: 6 };
|
||||
|
||||
let _dist_1 = distance(&start, &middle);
|
||||
let _dist_2 = distance(&middle, &end);
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/30Sh66)
|
||||
|
||||
Finally, passing by value (arguments with type
|
||||
[`Copy`](https://doc.rust-lang.org/std/marker/trait.Copy.html)) and passing by reference (either
|
||||
moving ownership or passing a pointer) may have slightly different layouts in assembly, but will
|
||||
still use either stack memory or CPU registers:
|
||||
|
||||
```rust
|
||||
pub struct Point {
|
||||
x: i64,
|
||||
y: i64,
|
||||
}
|
||||
|
||||
// Moving values
|
||||
pub fn distance_moved(a: Point, b: Point) -> i64 {
|
||||
let x1 = a.x;
|
||||
let x2 = b.x;
|
||||
let y1 = a.y;
|
||||
let y2 = b.y;
|
||||
|
||||
let x_pow = (x1 - x2) * (x1 - x2);
|
||||
let y_pow = (y1 - y2) * (y1 - y2);
|
||||
let squared = x_pow + y_pow;
|
||||
squared / squared
|
||||
}
|
||||
|
||||
// Borrowing values has two extra `mov` instructions on lines 21 and 22
|
||||
pub fn distance_borrowed(a: &Point, b: &Point) -> i64 {
|
||||
let x1 = a.x;
|
||||
let x2 = b.x;
|
||||
let y1 = a.y;
|
||||
let y2 = b.y;
|
||||
|
||||
let x_pow = (x1 - x2) * (x1 - x2);
|
||||
let y_pow = (y1 - y2) * (y1 - y2);
|
||||
let squared = x_pow + y_pow;
|
||||
squared / squared
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/06hGiv)
|
||||
|
||||
# Enums
|
||||
|
||||
If you've ever worried that wrapping your types in
|
||||
[`Option`](https://doc.rust-lang.org/stable/core/option/enum.Option.html) or
|
||||
[`Result`](https://doc.rust-lang.org/stable/core/result/enum.Result.html) would finally make them
|
||||
large enough that Rust decides to use heap allocation instead, fear no longer: `enum` and union
|
||||
types don't use heap allocation:
|
||||
|
||||
```rust
|
||||
enum MyEnum {
|
||||
Small(u8),
|
||||
Large(u64)
|
||||
}
|
||||
|
||||
struct MyStruct {
|
||||
x: MyEnum,
|
||||
y: MyEnum,
|
||||
}
|
||||
|
||||
pub fn enum_compare() {
|
||||
let x = MyEnum::Small(0);
|
||||
let y = MyEnum::Large(0);
|
||||
|
||||
let z = MyStruct { x, y };
|
||||
|
||||
let opt = Option::Some(z);
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/HK7zBx)
|
||||
|
||||
Because the size of an `enum` is the size of its largest element plus a flag, the compiler can
|
||||
predict how much memory is used no matter which variant of an enum is currently stored in a
|
||||
variable. Thus, enums and unions have no need of heap allocation. There's unfortunately not a great
|
||||
way to show this in assembly, so I'll instead point you to the
|
||||
[`core::mem::size_of`](https://doc.rust-lang.org/stable/core/mem/fn.size_of.html#size-of-enums)
|
||||
documentation.
|
||||
|
||||
# Arrays
|
||||
|
||||
The array type is guaranteed to be stack allocated, which is why the array size must be declared.
|
||||
Interestingly enough, this can be used to cause safe Rust programs to crash:
|
||||
|
||||
```rust
|
||||
// 256 bytes
|
||||
#[derive(Default)]
|
||||
struct TwoFiftySix {
|
||||
_a: [u64; 32]
|
||||
}
|
||||
|
||||
// 8 kilobytes
|
||||
#[derive(Default)]
|
||||
struct EightK {
|
||||
_a: [TwoFiftySix; 32]
|
||||
}
|
||||
|
||||
// 256 kilobytes
|
||||
#[derive(Default)]
|
||||
struct TwoFiftySixK {
|
||||
_a: [EightK; 32]
|
||||
}
|
||||
|
||||
// 8 megabytes - exceeds space typically provided for the stack,
|
||||
// though the kernel can be instructed to allocate more.
|
||||
// On Linux, you can check stack size using `ulimit -s`
|
||||
#[derive(Default)]
|
||||
struct EightM {
|
||||
_a: [TwoFiftySixK; 32]
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Because we already have things in stack memory
|
||||
// (like the current function call stack), allocating another
|
||||
// eight megabytes of stack memory crashes the program
|
||||
let _x = EightM::default();
|
||||
}
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=587a6380a4914bcbcef4192c90c01dc4)
|
||||
|
||||
There aren't any security implications of this (no memory corruption occurs), but it's good to note
|
||||
that the Rust compiler won't move arrays into heap memory even if they can be reasonably expected to
|
||||
overflow the stack.
|
||||
|
||||
# Closures
|
||||
|
||||
Rules for how anonymous functions capture their arguments are typically language-specific. In Java,
|
||||
[Lambda Expressions](https://docs.oracle.com/javase/tutorial/java/javaOO/lambdaexpressions.html) are
|
||||
actually objects created on the heap that capture local primitives by copying, and capture local
|
||||
non-primitives as (`final`) references.
|
||||
[Python](https://docs.python.org/3.7/reference/expressions.html#lambda) and
|
||||
[JavaScript](https://javascriptweblog.wordpress.com/2010/10/25/understanding-javascript-closures/)
|
||||
both bind _everything_ by reference normally, but Python can also
|
||||
[capture values](https://stackoverflow.com/a/235764/1454178) and JavaScript has
|
||||
[Arrow functions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions).
|
||||
|
||||
In Rust, arguments to closures are the same as arguments to other functions; closures are simply
|
||||
functions that don't have a declared name. Some weird ordering of the stack may be required to
|
||||
handle them, but it's the compiler's responsiblity to figure that out.
|
||||
|
||||
Each example below has the same effect, but a different assembly implementation. In the simplest
|
||||
case, we immediately run a closure returned by another function. Because we don't store a reference
|
||||
to the closure, the stack memory needed to store the captured values is contiguous:
|
||||
|
||||
```rust
|
||||
fn my_func() -> impl FnOnce() {
|
||||
let x = 24;
|
||||
// Note that this closure in assembly looks exactly like
|
||||
// any other function; you even use the `call` instruction
|
||||
// to start running it.
|
||||
move || { x; }
|
||||
}
|
||||
|
||||
pub fn immediate() {
|
||||
my_func()();
|
||||
my_func()();
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/mgJ2zl), 25 total assembly instructions
|
||||
|
||||
If we store a reference to the closure, the Rust compiler keeps values it needs in the stack memory
|
||||
of the original function. Getting the details right is a bit harder, so the instruction count goes
|
||||
up even though this code is functionally equivalent to our original example:
|
||||
|
||||
```rust
|
||||
pub fn simple_reference() {
|
||||
let x = my_func();
|
||||
let y = my_func();
|
||||
y();
|
||||
x();
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/K_dj5n), 55 total assembly instructions
|
||||
|
||||
Even things like variable order can make a difference in instruction count:
|
||||
|
||||
```rust
|
||||
pub fn complex() {
|
||||
let x = my_func();
|
||||
let y = my_func();
|
||||
x();
|
||||
y();
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/p37qFl), 70 total assembly instructions
|
||||
|
||||
In every circumstance though, the compiler ensured that no heap allocations were necessary.
|
||||
|
||||
# Generics
|
||||
|
||||
Traits in Rust come in two broad forms: static dispatch (monomorphization, `impl Trait`) and dynamic
|
||||
dispatch (trait objects, `dyn Trait`). While dynamic dispatch is often _associated_ with trait
|
||||
objects being stored in the heap, dynamic dispatch can be used with stack allocated objects as well:
|
||||
|
||||
```rust
|
||||
trait GetInt {
|
||||
fn get_int(&self) -> u64;
|
||||
}
|
||||
|
||||
// vtable stored at section L__unnamed_1
|
||||
struct WhyNotU8 {
|
||||
x: u8
|
||||
}
|
||||
impl GetInt for WhyNotU8 {
|
||||
fn get_int(&self) -> u64 {
|
||||
self.x as u64
|
||||
}
|
||||
}
|
||||
|
||||
// vtable stored at section L__unnamed_2
|
||||
struct ActualU64 {
|
||||
x: u64
|
||||
}
|
||||
impl GetInt for ActualU64 {
|
||||
fn get_int(&self) -> u64 {
|
||||
self.x
|
||||
}
|
||||
}
|
||||
|
||||
// `&dyn` declares that we want to use dynamic dispatch
|
||||
// rather than monomorphization, so there is only one
|
||||
// `retrieve_int` function that shows up in the final assembly.
|
||||
// If we used generics, there would be one implementation of
|
||||
// `retrieve_int` for each type that implements `GetInt`.
|
||||
pub fn retrieve_int(u: &dyn GetInt) {
|
||||
// In the assembly, we just call an address given to us
|
||||
// in the `rsi` register and hope that it was set up
|
||||
// correctly when this function was invoked.
|
||||
let x = u.get_int();
|
||||
}
|
||||
|
||||
pub fn do_call() {
|
||||
// Note that even though the vtable for `WhyNotU8` and
|
||||
// `ActualU64` includes a pointer to
|
||||
// `core::ptr::real_drop_in_place`, it is never invoked.
|
||||
let a = WhyNotU8 { x: 0 };
|
||||
let b = ActualU64 { x: 0 };
|
||||
|
||||
retrieve_int(&a);
|
||||
retrieve_int(&b);
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/u_yguS)
|
||||
|
||||
It's hard to imagine practical situations where dynamic dispatch would be used for objects that
|
||||
aren't heap allocated, but it technically can be done.
|
||||
|
||||
# Copy types
|
||||
|
||||
Understanding move semantics and copy semantics in Rust is weird at first. The Rust docs
|
||||
[go into detail](https://doc.rust-lang.org/stable/core/marker/trait.Copy.html) far better than can
|
||||
be addressed here, so I'll leave them to do the job. From a memory perspective though, their
|
||||
guideline is reasonable:
|
||||
[if your type can implemement `Copy`, it should](https://doc.rust-lang.org/stable/core/marker/trait.Copy.html#when-should-my-type-be-copy).
|
||||
While there are potential speed tradeoffs to _benchmark_ when discussing `Copy` (move semantics for
|
||||
stack objects vs. copying stack pointers vs. copying stack `struct`s), _it's impossible for `Copy`
|
||||
to introduce a heap allocation_.
|
||||
|
||||
But why is this the case? Fundamentally, it's because the language controls what `Copy` means -
|
||||
["the behavior of `Copy` is not overloadable"](https://doc.rust-lang.org/std/marker/trait.Copy.html#whats-the-difference-between-copy-and-clone)
|
||||
because it's a marker trait. From there we'll note that a type
|
||||
[can implement `Copy`](https://doc.rust-lang.org/std/marker/trait.Copy.html#when-can-my-type-be-copy)
|
||||
if (and only if) its components implement `Copy`, and that
|
||||
[no heap-allocated types implement `Copy`](https://doc.rust-lang.org/std/marker/trait.Copy.html#implementors).
|
||||
Thus, assignments involving heap types are always move semantics, and new heap allocations won't
|
||||
occur because of implicit operator behavior.
|
||||
|
||||
```rust
|
||||
#[derive(Clone)]
|
||||
struct Cloneable {
|
||||
x: Box<u64>
|
||||
}
|
||||
|
||||
// error[E0204]: the trait `Copy` may not be implemented for this type
|
||||
#[derive(Copy, Clone)]
|
||||
struct NotCopyable {
|
||||
x: Box<u64>
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/VToRuK)
|
||||
|
||||
# Iterators
|
||||
|
||||
In managed memory languages (like
|
||||
[Java](https://www.youtube.com/watch?v=bSkpMdDe4g4&feature=youtu.be&t=357)), there's a subtle
|
||||
difference between these two code samples:
|
||||
|
||||
```java
|
||||
public static int sum_for(List<Long> vals) {
|
||||
long sum = 0;
|
||||
// Regular for loop
|
||||
for (int i = 0; i < vals.length; i++) {
|
||||
sum += vals[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public static int sum_foreach(List<Long> vals) {
|
||||
long sum = 0;
|
||||
// "Foreach" loop - uses iteration
|
||||
for (Long l : vals) {
|
||||
sum += l;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
```
|
||||
|
||||
In the `sum_for` function, nothing terribly interesting happens. In `sum_foreach`, an object of type
|
||||
[`Iterator`](https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/Iterator.html)
|
||||
is allocated on the heap, and will eventually be garbage-collected. This isn't a great design;
|
||||
iterators are often transient objects that you need during a function and can discard once the
|
||||
function ends. Sounds exactly like the issue stack-allocated objects address, no?
|
||||
|
||||
In Rust, iterators are allocated on the stack. The objects to iterate over are almost certainly in
|
||||
heap memory, but the iterator itself
|
||||
([`Iter`](https://doc.rust-lang.org/std/slice/struct.Iter.html)) doesn't need to use the heap. In
|
||||
each of the examples below we iterate over a collection, but never use heap allocation:
|
||||
|
||||
```rust
|
||||
use std::collections::HashMap;
|
||||
// There's a lot of assembly generated, but if you search in the text,
|
||||
// there are no references to `real_drop_in_place` anywhere.
|
||||
|
||||
pub fn sum_vec(x: &Vec<u32>) {
|
||||
let mut s = 0;
|
||||
// Basic iteration over vectors doesn't need allocation
|
||||
for y in x {
|
||||
s += y;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn sum_enumerate(x: &Vec<u32>) {
|
||||
let mut s = 0;
|
||||
// More complex iterators are just fine too
|
||||
for (_i, y) in x.iter().enumerate() {
|
||||
s += y;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn sum_hm(x: &HashMap<u32, u32>) {
|
||||
let mut s = 0;
|
||||
// And it's not just Vec, all types will allocate the iterator
|
||||
// on stack memory
|
||||
for y in x.values() {
|
||||
s += y;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/FTT3CT)
|
@ -1,254 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Dynamic Memory: A Heaping Helping"
|
||||
description: "The reason Rust exists."
|
||||
category:
|
||||
tags: [rust, understanding-allocations]
|
||||
---
|
||||
|
||||
Managing dynamic memory is hard. Some languages assume users will do it themselves (C, C++), and
|
||||
some languages go to extreme lengths to protect users from themselves (Java, Python). In Rust, how
|
||||
the language uses dynamic memory (also referred to as the **heap**) is a system called _ownership_.
|
||||
And as the docs mention, ownership
|
||||
[is Rust's most unique feature](https://doc.rust-lang.org/book/ch04-00-understanding-ownership.html).
|
||||
|
||||
The heap is used in two situations; when the compiler is unable to predict either the _total size of
|
||||
memory needed_, or _how long the memory is needed for_, it allocates space in the heap. This happens
|
||||
pretty frequently; if you want to download the Google home page, you won't know how large it is
|
||||
until your program runs. And when you're finished with Google, we deallocate the memory so it can be
|
||||
used to store other webpages. If you're interested in a slightly longer explanation of the heap,
|
||||
check out
|
||||
[The Stack and the Heap](https://doc.rust-lang.org/book/ch04-01-what-is-ownership.html#the-stack-and-the-heap)
|
||||
in Rust's documentation.
|
||||
|
||||
We won't go into detail on how the heap is managed; the
|
||||
[ownership documentation](https://doc.rust-lang.org/book/ch04-01-what-is-ownership.html) does a
|
||||
phenomenal job explaining both the "why" and "how" of memory management. Instead, we're going to
|
||||
focus on understanding "when" heap allocations occur in Rust.
|
||||
|
||||
To start off, take a guess for how many allocations happen in the program below:
|
||||
|
||||
```rust
|
||||
fn main() {}
|
||||
```
|
||||
|
||||
It's obviously a trick question; while no heap allocations occur as a result of that code, the setup
|
||||
needed to call `main` does allocate on the heap. Here's a way to show it:
|
||||
|
||||
```rust
|
||||
#![feature(integer_atomics)]
|
||||
use std::alloc::{GlobalAlloc, Layout, System};
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
static ALLOCATION_COUNT: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
struct CountingAllocator;
|
||||
|
||||
unsafe impl GlobalAlloc for CountingAllocator {
|
||||
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
|
||||
ALLOCATION_COUNT.fetch_add(1, Ordering::SeqCst);
|
||||
System.alloc(layout)
|
||||
}
|
||||
|
||||
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
|
||||
System.dealloc(ptr, layout);
|
||||
}
|
||||
}
|
||||
|
||||
#[global_allocator]
|
||||
static A: CountingAllocator = CountingAllocator;
|
||||
|
||||
fn main() {
|
||||
let x = ALLOCATION_COUNT.fetch_add(0, Ordering::SeqCst);
|
||||
println!("There were {} allocations before calling main!", x);
|
||||
}
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=nightly&mode=debug&edition=2018&gist=fb5060025ba79fc0f906b65a4ef8eb8e)
|
||||
|
||||
As of the time of writing, there are five allocations that happen before `main` is ever called.
|
||||
|
||||
But when we want to understand more practically where heap allocation happens, we'll follow this
|
||||
guide:
|
||||
|
||||
- Smart pointers hold their contents in the heap
|
||||
- Collections are smart pointers for many objects at a time, and reallocate when they need to grow
|
||||
|
||||
Finally, there are two "addendum" issues that are important to address when discussing Rust and the
|
||||
heap:
|
||||
|
||||
- Non-heap alternatives to many standard library types are available.
|
||||
- Special allocators to track memory behavior should be used to benchmark code.
|
||||
|
||||
# Smart pointers
|
||||
|
||||
The first thing to note are the "smart pointer" types. When you have data that must outlive the
|
||||
scope in which it is declared, or your data is of unknown or dynamic size, you'll make use of these
|
||||
types.
|
||||
|
||||
The term [smart pointer](https://en.wikipedia.org/wiki/Smart_pointer) comes from C++, and while it's
|
||||
closely linked to a general design pattern of
|
||||
["Resource Acquisition Is Initialization"](https://en.cppreference.com/w/cpp/language/raii), we'll
|
||||
use it here specifically to describe objects that are responsible for managing ownership of data
|
||||
allocated on the heap. The smart pointers available in the `alloc` crate should look mostly
|
||||
familiar:
|
||||
|
||||
- [`Box`](https://doc.rust-lang.org/alloc/boxed/struct.Box.html)
|
||||
- [`Rc`](https://doc.rust-lang.org/alloc/rc/struct.Rc.html)
|
||||
- [`Arc`](https://doc.rust-lang.org/alloc/sync/struct.Arc.html)
|
||||
- [`Cow`](https://doc.rust-lang.org/alloc/borrow/enum.Cow.html)
|
||||
|
||||
The [standard library](https://doc.rust-lang.org/std/) also defines some smart pointers to manage
|
||||
heap objects, though more than can be covered here. Some examples are:
|
||||
|
||||
- [`RwLock`](https://doc.rust-lang.org/std/sync/struct.RwLock.html)
|
||||
- [`Mutex`](https://doc.rust-lang.org/std/sync/struct.Mutex.html)
|
||||
|
||||
Finally, there is one ["gotcha"](https://www.merriam-webster.com/dictionary/gotcha): **cell types**
|
||||
(like [`RefCell`](https://doc.rust-lang.org/stable/core/cell/struct.RefCell.html)) look and behave
|
||||
similarly, but **don't involve heap allocation**. The
|
||||
[`core::cell` docs](https://doc.rust-lang.org/stable/core/cell/index.html) have more information.
|
||||
|
||||
When a smart pointer is created, the data it is given is placed in heap memory and the location of
|
||||
that data is recorded in the smart pointer. Once the smart pointer has determined it's safe to
|
||||
deallocate that memory (when a `Box` has
|
||||
[gone out of scope](https://doc.rust-lang.org/stable/std/boxed/index.html) or a reference count
|
||||
[goes to zero](https://doc.rust-lang.org/alloc/rc/index.html)), the heap space is reclaimed. We can
|
||||
prove these types use heap memory by looking at code:
|
||||
|
||||
```rust
|
||||
use std::rc::Rc;
|
||||
use std::sync::Arc;
|
||||
use std::borrow::Cow;
|
||||
|
||||
pub fn my_box() {
|
||||
// Drop at assembly line 1640
|
||||
Box::new(0);
|
||||
}
|
||||
|
||||
pub fn my_rc() {
|
||||
// Drop at assembly line 1650
|
||||
Rc::new(0);
|
||||
}
|
||||
|
||||
pub fn my_arc() {
|
||||
// Drop at assembly line 1660
|
||||
Arc::new(0);
|
||||
}
|
||||
|
||||
pub fn my_cow() {
|
||||
// Drop at assembly line 1672
|
||||
Cow::from("drop");
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/4AMQug)
|
||||
|
||||
# Collections
|
||||
|
||||
Collection types use heap memory because their contents have dynamic size; they will request more
|
||||
memory [when needed](https://doc.rust-lang.org/std/vec/struct.Vec.html#method.reserve), and can
|
||||
[release memory](https://doc.rust-lang.org/std/vec/struct.Vec.html#method.shrink_to_fit) when it's
|
||||
no longer necessary. This dynamic property forces Rust to heap allocate everything they contain. In
|
||||
a way, **collections are smart pointers for many objects at a time**. Common types that fall under
|
||||
this umbrella are [`Vec`](https://doc.rust-lang.org/stable/alloc/vec/struct.Vec.html),
|
||||
[`HashMap`](https://doc.rust-lang.org/stable/std/collections/struct.HashMap.html), and
|
||||
[`String`](https://doc.rust-lang.org/stable/alloc/string/struct.String.html) (not
|
||||
[`str`](https://doc.rust-lang.org/std/primitive.str.html)).
|
||||
|
||||
While collections store the objects they own in heap memory, _creating new collections will not
|
||||
allocate on the heap_. This is a bit weird; if we call `Vec::new()`, the assembly shows a
|
||||
corresponding call to `real_drop_in_place`:
|
||||
|
||||
```rust
|
||||
pub fn my_vec() {
|
||||
// Drop in place at line 481
|
||||
Vec::<u8>::new();
|
||||
}
|
||||
```
|
||||
|
||||
-- [Compiler Explorer](https://godbolt.org/z/1WkNtC)
|
||||
|
||||
But because the vector has no elements to manage, no calls to the allocator will ever be dispatched:
|
||||
|
||||
```rust
|
||||
use std::alloc::{GlobalAlloc, Layout, System};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
fn main() {
|
||||
// Turn on panicking if we allocate on the heap
|
||||
DO_PANIC.store(true, Ordering::SeqCst);
|
||||
|
||||
// Interesting bit happens here
|
||||
let x: Vec<u8> = Vec::new();
|
||||
drop(x);
|
||||
|
||||
// Turn panicking back off, some deallocations occur
|
||||
// after main as well.
|
||||
DO_PANIC.store(false, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
#[global_allocator]
|
||||
static A: PanicAllocator = PanicAllocator;
|
||||
static DO_PANIC: AtomicBool = AtomicBool::new(false);
|
||||
struct PanicAllocator;
|
||||
|
||||
unsafe impl GlobalAlloc for PanicAllocator {
|
||||
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
|
||||
if DO_PANIC.load(Ordering::SeqCst) {
|
||||
panic!("Unexpected allocation.");
|
||||
}
|
||||
System.alloc(layout)
|
||||
}
|
||||
|
||||
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
|
||||
if DO_PANIC.load(Ordering::SeqCst) {
|
||||
panic!("Unexpected deallocation.");
|
||||
}
|
||||
System.dealloc(ptr, layout);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
--
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=debug&edition=2018&gist=831a297d176d015b1f9ace01ae416cc6)
|
||||
|
||||
Other standard library types follow the same behavior; make sure to check out
|
||||
[`HashMap::new()`](https://doc.rust-lang.org/std/collections/hash_map/struct.HashMap.html#method.new),
|
||||
and [`String::new()`](https://doc.rust-lang.org/std/string/struct.String.html#method.new).
|
||||
|
||||
# Heap Alternatives
|
||||
|
||||
While it is a bit strange to speak of the stack after spending time with the heap, it's worth
|
||||
pointing out that some heap-allocated objects in Rust have stack-based counterparts provided by
|
||||
other crates. If you have need of the functionality, but want to avoid allocating, there are
|
||||
typically alternatives available.
|
||||
|
||||
When it comes to some standard library smart pointers
|
||||
([`RwLock`](https://doc.rust-lang.org/std/sync/struct.RwLock.html) and
|
||||
[`Mutex`](https://doc.rust-lang.org/std/sync/struct.Mutex.html)), stack-based alternatives are
|
||||
provided in crates like [parking_lot](https://crates.io/crates/parking_lot) and
|
||||
[spin](https://crates.io/crates/spin). You can check out
|
||||
[`lock_api::RwLock`](https://docs.rs/lock_api/0.1.5/lock_api/struct.RwLock.html),
|
||||
[`lock_api::Mutex`](https://docs.rs/lock_api/0.1.5/lock_api/struct.Mutex.html), and
|
||||
[`spin::Once`](https://mvdnes.github.io/rust-docs/spin-rs/spin/struct.Once.html) if you're in need
|
||||
of synchronization primitives.
|
||||
|
||||
[thread_id](https://crates.io/crates/thread-id) may be necessary if you're implementing an allocator
|
||||
because [`thread::current().id()`](https://doc.rust-lang.org/std/thread/struct.ThreadId.html) uses a
|
||||
[`thread_local!` structure](https://doc.rust-lang.org/stable/src/std/sys_common/thread_info.rs.html#17-36)
|
||||
that needs heap allocation.
|
||||
|
||||
# Tracing Allocators
|
||||
|
||||
When writing performance-sensitive code, there's no alternative to measuring your code. If you
|
||||
didn't write a benchmark,
|
||||
[you don't care about it's performance](https://www.youtube.com/watch?v=2EWejmkKlxs&feature=youtu.be&t=263)
|
||||
You should never rely on your instincts when
|
||||
[a microsecond is an eternity](https://www.youtube.com/watch?v=NH1Tta7purM).
|
||||
|
||||
Similarly, there's great work going on in Rust with allocators that keep track of what they're doing
|
||||
(like [`alloc_counter`](https://crates.io/crates/alloc_counter)). When it comes to tracking heap
|
||||
behavior, it's easy to make mistakes; please write tests and make sure you have tools to guard
|
||||
against future issues.
|
@ -1,148 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Compiler Optimizations: What It's Done Lately"
|
||||
description: "A lot. The answer is a lot."
|
||||
category:
|
||||
tags: [rust, understanding-allocations]
|
||||
---
|
||||
|
||||
**Update 2019-02-10**: When debugging a
|
||||
[related issue](https://gitlab.com/sio4/code/alloc-counter/issues/1), it was discovered that the
|
||||
original code worked because LLVM optimized out the entire function, rather than just the allocation
|
||||
segments. The code has been updated with proper use of
|
||||
[`read_volatile`](https://doc.rust-lang.org/std/ptr/fn.read_volatile.html), and a previous section
|
||||
on vector capacity has been removed.
|
||||
|
||||
---
|
||||
|
||||
Up to this point, we've been discussing memory usage in the Rust language by focusing on simple
|
||||
rules that are mostly right for small chunks of code. We've spent time showing how those rules work
|
||||
themselves out in practice, and become familiar with reading the assembly code needed to see each
|
||||
memory type (global, stack, heap) in action.
|
||||
|
||||
Throughout the series so far, we've put a handicap on the code. In the name of consistent and
|
||||
understandable results, we've asked the compiler to pretty please leave the training wheels on. Now
|
||||
is the time where we throw out all the rules and take off the kid gloves. As it turns out, both the
|
||||
Rust compiler and the LLVM optimizers are incredibly sophisticated, and we'll step back and let them
|
||||
do their job.
|
||||
|
||||
Similar to
|
||||
["What Has My Compiler Done For Me Lately?"](https://www.youtube.com/watch?v=bSkpMdDe4g4), we're
|
||||
focusing on interesting things the Rust language (and LLVM!) can do with memory management. We'll
|
||||
still be looking at assembly code to understand what's going on, but it's important to mention
|
||||
again: **please use automated tools like [alloc-counter](https://crates.io/crates/alloc_counter) to
|
||||
double-check memory behavior if it's something you care about**. It's far too easy to mis-read
|
||||
assembly in large code sections, you should always verify behavior if you care about memory usage.
|
||||
|
||||
The guiding principal as we move forward is this: _optimizing compilers won't produce worse programs
|
||||
than we started with._ There won't be any situations where stack allocations get moved to heap
|
||||
allocations. There will, however, be an opera of optimization.
|
||||
|
||||
# The Case of the Disappearing Box
|
||||
|
||||
Our first optimization comes when LLVM can reason that the lifetime of an object is sufficiently
|
||||
short that heap allocations aren't necessary. In these cases, LLVM will move the allocation to the
|
||||
stack instead! The way this interacts with `#[inline]` attributes is a bit opaque, but the important
|
||||
part is that LLVM can sometimes do better than the baseline Rust language:
|
||||
|
||||
```rust
|
||||
use std::alloc::{GlobalAlloc, Layout, System};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
pub fn cmp(x: u32) {
|
||||
// Turn on panicking if we allocate on the heap
|
||||
DO_PANIC.store(true, Ordering::SeqCst);
|
||||
|
||||
// The compiler is able to see through the constant `Box`
|
||||
// and directly compare `x` to 24 - assembly line 73
|
||||
let y = Box::new(24);
|
||||
let equals = x == *y;
|
||||
|
||||
// This call to drop is eliminated
|
||||
drop(y);
|
||||
|
||||
// Need to mark the comparison result as volatile so that
|
||||
// LLVM doesn't strip out all the code. If `y` is marked
|
||||
// volatile instead, allocation will be forced.
|
||||
unsafe { std::ptr::read_volatile(&equals) };
|
||||
|
||||
// Turn off panicking, as there are some deallocations
|
||||
// when we exit main.
|
||||
DO_PANIC.store(false, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
fn main() {
|
||||
cmp(12)
|
||||
}
|
||||
|
||||
#[global_allocator]
|
||||
static A: PanicAllocator = PanicAllocator;
|
||||
static DO_PANIC: AtomicBool = AtomicBool::new(false);
|
||||
struct PanicAllocator;
|
||||
|
||||
unsafe impl GlobalAlloc for PanicAllocator {
|
||||
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
|
||||
if DO_PANIC.load(Ordering::SeqCst) {
|
||||
panic!("Unexpected allocation.");
|
||||
}
|
||||
System.alloc(layout)
|
||||
}
|
||||
|
||||
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
|
||||
if DO_PANIC.load(Ordering::SeqCst) {
|
||||
panic!("Unexpected deallocation.");
|
||||
}
|
||||
System.dealloc(ptr, layout);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## -- [Compiler Explorer](https://godbolt.org/z/BZ_Yp3)
|
||||
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=release&edition=2018&gist=4a765f753183d5b919f62c71d2109d5d)
|
||||
|
||||
# Dr. Array or: How I Learned to Love the Optimizer
|
||||
|
||||
Finally, this isn't so much about LLVM figuring out different memory behavior, but LLVM stripping
|
||||
out code that doesn't do anything. Optimizations of this type have a lot of nuance to them; if
|
||||
you're not careful, they can make your benchmarks look
|
||||
[impossibly good](https://www.youtube.com/watch?v=nXaxk27zwlk&feature=youtu.be&t=1199). In Rust, the
|
||||
`black_box` function (implemented in both
|
||||
[`libtest`](https://doc.rust-lang.org/1.1.0/test/fn.black_box.html) and
|
||||
[`criterion`](https://docs.rs/criterion/0.2.10/criterion/fn.black_box.html)) will tell the compiler
|
||||
to disable this kind of optimization. But if you let LLVM remove unnecessary code, you can end up
|
||||
running programs that previously caused errors:
|
||||
|
||||
```rust
|
||||
#[derive(Default)]
|
||||
struct TwoFiftySix {
|
||||
_a: [u64; 32]
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct EightK {
|
||||
_a: [TwoFiftySix; 32]
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct TwoFiftySixK {
|
||||
_a: [EightK; 32]
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct EightM {
|
||||
_a: [TwoFiftySixK; 32]
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
// Normally this blows up because we can't reserve size on stack
|
||||
// for the `EightM` struct. But because the compiler notices we
|
||||
// never do anything with `_x`, it optimizes out the stack storage
|
||||
// and the program completes successfully.
|
||||
let _x = EightM::default();
|
||||
}
|
||||
```
|
||||
|
||||
## -- [Compiler Explorer](https://godbolt.org/z/daHn7P)
|
||||
|
||||
[Rust Playground](https://play.rust-lang.org/?version=stable&mode=release&edition=2018&gist=4c253bf26072119896ab93c6ef064dc0)
|
@ -1,35 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Summary: What are the Allocation Rules?"
|
||||
description: "A synopsis and reference."
|
||||
category:
|
||||
tags: [rust, understanding-allocations]
|
||||
---
|
||||
|
||||
While there's a lot of interesting detail captured in this series, it's often helpful to have a
|
||||
document that answers some "yes/no" questions. You may not care about what an `Iterator` looks like
|
||||
in assembly, you just need to know whether it allocates an object on the heap or not. And while Rust
|
||||
will prioritize the fastest behavior it can, here are the rules for each memory type:
|
||||
|
||||
**Heap Allocation**:
|
||||
|
||||
- Smart pointers (`Box`, `Rc`, `Mutex`, etc.) allocate their contents in heap memory.
|
||||
- Collections (`HashMap`, `Vec`, `String`, etc.) allocate their contents in heap memory.
|
||||
- Some smart pointers in the standard library have counterparts in other crates that don't need heap
|
||||
memory. If possible, use those.
|
||||
|
||||
**Stack Allocation**:
|
||||
|
||||
- Everything not using a smart pointer will be allocated on the stack.
|
||||
- Structs, enums, iterators, arrays, and closures are all stack allocated.
|
||||
- Cell types (`RefCell`) behave like smart pointers, but are stack-allocated.
|
||||
- Inlining (`#[inline]`) will not affect allocation behavior for better or worse.
|
||||
- Types that are marked `Copy` are guaranteed to have their contents stack-allocated.
|
||||
|
||||
**Global Allocation**:
|
||||
|
||||
- `const` is a fixed value; the compiler is allowed to copy it wherever useful.
|
||||
- `static` is a fixed reference; the compiler will guarantee it is unique.
|
||||
|
||||
![Container Sizes in Rust](/assets/images/2019-02-04-container-size.svg) --
|
||||
[Raph Levien](https://docs.google.com/presentation/d/1q-c7UAyrUlM-eZyTo1pd8SZ0qwA_wYxmPZVOQkoDmH4/edit?usp=sharing)
|
@ -1,52 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Making Bread"
|
||||
description: "...because I've got some free time now. 🍞"
|
||||
category:
|
||||
tags: [baking]
|
||||
---
|
||||
|
||||
Having recently started my "gardening leave" between positions, I have some more personal time
|
||||
available. I'm planning to stay productive, contributing to some open-source projects, but it also
|
||||
occurred to me that despite [talking about](https://speice.io/2018/05/hello.html) bread pics, this
|
||||
blog has been purely technical. Maybe I'll change the site title from "The Old Speice Guy" to "Bites
|
||||
and Bytes"?
|
||||
|
||||
Either way, I'm baking a little bit again, and figured it was worth taking a quick break to focus on
|
||||
some lighter material. I recently learned two critically important lessons: first, the temperature
|
||||
of the dough when you put the yeast in makes a huge difference.
|
||||
|
||||
Previously, when I wasn't paying attention to dough temperature:
|
||||
|
||||
![Whole weat dough](/assets/images/2019-05-03-making-bread/whole-wheat-not-rising.jpg)
|
||||
|
||||
Compared with what happens when I put the dough in the microwave for a defrost cycle because the
|
||||
water I used wasn't warm enough:
|
||||
|
||||
![White dough](/assets/images/2019-05-03-making-bread/white-dough-rising-before-fold.jpg)
|
||||
|
||||
I mean, just look at the bubbles!
|
||||
|
||||
![White dough with bubbles](/assets/images/2019-05-03-making-bread/white-dough-rising-after-fold.jpg)
|
||||
|
||||
After shaping the dough, I've got two loaves ready:
|
||||
|
||||
![Shaped loaves](/assets/images/2019-05-03-making-bread/shaped-loaves.jpg)
|
||||
|
||||
Now, the recipe normally calls for a Dutch Oven to bake the bread because it keeps the dough from
|
||||
drying out in the oven. Because I don't own a Dutch Oven, I typically put a casserole dish on the
|
||||
bottom rack and fill it with water so there's still some moisture in the oven. This time, I forgot
|
||||
to add the water and learned my second lesson: never add room-temperature water to a glass dish
|
||||
that's currently at 500 degrees.
|
||||
|
||||
![Shattered glass dish](/assets/images/2019-05-03-making-bread/shattered-glass.jpg)
|
||||
|
||||
Needless to say, trying to pull out sharp glass from an incredibly hot oven is not what I expected
|
||||
to be doing during my garden leave.
|
||||
|
||||
In the end, the bread crust wasn't great, but the bread itself turned out pretty alright:
|
||||
|
||||
![Baked bread](/assets/images/2019-05-03-making-bread/final-product.jpg)
|
||||
|
||||
I've been writing a lot more during this break, so I'm looking forward to sharing that in the
|
||||
future. In the mean-time, I'm planning on making a sandwich.
|
@ -1,296 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "On Building High Performance Systems"
|
||||
description: ""
|
||||
category:
|
||||
tags: []
|
||||
---
|
||||
|
||||
**Update 2019-09-21**: Added notes on `isolcpus` and `systemd` affinity.
|
||||
|
||||
Prior to working in the trading industry, my assumption was that High Frequency Trading (HFT) is
|
||||
made up of people who have access to secret techniques mortal developers could only dream of. There
|
||||
had to be some secret art that could only be learned if one had an appropriately tragic backstory:
|
||||
|
||||
<img src="/assets/images/2019-04-24-kung-fu.webp" alt="kung-fu fight">
|
||||
> How I assumed HFT people learn their secret techniques
|
||||
|
||||
How else do you explain people working on systems that complete the round trip of market data in to
|
||||
orders out (a.k.a. tick-to-trade) consistently within
|
||||
[750-800 nanoseconds](https://stackoverflow.com/a/22082528/1454178)? In roughly the time it takes a
|
||||
computer to access
|
||||
[main memory 8 times](https://people.eecs.berkeley.edu/~rcs/research/interactive_latency.html),
|
||||
trading systems are capable of reading the market data packets, deciding what orders to send, doing
|
||||
risk checks, creating new packets for exchange-specific protocols, and putting those packets on the
|
||||
wire.
|
||||
|
||||
Having now worked in the trading industry, I can confirm the developers aren't super-human; I've
|
||||
made some simple mistakes at the very least. Instead, what shows up in public discussions is that
|
||||
philosophy, not technique, separates high-performance systems from everything else.
|
||||
Performance-critical systems don't rely on "this one cool C++ optimization trick" to make code fast
|
||||
(though micro-optimizations have their place); there's a lot more to worry about than just the code
|
||||
written for the project.
|
||||
|
||||
The framework I'd propose is this: **If you want to build high-performance systems, focus first on
|
||||
reducing performance variance** (reducing the gap between the fastest and slowest runs of the same
|
||||
code), **and only look at average latency once variance is at an acceptable level**.
|
||||
|
||||
Don't get me wrong, I'm a much happier person when things are fast. Computer goes from booting in 20
|
||||
seconds down to 10 because I installed a solid-state drive? Awesome. But if every fifth day it takes
|
||||
a full minute to boot because of corrupted sectors? Not so great. Average speed over the course of a
|
||||
week is the same in each situation, but you're painfully aware of that minute when it happens. When
|
||||
it comes to code, the principal is the same: speeding up a function by an average of 10 milliseconds
|
||||
doesn't mean much if there's a 100ms difference between your fastest and slowest runs. When
|
||||
performance matters, you need to respond quickly _every time_, not just in aggregate.
|
||||
High-performance systems should first optimize for time variance. Once you're consistent at the time
|
||||
scale you care about, then focus on improving average time.
|
||||
|
||||
This focus on variance shows up all the time in industry too (emphasis added in all quotes below):
|
||||
|
||||
- In [marketing materials](https://business.nasdaq.com/market-tech/marketplaces/trading) for
|
||||
NASDAQ's matching engine, the most performance-sensitive component of the exchange, dependability
|
||||
is highlighted in addition to instantaneous metrics:
|
||||
|
||||
> Able to **consistently sustain** an order rate of over 100,000 orders per second at sub-40
|
||||
> microsecond average latency
|
||||
|
||||
- The [Aeron](https://github.com/real-logic/aeron) message bus has this to say about performance:
|
||||
|
||||
> Performance is the key focus. Aeron is designed to be the highest throughput with the lowest and
|
||||
> **most predictable latency possible** of any messaging system
|
||||
|
||||
- The company PolySync, which is working on autonomous vehicles,
|
||||
[mentions why](https://polysync.io/blog/session-types-for-hearty-codecs/) they picked their
|
||||
specific messaging format:
|
||||
|
||||
> In general, high performance is almost always desirable for serialization. But in the world of
|
||||
> autonomous vehicles, **steady timing performance is even more important** than peak throughput.
|
||||
> This is because safe operation is sensitive to timing outliers. Nobody wants the system that
|
||||
> decides when to slam on the brakes to occasionally take 100 times longer than usual to encode
|
||||
> its commands.
|
||||
|
||||
- [Solarflare](https://solarflare.com/), which makes highly-specialized network hardware, points out
|
||||
variance (jitter) as a big concern for
|
||||
[electronic trading](https://solarflare.com/electronic-trading/):
|
||||
> The high stakes world of electronic trading, investment banks, market makers, hedge funds and
|
||||
> exchanges demand the **lowest possible latency and jitter** while utilizing the highest
|
||||
> bandwidth and return on their investment.
|
||||
|
||||
And to further clarify: we're not discussing _total run-time_, but variance of total run-time. There
|
||||
are situations where it's not reasonably possible to make things faster, and you'd much rather be
|
||||
consistent. For example, trading firms use
|
||||
[wireless networks](https://sniperinmahwah.wordpress.com/2017/06/07/network-effects-part-i/) because
|
||||
the speed of light through air is faster than through fiber-optic cables. There's still at _absolute
|
||||
minimum_ a [~33.76 millisecond](http://tinyurl.com/y2vd7tn8) delay required to send data between,
|
||||
say,
|
||||
[Chicago and Tokyo](https://www.theice.com/market-data/connectivity-and-feeds/wireless/tokyo-chicago).
|
||||
If a trading system in Chicago calls the function for "send order to Tokyo" and waits to see if a
|
||||
trade occurs, there's a physical limit to how long that will take. In this situation, the focus is
|
||||
on keeping variance of _additional processing_ to a minimum, since speed of light is the limiting
|
||||
factor.
|
||||
|
||||
So how does one go about looking for and eliminating performance variance? To tell the truth, I
|
||||
don't think a systematic answer or flow-chart exists. There's no substitute for (A) building a deep
|
||||
understanding of the entire technology stack, and (B) actually measuring system performance (though
|
||||
(C) watching a lot of [CppCon](https://www.youtube.com/channel/UCMlGfpWw-RUdWX_JbLCukXg) videos for
|
||||
inspiration never hurt). Even then, every project cares about performance to a different degree; you
|
||||
may need to build an entire
|
||||
[replica production system](https://www.youtube.com/watch?v=NH1Tta7purM&feature=youtu.be&t=3015) to
|
||||
accurately benchmark at nanosecond precision, or you may be content to simply
|
||||
[avoid garbage collection](https://www.youtube.com/watch?v=BD9cRbxWQx8&feature=youtu.be&t=1335) in
|
||||
your Java code.
|
||||
|
||||
Even though everyone has different needs, there are still common things to look for when trying to
|
||||
isolate and eliminate variance. In no particular order, these are my focus areas when thinking about
|
||||
high-performance systems:
|
||||
|
||||
## Language-specific
|
||||
|
||||
**Garbage Collection**: How often does garbage collection happen? When is it triggered? What are the
|
||||
impacts?
|
||||
|
||||
- [In Python](https://rushter.com/blog/python-garbage-collector/), individual objects are collected
|
||||
if the reference count reaches 0, and each generation is collected if
|
||||
`num_alloc - num_dealloc > gc_threshold` whenever an allocation happens. The GIL is acquired for
|
||||
the duration of generational collection.
|
||||
- Java has
|
||||
[many](https://docs.oracle.com/en/java/javase/12/gctuning/parallel-collector1.html#GUID-DCDD6E46-0406-41D1-AB49-FB96A50EB9CE)
|
||||
[different](https://docs.oracle.com/en/java/javase/12/gctuning/garbage-first-garbage-collector.html#GUID-ED3AB6D3-FD9B-4447-9EDF-983ED2F7A573)
|
||||
[collection](https://docs.oracle.com/en/java/javase/12/gctuning/garbage-first-garbage-collector-tuning.html#GUID-90E30ACA-8040-432E-B3A0-1E0440AB556A)
|
||||
[algorithms](https://docs.oracle.com/en/java/javase/12/gctuning/z-garbage-collector1.html#GUID-A5A42691-095E-47BA-B6DC-FB4E5FAA43D0)
|
||||
to choose from, each with different characteristics. The default algorithms (Parallel GC in Java
|
||||
8, G1 in Java 9) freeze the JVM while collecting, while more recent algorithms
|
||||
([ZGC](https://wiki.openjdk.java.net/display/zgc) and
|
||||
[Shenandoah](https://wiki.openjdk.java.net/display/shenandoah)) are designed to keep "stop the
|
||||
world" to a minimum by doing collection work in parallel.
|
||||
|
||||
**Allocation**: Every language has a different way of interacting with "heap" memory, but the
|
||||
principle is the same: running the allocator to allocate/deallocate memory takes time that can often
|
||||
be put to better use. Understanding when your language interacts with the allocator is crucial, and
|
||||
not always obvious. For example: C++ and Rust don't allocate heap memory for iterators, but Java
|
||||
does (meaning potential GC pauses). Take time to understand heap behavior (I made a
|
||||
[a guide for Rust](/2019/02/understanding-allocations-in-rust.html)), and look into alternative
|
||||
allocators ([jemalloc](http://jemalloc.net/),
|
||||
[tcmalloc](https://gperftools.github.io/gperftools/tcmalloc.html)) that might run faster than the
|
||||
operating system default.
|
||||
|
||||
**Data Layout**: How your data is arranged in memory matters;
|
||||
[data-oriented design](https://www.youtube.com/watch?v=yy8jQgmhbAU) and
|
||||
[cache locality](https://www.youtube.com/watch?v=2EWejmkKlxs&feature=youtu.be&t=1185) can have huge
|
||||
impacts on performance. The C family of languages (C, value types in C#, C++) and Rust all have
|
||||
guarantees about the shape every object takes in memory that others (e.g. Java and Python) can't
|
||||
make. [Cachegrind](http://valgrind.org/docs/manual/cg-manual.html) and kernel
|
||||
[perf](https://perf.wiki.kernel.org/index.php/Main_Page) counters are both great for understanding
|
||||
how performance relates to memory layout.
|
||||
|
||||
**Just-In-Time Compilation**: Languages that are compiled on the fly (LuaJIT, C#, Java, PyPy) are
|
||||
great because they optimize your program for how it's actually being used, rather than how a
|
||||
compiler expects it to be used. However, there's a variance problem if the program stops executing
|
||||
while waiting for translation from VM bytecode to native code. As a remedy, many languages support
|
||||
ahead-of-time compilation in addition to the JIT versions
|
||||
([CoreRT](https://github.com/dotnet/corert) in C# and [GraalVM](https://www.graalvm.org/) in Java).
|
||||
On the other hand, LLVM supports
|
||||
[Profile Guided Optimization](https://clang.llvm.org/docs/UsersManual.html#profile-guided-optimization),
|
||||
which theoretically brings JIT benefits to non-JIT languages. Finally, be careful to avoid comparing
|
||||
apples and oranges during benchmarks; you don't want your code to suddenly speed up because the JIT
|
||||
compiler kicked in.
|
||||
|
||||
**Programming Tricks**: These won't make or break performance, but can be useful in specific
|
||||
circumstances. For example, C++ can use
|
||||
[templates instead of branches](https://www.youtube.com/watch?v=NH1Tta7purM&feature=youtu.be&t=1206)
|
||||
in critical sections.
|
||||
|
||||
## Kernel
|
||||
|
||||
Code you wrote is almost certainly not the _only_ code running on your hardware. There are many ways
|
||||
the operating system interacts with your program, from interrupts to system calls, that are
|
||||
important to watch for. These are written from a Linux perspective, but Windows does typically have
|
||||
equivalent functionality.
|
||||
|
||||
**Scheduling**: The kernel is normally free to schedule any process on any core, so it's important
|
||||
to reserve CPU cores exclusively for the important programs. There are a few parts to this: first,
|
||||
limit the CPU cores that non-critical processes are allowed to run on by excluding cores from
|
||||
scheduling
|
||||
([`isolcpus`](https://www.linuxtopia.org/online_books/linux_kernel/kernel_configuration/re46.html)
|
||||
kernel command-line option), or by setting the `init` process CPU affinity
|
||||
([`systemd` example](https://access.redhat.com/solutions/2884991)). Second, set critical processes
|
||||
to run on the isolated cores by setting the
|
||||
[processor affinity](https://en.wikipedia.org/wiki/Processor_affinity) using
|
||||
[taskset](https://linux.die.net/man/1/taskset). Finally, use
|
||||
[`NO_HZ`](https://github.com/torvalds/linux/blob/master/Documentation/timers/NO_HZ.txt) or
|
||||
[`chrt`](https://linux.die.net/man/1/chrt) to disable scheduling interrupts. Turning off
|
||||
hyper-threading is also likely beneficial.
|
||||
|
||||
**System calls**: Reading from a UNIX socket? Writing to a file? In addition to not knowing how long
|
||||
the I/O operation takes, these all trigger expensive
|
||||
[system calls (syscalls)](https://en.wikipedia.org/wiki/System_call). To handle these, the CPU must
|
||||
[context switch](https://en.wikipedia.org/wiki/Context_switch) to the kernel, let the kernel
|
||||
operation complete, then context switch back to your program. We'd rather keep these
|
||||
[to a minimum](https://www.destroyallsoftware.com/talks/the-birth-and-death-of-javascript) (see
|
||||
timestamp 18:20). [Strace](https://linux.die.net/man/1/strace) is your friend for understanding when
|
||||
and where syscalls happen.
|
||||
|
||||
**Signal Handling**: Far less likely to be an issue, but signals do trigger a context switch if your
|
||||
code has a handler registered. This will be highly dependent on the application, but you can
|
||||
[block signals](https://www.linuxprogrammingblog.com/all-about-linux-signals?page=show#Blocking_signals)
|
||||
if it's an issue.
|
||||
|
||||
**Interrupts**: System interrupts are how devices connected to your computer notify the CPU that
|
||||
something has happened. The CPU will then choose a processor core to pause and context switch to the
|
||||
OS to handle the interrupt. Make sure that
|
||||
[SMP affinity](http://www.alexonlinux.com/smp-affinity-and-proper-interrupt-handling-in-linux) is
|
||||
set so that interrupts are handled on a CPU core not running the program you care about.
|
||||
|
||||
**[NUMA](https://www.kernel.org/doc/html/latest/vm/numa.html)**: While NUMA is good at making
|
||||
multi-cell systems transparent, there are variance implications; if the kernel moves a process
|
||||
across nodes, future memory accesses must wait for the controller on the original node. Use
|
||||
[numactl](https://linux.die.net/man/8/numactl) to handle memory-/cpu-cell pinning so this doesn't
|
||||
happen.
|
||||
|
||||
## Hardware
|
||||
|
||||
**CPU Pipelining/Speculation**: Speculative execution in modern processors gave us vulnerabilities
|
||||
like Spectre, but it also gave us performance improvements like
|
||||
[branch prediction](https://stackoverflow.com/a/11227902/1454178). And if the CPU mis-speculates
|
||||
your code, there's variance associated with rewind and replay. While the compiler knows a lot about
|
||||
how your CPU [pipelines instructions](https://youtu.be/nAbCKa0FzjQ?t=4467), code can be
|
||||
[structured to help](https://www.youtube.com/watch?v=NH1Tta7purM&feature=youtu.be&t=755) the branch
|
||||
predictor.
|
||||
|
||||
**Paging**: For most systems, virtual memory is incredible. Applications live in their own worlds,
|
||||
and the CPU/[MMU](https://en.wikipedia.org/wiki/Memory_management_unit) figures out the details.
|
||||
However, there's a variance penalty associated with memory paging and caching; if you access more
|
||||
memory pages than the [TLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer) can store,
|
||||
you'll have to wait for the page walk. Kernel perf tools are necessary to figure out if this is an
|
||||
issue, but using [huge pages](https://blog.pythian.com/performance-tuning-hugepages-in-linux/) can
|
||||
reduce TLB burdens. Alternately, running applications in a hypervisor like
|
||||
[Jailhouse](https://github.com/siemens/jailhouse) allows one to skip virtual memory entirely, but
|
||||
this is probably more work than the benefits are worth.
|
||||
|
||||
**Network Interfaces**: When more than one computer is involved, variance can go up dramatically.
|
||||
Tuning kernel
|
||||
[network parameters](https://github.com/leandromoreira/linux-network-performance-parameters) may be
|
||||
helpful, but modern systems more frequently opt to skip the kernel altogether with a technique
|
||||
called [kernel bypass](https://blog.cloudflare.com/kernel-bypass/). This typically requires
|
||||
specialized hardware and [drivers](https://www.openonload.org/), but even industries like
|
||||
[telecom](https://www.bbc.co.uk/rd/blog/2018-04-high-speed-networking-open-source-kernel-bypass) are
|
||||
finding the benefits.
|
||||
|
||||
## Networks
|
||||
|
||||
**Routing**: There's a reason financial firms are willing to pay
|
||||
[millions of euros](https://sniperinmahwah.wordpress.com/2019/03/26/4-les-moeres-english-version/)
|
||||
for rights to a small plot of land - having a straight-line connection from point A to point B means
|
||||
the path their data takes is the shortest possible. In contrast, there are currently 6 computers in
|
||||
between me and Google, but that may change at any moment if my ISP realizes a
|
||||
[more efficient route](https://en.wikipedia.org/wiki/Border_Gateway_Protocol) is available. Whether
|
||||
it's using
|
||||
[research-quality equipment](https://sniperinmahwah.wordpress.com/2018/05/07/shortwave-trading-part-i-the-west-chicago-tower-mystery/)
|
||||
for shortwave radio, or just making sure there's no data inadvertently going between data centers,
|
||||
routing matters.
|
||||
|
||||
**Protocol**: TCP as a network protocol is awesome: guaranteed and in-order delivery, flow control,
|
||||
and congestion control all built in. But these attributes make the most sense when networking
|
||||
infrastructure is lossy; for systems that expect nearly all packets to be delivered correctly, the
|
||||
setup handshaking and packet acknowledgment are just overhead. Using UDP (unicast or multicast) may
|
||||
make sense in these contexts as it avoids the chatter needed to track connection state, and
|
||||
[gap-fill](https://iextrading.com/docs/IEX%20Transport%20Specification.pdf)
|
||||
[strategies](http://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/moldudp64.pdf)
|
||||
can handle the rest.
|
||||
|
||||
**Switching**: Many routers/switches handle packets using "store-and-forward" behavior: wait for the
|
||||
whole packet, validate checksums, and then send to the next device. In variance terms, the time
|
||||
needed to move data between two nodes is proportional to the size of that data; the switch must
|
||||
"store" all data before it can calculate checksums and "forward" to the next node. With
|
||||
["cut-through"](https://www.networkworld.com/article/2241573/latency-and-jitter--cut-through-design-pays-off-for-arista--blade.html)
|
||||
designs, switches will begin forwarding data as soon as they know where the destination is,
|
||||
checksums be damned. This means there's a fixed cost (at the switch) for network traffic, no matter
|
||||
the size.
|
||||
|
||||
# Final Thoughts
|
||||
|
||||
High-performance systems, regardless of industry, are not magical. They do require extreme precision
|
||||
and attention to detail, but they're designed, built, and operated by regular people, using a lot of
|
||||
tools that are publicly available. Interested in seeing how context switching affects performance of
|
||||
your benchmarks? `taskset` should be installed in all modern Linux distributions, and can be used to
|
||||
make sure the OS never migrates your process. Curious how often garbage collection triggers during a
|
||||
crucial operation? Your language of choice will typically expose details of its operations
|
||||
([Python](https://docs.python.org/3/library/gc.html),
|
||||
[Java](https://www.oracle.com/technetwork/java/javase/tech/vmoptions-jsp-140102.html#DebuggingOptions)).
|
||||
Want to know how hard your program is stressing the TLB? Use `perf record` and look for
|
||||
`dtlb_load_misses.miss_causes_a_walk`.
|
||||
|
||||
Two final guiding questions, then: first, before attempting to apply some of the technology above to
|
||||
your own systems, can you first identify
|
||||
[where/when you care](http://wiki.c2.com/?PrematureOptimization) about "high-performance"? As an
|
||||
example, if parts of a system rely on humans pushing buttons, CPU pinning won't have any measurable
|
||||
effect. Humans are already far too slow to react in time. Second, if you're using benchmarks, are
|
||||
they being designed in a way that's actually helpful? Tools like
|
||||
[Criterion](http://www.serpentine.com/criterion/) (also in
|
||||
[Rust](https://github.com/bheisler/criterion.rs)) and Google's
|
||||
[Benchmark](https://github.com/google/benchmark) output not only average run time, but variance as
|
||||
well; your benchmarking environment is subject to the same concerns your production environment is.
|
||||
|
||||
Finally, I believe high-performance systems are a matter of philosophy, not necessarily technique.
|
||||
Rigorous focus on variance is the first step, and there are plenty of ways to measure and mitigate
|
||||
it; once that's at an acceptable level, then optimize for speed.
|
@ -1,263 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Binary Format Shootout"
|
||||
description: "Cap'n Proto vs. Flatbuffers vs. SBE"
|
||||
category:
|
||||
tags: [rust]
|
||||
---
|
||||
|
||||
I've found that in many personal projects,
|
||||
[analysis paralysis](https://en.wikipedia.org/wiki/Analysis_paralysis) is particularly deadly.
|
||||
Making good decisions in the beginning avoids pain and suffering later; if extra research prevents
|
||||
future problems, I'm happy to continue ~~procrastinating~~ researching indefinitely.
|
||||
|
||||
So let's say you're in need of a binary serialization format. Data will be going over the network,
|
||||
not just in memory, so having a schema document and code generation is a must. Performance is
|
||||
crucial, so formats that support zero-copy de/serialization are given priority. And the more
|
||||
languages supported, the better; I use Rust, but can't predict what other languages this could
|
||||
interact with.
|
||||
|
||||
Given these requirements, the candidates I could find were:
|
||||
|
||||
1. [Cap'n Proto](https://capnproto.org/) has been around the longest, and is the most established
|
||||
2. [Flatbuffers](https://google.github.io/flatbuffers/) is the newest, and claims to have a simpler
|
||||
encoding
|
||||
3. [Simple Binary Encoding](https://github.com/real-logic/simple-binary-encoding) has the simplest
|
||||
encoding, but the Rust implementation is unmaintained
|
||||
|
||||
Any one of these will satisfy the project requirements: easy to transmit over a network, reasonably
|
||||
fast, and polyglot support. But how do you actually pick one? It's impossible to know what issues
|
||||
will follow that choice, so I tend to avoid commitment until the last possible moment.
|
||||
|
||||
Still, a choice must be made. Instead of worrying about which is "the best," I decided to build a
|
||||
small proof-of-concept system in each format and pit them against each other. All code can be found
|
||||
in the [repository](https://github.com/speice-io/marketdata-shootout) for this post.
|
||||
|
||||
We'll discuss more in detail, but a quick preview of the results:
|
||||
|
||||
- Cap'n Proto: Theoretically performs incredibly well, the implementation had issues
|
||||
- Flatbuffers: Has some quirks, but largely lived up to its "zero-copy" promises
|
||||
- SBE: Best median and worst-case performance, but the message structure has a limited feature set
|
||||
|
||||
# Prologue: Binary Parsing with Nom
|
||||
|
||||
Our benchmark system will be a simple data processor; given depth-of-book market data from
|
||||
[IEX](https://iextrading.com/trading/market-data/#deep), serialize each message into the schema
|
||||
format, read it back, and calculate total size of stock traded and the lowest/highest quoted prices.
|
||||
This test isn't complex, but is representative of the project I need a binary format for.
|
||||
|
||||
But before we make it to that point, we have to actually read in the market data. To do so, I'm
|
||||
using a library called [`nom`](https://github.com/Geal/nom). Version 5.0 was recently released and
|
||||
brought some big changes, so this was an opportunity to build a non-trivial program and get
|
||||
familiar.
|
||||
|
||||
If you don't already know about `nom`, it's a "parser generator". By combining different smaller
|
||||
parsers, you can assemble a parser to handle complex structures without writing tedious code by
|
||||
hand. For example, when parsing
|
||||
[PCAP files](https://www.winpcap.org/ntar/draft/PCAP-DumpFileFormat.html#rfc.section.3.3):
|
||||
|
||||
```
|
||||
0 1 2 3
|
||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||||
+---------------------------------------------------------------+
|
||||
0 | Block Type = 0x00000006 |
|
||||
+---------------------------------------------------------------+
|
||||
4 | Block Total Length |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
8 | Interface ID |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
12 | Timestamp (High) |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
16 | Timestamp (Low) |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
20 | Captured Len |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
24 | Packet Len |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
| Packet Data |
|
||||
| ... |
|
||||
```
|
||||
|
||||
...you can build a parser in `nom` that looks like
|
||||
[this](https://github.com/speice-io/marketdata-shootout/blob/369613843d39cfdc728e1003123bf87f79422497/src/parsers.rs#L59-L93):
|
||||
|
||||
```rust
|
||||
const ENHANCED_PACKET: [u8; 4] = [0x06, 0x00, 0x00, 0x00];
|
||||
pub fn enhanced_packet_block(input: &[u8]) -> IResult<&[u8], &[u8]> {
|
||||
let (
|
||||
remaining,
|
||||
(
|
||||
block_type,
|
||||
block_len,
|
||||
interface_id,
|
||||
timestamp_high,
|
||||
timestamp_low,
|
||||
captured_len,
|
||||
packet_len,
|
||||
),
|
||||
) = tuple((
|
||||
tag(ENHANCED_PACKET),
|
||||
le_u32,
|
||||
le_u32,
|
||||
le_u32,
|
||||
le_u32,
|
||||
le_u32,
|
||||
le_u32,
|
||||
))(input)?;
|
||||
|
||||
let (remaining, packet_data) = take(captured_len)(remaining)?;
|
||||
Ok((remaining, packet_data))
|
||||
}
|
||||
```
|
||||
|
||||
While this example isn't too interesting, more complex formats (like IEX market data) are where
|
||||
[`nom` really shines](https://github.com/speice-io/marketdata-shootout/blob/369613843d39cfdc728e1003123bf87f79422497/src/iex.rs).
|
||||
|
||||
Ultimately, because the `nom` code in this shootout was the same for all formats, we're not too
|
||||
interested in its performance. Still, it's worth mentioning that building the market data parser was
|
||||
actually fun; I didn't have to write tons of boring code by hand.
|
||||
|
||||
# Part 1: Cap'n Proto
|
||||
|
||||
Now it's time to get into the meaty part of the story. Cap'n Proto was the first format I tried
|
||||
because of how long it has supported Rust (thanks to [dwrensha](https://github.com/dwrensha) for
|
||||
maintaining the Rust port since
|
||||
[2014!](https://github.com/capnproto/capnproto-rust/releases/tag/rustc-0.10)). However, I had a ton
|
||||
of performance concerns once I started using it.
|
||||
|
||||
To serialize new messages, Cap'n Proto uses a "builder" object. This builder allocates memory on the
|
||||
heap to hold the message content, but because builders
|
||||
[can't be re-used](https://github.com/capnproto/capnproto-rust/issues/111), we have to allocate a
|
||||
new buffer for every single message. I was able to work around this with a
|
||||
[special builder](https://github.com/speice-io/marketdata-shootout/blob/369613843d39cfdc728e1003123bf87f79422497/src/capnp_runner.rs#L17-L51)
|
||||
that could re-use the buffer, but it required reading through Cap'n Proto's
|
||||
[benchmarks](https://github.com/capnproto/capnproto-rust/blob/master/benchmark/benchmark.rs#L124-L156)
|
||||
to find an example, and used
|
||||
[`std::mem::transmute`](https://doc.rust-lang.org/std/mem/fn.transmute.html) to bypass Rust's borrow
|
||||
checker.
|
||||
|
||||
The process of reading messages was better, but still had issues. Cap'n Proto has two message
|
||||
encodings: a ["packed"](https://capnproto.org/encoding.html#packing) representation, and an
|
||||
"unpacked" version. When reading "packed" messages, we need a buffer to unpack the message into
|
||||
before we can use it; Cap'n Proto allocates a new buffer for each message we unpack, and I wasn't
|
||||
able to figure out a way around that. In contrast, the unpacked message format should be where Cap'n
|
||||
Proto shines; its main selling point is that there's [no decoding step](https://capnproto.org/).
|
||||
However, accomplishing zero-copy deserialization required code in the private API
|
||||
([since fixed](https://github.com/capnproto/capnproto-rust/issues/148)), and we allocate a vector on
|
||||
every read for the segment table.
|
||||
|
||||
In the end, I put in significant work to make Cap'n Proto as fast as possible, but there were too
|
||||
many issues for me to feel comfortable using it long-term.
|
||||
|
||||
# Part 2: Flatbuffers
|
||||
|
||||
This is the new kid on the block. After a
|
||||
[first attempt](https://github.com/google/flatbuffers/pull/3894) didn't pan out, official support
|
||||
was [recently launched](https://github.com/google/flatbuffers/pull/4898). Flatbuffers intends to
|
||||
address the same problems as Cap'n Proto: high-performance, polyglot, binary messaging. The
|
||||
difference is that Flatbuffers claims to have a simpler wire format and
|
||||
[more flexibility](https://google.github.io/flatbuffers/flatbuffers_benchmarks.html).
|
||||
|
||||
On the whole, I enjoyed using Flatbuffers; the [tooling](https://crates.io/crates/flatc-rust) is
|
||||
nice, and unlike Cap'n Proto, parsing messages was actually zero-copy and zero-allocation. However,
|
||||
there were still some issues.
|
||||
|
||||
First, Flatbuffers (at least in Rust) can't handle nested vectors. This is a problem for formats
|
||||
like the following:
|
||||
|
||||
```
|
||||
table Message {
|
||||
symbol: string;
|
||||
}
|
||||
table MultiMessage {
|
||||
messages:[Message];
|
||||
}
|
||||
```
|
||||
|
||||
We want to create a `MultiMessage` which contains a vector of `Message`, and each `Message` itself
|
||||
contains a vector (the `string` type). I was able to work around this by
|
||||
[caching `Message` elements](https://github.com/speice-io/marketdata-shootout/blob/e9d07d148bf36a211a6f86802b313c4918377d1b/src/flatbuffers_runner.rs#L83)
|
||||
in a `SmallVec` before building the final `MultiMessage`, but it was a painful process that I
|
||||
believe contributed to poor serialization performance.
|
||||
|
||||
Second, streaming support in Flatbuffers seems to be something of an
|
||||
[afterthought](https://github.com/google/flatbuffers/issues/3898). Where Cap'n Proto in Rust handles
|
||||
reading messages from a stream as part of the API, Flatbuffers just sticks a `u32` at the front of
|
||||
each message to indicate the size. Not specifically a problem, but calculating message size without
|
||||
that tag is nigh on impossible.
|
||||
|
||||
Ultimately, I enjoyed using Flatbuffers, and had to do significantly less work to make it perform
|
||||
well.
|
||||
|
||||
# Part 3: Simple Binary Encoding
|
||||
|
||||
Support for SBE was added by the author of one of my favorite
|
||||
[Rust blog posts](https://web.archive.org/web/20190427124806/https://polysync.io/blog/session-types-for-hearty-codecs/).
|
||||
I've [talked previously]({% post_url 2019-06-31-high-performance-systems %}) about how important
|
||||
variance is in high-performance systems, so it was encouraging to read about a format that
|
||||
[directly addressed](https://github.com/real-logic/simple-binary-encoding/wiki/Why-Low-Latency) my
|
||||
concerns. SBE has by far the simplest binary format, but it does make some tradeoffs.
|
||||
|
||||
Both Cap'n Proto and Flatbuffers use [message offsets](https://capnproto.org/encoding.html#structs)
|
||||
to handle variable-length data, [unions](https://capnproto.org/language.html#unions), and various
|
||||
other features. In contrast, messages in SBE are essentially
|
||||
[just structs](https://github.com/real-logic/simple-binary-encoding/blob/master/sbe-samples/src/main/resources/example-schema.xml);
|
||||
variable-length data is supported, but there's no union type.
|
||||
|
||||
As mentioned in the beginning, the Rust port of SBE works well, but is
|
||||
[essentially unmaintained](https://users.rust-lang.org/t/zero-cost-abstraction-frontier-no-copy-low-allocation-ordered-decoding/11515/9).
|
||||
However, if you don't need union types, and can accept that schemas are XML documents, it's still
|
||||
worth using. SBE's implementation had the best streaming support of all formats I tested, and
|
||||
doesn't trigger allocation during de/serialization.
|
||||
|
||||
# Results
|
||||
|
||||
After building a test harness
|
||||
[for](https://github.com/speice-io/marketdata-shootout/blob/master/src/capnp_runner.rs)
|
||||
[each](https://github.com/speice-io/marketdata-shootout/blob/master/src/flatbuffers_runner.rs)
|
||||
[format](https://github.com/speice-io/marketdata-shootout/blob/master/src/sbe_runner.rs), it was
|
||||
time to actually take them for a spin. I used
|
||||
[this script](https://github.com/speice-io/marketdata-shootout/blob/master/run_shootout.sh) to run
|
||||
the benchmarks, and the raw results are
|
||||
[here](https://github.com/speice-io/marketdata-shootout/blob/master/shootout.csv). All data reported
|
||||
below is the average of 10 runs on a single day of IEX data. Results were validated to make sure
|
||||
that each format parsed the data correctly.
|
||||
|
||||
## Serialization
|
||||
|
||||
This test measures, on a
|
||||
[per-message basis](https://github.com/speice-io/marketdata-shootout/blob/master/src/main.rs#L268-L272),
|
||||
how long it takes to serialize the IEX message into the desired format and write to a pre-allocated
|
||||
buffer.
|
||||
|
||||
| Schema | Median | 99th Pctl | 99.9th Pctl | Total |
|
||||
| :------------------- | :----- | :-------- | :---------- | :----- |
|
||||
| Cap'n Proto Packed | 413ns | 1751ns | 2943ns | 14.80s |
|
||||
| Cap'n Proto Unpacked | 273ns | 1828ns | 2836ns | 10.65s |
|
||||
| Flatbuffers | 355ns | 2185ns | 3497ns | 14.31s |
|
||||
| SBE | 91ns | 1535ns | 2423ns | 3.91s |
|
||||
|
||||
## Deserialization
|
||||
|
||||
This test measures, on a
|
||||
[per-message basis](https://github.com/speice-io/marketdata-shootout/blob/master/src/main.rs#L294-L298),
|
||||
how long it takes to read the previously-serialized message and perform some basic aggregation. The
|
||||
aggregation code is the same for each format, so any performance differences are due solely to the
|
||||
format implementation.
|
||||
|
||||
| Schema | Median | 99th Pctl | 99.9th Pctl | Total |
|
||||
| :------------------- | :----- | :-------- | :---------- | :----- |
|
||||
| Cap'n Proto Packed | 539ns | 1216ns | 2599ns | 18.92s |
|
||||
| Cap'n Proto Unpacked | 366ns | 737ns | 1583ns | 12.32s |
|
||||
| Flatbuffers | 173ns | 421ns | 1007ns | 6.00s |
|
||||
| SBE | 116ns | 286ns | 659ns | 4.05s |
|
||||
|
||||
# Conclusion
|
||||
|
||||
Building a benchmark turned out to be incredibly helpful in making a decision; because a "union"
|
||||
type isn't important to me, I can be confident that SBE best addresses my needs.
|
||||
|
||||
While SBE was the fastest in terms of both median and worst-case performance, its worst case
|
||||
performance was proportionately far higher than any other format. It seems to be that
|
||||
de/serialization time scales with message size, but I'll need to do some more research to understand
|
||||
what exactly is going on.
|
@ -1,370 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "Release the GIL"
|
||||
description: "Strategies for Parallelism in Python"
|
||||
category:
|
||||
tags: [python]
|
||||
---
|
||||
|
||||
Complaining about the [Global Interpreter Lock](https://wiki.python.org/moin/GlobalInterpreterLock)
|
||||
(GIL) seems like a rite of passage for Python developers. It's easy to criticize a design decision
|
||||
made before multi-core CPU's were widely available, but the fact that it's still around indicates
|
||||
that it generally works [Good](https://wiki.c2.com/?PrematureOptimization)
|
||||
[Enough](https://wiki.c2.com/?YouArentGonnaNeedIt). Besides, there are simple and effective
|
||||
workarounds; it's not hard to start a
|
||||
[new process](https://docs.python.org/3/library/multiprocessing.html) and use message passing to
|
||||
synchronize code running in parallel.
|
||||
|
||||
Still, wouldn't it be nice to have more than a single active interpreter thread? In an age of
|
||||
asynchronicity and _M:N_ threading, Python seems lacking. The ideal scenario is to take advantage of
|
||||
both Python's productivity and the modern CPU's parallel capabilities.
|
||||
|
||||
Presented below are two strategies for releasing the GIL's icy grip without giving up on what makes
|
||||
Python a nice language to start with. Bear in mind: these are just the tools, no claim is made about
|
||||
whether it's a good idea to use them. Very often, unlocking the GIL is an
|
||||
[XY problem](https://en.wikipedia.org/wiki/XY_problem); you want application performance, and the
|
||||
GIL seems like an obvious bottleneck. Remember that any gains from running code in parallel come at
|
||||
the expense of project complexity; messing with the GIL is ultimately messing with Python's memory
|
||||
model.
|
||||
|
||||
```python
|
||||
%load_ext Cython
|
||||
from numba import jit
|
||||
|
||||
N = 1_000_000_000
|
||||
```
|
||||
|
||||
# Cython
|
||||
|
||||
Put simply, [Cython](https://cython.org/) is a programming language that looks a lot like Python,
|
||||
gets [transpiled](https://en.wikipedia.org/wiki/Source-to-source_compiler) to C/C++, and integrates
|
||||
well with the [CPython](https://en.wikipedia.org/wiki/CPython) API. It's great for building Python
|
||||
wrappers to C and C++ libraries, writing optimized code for numerical processing, and tons more. And
|
||||
when it comes to managing the GIL, there are two special features:
|
||||
|
||||
- The `nogil`
|
||||
[function annotation](https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#declaring-a-function-as-callable-without-the-gil)
|
||||
asserts that a Cython function is safe to use without the GIL, and compilation will fail if it
|
||||
interacts with Python in an unsafe manner
|
||||
- The `with nogil`
|
||||
[context manager](https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#releasing-the-gil)
|
||||
explicitly unlocks the CPython GIL while active
|
||||
|
||||
Whenever Cython code runs inside a `with nogil` block on a separate thread, the Python interpreter
|
||||
is unblocked and allowed to continue work elsewhere. We'll define a "busy work" function that
|
||||
demonstrates this principle in action:
|
||||
|
||||
```python
|
||||
%%cython
|
||||
|
||||
# Annotating a function with `nogil` indicates only that it is safe
|
||||
# to call in a `with nogil` block. It *does not* release the GIL.
|
||||
cdef unsigned long fibonacci(unsigned long n) nogil:
|
||||
if n <= 1:
|
||||
return n
|
||||
|
||||
cdef unsigned long a = 0, b = 1, c = 0
|
||||
|
||||
c = a + b
|
||||
for _i in range(2, n):
|
||||
a = b
|
||||
b = c
|
||||
c = a + b
|
||||
|
||||
return c
|
||||
|
||||
|
||||
def cython_nogil(unsigned long n):
|
||||
# Explicitly release the GIL while running `fibonacci`
|
||||
with nogil:
|
||||
value = fibonacci(n)
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def cython_gil(unsigned long n):
|
||||
# Because the GIL is not explicitly released, it implicitly
|
||||
# remains acquired when running the `fibonacci` function
|
||||
return fibonacci(n)
|
||||
```
|
||||
|
||||
First, let's time how long it takes Cython to calculate the billionth Fibonacci number:
|
||||
|
||||
```python
|
||||
%%time
|
||||
_ = cython_gil(N);
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> CPU times: user 365 ms, sys: 0 ns, total: 365 ms
|
||||
> Wall time: 372 ms
|
||||
> </pre>
|
||||
|
||||
```python
|
||||
%%time
|
||||
_ = cython_nogil(N);
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> CPU times: user 381 ms, sys: 0 ns, total: 381 ms
|
||||
> Wall time: 388 ms
|
||||
> </pre>
|
||||
|
||||
Both versions (with and without GIL) take effectively the same amount of time to run. Even when
|
||||
running this calculation in parallel on separate threads, it is expected that the run time will
|
||||
double because only one thread can be active at a time:
|
||||
|
||||
```python
|
||||
%%time
|
||||
from threading import Thread
|
||||
|
||||
# Create the two threads to run on
|
||||
t1 = Thread(target=cython_gil, args=[N])
|
||||
t2 = Thread(target=cython_gil, args=[N])
|
||||
# Start the threads
|
||||
t1.start(); t2.start()
|
||||
# Wait for the threads to finish
|
||||
t1.join(); t2.join()
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> CPU times: user 641 ms, sys: 5.62 ms, total: 647 ms
|
||||
> Wall time: 645 ms
|
||||
> </pre>
|
||||
|
||||
However, if the first thread releases the GIL, the second thread is free to acquire it and run in
|
||||
parallel:
|
||||
|
||||
```python
|
||||
%%time
|
||||
|
||||
t1 = Thread(target=cython_nogil, args=[N])
|
||||
t2 = Thread(target=cython_gil, args=[N])
|
||||
t1.start(); t2.start()
|
||||
t1.join(); t2.join()
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> CPU times: user 717 ms, sys: 372 µs, total: 718 ms
|
||||
> Wall time: 358 ms
|
||||
> </pre>
|
||||
|
||||
Because `user` time represents the sum of processing time on all threads, it doesn't change much.
|
||||
The ["wall time"](https://en.wikipedia.org/wiki/Elapsed_real_time) has been cut roughly in half
|
||||
because each function is running simultaneously.
|
||||
|
||||
Keep in mind that the **order in which threads are started** makes a difference!
|
||||
|
||||
```python
|
||||
%%time
|
||||
|
||||
# Note that the GIL-locked version is started first
|
||||
t1 = Thread(target=cython_gil, args=[N])
|
||||
t2 = Thread(target=cython_nogil, args=[N])
|
||||
t1.start(); t2.start()
|
||||
t1.join(); t2.join()
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> CPU times: user 667 ms, sys: 0 ns, total: 667 ms
|
||||
> Wall time: 672 ms
|
||||
> </pre>
|
||||
|
||||
Even though the second thread releases the GIL while running, it can't start until the first has
|
||||
completed. Thus, the overall runtime is effectively the same as running two GIL-locked threads.
|
||||
|
||||
Finally, be aware that attempting to unlock the GIL from a thread that doesn't own it will crash the
|
||||
**interpreter**, not just the thread attempting the unlock:
|
||||
|
||||
```python
|
||||
%%cython
|
||||
|
||||
cdef int cython_recurse(int n) nogil:
|
||||
if n <= 0:
|
||||
return 0
|
||||
|
||||
with nogil:
|
||||
return cython_recurse(n - 1)
|
||||
|
||||
cython_recurse(2)
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> Fatal Python error: PyEval_SaveThread: NULL tstate
|
||||
>
|
||||
> Thread 0x00007f499effd700 (most recent call first):
|
||||
> File "/home/bspeice/.virtualenvs/release-the-gil/lib/python3.7/site-packages/ipykernel/parentpoller.py", line 39 in run
|
||||
> File "/usr/lib/python3.7/threading.py", line 926 in _bootstrap_inner
|
||||
> File "/usr/lib/python3.7/threading.py", line 890 in _bootstrap
|
||||
> </pre>
|
||||
|
||||
In practice, avoiding this issue is simple. First, `nogil` functions probably shouldn't contain
|
||||
`with nogil` blocks. Second, Cython can
|
||||
[conditionally acquire/release](https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#conditional-acquiring-releasing-the-gil)
|
||||
the GIL, so these conditions can be used to synchronize access. Finally, Cython's documentation for
|
||||
[external C code](https://cython.readthedocs.io/en/latest/src/userguide/external_C_code.html#acquiring-and-releasing-the-gil)
|
||||
contains more detail on how to safely manage the GIL.
|
||||
|
||||
To conclude: use Cython's `nogil` annotation to assert that functions are safe for calling when the
|
||||
GIL is unlocked, and `with nogil` to actually unlock the GIL and run those functions.
|
||||
|
||||
# Numba
|
||||
|
||||
Like Cython, [Numba](https://numba.pydata.org/) is a "compiled Python." Where Cython works by
|
||||
compiling a Python-like language to C/C++, Numba compiles Python bytecode _directly to machine code_
|
||||
at runtime. Behavior is controlled with a special `@jit` decorator; calling a decorated function
|
||||
first compiles it to machine code before running. Calling the function a second time re-uses that
|
||||
machine code unless the argument types have changed.
|
||||
|
||||
Numba works best when a `nopython=True` argument is added to the `@jit` decorator; functions
|
||||
compiled in [`nopython`](http://numba.pydata.org/numba-doc/latest/user/jit.html?#nopython) mode
|
||||
avoid the CPython API and have performance comparable to C. Further, adding `nogil=True` to the
|
||||
`@jit` decorator unlocks the GIL while that function is running. Note that `nogil` and `nopython`
|
||||
are separate arguments; while it is necessary for code to be compiled in `nopython` mode in order to
|
||||
release the lock, the GIL will remain locked if `nogil=False` (the default).
|
||||
|
||||
Let's repeat the same experiment, this time using Numba instead of Cython:
|
||||
|
||||
```python
|
||||
# The `int` type annotation is only for humans and is ignored
|
||||
# by Numba.
|
||||
@jit(nopython=True, nogil=True)
|
||||
def numba_nogil(n: int) -> int:
|
||||
if n <= 1:
|
||||
return n
|
||||
|
||||
a = 0
|
||||
b = 1
|
||||
|
||||
c = a + b
|
||||
for _i in range(2, n):
|
||||
a = b
|
||||
b = c
|
||||
c = a + b
|
||||
|
||||
return c
|
||||
|
||||
|
||||
# Run using `nopython` mode to receive a performance boost,
|
||||
# but GIL remains locked due to `nogil=False` by default.
|
||||
@jit(nopython=True)
|
||||
def numba_gil(n: int) -> int:
|
||||
if n <= 1:
|
||||
return n
|
||||
|
||||
a = 0
|
||||
b = 1
|
||||
|
||||
c = a + b
|
||||
for _i in range(2, n):
|
||||
a = b
|
||||
b = c
|
||||
c = a + b
|
||||
|
||||
return c
|
||||
|
||||
|
||||
# Call each function once to force compilation; we don't want
|
||||
# the timing statistics to include how long it takes to compile.
|
||||
numba_nogil(N)
|
||||
numba_gil(N);
|
||||
```
|
||||
|
||||
We'll perform the same tests as above; first, figure out how long it takes the function to run:
|
||||
|
||||
```python
|
||||
%%time
|
||||
_ = numba_gil(N)
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> CPU times: user 253 ms, sys: 258 µs, total: 253 ms
|
||||
> Wall time: 251 ms
|
||||
> </pre>
|
||||
|
||||
<span style="font-size: .8em">
|
||||
Aside: it's not immediately clear why Numba takes ~20% less time to run than Cython for code that should be
|
||||
effectively identical after compilation.
|
||||
</span>
|
||||
|
||||
When running two GIL-locked threads, the result (as expected) takes around twice as long to compute:
|
||||
|
||||
```python
|
||||
%%time
|
||||
t1 = Thread(target=numba_gil, args=[N])
|
||||
t2 = Thread(target=numba_gil, args=[N])
|
||||
t1.start(); t2.start()
|
||||
t1.join(); t2.join()
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> CPU times: user 541 ms, sys: 3.96 ms, total: 545 ms
|
||||
> Wall time: 541 ms
|
||||
> </pre>
|
||||
|
||||
But if the GIL-unlocking thread starts first, both threads run in parallel:
|
||||
|
||||
```python
|
||||
%%time
|
||||
t1 = Thread(target=numba_nogil, args=[N])
|
||||
t2 = Thread(target=numba_gil, args=[N])
|
||||
t1.start(); t2.start()
|
||||
t1.join(); t2.join()
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> CPU times: user 551 ms, sys: 7.77 ms, total: 559 ms
|
||||
> Wall time: 279 ms
|
||||
> </pre>
|
||||
|
||||
Just like Cython, starting the GIL-locked thread first leads to poor performance:
|
||||
|
||||
```python
|
||||
%%time
|
||||
t1 = Thread(target=numba_gil, args=[N])
|
||||
t2 = Thread(target=numba_nogil, args=[N])
|
||||
t1.start(); t2.start()
|
||||
t1.join(); t2.join()
|
||||
```
|
||||
|
||||
> <pre>
|
||||
> CPU times: user 524 ms, sys: 0 ns, total: 524 ms
|
||||
> Wall time: 522 ms
|
||||
> </pre>
|
||||
|
||||
Finally, unlike Cython, Numba will unlock the GIL if and only if it is currently acquired;
|
||||
recursively calling `@jit(nogil=True)` functions is perfectly safe:
|
||||
|
||||
```python
|
||||
from numba import jit
|
||||
|
||||
@jit(nopython=True, nogil=True)
|
||||
def numba_recurse(n: int) -> int:
|
||||
if n <= 0:
|
||||
return 0
|
||||
|
||||
return numba_recurse(n - 1)
|
||||
|
||||
numba_recurse(2);
|
||||
```
|
||||
|
||||
# Conclusion
|
||||
|
||||
Before finishing, it's important to address pain points that will show up if these techniques are
|
||||
used in a more realistic project:
|
||||
|
||||
First, code running in a GIL-free context will likely also need non-trivial data structures;
|
||||
GIL-free functions aren't useful if they're constantly interacting with Python objects whose access
|
||||
requires the GIL. Cython provides
|
||||
[extension types](http://docs.cython.org/en/latest/src/tutorial/cdef_classes.html) and Numba
|
||||
provides a [`@jitclass`](https://numba.pydata.org/numba-doc/dev/user/jitclass.html) decorator to
|
||||
address this need.
|
||||
|
||||
Second, building and distributing applications that make use of Cython/Numba can be complicated.
|
||||
Cython packages require running the compiler, (potentially) linking/packaging external dependencies,
|
||||
and distributing a binary wheel. Numba is generally simpler because the code being distributed is
|
||||
pure Python, but can be tricky since errors aren't detected until runtime.
|
||||
|
||||
Finally, while unlocking the GIL is often a solution in search of a problem, both Cython and Numba
|
||||
provide tools to directly manage the GIL when appropriate. This enables true parallelism (not just
|
||||
[concurrency](https://stackoverflow.com/a/1050257)) that is impossible in vanilla Python.
|
@ -1,60 +0,0 @@
|
||||
---
|
||||
layout: post
|
||||
title: "The webpack industrial complex"
|
||||
description: "Reflections on a new project"
|
||||
category:
|
||||
tags: [webpack, react, vite]
|
||||
---
|
||||
|
||||
This started because I wanted to build a synthesizer. Setting a goal of "digital DX7" was ambitious, but I needed something unrelated to the day job. Beyond that, working with audio seemed like a good challenge. I enjoy performance-focused code, and performance problems in audio are conspicuous. Building a web project was an obvious choice because of the web audio API documentation and independence from a large Digital Audio Workstation (DAW).
|
||||
|
||||
The project was soon derailed trying to sort out technical issues unrelated to the original purpose. Finding a resolution was a frustrating journey, and it's still not clear whether those problems were my fault. As a result, I'm writing this to try making sense of it, as a case study/reference material, and to salvage something from the process.
|
||||
|
||||
## Starting strong
|
||||
|
||||
The sole starting requirement was to write everything in TypeScript. Not because of project scale, but because guardrails help with unfamiliar territory. Keeping that in mind, the first question was: how does one start a new project? All I actually need is "compile TypeScript, show it in a browser."
|
||||
|
||||
Create React App (CRA) came to the rescue and the rest of that evening was a joy. My TypeScript/JavaScript skills were rusty, but the online documentation was helpful. I had never understood the appeal of JSX (why put a DOM in JavaScript?) until it made connecting an `onEvent` handler and a function easy.
|
||||
|
||||
Some quick dimensional analysis later and there was a sine wave oscillator playing A=440 through the speakers. I specifically remember thinking "modern browsers are magical."
|
||||
|
||||
## Continuing on
|
||||
|
||||
Now comes the first mistake: I began to worry about "scale" before encountering an actual problem. Rather than rendering audio in the main thread, why not use audio worklets and render in a background thread instead?
|
||||
|
||||
The first sign something was amiss came from the TypeScript compiler errors showing the audio worklet API [was missing](https://github.com/microsoft/TypeScript/issues/28308). After searching out Github issues and (unsuccessfully) tweaking the `.tsconfig` settings, I settled on installing a package and moving on.
|
||||
|
||||
The next problem came from actually using the API. Worklets must load from separate "modules," but it wasn't clear how to guarantee the worklet code stayed separate from the application. I saw recommendations to use `new URL(<local path>, import.meta.url)` and it worked! Well, kind of:
|
||||
|
||||
![Browser error](/assets/images/2022-11-20-video_mp2t.png)
|
||||
|
||||
That file has the audio processor code, so why does it get served with `Content-Type: video/mp2t`?
|
||||
|
||||
## Floundering about
|
||||
|
||||
Now comes the second mistake: even though I didn't understand the error, I ignored recommendations to [just use JavaScript](https://hackernoon.com/implementing-audioworklets-with-react-8a80a470474) and stuck by the original TypeScript requirement.
|
||||
|
||||
I tried different project structures. Moving the worklet code to a new folder didn't help, nor did setting up a monorepo and placing it in a new package.
|
||||
|
||||
I tried three different CRA tools - `react-app-rewired`, `craco`, `customize-react-app` - but got the same problem. Each has varying levels of compatibility with recent CRA versions, so it wasn't clear if I had the right solution but implemented it incorrectly. After attempting to eject the application and panicking after seeing the configuration, I abandoned that as well.
|
||||
|
||||
I tried changing the webpack configuration: using [new](https://github.com/webpack/webpack/issues/11543#issuecomment-917673256) [loaders](https://github.com/popelenkow/worker-url), setting [asset rules](https://github.com/webpack/webpack/discussions/14093#discussioncomment-1257149), even [changing how webpack detects worker resources](https://github.com/webpack/webpack/issues/11543#issuecomment-826897590). In hindsight, entry points may have been the answer. But because CRA actively resists attempts to change its webpack configuration, and I couldn't find audio worklet examples in any other framework, I gave up.
|
||||
|
||||
I tried so many application frameworks. Next.js looked like a good candidate, but added its own [bespoke webpack complexity](https://github.com/vercel/next.js/issues/24907) to the existing confusion. Astro had the best "getting started" experience, but I refuse to install an IDE-specific plugin. I first used Deno while exploring Lume, but it couldn't import the audio worklet types (maybe because of module compatibility?). Each framework was unique in its own way (shout-out to SvelteKit) but I couldn't figure out how to make them work.
|
||||
|
||||
## Learning and reflecting
|
||||
|
||||
I ended up using Vite and vite-plugin-react-pages to handle both "build the app" and "bundle worklets," but the specific tool choice isn't important. Instead, the focus should be on lessons learned.
|
||||
|
||||
For myself:
|
||||
|
||||
- I'm obsessed with tooling, to the point it can derail the original goal. While it comes from a good place (for example: "types are awesome"), it can get in the way of more important work
|
||||
- I tend to reach for online resources right after seeing a new problem. While finding help online is often faster, spending time understanding the problem would have been more productive than cycling through (often outdated) blog posts
|
||||
|
||||
For the tools:
|
||||
|
||||
- Resource bundling is great and solves a genuine challenge. I've heard too many horror stories of developers writing modules by hand to believe this is unnecessary complexity
|
||||
- Webpack is a build system and modern frameworks are deeply dependent on it (hence the "webpack industrial complex"). While this often saves users from unnecessary complexity, there's no path forward if something breaks
|
||||
- There's little ability to mix and match tools across frameworks. Next.js and Gatsby let users extend webpack, but because each framework adds its own modules, changes aren't portable. After spending a week looking at webpack, I had an example running with parcel in thirty minutes, but couldn't integrate it
|
||||
|
||||
In the end, learning new systems is fun, but a focus on tools that "just work" can leave users out in the cold if they break down.
|
1
archive/index.html
Normal file
@ -1,15 +0,0 @@
|
||||
@font-face {
|
||||
font-family: 'JetBrains Mono';
|
||||
src: url('/assets/font/JetBrainsMono-Regular.woff2') format('woff2'),
|
||||
url('/assets/font/JetBrainsMono-Regular.woff') format('woff');
|
||||
font-weight: normal;
|
||||
font-style: normal;
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'Lato';
|
||||
src: url('/assets/font/lato-regular-webfont.woff2') format('woff2'),
|
||||
url('/assets/font/lato-regular-webfont.woff') format('woff');
|
||||
font-weight: normal;
|
||||
font-style: normal;
|
||||
}
|
@ -1,119 +0,0 @@
|
||||
---
|
||||
---
|
||||
|
||||
// Import the theme rules
|
||||
@import "theme";
|
||||
|
||||
body {
|
||||
max-width: 100%;
|
||||
overflow-x: hidden;
|
||||
font-family: 'Lato', sans-serif;
|
||||
}
|
||||
|
||||
.navbar {
|
||||
color: $gray;
|
||||
}
|
||||
|
||||
.separator {
|
||||
margin-right: .45rem;
|
||||
margin-left: .25rem;
|
||||
color: #000;
|
||||
&:after {
|
||||
content: '\00a0/';
|
||||
}
|
||||
}
|
||||
|
||||
header {
|
||||
padding-top: 80px;
|
||||
padding-bottom: 0;
|
||||
};
|
||||
|
||||
header h1,h2 {
|
||||
color: #000;
|
||||
}
|
||||
|
||||
.post-description {
|
||||
color: #555;
|
||||
}
|
||||
|
||||
.post-container a {
|
||||
color: #555;
|
||||
border-bottom-color: $gray;
|
||||
border-bottom-style: dotted;
|
||||
border-bottom-width: 1px;
|
||||
|
||||
position: relative;
|
||||
display: inline-block;
|
||||
padding: 1px 1px;
|
||||
transition: color ease 0.3s;
|
||||
|
||||
&::after {
|
||||
content: '';
|
||||
position: absolute;
|
||||
z-index: -1;
|
||||
width: 100%;
|
||||
height: 0%;
|
||||
left: 0;
|
||||
bottom: 0;
|
||||
background-color: $gray;
|
||||
transition: all ease 0.3s;
|
||||
}
|
||||
|
||||
&:hover {
|
||||
color: #fff;
|
||||
border-bottom-style: solid;
|
||||
&::after {
|
||||
height: 100%;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
body pre {
|
||||
font-size: 15px;
|
||||
}
|
||||
|
||||
pre.highlight, code {
|
||||
font-family: 'JetBrains Mono', monospace;
|
||||
}
|
||||
|
||||
div.highlighter-rouge {
|
||||
// Default theme uses `width: 100vw`, which while cool, does cause the page
|
||||
// to exceed screen width and trigger horizontal scrolling. No bueno.
|
||||
width: 99vw;
|
||||
}
|
||||
|
||||
.post-date {
|
||||
// On the front page, make sure titles don't force wrapping the date box content
|
||||
text-align: right;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
color: #555;
|
||||
right: 100px;
|
||||
margin-left: 0;
|
||||
padding-left: 1.8rem;
|
||||
border-left: 5px solid $gray;
|
||||
}
|
||||
|
||||
.post-nav {
|
||||
/* Insert your custom styling here. Example:
|
||||
|
||||
font-size: 14px;
|
||||
*/
|
||||
display: flex;
|
||||
margin-top: 1em;
|
||||
margin-bottom: 1em;
|
||||
}
|
||||
.post-nav div {
|
||||
/* flex-grow, flex-shrink, flex-basis */
|
||||
flex: 1 1 0;
|
||||
}
|
||||
.post-nav-next {
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
th, td {
|
||||
border-bottom: 1px solid $gray;
|
||||
padding: 0.75em;
|
||||
}
|
1
assets/css/styles.ae6ff4a3.css
Normal file
BIN
assets/images/1-0d5e8450555296218deb0517b80440f3.png
Normal file
After Width: | Height: | Size: 117 KiB |
BIN
assets/images/1-2d6670430a11b01011e4c231ea594db1.png
Normal file
After Width: | Height: | Size: 98 KiB |
BIN
assets/images/10-b7987a0ff93705d5045057cbdaa2ede9.png
Normal file
After Width: | Height: | Size: 100 KiB |
BIN
assets/images/2-062e1e47a07f200ff3b1531a02812bc7.png
Normal file
After Width: | Height: | Size: 136 KiB |
BIN
assets/images/2-46bb7cc9cf739d97050c199eedced1a7.png
Normal file
After Width: | Height: | Size: 94 KiB |
Before Width: | Height: | Size: 71 KiB |
BIN
assets/images/3-2f5c483659f81d741809de6d095bd577.png
Normal file
After Width: | Height: | Size: 110 KiB |
BIN
assets/images/3-eea635f8cfe4a12ae649ceb6c984e0cd.png
Normal file
After Width: | Height: | Size: 27 KiB |
BIN
assets/images/4-63dc81954b1604cfa91f4c789da144a5.png
Normal file
After Width: | Height: | Size: 100 KiB |
BIN
assets/images/4-b4c3dbfa10b1997706bc271ca71e2ff5.png
Normal file
After Width: | Height: | Size: 160 KiB |
BIN
assets/images/5-8f10acd82b2f025abe57cb93d435a25f.png
Normal file
After Width: | Height: | Size: 136 KiB |
BIN
assets/images/5-ae210d26729cea1700924579adf2c44c.png
Normal file
After Width: | Height: | Size: 97 KiB |
BIN
assets/images/6-456ca1125f48947cf3c1c13722af95a0.png
Normal file
After Width: | Height: | Size: 18 KiB |
BIN
assets/images/6-f07e72ff0b4639453034c75b2e62faba.png
Normal file
After Width: | Height: | Size: 101 KiB |
BIN
assets/images/7-e0793eed6c42845d8ce4e3e79c1d44d8.png
Normal file
After Width: | Height: | Size: 97 KiB |
BIN
assets/images/8-3eb2ad63e4c40b6717ee4516223d73ed.png
Normal file
After Width: | Height: | Size: 113 KiB |
BIN
assets/images/9-630bd32c43e654f068e3c3bea79810e5.png
Normal file
After Width: | Height: | Size: 103 KiB |
After Width: | Height: | Size: 22 KiB |
After Width: | Height: | Size: 13 KiB |
After Width: | Height: | Size: 81 KiB |
After Width: | Height: | Size: 29 KiB |
After Width: | Height: | Size: 13 KiB |
After Width: | Height: | Size: 95 KiB |
After Width: | Height: | Size: 24 KiB |