speice.io/assets/js/3aab746c.9a48ca86.js


			
				
					
						
						
						
							
							
							"use strict";(self.webpackChunkspeice_io=self.webpackChunkspeice_io||[]).push([["1672"],{8204:function(e,n,t){t.r(n),t.d(n,{assets:function(){return h},contentTitle:function(){return o},default:function(){return d},frontMatter:function(){return i},metadata:function(){return a},toc:function(){return c}});var a=t(1403),r=t(5893),s=t(65);let i={slug:"2019/06/high-performance-systems",title:"On building high performance systems",date:new Date("2019-07-01T12:00:00.000Z"),last_updated:{date:new Date("2019-09-21T12:00:00.000Z")},authors:["bspeice"],tags:[]},o=void 0,h={authorsImageUrls:[void 0]},c=[{value:"Language-specific",id:"language-specific",level:2},{value:"Kernel",id:"kernel",level:2},{value:"Hardware",id:"hardware",level:2},{value:"Networks",id:"networks",level:2},{value:"Final Thoughts",id:"final-thoughts",level:2}];function l(e){let n={a:"a",blockquote:"blockquote",code:"code",em:"em",h2:"h2",img:"img",li:"li",p:"p",strong:"strong",ul:"ul",...(0,s.a)(),...e.components};return(0,r.jsxs)(r.Fragment,{children:[(0,r.jsx)(n.p,{children:"Prior to working in the trading industry, my assumption was that High Frequency Trading (HFT) is\nmade up of people who have access to secret techniques mortal developers could only dream of. There\nhad to be some secret art that could only be learned if one had an appropriately tragic backstory."}),"\n",(0,r.jsx)(n.p,{children:(0,r.jsx)(n.img,{alt:"Kung Fu fight",src:t(7743).Z+"",width:"426",height:"240"})}),"\n",(0,r.jsxs)(n.blockquote,{children:["\n",(0,r.jsx)(n.p,{children:"How I assumed HFT people learn their secret techniques"}),"\n"]}),"\n",(0,r.jsxs)(n.p,{children:["How else do you explain people working on systems that complete the round trip of market data in to\norders out (a.k.a. tick-to-trade) consistently within\n",(0,r.jsx)(n.a,{href:"https://stackoverflow.com/a/22082528/1454178",children:"750-800 nanoseconds"}),"? In roughly the time it takes a\ncomputer to access\n",(0,r.jsx)(n.a,{href:"https://people.eecs.berkeley.edu/~rcs/research/interactive_latency.html",children:"main memory 8 times"}),",\ntrading systems are capable of reading the market data packets, deciding what orders to send, doing\nrisk checks, creating new packets for exchange-specific protocols, and putting those packets on the\nwire."]}),"\n",(0,r.jsx)(n.p,{children:"Having now worked in the trading industry, I can confirm the developers aren't super-human; I've\nmade some simple mistakes at the very least. Instead, what shows up in public discussions is that\nphilosophy, not technique, separates high-performance systems from everything else.\nPerformance-critical systems don't rely on \"this one cool C++ optimization trick\" to make code fast\n(though micro-optimizations have their place); there's a lot more to worry about than just the code\nwritten for the project."}),"\n",(0,r.jsxs)(n.p,{children:["The framework I'd propose is this: ",(0,r.jsx)(n.strong,{children:"If you want to build high-performance systems, focus first on\nreducing performance variance"})," (reducing the gap between the fastest and slowest runs of the same\ncode), ",(0,r.jsx)(n.strong,{children:"and only look at average latency once variance is at an acceptable level"}),"."]}),"\n",(0,r.jsxs)(n.p,{children:["Don't get me wrong, I'm a much happier person when things are fast. Computer goes from booting in 20\nseconds down to 10 because I installed a solid-state drive? Awesome. But if every fifth day it takes\na full minute to boot because of corrupted sectors? Not so great. Average speed over the course of a\nweek is the same in each situation, but you're painfully aware of that minute when it happens. When\nit comes to code, the principal is the same: speeding up a function by an average of 10 milliseconds\ndoesn't mean much if there's a 100ms difference between your fastest and slowest runs. When\nperformance matters, you need to respond quickly ",(0,r.jsx)(n.em,{children:"every time"}),", not just in aggregate.\nHigh-performance systems should first optimize for time variance. Once you're consistent at the time\nscale you care about, then focus on improving average time."]}),"\n",(0,r.jsx)(n.p,{children:"This focus on variance shows up all the time in industry too (emphasis added in all quotes below):"}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsxs)(n.p,{children:["In ",(0,r.jsx)(n.a,{href:"https://business.nasdaq.com/market-tech/marketplaces/trading",children:"marketing materials"})," for\nNASDAQ's matching engine, the most performance-sensitive component of the exchange, dependability\nis highlighted in addition to instantaneous metrics:"]}),"\n",(0,r.jsxs)(n.blockquote,{children:["\n",(0,r.jsxs)(n.p,{children:["Able to ",(0,r.jsx)(n.strong,{children:"consistently sustain"})," an order rate of over 100,000 orders per second at sub-40\nmicrosecond average latency"]}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsxs)(n.p,{children:["The ",(0,r.jsx)(n.a,{href:"https://github.com/real-logic/aeron",children:"Aeron"})," message bus has this to say about performance:"]}),"\n",(0,r.jsxs)(n.blockquote,{children:["\n",(0,r.jsxs)(n.p,{children:["Performance is the key focus. Aeron is designed to be the highest throughput with the lowest and\n",(0,r.jsx)(n.strong,{children:"most predictable latency possible"})," of any messaging system"]}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsxs)(n.p,{children:["The company PolySync, which is working on autonomous vehicles,\n",(0,r.jsx)(n.a,{href:"https://polysync.io/blog/session-types-for-hearty-codecs/",children:"mentions why"})," they picked their\nspecific messaging format:"]}),"\n",(0,r.jsxs)(n.blockquote,{children:["\n",(0,r.jsxs)(n.p,{children:["In general, high performance is almost always desirable for serialization. But in the world of\nautonomous vehicles, ",(0,r.jsx)(n.strong,{children:"steady timing performance is even more important"})," than peak throughput.\nThis is because safe operation is sensitive to timing outliers. Nobody wants the system that\ndecides when to slam on the brakes to occasionally take 100 times longer than usual to encode\nits commands."]}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.li,{children:["\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.a,{href:"https://solarflare.com/",children:"Solarflare"}),", which makes highly-specialized network hardware, points out\nvariance (jitter) as a big concern for\n",(0,r.jsx)(n.a,{href:"https://solarflare.com/electronic-trading/",children:"electronic trading"}),":"]}),"\n",(0,r.jsxs)(n.blockquote,{children:["\n",(0,r.jsxs)(n.p,{children:["The high stakes world of electronic trading, investment banks, market makers, hedge funds and\nexchanges demand the ",(0,r.jsx)(n.strong,{children:"lowest possible latency and jitter"})," while utilizing the highest\nbandwidth and return on their investment."]}),"\n"]}),"\n"]}),"\n"]}),"\n",(0,r.jsxs)(n.p,{children:["And to further clarify: we're not discussing ",(0,r.jsx)(n.em,{children:"total run-time"}),", but variance of total run-time. There\nare situations where it's not reasonably possible to make things faster, and you'd much rather be\nconsistent. For example, trading firms use\n",(0,r.jsx)(n.a,{href:"https://sniperinmahwah.wordpress.com/2017/06/07/network-effects-part-i/",children:"wireless networks"})," because\nthe speed of light through air is faster than through fiber-optic cables. There's still at ",(0,r.jsx)(n.em,{children:"absolute\nminimum"})," a ",(0,r.jsx)(n.a,{href:"http://tinyurl.com/y2vd7tn8",children:"~33.76 millisecond"})," delay required to send data between,\nsay,\n",(0,r.jsx)(n.a,{href:"https://www.theice.com/market-data/connectivity-and-feeds/wireless/tokyo-chicago",children:"Chicago and Tokyo"}),'.\nIf a trading system in Chicago calls the function for "send order to Tokyo" and waits to see if a\ntrade occurs, there\'s a physical limit to how long that will take. In this situation, the focus is\non keeping variance of ',(0,r.jsx)(n.em,{children:"additional processing"})," to a minimum, since speed of light is the limiting\nfactor."]}),"\n",(0,r.jsxs)(n.p,{children:["So how does one go about looking for and eliminating performance variance? To tell the truth, I\ndon't think a systematic answer or flow-chart exists. There's no substitute for (A) building a deep\nunderstanding of the entire technology stack, and (B) actually measuring system performance (though\n(C) watching a lot of ",(0,r.jsx)(n.a,{href:"https://www.youtube.com/channel/UCMlGfpWw-RUdWX_JbLCukXg",children:"CppCon"})," videos for\ninspiration never hurt). Even then, every project cares about performance to a different degree; you\nmay need to build an entire\n",(0,r.jsx)(n.a,{href:"https://www.youtube.com/watch?v=NH1Tta7purM&feature=youtu.be&t=3015",children:"replica production system"})," to\naccurately benchmark at nanosecond precision, or you may be content to simply\n",(0,r.jsx)(n.a,{href:"https://www.youtube.com/watch?v=BD9cRbxWQx8&feature=youtu.be&t=1335",children:"avoid garbage collection"})," in\nyour Java code."]}),"\n",(0,r.jsx)(n.p,{children:"Even though everyone has different needs, there are still common things to look for when trying to\nisolate and eliminate variance. In no particular order, these are my focus areas when thinking about\nhigh-performance systems:"}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Update 2019-09-21"}),": Added notes on ",(0,r.jsx)(n.code,{children:"isolcpus"})," and ",(0,r.jsx)(n.code,{children:"systemd"})," affinity."]}),"\n",(0,r.jsx)(n.h2,{id:"language-specific",children:"Language-specific"}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Garbage Collection"}),": How often does garbage collection happen? When is it triggered? What are the\nimpacts?"]}),"\n",(0,r.jsxs)(n.ul,{children:["\n",(0,r.jsxs)(n.li,{children:[(0,r.jsx)(n.a,{href:"https://rushter.com/blog/python-garbage-collector/",children:"In Python"}),", individual objects are collected\nif the reference count reaches 0, and each generation is collected if\n",(0,r.jsx)(n.code,{children:"num_alloc - num_dealloc > gc_threshold"})," whenever an allocation happens. The GIL is acquired for\nthe duration of generational collection."]}),"\n",(0,r.jsxs)(n.li,{children:["Java has\n",(0,r.jsx)(n.a,{href:"https://docs.oracle.com/en/java/javase/12/gctuning/parallel-collector1.html#GUID-DCDD6E46-0406-41D1-AB49-FB96A50EB9CE",children:"many"}),"\n",(0,r.jsx)(n.a,{href:"https://docs.oracle.com/en/java/javase/12/gctuning/garbage-first-garbage-collector.html#GUID-ED3AB6D3-FD9B-4447-9EDF-983ED2F7A573",children:"different"}),"\n",(0,r.jsx)(n.a,{href:"https://docs.oracle.com/en/java/javase/12/gctuning/garbage-first-garbage-collector-tuning.html#GUID-90E30ACA-8040-432E-B3A0-1E0440AB556A",children:"collection"}),"\n",(0,r.jsx)(n.a,{href:"https://docs.oracle.com/en/java/javase/12/gctuning/z-garbage-collector1.html#GUID-A5A42691-095E-47BA-B6DC-FB4E5FAA43D0",children:"algorithms"}),"\nto choose from, each with different characteristics. The default algorithms (Parallel GC in Java\n8, G1 in Java 9) freeze the JVM while collecting, while more recent algorithms\n(",(0,r.jsx)(n.a,{href:"https://wiki.openjdk.java.net/display/zgc",children:"ZGC"})," and\n",(0,r.jsx)(n.a,{href:"https://wiki.openjdk.java.net/display/shenandoah",children:"Shenandoah"}),') are designed to keep "stop the\nworld" to a minimum by doing collection work in parallel.']}),"\n"]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Allocation"}),': Every language has a different way of interacting with "heap" memory, but the\nprinciple is the same: running the allocator to allocate/deallocate memory takes time that can often\nbe put to better use. Understanding when your language interacts with the allocator is crucial, and\nnot always obvious. For example: C++ and Rust don\'t allocate heap memory for iterators, but Java\ndoes (meaning potential GC pauses). Take time to understand heap behavior (I made a\n',(0,r.jsx)(n.a,{href:"/2019/02/understanding-allocations-in-rust",children:"a guide for Rust"}),"), and look into alternative\nallocators (",(0,r.jsx)(n.a,{href:"http://jemalloc.net/",children:"jemalloc"}),",\n",(0,r.jsx)(n.a,{href:"https://gperftools.github.io/gperftools/tcmalloc.html",children:"tcmalloc"}),") that might run faster than the\noperating system default."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Data Layout"}),": How your data is arranged in memory matters;\n",(0,r.jsx)(n.a,{href:"https://www.youtube.com/watch?v=yy8jQgmhbAU",children:"data-oriented design"})," and\n",(0,r.jsx)(n.a,{href:"https://www.youtube.com/watch?v=2EWejmkKlxs&feature=youtu.be&t=1185",children:"cache locality"})," can have huge\nimpacts on performance. The C family of languages (C, value types in C#, C++) and Rust all have\nguarantees about the shape every object takes in memory that others (e.g. Java and Python) can't\nmake. ",(0,r.jsx)(n.a,{href:"http://valgrind.org/docs/manual/cg-manual.html",children:"Cachegrind"})," and kernel\n",(0,r.jsx)(n.a,{href:"https://perf.wiki.kernel.org/index.php/Main_Page",children:"perf"})," counters are both great for understanding\nhow performance relates to memory layout."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Just-In-Time Compilation"}),": Languages that are compiled on the fly (LuaJIT, C#, Java, PyPy) are\ngreat because they optimize your program for how it's actually being used, rather than how a\ncompiler expects it to be used. However, there's a variance problem if the program stops executing\nwhile waiting for translation from VM bytecode to native code. As a remedy, many languages support\nahead-of-time compilation in addition to the JIT versions\n(",(0,r.jsx)(n.a,{href:"https://github.com/dotnet/corert",children:"CoreRT"})," in C# and ",(0,r.jsx)(n.a,{href:"https://www.graalvm.org/",children:"GraalVM"})," in Java).\nOn the other hand, LLVM supports\n",(0,r.jsx)(n.a,{href:"https://clang.llvm.org/docs/UsersManual.html#profile-guided-optimization",children:"Profile Guided Optimization"}),",\nwhich theoretically brings JIT benefits to non-JIT languages. Finally, be careful to avoid comparing\napples and oranges during benchmarks; you don't want your code to suddenly speed up because the JIT\ncompiler kicked in."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Programming Tricks"}),": These won't make or break performance, but can be useful in specific\ncircumstances. For example, C++ can use\n",(0,r.jsx)(n.a,{href:"https://www.youtube.com/watch?v=NH1Tta7purM&feature=youtu.be&t=1206",children:"templates instead of branches"}),"\nin critical sections."]}),"\n",(0,r.jsx)(n.h2,{id:"kernel",children:"Kernel"}),"\n",(0,r.jsxs)(n.p,{children:["Code you wrote is almost certainly not the ",(0,r.jsx)(n.em,{children:"only"})," code running on your hardware. There are many ways\nthe operating system interacts with your program, from interrupts to system calls, that are\nimportant to watch for. These are written from a Linux perspective, but Windows does typically have\nequivalent functionality."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Scheduling"}),": The kernel is normally free to schedule any process on any core, so it's important\nto reserve CPU cores exclusively for the important programs. There are a few parts to this: first,\nlimit the CPU cores that non-critical processes are allowed to run on by excluding cores from\nscheduling\n(",(0,r.jsx)(n.a,{href:"https://www.linuxtopia.org/online_books/linux_kernel/kernel_configuration/re46.html",children:(0,r.jsx)(n.code,{children:"isolcpus"})}),"\nkernel command-line option), or by setting the ",(0,r.jsx)(n.code,{children:"init"})," process CPU affinity\n(",(0,r.jsxs)(n.a,{href:"https://access.redhat.com/solutions/2884991",children:[(0,r.jsx)(n.code,{children:"systemd"})," example"]}),"). Second, set critical processes\nto run on the isolated cores by setting the\n",(0,r.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/Processor_affinity",children:"processor affinity"})," using\n",(0,r.jsx)(n.a,{href:"https://linux.die.net/man/1/taskset",children:"taskset"}),". Finally, use\n",(0,r.jsx)(n.a,{href:"https://github.com/torvalds/linux/blob/master/Documentation/timers/NO_HZ.txt",children:(0,r.jsx)(n.code,{children:"NO_HZ"})})," or\n",(0,r.jsx)(n.a,{href:"https://linux.die.net/man/1/chrt",children:(0,r.jsx)(n.code,{children:"chrt"})})," to disable scheduling interrupts. Turning off\nhyper-threading is also likely beneficial."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"System calls"}),": Reading from a UNIX socket? Writing to a file? In addition to not knowing how long\nthe I/O operation takes, these all trigger expensive\n",(0,r.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/System_call",children:"system calls (syscalls)"}),". To handle these, the CPU must\n",(0,r.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/Context_switch",children:"context switch"})," to the kernel, let the kernel\noperation complete, then context switch back to your program. We'd rather keep these\n",(0,r.jsx)(n.a,{href:"https://www.destroyallsoftware.com/talks/the-birth-and-death-of-javascript",children:"to a minimum"})," (see\ntimestamp 18:20). ",(0,r.jsx)(n.a,{href:"https://linux.die.net/man/1/strace",children:"Strace"})," is your friend for understanding when\nand where syscalls happen."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Signal Handling"}),": Far less likely to be an issue, but signals do trigger a context switch if your\ncode has a handler registered. This will be highly dependent on the application, but you can\n",(0,r.jsx)(n.a,{href:"https://www.linuxprogrammingblog.com/all-about-linux-signals?page=show#Blocking_signals",children:"block signals"}),"\nif it's an issue."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Interrupts"}),": System interrupts are how devices connected to your computer notify the CPU that\nsomething has happened. The CPU will then choose a processor core to pause and context switch to the\nOS to handle the interrupt. Make sure that\n",(0,r.jsx)(n.a,{href:"http://www.alexonlinux.com/smp-affinity-and-proper-interrupt-handling-in-linux",children:"SMP affinity"})," is\nset so that interrupts are handled on a CPU core not running the program you care about."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:(0,r.jsx)(n.a,{href:"https://www.kernel.org/doc/html/latest/vm/numa.html",children:"NUMA"})}),": While NUMA is good at making\nmulti-cell systems transparent, there are variance implications; if the kernel moves a process\nacross nodes, future memory accesses must wait for the controller on the original node. Use\n",(0,r.jsx)(n.a,{href:"https://linux.die.net/man/8/numactl",children:"numactl"})," to handle memory-/cpu-cell pinning so this doesn't\nhappen."]}),"\n",(0,r.jsx)(n.h2,{id:"hardware",children:"Hardware"}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"CPU Pipelining/Speculation"}),": Speculative execution in modern processors gave us vulnerabilities\nlike Spectre, but it also gave us performance improvements like\n",(0,r.jsx)(n.a,{href:"https://stackoverflow.com/a/11227902/1454178",children:"branch prediction"}),". And if the CPU mis-speculates\nyour code, there's variance associated with rewind and replay. While the compiler knows a lot about\nhow your CPU ",(0,r.jsx)(n.a,{href:"https://youtu.be/nAbCKa0FzjQ?t=4467",children:"pipelines instructions"}),", code can be\n",(0,r.jsx)(n.a,{href:"https://www.youtube.com/watch?v=NH1Tta7purM&feature=youtu.be&t=755",children:"structured to help"})," the branch\npredictor."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Paging"}),": For most systems, virtual memory is incredible. Applications live in their own worlds,\nand the CPU/",(0,r.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/Memory_management_unit",children:"MMU"})," figures out the details.\nHowever, there's a variance penalty associated with memory paging and caching; if you access more\nmemory pages than the ",(0,r.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/Translation_lookaside_buffer",children:"TLB"})," can store,\nyou'll have to wait for the page walk. Kernel perf tools are necessary to figure out if this is an\nissue, but using ",(0,r.jsx)(n.a,{href:"https://blog.pythian.com/performance-tuning-hugepages-in-linux/",children:"huge pages"})," can\nreduce TLB burdens. Alternately, running applications in a hypervisor like\n",(0,r.jsx)(n.a,{href:"https://github.com/siemens/jailhouse",children:"Jailhouse"})," allows one to skip virtual memory entirely, but\nthis is probably more work than the benefits are worth."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Network Interfaces"}),": When more than one computer is involved, variance can go up dramatically.\nTuning kernel\n",(0,r.jsx)(n.a,{href:"https://github.com/leandromoreira/linux-network-performance-parameters",children:"network parameters"})," may be\nhelpful, but modern systems more frequently opt to skip the kernel altogether with a technique\ncalled ",(0,r.jsx)(n.a,{href:"https://blog.cloudflare.com/kernel-bypass/",children:"kernel bypass"}),". This typically requires\nspecialized hardware and ",(0,r.jsx)(n.a,{href:"https://www.openonload.org/",children:"drivers"}),", but even industries like\n",(0,r.jsx)(n.a,{href:"https://www.bbc.co.uk/rd/blog/2018-04-high-speed-networking-open-source-kernel-bypass",children:"telecom"})," are\nfinding the benefits."]}),"\n",(0,r.jsx)(n.h2,{id:"networks",children:"Networks"}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Routing"}),": There's a reason financial firms are willing to pay\n",(0,r.jsx)(n.a,{href:"https://sniperinmahwah.wordpress.com/2019/03/26/4-les-moeres-english-version/",children:"millions of euros"}),"\nfor rights to a small plot of land - having a straight-line connection from point A to point B means\nthe path their data takes is the shortest possible. In contrast, there are currently 6 computers in\nbetween me and Google, but that may change at any moment if my ISP realizes a\n",(0,r.jsx)(n.a,{href:"https://en.wikipedia.org/wiki/Border_Gateway_Protocol",children:"more efficient route"})," is available. Whether\nit's using\n",(0,r.jsx)(n.a,{href:"https://sniperinmahwah.wordpress.com/2018/05/07/shortwave-trading-part-i-the-west-chicago-tower-mystery/",children:"research-quality equipment"}),"\nfor shortwave radio, or just making sure there's no data inadvertently going between data centers,\nrouting matters."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Protocol"}),": TCP as a network protocol is awesome: guaranteed and in-order delivery, flow control,\nand congestion control all built in. But these attributes make the most sense when networking\ninfrastructure is lossy; for systems that expect nearly all packets to be delivered correctly, the\nsetup handshaking and packet acknowledgment are just overhead. Using UDP (unicast or multicast) may\nmake sense in these contexts as it avoids the chatter needed to track connection state, and\n",(0,r.jsx)(n.a,{href:"https://iextrading.com/docs/IEX%20Transport%20Specification.pdf",children:"gap-fill"}),"\n",(0,r.jsx)(n.a,{href:"http://www.nasdaqtrader.com/content/technicalsupport/specifications/dataproducts/moldudp64.pdf",children:"strategies"}),"\ncan handle the rest."]}),"\n",(0,r.jsxs)(n.p,{children:[(0,r.jsx)(n.strong,{children:"Switching"}),': Many routers/switches handle packets using "store-and-forward" behavior: wait for the\nwhole packet, validate checksums, and then send to the next device. In variance terms, the time\nneeded to move data between two nodes is proportional to the size of that data; the switch must\n"store" all data before it can calculate checksums and "forward" to the next node. With\n',(0,r.jsx)(n.a,{href:"https://www.networkworld.com/article/2241573/latency-and-jitter--cut-through-design-pays-off-for-arista--blade.html",children:'"cut-through"'}),"\ndesigns, switches will begin forwarding data as soon as they know where the destination is,\nchecksums be damned. This means there's a fixed cost (at the switch) for network traffic, no matter\nthe size."]}),"\n",(0,r.jsx)(n.h2,{id:"final-thoughts",children:"Final Thoughts"}),"\n",(0,r.jsxs)(n.p,{children:["High-performance systems, regardless of industry, are not magical. They do require extreme precision\nand attention to detail, but they're designed, built, and operated by regular people, using a lot of\ntools that are publicly available. Interested in seeing how context switching affects performance of\nyour benchmarks? ",(0,r.jsx)(n.code,{children:"taskset"})," should be installed in all modern Linux distributions, and can be used to\nmake sure the OS never migrates your process. Curious how often garbage collection triggers during a\ncrucial operation? Your language of choice will typically expose details of its operations\n(",(0,r.jsx)(n.a,{href:"https://docs.python.org/3/library/gc.html",children:"Python"}),",\n",(0,r.jsx)(n.a,{href:"https://www.oracle.com/technetwork/java/javase/tech/vmoptions-jsp-140102.html#DebuggingOptions",children:"Java"}),").\nWant to know how hard your program is stressing the TLB? Use ",(0,r.jsx)(n.code,{children:"perf record"})," and look for\n",(0,r.jsx)(n.code,{children:"dtlb_load_misses.miss_causes_a_walk"}),"."]}),"\n",(0,r.jsxs)(n.p,{children:["Two final guiding questions, then: first, before attempting to apply some of the technology above to\nyour own systems, can you first identify\n",(0,r.jsx)(n.a,{href:"http://wiki.c2.com/?PrematureOptimization",children:"where/when you care"})," about \"high-performance\"? As an\nexample, if parts of a system rely on humans pushing buttons, CPU pinning won't have any measurable\neffect. Humans are already far too slow to react in time. Second, if you're using benchmarks, are\nthey being designed in a way that's actually helpful? Tools like\n",(0,r.jsx)(n.a,{href:"http://www.serpentine.com/criterion/",children:"Criterion"})," (also in\n",(0,r.jsx)(n.a,{href:"https://github.com/bheisler/criterion.rs",children:"Rust"}),") and Google's\n",(0,r.jsx)(n.a,{href:"https://github.com/google/benchmark",children:"Benchmark"})," output not only average run time, but variance as\nwell; your benchmarking environment is subject to the same concerns your production environment is."]}),"\n",(0,r.jsx)(n.p,{children:"Finally, I believe high-performance systems are a matter of philosophy, not necessarily technique.\nRigorous focus on variance is the first step, and there are plenty of ways to measure and mitigate\nit; once that's at an acceptable level, then optimize for speed."})]})}function d(e={}){let{wrapper:n}={...(0,s.a)(),...e.components};return n?(0,r.jsx)(n,{...e,children:(0,r.jsx)(l,{...e})}):l(e)}},7743:function(e,n,t){t.d(n,{Z:function(){return a}});let a=t.p+"assets/images/kung-fu-5715f30eef7bf3aaa26770b1247024dc.webp"},65:function(e,n,t){t.d(n,{Z:function(){return o},a:function(){return i}});var a=t(7294);let r={},s=a.createContext(r);function i(e){let n=a.useContext(s);return a.useMemo(function(){return"function"==typeof e?e(n):{...n,...e}},[n,e])}function o(e){let n;return n=e.disableParentContext?"function"==typeof e.components?e.components(r):e.components||r:i(e.components),a.createElement(s.Provider,{value:n},e.children)}},1403:function(e){e.exports=JSON.parse('{"permalink":"/2019/06/high-performance-systems","source":"@site/blog/2019-06-31-high-performance-systems/index.mdx","title":"On building high performance systems","description":"Prior to working in the trading industry, my assumption was that High Frequency Trading (HFT) is","date":"2019-07-01T12:00:00.000Z","tags":[],"readingTime":12.175,"hasTruncateMarker":true,"authors":[{"name":"Bradlee Speice","socials":{"github":"https://github.com/bspeice"},"key":"bspeice","page":null}],"frontMatter":{"slug":"2019/06/high-performance-systems","title":"On building high performance systems","date":"2019-07-01T12:00:00.000Z","last_updated":{"date":"2019-09-21T12:00:00.000Z"},"authors":["bspeice"],"tags":[]},"unlisted":false,"lastUpdatedAt":1731207625000,"prevItem":{"title":"Binary format shootout","permalink":"/2019/09/binary-format-shootout"},"nextItem":{"title":"Making bread","permalink":"/2019/05/making-bread"}}')}}]);
						
						
					
				
				
					
						Reference in New Issue
					
					View Git Blame
					Copy Permalink