Add the SBE runner

Works for the test dataset, now time to make sure everyone agrees.
master
Bradlee Speice 2019-09-04 23:47:10 -04:00
parent df5198993b
commit e58861458e
7 changed files with 525 additions and 429 deletions

View File

@ -59,23 +59,26 @@
--> -->
<field name="timestamp" id="3" type="int64"/> <field name="timestamp" id="3" type="int64"/>
<!-- <!--
SBE specifically doesn't have "union" types, so we include both a `trade` and `quote` SBE specifically doesn't have "union" types, so we include a type tag, and both
here and the tag to identify. `trade` and `quote` as "optional". This style was chosen to approximate how
Cap'n Proto and Flatbuffers do it:
https://github.com/real-logic/simple-binary-encoding/issues/232 https://github.com/real-logic/simple-binary-encoding/issues/232
In the future, there are a couple options: However, space is actually reserved for *both* `trade` and `quote` in the message;
1. Create a "payload header" type and promote "trade" and "quote" to <sbe:message>, that is, the payload size is the same if neither, one, or both are filled.
since the SBE message header is already able to distinguish between message types Other ways you can try to emulate unions:
2. Create a "group" for each message type, and just include no elements of that type. 1. Use a "payload header" composite type and promote "trade" and "quote" to <sbe:message>;
Uses a `u16` for each message type, instead of padding bytes for the message size. SBE can distinguish message types based on the SBE header.
3. Split up the message components and use "session types" to chain things together, 2. Create a "group" for each message type; adds an extra `u16` per type, but overall payload
as in https://polysync.io/blog/session-types-for-hearty-codecs size goes down because we no longer reserve space that is potentially unused.
3. Split up the message components into individual <composite/> blocks, and write
For now, this message format was chosen to approximate the schemas of Cap'n Proto and Flatbuffers a state machine (by hand) to chain the blocks. For a better explanation,
see "session types" in:
https://polysync.io/blog/session-types-for-hearty-codecs
--> -->
<field name="msg_type" id="4" type="MsgType"/> <field name="msg_type" id="4" type="MsgType"/>
<field name="trade" id="5" type="Trade"/> <field name="trade" id="5" type="Trade" presence="optional"/>
<field name="quote" id="6" type="Quote"/> <field name="quote" id="6" type="Quote" presence="optional"/>
<data name="symbol" id="100" type="varAsciiEncoding"/> <data name="symbol" id="100" type="varAsciiEncoding"/>
</group> </group>
</sbe:message> </sbe:message>

View File

@ -1,2 +0,0 @@
// This file is needed for tests outside the main source tree to find the project files
pub mod marketdata_capnp;

View File

@ -17,9 +17,11 @@ use crate::iex::IexParser;
pub mod marketdata_capnp; pub mod marketdata_capnp;
#[allow(unused_imports)] #[allow(unused_imports)]
pub mod marketdata_generated; // Flatbuffers pub mod marketdata_generated; // Flatbuffers
pub mod marketdata_sbe;
mod capnp_runner; mod capnp_runner;
mod flatbuffers_runner; mod flatbuffers_runner;
mod sbe_runner;
mod iex; mod iex;
mod parsers; mod parsers;
@ -81,6 +83,7 @@ fn main() {
} }
*/ */
/*
let mut capnp_writer = capnp_runner::CapnpWriter::new(); let mut capnp_writer = capnp_runner::CapnpWriter::new();
for iex_payload in parser { for iex_payload in parser {
//let iex_payload = parser.next().unwrap(); //let iex_payload = parser.next().unwrap();
@ -93,6 +96,20 @@ fn main() {
while let Ok(_) = capnp_reader.deserialize_unpacked(&mut read_buf, &mut summarizer) { while let Ok(_) = capnp_reader.deserialize_unpacked(&mut read_buf, &mut summarizer) {
parsed_msgs += 1; parsed_msgs += 1;
} }
*/
let mut sbe_writer = sbe_runner::SBEWriter::new();
for iex_payload in parser {
//let iex_payload = parser.next().unwrap();
sbe_writer.serialize(&iex_payload, &mut output_buf);
}
let sbe_reader = sbe_runner::SBEReader::new();
let mut read_buf = StreamVec::new(output_buf);
let mut parsed_msgs: u64 = 0;
while let Ok(_) = sbe_reader.deserialize(&mut read_buf, &mut summarizer) {
parsed_msgs += 1;
}
dbg!(parsed_msgs); dbg!(parsed_msgs);
dbg!(summarizer); dbg!(summarizer);

View File

@ -188,7 +188,7 @@ impl<'d> ScratchEncoderData<'d> {
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
pub enum Either<L, R> { pub enum Either<L, R> {
Left(L), Left(L),
Right(R), Right(R)
} }
/// Enum Side /// Enum Side
@ -199,7 +199,6 @@ pub enum Side {
Sell = 1u8, Sell = 1u8,
NullVal = 255u8, NullVal = 255u8,
} }
impl Default for Side { impl Default for Side {
fn default() -> Self { Side::NullVal } fn default() -> Self { Side::NullVal }
} }
@ -212,7 +211,6 @@ pub enum MsgType {
Quote = 1u8, Quote = 1u8,
NullVal = 255u8, NullVal = 255u8,
} }
impl Default for MsgType { impl Default for MsgType {
fn default() -> Self { MsgType::NullVal } fn default() -> Self { MsgType::NullVal }
} }
@ -293,14 +291,12 @@ impl MultiMessageFields {}
pub struct MultiMessageMessageHeader { pub struct MultiMessageMessageHeader {
pub message_header: MessageHeader pub message_header: MessageHeader
} }
impl MultiMessageMessageHeader { impl MultiMessageMessageHeader {
pub const BLOCK_LENGTH: u16 = 8; pub const BLOCK_LENGTH: u16 = 8;
pub const TEMPLATE_ID: u16 = 1; pub const TEMPLATE_ID: u16 = 1;
pub const SCHEMA_ID: u16 = 1; pub const SCHEMA_ID: u16 = 1;
pub const VERSION: u16 = 0; pub const VERSION: u16 = 0;
} }
impl Default for MultiMessageMessageHeader { impl Default for MultiMessageMessageHeader {
fn default() -> MultiMessageMessageHeader { fn default() -> MultiMessageMessageHeader {
MultiMessageMessageHeader { MultiMessageMessageHeader {
@ -330,7 +326,6 @@ impl MultiMessageMessagesMember {}
pub struct MultiMessageDecoderDone<'d> { pub struct MultiMessageDecoderDone<'d> {
scratch: ScratchDecoderData<'d>, scratch: ScratchDecoderData<'d>,
} }
impl<'d> MultiMessageDecoderDone<'d> { impl<'d> MultiMessageDecoderDone<'d> {
/// Returns the number of bytes decoded /// Returns the number of bytes decoded
pub fn unwrap(self) -> usize { pub fn unwrap(self) -> usize {
@ -346,7 +341,6 @@ impl<'d> MultiMessageDecoderDone<'d> {
pub struct MultiMessageMessagesSymbolDecoder<'d> { pub struct MultiMessageMessagesSymbolDecoder<'d> {
parent: MultiMessageMessagesMemberDecoder<'d>, parent: MultiMessageMessagesMemberDecoder<'d>,
} }
impl<'d> MultiMessageMessagesSymbolDecoder<'d> { impl<'d> MultiMessageMessagesSymbolDecoder<'d> {
fn wrap(parent: MultiMessageMessagesMemberDecoder<'d>) -> Self { fn wrap(parent: MultiMessageMessagesMemberDecoder<'d>) -> Self {
MultiMessageMessagesSymbolDecoder { parent: parent } MultiMessageMessagesSymbolDecoder { parent: parent }
@ -388,11 +382,9 @@ impl<'d> MultiMessageMessagesMemberDecoder<'d> {
} }
} }
} }
pub struct MultiMessageMessagesHeaderDecoder<'d> { pub struct MultiMessageMessagesHeaderDecoder<'d> {
scratch: ScratchDecoderData<'d>, scratch: ScratchDecoderData<'d>,
} }
impl<'d> MultiMessageMessagesHeaderDecoder<'d> { impl<'d> MultiMessageMessagesHeaderDecoder<'d> {
fn wrap(scratch: ScratchDecoderData<'d>) -> Self { fn wrap(scratch: ScratchDecoderData<'d>) -> Self {
MultiMessageMessagesHeaderDecoder { scratch: scratch } MultiMessageMessagesHeaderDecoder { scratch: scratch }
@ -411,7 +403,6 @@ impl<'d> MultiMessageMessagesHeaderDecoder<'d> {
pub struct MultiMessageFieldsDecoder<'d> { pub struct MultiMessageFieldsDecoder<'d> {
scratch: ScratchDecoderData<'d>, scratch: ScratchDecoderData<'d>,
} }
impl<'d> MultiMessageFieldsDecoder<'d> { impl<'d> MultiMessageFieldsDecoder<'d> {
pub fn wrap(scratch: ScratchDecoderData<'d>) -> MultiMessageFieldsDecoder<'d> { pub fn wrap(scratch: ScratchDecoderData<'d>) -> MultiMessageFieldsDecoder<'d> {
MultiMessageFieldsDecoder { scratch: scratch } MultiMessageFieldsDecoder { scratch: scratch }
@ -426,7 +417,6 @@ impl<'d> MultiMessageFieldsDecoder<'d> {
pub struct MultiMessageMessageHeaderDecoder<'d> { pub struct MultiMessageMessageHeaderDecoder<'d> {
scratch: ScratchDecoderData<'d>, scratch: ScratchDecoderData<'d>,
} }
impl<'d> MultiMessageMessageHeaderDecoder<'d> { impl<'d> MultiMessageMessageHeaderDecoder<'d> {
pub fn wrap(scratch: ScratchDecoderData<'d>) -> MultiMessageMessageHeaderDecoder<'d> { pub fn wrap(scratch: ScratchDecoderData<'d>) -> MultiMessageMessageHeaderDecoder<'d> {
MultiMessageMessageHeaderDecoder { scratch: scratch } MultiMessageMessageHeaderDecoder { scratch: scratch }
@ -446,7 +436,6 @@ pub fn start_decoding_multi_message<'d>(data: &'d [u8]) -> MultiMessageMessageHe
pub struct MultiMessageEncoderDone<'d> { pub struct MultiMessageEncoderDone<'d> {
scratch: ScratchEncoderData<'d>, scratch: ScratchEncoderData<'d>,
} }
impl<'d> MultiMessageEncoderDone<'d> { impl<'d> MultiMessageEncoderDone<'d> {
/// Returns the number of bytes encoded /// Returns the number of bytes encoded
pub fn unwrap(self) -> usize { pub fn unwrap(self) -> usize {
@ -462,7 +451,6 @@ impl<'d> MultiMessageEncoderDone<'d> {
pub struct MultiMessageMessagesSymbolEncoder<'d> { pub struct MultiMessageMessagesSymbolEncoder<'d> {
parent: MultiMessageMessagesMemberEncoder<'d>, parent: MultiMessageMessagesMemberEncoder<'d>,
} }
impl<'d> MultiMessageMessagesSymbolEncoder<'d> { impl<'d> MultiMessageMessagesSymbolEncoder<'d> {
fn wrap(parent: MultiMessageMessagesMemberEncoder<'d>) -> Self { fn wrap(parent: MultiMessageMessagesMemberEncoder<'d>) -> Self {
MultiMessageMessagesSymbolEncoder { parent: parent } MultiMessageMessagesSymbolEncoder { parent: parent }
@ -470,7 +458,7 @@ impl<'d> MultiMessageMessagesSymbolEncoder<'d> {
pub fn symbol(mut self, s: &'d [u8]) -> CodecResult<MultiMessageMessagesMemberEncoder> { pub fn symbol(mut self, s: &'d [u8]) -> CodecResult<MultiMessageMessagesMemberEncoder> {
let l = s.len(); let l = s.len();
if l > 4294967294 { if l > 4294967294 {
return Err(CodecErr::SliceIsLongerThanAllowedBySchema); return Err(CodecErr::SliceIsLongerThanAllowedBySchema)
} }
// Write data length // Write data length
self.parent.scratch.write_type::<u32>(&(l as u32), 4)?; // group length self.parent.scratch.write_type::<u32>(&(l as u32), 4)?; // group length
@ -508,11 +496,9 @@ impl<'d> MultiMessageMessagesMemberEncoder<'d> {
Ok(MultiMessageEncoderDone::wrap(self.scratch)) Ok(MultiMessageEncoderDone::wrap(self.scratch))
} }
} }
pub struct MultiMessageMessagesHeaderEncoder<'d> { pub struct MultiMessageMessagesHeaderEncoder<'d> {
scratch: ScratchEncoderData<'d>, scratch: ScratchEncoderData<'d>,
} }
impl<'d> MultiMessageMessagesHeaderEncoder<'d> { impl<'d> MultiMessageMessagesHeaderEncoder<'d> {
#[inline] #[inline]
fn wrap(scratch: ScratchEncoderData<'d>) -> Self { fn wrap(scratch: ScratchEncoderData<'d>) -> Self {
@ -531,7 +517,6 @@ impl<'d> MultiMessageMessagesHeaderEncoder<'d> {
pub struct MultiMessageFieldsEncoder<'d> { pub struct MultiMessageFieldsEncoder<'d> {
scratch: ScratchEncoderData<'d>, scratch: ScratchEncoderData<'d>,
} }
impl<'d> MultiMessageFieldsEncoder<'d> { impl<'d> MultiMessageFieldsEncoder<'d> {
pub fn wrap(scratch: ScratchEncoderData<'d>) -> MultiMessageFieldsEncoder<'d> { pub fn wrap(scratch: ScratchEncoderData<'d>) -> MultiMessageFieldsEncoder<'d> {
MultiMessageFieldsEncoder { scratch: scratch } MultiMessageFieldsEncoder { scratch: scratch }
@ -556,7 +541,6 @@ impl<'d> MultiMessageFieldsEncoder<'d> {
pub struct MultiMessageMessageHeaderEncoder<'d> { pub struct MultiMessageMessageHeaderEncoder<'d> {
scratch: ScratchEncoderData<'d>, scratch: ScratchEncoderData<'d>,
} }
impl<'d> MultiMessageMessageHeaderEncoder<'d> { impl<'d> MultiMessageMessageHeaderEncoder<'d> {
pub fn wrap(scratch: ScratchEncoderData<'d>) -> MultiMessageMessageHeaderEncoder<'d> { pub fn wrap(scratch: ScratchEncoderData<'d>) -> MultiMessageMessageHeaderEncoder<'d> {
MultiMessageMessageHeaderEncoder { scratch: scratch } MultiMessageMessageHeaderEncoder { scratch: scratch }

144
src/sbe_runner.rs Normal file
View File

@ -0,0 +1,144 @@
use std::io::{BufRead, Write};
use std::str::from_utf8_unchecked;
use nom::bytes::complete::take_until;
use nom::IResult;
use crate::{marketdata_sbe, StreamVec, Summarizer};
use crate::iex::{IexMessage, IexPayload};
use crate::marketdata_sbe::{Either, MultiMessageFields, MultiMessageMessageHeader, MultiMessageMessagesMember, MultiMessageMessagesMemberEncoder, MultiMessageMessagesSymbolEncoder, Side, start_decoding_multi_message, start_encoding_multi_message};
fn __take_until<'a>(tag: &'static str, input: &'a [u8]) -> IResult<&'a [u8], &'a [u8]> {
take_until(tag)(input)
}
fn parse_symbol(sym: &[u8; 8]) -> &str {
// TODO: Use the `jetscii` library for all that SIMD goodness
// IEX guarantees ASCII, so we're fine using an unsafe conversion
let (_, sym_bytes) = __take_until(" ", &sym[..]).unwrap();
unsafe { from_utf8_unchecked(sym_bytes) }
}
pub struct SBEWriter {
/// Buffer to construct messages before copying. While SBE benefits
/// from easily being able to create messages directly in output buffer,
/// we'll construct in a scratch buffer and then copy to more fairly
/// benchmark against Cap'n Proto and Flatbuffers.
scratch_buffer: Vec<u8>,
default_header: MultiMessageMessageHeader,
}
impl SBEWriter {
pub fn new() -> SBEWriter {
SBEWriter {
// 8K scratch buffer is *way* more than necessary,
// but we don't want to run into issues with not enough
// data to encode messages
scratch_buffer: vec![0; 1024 * 8],
default_header: MultiMessageMessageHeader::default(),
}
}
pub fn serialize(&mut self, payload: &IexPayload, output: &mut Vec<u8>) {
let (fields, encoder) = start_encoding_multi_message(&mut self.scratch_buffer[..])
.header_copy(&self.default_header.message_header).unwrap()
.multi_message_fields().unwrap();
fields.sequence_number = payload.first_seq_no;
let mut encoder = encoder.messages_individually().unwrap();
let mut encoder: MultiMessageMessagesMemberEncoder = payload.messages.iter().fold(encoder, |enc, m| {
match m {
IexMessage::TradeReport(tr) => {
let fields = MultiMessageMessagesMember {
msg_type: marketdata_sbe::MsgType::Trade,
timestamp: tr.timestamp,
trade: marketdata_sbe::Trade {
size: tr.size,
price: tr.price,
},
..Default::default()
};
let sym_enc: MultiMessageMessagesSymbolEncoder = enc.next_messages_member(&fields).unwrap();
sym_enc.symbol(parse_symbol(&tr.symbol).as_bytes()).unwrap()
}
IexMessage::PriceLevelUpdate(plu) => {
let fields = MultiMessageMessagesMember {
msg_type: marketdata_sbe::MsgType::Quote,
timestamp: plu.timestamp,
quote: marketdata_sbe::Quote {
price: plu.price,
size: plu.size,
flags: plu.event_flags,
side: if plu.msg_type == 0x38 { Side::Buy } else { Side::Sell },
},
..Default::default()
};
let sym_enc: MultiMessageMessagesSymbolEncoder = enc.next_messages_member(&fields).unwrap();
sym_enc.symbol(parse_symbol(&plu.symbol).as_bytes()).unwrap()
}
_ => enc
}
});
let finished = encoder.done_with_messages().unwrap();
let data_len = finished.unwrap();
output.write(&self.scratch_buffer[..data_len]).unwrap();
}
}
pub struct SBEReader;
impl SBEReader {
pub fn new() -> SBEReader {
SBEReader {}
}
pub fn deserialize<'a>(&self, buf: &'a mut StreamVec, stats: &mut Summarizer) -> Result<(), ()> {
let data = buf.fill_buf().unwrap();
if data.len() == 0 {
return Err(());
}
let (header, decoder) = start_decoding_multi_message(data)
.header().unwrap();
let (fields, decoder) = decoder.multi_message_fields().unwrap();
let mut msg_decoder = decoder.messages_individually().unwrap();
while let Either::Left(msg) = msg_decoder {
let (member, sym_dec) = msg.next_messages_member().unwrap();
let (sym, next_msg_dec) = sym_dec.symbol().unwrap();
match member.msg_type {
marketdata_sbe::MsgType::Trade => stats.append_trade_volume(
unsafe { from_utf8_unchecked(sym) },
member.trade.size as u64,
),
marketdata_sbe::MsgType::Quote => stats.update_quote_prices(
unsafe { from_utf8_unchecked(sym) },
member.quote.price,
match member.quote.side {
Side::Buy => true,
_ => false
},
),
_ => ()
}
msg_decoder = next_msg_dec;
}
// We now have a `Right`, which is a finished messages block
let msg_decoder = match msg_decoder {
Either::Right(r) => r,
_ => panic!("Didn't parse all messages")
};
// Interestingly enough, `buf.consume(msg_decoder.unwrap())` isn't OK,
// presumably something to do with when *precisely* the drop of `self`
// happens for `msg_decoder`. Leave it as two statments so that
// Rust is able to prove our immutable borrow of `data` ends in time
// to consume the buffer
let msg_len = msg_decoder.unwrap();
buf.consume(msg_len);
Ok(())
}
}

View File

@ -1,50 +0,0 @@
use alloc_counter::{AllocCounterSystem, count_alloc, deny_alloc};
use md_shootout::marketdata_capnp::multi_message;
#[global_allocator]
static A: AllocCounterSystem = AllocCounterSystem;
#[test]
fn reinit_memory_check() {
// Setting up the builder doesn't reserve any heap memory
let mut msg_block = deny_alloc(|| {
capnp::message::Builder::new_default()
});
// Setting up the root object, however, does reserve a first segment
let (stats, result) = count_alloc(|| {
let multimsg = msg_block.init_root::<multi_message::Builder>();
multimsg.init_messages(32);
});
assert_eq!(stats.0, 4);
assert_eq!(stats.1, 0);
assert_eq!(stats.2, 0);
// If we reinitialize an object on that original builder, we re-use memory
deny_alloc(|| {
let multimsg = msg_block.init_root::<multi_message::Builder>();
multimsg.init_messages(32);
// Even if we down-size and up-size the message list size, we don't need
// to re-allocate
let multimsg = msg_block.init_root::<multi_message::Builder>();
multimsg.init_messages(16);
let multimsg = msg_block.init_root::<multi_message::Builder>();
multimsg.init_messages(32);
});
// It's only when we init a larger message count that a fresh allocation occurs
let (stats, _) = count_alloc(|| {
let multimsg = msg_block.init_root::<multi_message::Builder>();
// Note: calling `init_messages(33)` doesn't force allocation because
// the Capnproto builder reserved extra memory the first time around
multimsg.init_messages(256);
});
assert_eq!(stats.0, 1);
assert_eq!(stats.1, 3);
assert_eq!(stats.2, 0);
}