From 58e3b05b454fc551829dcb7ecbf97a73ef122165 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Thu, 17 May 2018 00:31:57 -0400 Subject: [PATCH] Add a Python dateutil compat test --- build_tests.py | 56 +++++++++ src/lib.rs | 233 ++++++++++++++++++-------------------- src/test_python_compat.rs | 39 +++++++ src/tests.rs | 50 -------- tests/parse.rs | 29 ----- 5 files changed, 208 insertions(+), 199 deletions(-) create mode 100644 build_tests.py create mode 100644 src/test_python_compat.rs delete mode 100644 src/tests.rs delete mode 100644 tests/parse.rs diff --git a/build_tests.py b/build_tests.py new file mode 100644 index 0000000..45ad18a --- /dev/null +++ b/build_tests.py @@ -0,0 +1,56 @@ +#import dateutil.parser._timelex.split as time_split +from dateutil.parser import _timelex + +# The TEST_STRINGS list should be the only thing that actually needs changing +TEST_STRINGS = [ + '2018.5.15', + 'May 5, 2018', + 'Mar. 5, 2018', +] + +S4 = ' ' * 4 +S8 = ' ' * 8 +S12 = ' ' * 12 + +def test_string_to_rust(time_string): + split_array = _timelex.split(time_string) + + def translate_token(token): + if token[0].isalpha(): + return 'Token::Alpha("{}".to_owned())'.format(token) + elif token[0].isnumeric(): + return 'Token::Numeric("{}".to_owned())'.format(token) + elif len(token) == 1: + return 'Token::Separator("{}".to_owned())'.format(token) + else: + raise Exception("Invalid token during parsing of dateutil " + "split: {}".format(token)) + + return [translate_token(t) for t in split_array] + +def main(): + header = '''use super::Token; +use super::tokenize; + +#[test] +fn test_python_compat() {\n''' + + tests = [] + + for test_string in TEST_STRINGS: + token_string = '\n'.join(['{}{},'.format(S12, s) + for s in test_string_to_rust(test_string)]) + tests.append(' assert_eq!(\n{}tokenize("{}"),\n{}vec![\n{}\n{}]\n{});' + .format(S8, test_string, S8, token_string, S8, S4)) + + body = '\n'.join(tests) + + footer = '\n}\n' + + with open('src/test_python_compat.rs', 'w') as handle: + handle.write(header) + handle.write(body) + handle.write(footer) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 0808ab6..86f6f9f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,15 +1,9 @@ extern crate chrono; -use chrono::DateTime; -use chrono::NaiveDateTime; -use chrono::ParseError; -use chrono::Utc; -use std::time::SystemTime; -use std::time::UNIX_EPOCH; use std::vec::Vec; #[cfg(test)] -mod tests; +mod test_python_compat; #[derive(PartialEq, Debug)] pub enum Token { @@ -33,7 +27,6 @@ enum ParseState { } impl Tokenizer { - fn new(parse_string: String) -> Self { Tokenizer { token_stack: Vec::new(), @@ -46,8 +39,12 @@ impl Iterator for Tokenizer { type Item = Token; fn next(&mut self) -> Option { - if !self.token_stack.is_empty() { return Some(self.token_stack.pop().unwrap()) }; - if self.parse_string.is_empty() { return None }; + if !self.token_stack.is_empty() { + return Some(self.token_stack.pop().unwrap()); + }; + if self.parse_string.is_empty() { + return None; + }; let mut char_stack: Vec = Vec::new(); let mut seen_letters = false; @@ -56,76 +53,80 @@ impl Iterator for Tokenizer { while let Some(next) = self.parse_string.pop() { println!("{} - {:?}", next, state); match state { - ParseState::Empty => { - if next.is_numeric() { - state = ParseState::Numeric; - char_stack.push(next); - } else if next.is_alphabetic() { - state = ParseState::Alpha; - seen_letters = true; - char_stack.push(next); - } else if next.is_whitespace() { - char_stack.push(' '); - break; - } else { - char_stack.push(next); - break; + ParseState::Empty => { + if next.is_numeric() { + state = ParseState::Numeric; + char_stack.push(next); + } else if next.is_alphabetic() { + state = ParseState::Alpha; + seen_letters = true; + char_stack.push(next); + } else if next.is_whitespace() { + char_stack.push(' '); + break; + } else { + char_stack.push(next); + break; + } } - }, - ParseState::Alpha => { - if next.is_alphabetic() { - char_stack.push(next); - } else if next == '.' { - state = ParseState::AlphaDecimal; - char_stack.push(next); - } else { - // We don't recognize the character, so push it back - // to be handled later. - self.parse_string.push(next); - break; + ParseState::Alpha => { + if next.is_alphabetic() { + char_stack.push(next); + } else if next == '.' { + state = ParseState::AlphaDecimal; + char_stack.push(next); + } else { + // We don't recognize the character, so push it back + // to be handled later. + self.parse_string.push(next); + break; + } } - }, - ParseState::AlphaDecimal => { - if next == '.' || next.is_alphabetic() { - char_stack.push(next); - } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' { - char_stack.push(next); - state = ParseState::NumericDecimal; - } else { - self.parse_string.push(next); - break; + ParseState::AlphaDecimal => { + if next == '.' || next.is_alphabetic() { + char_stack.push(next); + } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' { + char_stack.push(next); + state = ParseState::NumericDecimal; + } else { + self.parse_string.push(next); + break; + } } - }, - ParseState::Numeric => { - if next.is_numeric() { - char_stack.push(next); - } else if next == '.' || (next == ',' && char_stack.len() >= 2) { - char_stack.push(next); - state = ParseState::NumericDecimal; - } else { - // We don't recognize the character, so push it back - // to be handled later - self.parse_string.push(next); - break; + ParseState::Numeric => { + if next.is_numeric() { + char_stack.push(next); + } else if next == '.' || (next == ',' && char_stack.len() >= 2) { + char_stack.push(next); + state = ParseState::NumericDecimal; + } else { + // We don't recognize the character, so push it back + // to be handled later + self.parse_string.push(next); + break; + } } - }, - ParseState::NumericDecimal => { - if next == '.' || next.is_numeric() { - char_stack.push(next); - } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' { - char_stack.push(next); - state = ParseState::AlphaDecimal; - } else { - self.parse_string.push(next); - break; + ParseState::NumericDecimal => { + if next == '.' || next.is_numeric() { + char_stack.push(next); + } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' { + char_stack.push(next); + state = ParseState::AlphaDecimal; + } else { + self.parse_string.push(next); + break; + } } } - }}; + } // I like Python's version of this much better: // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,' - let dot_count = char_stack.iter().fold(0, |count, character| count + (if character == &'.' {1} else {0})); - let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' || char_stack.last().unwrap() == &','; + let dot_count = char_stack.iter().fold(0, |count, character| { + count + (if character == &'.' { 1 } else { 0 }) + }); + let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' + || char_stack.last().unwrap() == &','; let final_string = char_stack.into_iter().collect(); let mut tokens = match state { @@ -134,7 +135,7 @@ impl Iterator for Tokenizer { ParseState::Numeric => vec![Token::Numeric(final_string)], ParseState::AlphaDecimal => { if needs_split { - decimal_split(&final_string, false) + decimal_split(&final_string, false) } else { vec![Token::Alpha(final_string)] } @@ -146,7 +147,9 @@ impl Iterator for Tokenizer { vec![Token::Numeric(final_string)] } } - }.into_iter().rev().collect(); + }.into_iter() + .rev() + .collect(); self.token_stack.append(&mut tokens); // UNWRAP: Previous match guaranteed that at least one token was added @@ -154,72 +157,62 @@ impl Iterator for Tokenizer { } } - fn decimal_split(characters: &str, cast_period: bool) -> Vec { let mut token_stack: Vec = Vec::new(); let mut char_stack: Vec = Vec::new(); let mut state = ParseState::Empty; - for c in characters.chars() { match state { - ParseState::Empty => { - if c.is_alphabetic() { - char_stack.push(c); - state = ParseState::Alpha; - } else if c.is_numeric() { - char_stack.push(c); - state = ParseState::Numeric; - } else { - let character = if cast_period { '.' } else { c }; - token_stack.push(Token::Separator(character.to_string())); + for c in characters.chars() { + match state { + ParseState::Empty => { + if c.is_alphabetic() { + char_stack.push(c); + state = ParseState::Alpha; + } else if c.is_numeric() { + char_stack.push(c); + state = ParseState::Numeric; + } else { + let character = if cast_period { '.' } else { c }; + token_stack.push(Token::Separator(character.to_string())); + } } - }, - ParseState::Alpha => { - if c.is_alphabetic() { - char_stack.push(c); - } else { - token_stack.push(Token::Alpha(char_stack.iter().collect())); - char_stack.clear(); - let character = if cast_period { '.' } else { c }; - token_stack.push(Token::Separator(character.to_string())); - state = ParseState::Empty; + ParseState::Alpha => { + if c.is_alphabetic() { + char_stack.push(c); + } else { + token_stack.push(Token::Alpha(char_stack.iter().collect())); + char_stack.clear(); + let character = if cast_period { '.' } else { c }; + token_stack.push(Token::Separator(character.to_string())); + state = ParseState::Empty; + } } - }, - ParseState::Numeric => { - if c.is_numeric() { - char_stack.push(c); - } else { - token_stack.push(Token::Numeric(char_stack.iter().collect())); - char_stack.clear(); - let character = if cast_period { '.' } else { c }; - token_stack.push(Token::Separator(character.to_string())); - state = ParseState::Empty; + ParseState::Numeric => { + if c.is_numeric() { + char_stack.push(c); + } else { + token_stack.push(Token::Numeric(char_stack.iter().collect())); + char_stack.clear(); + let character = if cast_period { '.' } else { c }; + token_stack.push(Token::Separator(character.to_string())); + state = ParseState::Empty; + } } - }, - _ => panic!("Invalid parse state during decimal_split()") - }} + _ => panic!("Invalid parse state during decimal_split()"), + } + } match state { ParseState::Alpha => token_stack.push(Token::Alpha(char_stack.iter().collect())), ParseState::Numeric => token_stack.push(Token::Numeric(char_stack.iter().collect())), ParseState::Empty => (), - _ => panic!("Invalid parse state during decimal_split()") + _ => panic!("Invalid parse state during decimal_split()"), } token_stack } - pub fn tokenize(parse_string: &str) -> Vec { let tokenizer = Tokenizer::new(parse_string.to_owned()); tokenizer.collect() } - - -pub fn parse(date: &str) -> Result, ParseError> { - let current = SystemTime::now(); - let epoch = current.duration_since(UNIX_EPOCH).unwrap(); - - let naive = NaiveDateTime::from_timestamp(epoch.as_secs() as i64, epoch.subsec_nanos()); - - Ok(DateTime::from_utc(naive, Utc)) -} diff --git a/src/test_python_compat.rs b/src/test_python_compat.rs new file mode 100644 index 0000000..494f96f --- /dev/null +++ b/src/test_python_compat.rs @@ -0,0 +1,39 @@ +use super::Token; +use super::tokenize; + +#[test] +fn test_python_compat() { + assert_eq!( + tokenize("2018.5.15"), + vec![ + Token::Numeric("2018".to_owned()), + Token::Separator(".".to_owned()), + Token::Numeric("5".to_owned()), + Token::Separator(".".to_owned()), + Token::Numeric("15".to_owned()), + ] + ); + assert_eq!( + tokenize("May 5, 2018"), + vec![ + Token::Alpha("May".to_owned()), + Token::Separator(" ".to_owned()), + Token::Numeric("5".to_owned()), + Token::Separator(",".to_owned()), + Token::Separator(" ".to_owned()), + Token::Numeric("2018".to_owned()), + ] + ); + assert_eq!( + tokenize("Mar. 5, 2018"), + vec![ + Token::Alpha("Mar".to_owned()), + Token::Separator(".".to_owned()), + Token::Separator(" ".to_owned()), + Token::Numeric("5".to_owned()), + Token::Separator(",".to_owned()), + Token::Separator(" ".to_owned()), + Token::Numeric("2018".to_owned()), + ] + ); +} diff --git a/src/tests.rs b/src/tests.rs deleted file mode 100644 index 7658559..0000000 --- a/src/tests.rs +++ /dev/null @@ -1,50 +0,0 @@ -use super::Token; -use super::tokenize; - -macro_rules! t { - ($string: expr, $( $x: expr ),* ) => { - assert_eq!( - tokenize($string), - vec![$( $x, )*] - ) - }; -} - -macro_rules! a { - ($string:expr) => { - Token::Alpha($string.to_owned()) - }; -} - -macro_rules! n { - ($string:expr) => { - Token::Numeric($string.to_owned()) - }; -} - -macro_rules! s { - ($string:expr) => { - Token::Separator($string.to_owned()) - }; -} - -#[test] -fn test_basic_tokenize() { - t!("Sep.2009.24", - a!("Sep"), s!("."), n!("2009"), s!("."), n!("24")); - - t!("Sep.2009;24", - a!("Sep"), s!("."), n!("2009"), s!(";"), n!("24")); - - t!("Sep.2009,24", - a!("Sep"), s!("."), n!("2009"), s!(","), n!("24")); - - t!("24 Sep., 2009", - n!("24"), s!(" "), a!("Sep"), s!("."), s!(","), s!(" "), n!("2009")); - - t!("2009.24", - n!("2009.24")); - - t!("2009.24.09", - n!("2009"), s!("."), n!("24"), s!("."), n!("09")); -} diff --git a/tests/parse.rs b/tests/parse.rs deleted file mode 100644 index eb135ec..0000000 --- a/tests/parse.rs +++ /dev/null @@ -1,29 +0,0 @@ -extern crate chrono; -extern crate dtparse; - -use chrono::DateTime; -use chrono::NaiveDate; -use chrono::NaiveDateTime; -use chrono::NaiveTime; -use chrono::Utc; - -use dtparse::parse; - -macro_rules! ymd_test { - ($date:expr, $year:expr, $month:expr, $day:expr) => { - let nd = NaiveDate::from_ymd($year, $month, $day); - let nt = NaiveTime::from_hms(0, 0, 0); - let dt = NaiveDateTime::new(nd, nt); - let utc_dt = DateTime::from_utc(dt, Utc); - - let parsed = parse($date); - - println!("{:?}", parsed); - assert!(parsed == Ok(utc_dt)); - }; -} - -#[test] -fn test_basic() { - ymd_test!("2014 January 19", 2014, 1, 19); -}