Add a Python dateutil compat test

2025-11-04 18:40:37 -05:00 · 2018-05-17 00:31:57 -04:00
parent af0908c3cc
commit 58e3b05b45
5 changed files with 208 additions and 199 deletions
--- a/build_tests.py
+++ b/build_tests.py
@ -0,0 +1,56 @@
 #import dateutil.parser._timelex.split as time_split
 from dateutil.parser import _timelex
 # The TEST_STRINGS list should be the only thing that actually needs changing
 TEST_STRINGS = [
    '2018.5.15',
    'May 5, 2018',
    'Mar. 5, 2018',
 ]
 S4 = ' ' * 4
 S8 = ' ' * 8
 S12 = ' ' * 12
 def test_string_to_rust(time_string):
    split_array = _timelex.split(time_string)
    def translate_token(token):
        if token[0].isalpha():
            return 'Token::Alpha("{}".to_owned())'.format(token)
        elif token[0].isnumeric():
            return 'Token::Numeric("{}".to_owned())'.format(token)
        elif len(token) == 1:
            return 'Token::Separator("{}".to_owned())'.format(token)
        else:
            raise Exception("Invalid token during parsing of dateutil "
                            "split: {}".format(token))
    return [translate_token(t) for t in split_array]
 def main():
    header = '''use super::Token;
 use super::tokenize;
 #[test]
 fn test_python_compat() {\n'''
    tests = []
    for test_string in TEST_STRINGS:
        token_string = '\n'.join(['{}{},'.format(S12, s)
                                  for s in test_string_to_rust(test_string)])
        tests.append('    assert_eq!(\n{}tokenize("{}"),\n{}vec![\n{}\n{}]\n{});'
                     .format(S8, test_string, S8, token_string, S8, S4))
    body = '\n'.join(tests)
    footer = '\n}\n'
    with open('src/test_python_compat.rs', 'w') as handle:
        handle.write(header)
        handle.write(body)
        handle.write(footer)
 if __name__ == '__main__':
    main()
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,15 +1,9 @@
 extern crate chrono;
 use chrono::DateTime;
 use chrono::NaiveDateTime;
 use chrono::ParseError;
 use chrono::Utc;
 use std::time::SystemTime;
 use std::time::UNIX_EPOCH;
 use std::vec::Vec;
 #[cfg(test)]
-mod tests;
+mod test_python_compat;
 #[derive(PartialEq, Debug)]
 pub enum Token {
@ -33,7 +27,6 @@ enum ParseState {
 }
 impl Tokenizer {
    fn new(parse_string: String) -> Self {
        Tokenizer {
            token_stack: Vec::new(),
@ -46,8 +39,12 @@ impl Iterator for Tokenizer {
    type Item = Token;
    fn next(&mut self) -> Option<Self::Item> {
-        if !self.token_stack.is_empty() { return Some(self.token_stack.pop().unwrap()) };
+        if !self.token_stack.is_empty() {
-        if self.parse_string.is_empty() { return None };
+            return Some(self.token_stack.pop().unwrap());
        };
        if self.parse_string.is_empty() {
            return None;
        };
        let mut char_stack: Vec<char> = Vec::new();
        let mut seen_letters = false;
@ -56,76 +53,80 @@ impl Iterator for Tokenizer {
        while let Some(next) = self.parse_string.pop() {
            println!("{} - {:?}", next, state);
            match state {
-            ParseState::Empty => {
+                ParseState::Empty => {
-                if next.is_numeric() {
+                    if next.is_numeric() {
-                    state = ParseState::Numeric;
+                        state = ParseState::Numeric;
-                    char_stack.push(next);
+                        char_stack.push(next);
-                } else if next.is_alphabetic() {
+                    } else if next.is_alphabetic() {
-                    state = ParseState::Alpha;
+                        state = ParseState::Alpha;
-                    seen_letters = true;
+                        seen_letters = true;
-                    char_stack.push(next);
+                        char_stack.push(next);
-                } else if next.is_whitespace() {
+                    } else if next.is_whitespace() {
-                    char_stack.push(' ');
+                        char_stack.push(' ');
-                    break;
+                        break;
-                } else {
+                    } else {
-                    char_stack.push(next);
+                        char_stack.push(next);
-                    break;
+                        break;
                    }
                }
-            },
+                ParseState::Alpha => {
-            ParseState::Alpha => {
+                    if next.is_alphabetic() {
-                if next.is_alphabetic() {
+                        char_stack.push(next);
-                    char_stack.push(next);
+                    } else if next == '.' {
-                } else if next == '.' {
+                        state = ParseState::AlphaDecimal;
-                    state = ParseState::AlphaDecimal;
+                        char_stack.push(next);
-                    char_stack.push(next);
+                    } else {
-                } else {
+                        // We don't recognize the character, so push it back
-                    // We don't recognize the character, so push it back
+                        // to be handled later.
-                    // to be handled later.
+                        self.parse_string.push(next);
-                    self.parse_string.push(next);
+                        break;
-                    break;
+                    }
                }
-            },
+                ParseState::AlphaDecimal => {
-            ParseState::AlphaDecimal => {
+                    if next == '.' || next.is_alphabetic() {
-                if next == '.' || next.is_alphabetic() {
+                        char_stack.push(next);
-                    char_stack.push(next);
+                    } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' {
-                } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' {
+                        char_stack.push(next);
-                    char_stack.push(next);
+                        state = ParseState::NumericDecimal;
-                    state = ParseState::NumericDecimal;
+                    } else {
-                } else {
+                        self.parse_string.push(next);
-                    self.parse_string.push(next);
+                        break;
-                    break;
+                    }
                }
-            },
+                ParseState::Numeric => {
-            ParseState::Numeric => {
+                    if next.is_numeric() {
-                if next.is_numeric() {
+                        char_stack.push(next);
-                    char_stack.push(next);
+                    } else if next == '.' || (next == ',' && char_stack.len() >= 2) {
-                } else if next == '.' || (next == ',' && char_stack.len() >= 2) {
+                        char_stack.push(next);
-                    char_stack.push(next);
+                        state = ParseState::NumericDecimal;
-                    state = ParseState::NumericDecimal;
+                    } else {
-                } else {
+                        // We don't recognize the character, so push it back
-                    // We don't recognize the character, so push it back
+                        // to be handled later
-                    // to be handled later
+                        self.parse_string.push(next);
-                    self.parse_string.push(next);
+                        break;
-                    break;
+                    }
                }
-            },
+                ParseState::NumericDecimal => {
-            ParseState::NumericDecimal => {
+                    if next == '.' || next.is_numeric() {
-                if next == '.' || next.is_numeric() {
+                        char_stack.push(next);
-                    char_stack.push(next);
+                    } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' {
-                } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' {
+                        char_stack.push(next);
-                    char_stack.push(next);
+                        state = ParseState::AlphaDecimal;
-                    state = ParseState::AlphaDecimal;
+                    } else {
-                } else {
+                        self.parse_string.push(next);
-                    self.parse_string.push(next);
+                        break;
-                    break;
+                    }
                }
            }
-        }};
+        }
        // I like Python's version of this much better:
        // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,'
-        let dot_count = char_stack.iter().fold(0, |count, character| count + (if character == &'.' {1} else {0}));
+        let dot_count = char_stack.iter().fold(0, |count, character| {
-        let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' || char_stack.last().unwrap() == &',';
+            count + (if character == &'.' { 1 } else { 0 })
        });
        let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.'
            || char_stack.last().unwrap() == &',';
        let final_string = char_stack.into_iter().collect();
        let mut tokens = match state {
@ -134,7 +135,7 @@ impl Iterator for Tokenizer {
            ParseState::Numeric => vec![Token::Numeric(final_string)],
            ParseState::AlphaDecimal => {
                if needs_split {
-                   decimal_split(&final_string, false)
+                    decimal_split(&final_string, false)
                } else {
                    vec![Token::Alpha(final_string)]
                }
@ -146,7 +147,9 @@ impl Iterator for Tokenizer {
                    vec![Token::Numeric(final_string)]
                }
            }
-        }.into_iter().rev().collect();
+        }.into_iter()
            .rev()
            .collect();
        self.token_stack.append(&mut tokens);
        // UNWRAP: Previous match guaranteed that at least one token was added
@ -154,72 +157,62 @@ impl Iterator for Tokenizer {
    }
 }
 fn decimal_split(characters: &str, cast_period: bool) -> Vec<Token> {
    let mut token_stack: Vec<Token> = Vec::new();
    let mut char_stack: Vec<char> = Vec::new();
    let mut state = ParseState::Empty;
-    for c in characters.chars() { match state {
+    for c in characters.chars() {
-        ParseState::Empty => {
+        match state {
-            if c.is_alphabetic() {
+            ParseState::Empty => {
-                char_stack.push(c);
+                if c.is_alphabetic() {
-                state = ParseState::Alpha;
+                    char_stack.push(c);
-            } else if c.is_numeric() {
+                    state = ParseState::Alpha;
-                char_stack.push(c);
+                } else if c.is_numeric() {
-                state = ParseState::Numeric;
+                    char_stack.push(c);
-            } else {
+                    state = ParseState::Numeric;
-                let character = if cast_period { '.' } else { c };
+                } else {
-                token_stack.push(Token::Separator(character.to_string()));
+                    let character = if cast_period { '.' } else { c };
                    token_stack.push(Token::Separator(character.to_string()));
                }
            }
-        },
+            ParseState::Alpha => {
-        ParseState::Alpha => {
+                if c.is_alphabetic() {
-            if c.is_alphabetic() {
+                    char_stack.push(c);
-                char_stack.push(c);
+                } else {
-            } else {
+                    token_stack.push(Token::Alpha(char_stack.iter().collect()));
-                token_stack.push(Token::Alpha(char_stack.iter().collect()));
+                    char_stack.clear();
-                char_stack.clear();
+                    let character = if cast_period { '.' } else { c };
-                let character = if cast_period { '.' } else { c };
+                    token_stack.push(Token::Separator(character.to_string()));
-                token_stack.push(Token::Separator(character.to_string()));
+                    state = ParseState::Empty;
-                state = ParseState::Empty;
+                }
            }
-        },
+            ParseState::Numeric => {
-        ParseState::Numeric => {
+                if c.is_numeric() {
-            if c.is_numeric() {
+                    char_stack.push(c);
-                char_stack.push(c);
+                } else {
-            } else {
+                    token_stack.push(Token::Numeric(char_stack.iter().collect()));
-                token_stack.push(Token::Numeric(char_stack.iter().collect()));
+                    char_stack.clear();
-                char_stack.clear();
+                    let character = if cast_period { '.' } else { c };
-                let character = if cast_period { '.' } else { c };
+                    token_stack.push(Token::Separator(character.to_string()));
-                token_stack.push(Token::Separator(character.to_string()));
+                    state = ParseState::Empty;
-                state = ParseState::Empty;
+                }
            }
-        },
+            _ => panic!("Invalid parse state during decimal_split()"),
-        _ => panic!("Invalid parse state during decimal_split()")
+        }
-    }}
+    }
    match state {
        ParseState::Alpha => token_stack.push(Token::Alpha(char_stack.iter().collect())),
        ParseState::Numeric => token_stack.push(Token::Numeric(char_stack.iter().collect())),
        ParseState::Empty => (),
-        _ => panic!("Invalid parse state during decimal_split()")
+        _ => panic!("Invalid parse state during decimal_split()"),
    }
    token_stack
 }
 pub fn tokenize(parse_string: &str) -> Vec<Token> {
    let tokenizer = Tokenizer::new(parse_string.to_owned());
    tokenizer.collect()
 }
 pub fn parse(date: &str) -> Result<DateTime<Utc>, ParseError> {
    let current = SystemTime::now();
    let epoch = current.duration_since(UNIX_EPOCH).unwrap();
    let naive = NaiveDateTime::from_timestamp(epoch.as_secs() as i64, epoch.subsec_nanos());
    Ok(DateTime::from_utc(naive, Utc))
 }
--- a/src/test_python_compat.rs
+++ b/src/test_python_compat.rs
@ -0,0 +1,39 @@
 use super::Token;
 use super::tokenize;
 #[test]
 fn test_python_compat() {
    assert_eq!(
        tokenize("2018.5.15"),
        vec![
            Token::Numeric("2018".to_owned()),
            Token::Separator(".".to_owned()),
            Token::Numeric("5".to_owned()),
            Token::Separator(".".to_owned()),
            Token::Numeric("15".to_owned()),
        ]
    );
    assert_eq!(
        tokenize("May 5, 2018"),
        vec![
            Token::Alpha("May".to_owned()),
            Token::Separator(" ".to_owned()),
            Token::Numeric("5".to_owned()),
            Token::Separator(",".to_owned()),
            Token::Separator(" ".to_owned()),
            Token::Numeric("2018".to_owned()),
        ]
    );
    assert_eq!(
        tokenize("Mar. 5, 2018"),
        vec![
            Token::Alpha("Mar".to_owned()),
            Token::Separator(".".to_owned()),
            Token::Separator(" ".to_owned()),
            Token::Numeric("5".to_owned()),
            Token::Separator(",".to_owned()),
            Token::Separator(" ".to_owned()),
            Token::Numeric("2018".to_owned()),
        ]
    );
 }
--- a/src/tests.rs
+++ b/src/tests.rs
@ -1,50 +0,0 @@
 use super::Token;
 use super::tokenize;
 macro_rules! t {
    ($string: expr, $( $x: expr ),* ) => {
        assert_eq!(
            tokenize($string),
            vec![$( $x, )*]
        )
    };
 }
 macro_rules! a {
    ($string:expr) => {
        Token::Alpha($string.to_owned())
    };
 }
 macro_rules! n {
    ($string:expr) => {
        Token::Numeric($string.to_owned())
    };
 }
 macro_rules! s {
    ($string:expr) => {
        Token::Separator($string.to_owned())
    };
 }
 #[test]
 fn test_basic_tokenize() {
    t!("Sep.2009.24",
       a!("Sep"), s!("."), n!("2009"), s!("."), n!("24"));
    t!("Sep.2009;24",
       a!("Sep"), s!("."), n!("2009"), s!(";"), n!("24"));
    t!("Sep.2009,24",
       a!("Sep"), s!("."), n!("2009"), s!(","), n!("24"));
    t!("24 Sep., 2009",
       n!("24"), s!(" "), a!("Sep"), s!("."), s!(","), s!(" "), n!("2009"));
    t!("2009.24",
       n!("2009.24"));
    t!("2009.24.09",
       n!("2009"), s!("."), n!("24"), s!("."), n!("09"));
 }
--- a/tests/parse.rs
+++ b/tests/parse.rs
@ -1,29 +0,0 @@
 extern crate chrono;
 extern crate dtparse;
 use chrono::DateTime;
 use chrono::NaiveDate;
 use chrono::NaiveDateTime;
 use chrono::NaiveTime;
 use chrono::Utc;
 use dtparse::parse;
 macro_rules! ymd_test {
    ($date:expr, $year:expr, $month:expr, $day:expr) => {
        let nd = NaiveDate::from_ymd($year, $month, $day);
        let nt = NaiveTime::from_hms(0, 0, 0);
        let dt = NaiveDateTime::new(nd, nt);
        let utc_dt = DateTime::from_utc(dt, Utc);
        let parsed = parse($date);
        println!("{:?}", parsed);
        assert!(parsed == Ok(utc_dt));
    };
 }
 #[test]
 fn test_basic() {
    ymd_test!("2014 January 19", 2014, 1, 19);
 }