Add a Python dateutil compat test

2026-07-02 09:43:02 -04:00 · 2018-05-17 00:31:57 -04:00
parent af0908c3cc
commit 58e3b05b45
5 changed files with 208 additions and 199 deletions
@@ -1,15 +1,9 @@
 extern crate chrono;

-use chrono::DateTime;
-use chrono::NaiveDateTime;
-use chrono::ParseError;
-use chrono::Utc;
-use std::time::SystemTime;
-use std::time::UNIX_EPOCH;
 use std::vec::Vec;

 #[cfg(test)]
-mod tests;
+mod test_python_compat;

 #[derive(PartialEq, Debug)]
 pub enum Token {
@@ -33,7 +27,6 @@ enum ParseState {
 }

 impl Tokenizer {
-
    fn new(parse_string: String) -> Self {
        Tokenizer {
            token_stack: Vec::new(),
@@ -46,8 +39,12 @@ impl Iterator for Tokenizer {
    type Item = Token;

    fn next(&mut self) -> Option<Self::Item> {
-        if !self.token_stack.is_empty() { return Some(self.token_stack.pop().unwrap()) };
-        if self.parse_string.is_empty() { return None };
+        if !self.token_stack.is_empty() {
+            return Some(self.token_stack.pop().unwrap());
+        };
+        if self.parse_string.is_empty() {
+            return None;
+        };

        let mut char_stack: Vec<char> = Vec::new();
        let mut seen_letters = false;
@@ -56,76 +53,80 @@ impl Iterator for Tokenizer {
        while let Some(next) = self.parse_string.pop() {
            println!("{} - {:?}", next, state);
            match state {
-            ParseState::Empty => {
-                if next.is_numeric() {
-                    state = ParseState::Numeric;
-                    char_stack.push(next);
-                } else if next.is_alphabetic() {
-                    state = ParseState::Alpha;
-                    seen_letters = true;
-                    char_stack.push(next);
-                } else if next.is_whitespace() {
-                    char_stack.push(' ');
-                    break;
-                } else {
-                    char_stack.push(next);
-                    break;
+                ParseState::Empty => {
+                    if next.is_numeric() {
+                        state = ParseState::Numeric;
+                        char_stack.push(next);
+                    } else if next.is_alphabetic() {
+                        state = ParseState::Alpha;
+                        seen_letters = true;
+                        char_stack.push(next);
+                    } else if next.is_whitespace() {
+                        char_stack.push(' ');
+                        break;
+                    } else {
+                        char_stack.push(next);
+                        break;
+                    }
                }
-            },
-            ParseState::Alpha => {
-                if next.is_alphabetic() {
-                    char_stack.push(next);
-                } else if next == '.' {
-                    state = ParseState::AlphaDecimal;
-                    char_stack.push(next);
-                } else {
-                    // We don't recognize the character, so push it back
-                    // to be handled later.
-                    self.parse_string.push(next);
-                    break;
+                ParseState::Alpha => {
+                    if next.is_alphabetic() {
+                        char_stack.push(next);
+                    } else if next == '.' {
+                        state = ParseState::AlphaDecimal;
+                        char_stack.push(next);
+                    } else {
+                        // We don't recognize the character, so push it back
+                        // to be handled later.
+                        self.parse_string.push(next);
+                        break;
+                    }
                }
-            },
-            ParseState::AlphaDecimal => {
-                if next == '.' || next.is_alphabetic() {
-                    char_stack.push(next);
-                } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' {
-                    char_stack.push(next);
-                    state = ParseState::NumericDecimal;
-                } else {
-                    self.parse_string.push(next);
-                    break;
+                ParseState::AlphaDecimal => {
+                    if next == '.' || next.is_alphabetic() {
+                        char_stack.push(next);
+                    } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' {
+                        char_stack.push(next);
+                        state = ParseState::NumericDecimal;
+                    } else {
+                        self.parse_string.push(next);
+                        break;
+                    }
                }
-            },
-            ParseState::Numeric => {
-                if next.is_numeric() {
-                    char_stack.push(next);
-                } else if next == '.' || (next == ',' && char_stack.len() >= 2) {
-                    char_stack.push(next);
-                    state = ParseState::NumericDecimal;
-                } else {
-                    // We don't recognize the character, so push it back
-                    // to be handled later
-                    self.parse_string.push(next);
-                    break;
+                ParseState::Numeric => {
+                    if next.is_numeric() {
+                        char_stack.push(next);
+                    } else if next == '.' || (next == ',' && char_stack.len() >= 2) {
+                        char_stack.push(next);
+                        state = ParseState::NumericDecimal;
+                    } else {
+                        // We don't recognize the character, so push it back
+                        // to be handled later
+                        self.parse_string.push(next);
+                        break;
+                    }
                }
-            },
-            ParseState::NumericDecimal => {
-                if next == '.' || next.is_numeric() {
-                    char_stack.push(next);
-                } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' {
-                    char_stack.push(next);
-                    state = ParseState::AlphaDecimal;
-                } else {
-                    self.parse_string.push(next);
-                    break;
+                ParseState::NumericDecimal => {
+                    if next == '.' || next.is_numeric() {
+                        char_stack.push(next);
+                    } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' {
+                        char_stack.push(next);
+                        state = ParseState::AlphaDecimal;
+                    } else {
+                        self.parse_string.push(next);
+                        break;
+                    }
                }
            }
-        }};
+        }

        // I like Python's version of this much better:
        // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,'
-        let dot_count = char_stack.iter().fold(0, |count, character| count + (if character == &'.' {1} else {0}));
-        let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' || char_stack.last().unwrap() == &',';
+        let dot_count = char_stack.iter().fold(0, |count, character| {
+            count + (if character == &'.' { 1 } else { 0 })
+        });
+        let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.'
+            || char_stack.last().unwrap() == &',';
        let final_string = char_stack.into_iter().collect();

        let mut tokens = match state {
@@ -134,7 +135,7 @@ impl Iterator for Tokenizer {
            ParseState::Numeric => vec![Token::Numeric(final_string)],
            ParseState::AlphaDecimal => {
                if needs_split {
-                   decimal_split(&final_string, false)
+                    decimal_split(&final_string, false)
                } else {
                    vec![Token::Alpha(final_string)]
                }
@@ -146,7 +147,9 @@ impl Iterator for Tokenizer {
                    vec![Token::Numeric(final_string)]
                }
            }
-        }.into_iter().rev().collect();
+        }.into_iter()
+            .rev()
+            .collect();

        self.token_stack.append(&mut tokens);
        // UNWRAP: Previous match guaranteed that at least one token was added
@@ -154,72 +157,62 @@ impl Iterator for Tokenizer {
    }
 }

-
 fn decimal_split(characters: &str, cast_period: bool) -> Vec<Token> {
    let mut token_stack: Vec<Token> = Vec::new();
    let mut char_stack: Vec<char> = Vec::new();
    let mut state = ParseState::Empty;

-    for c in characters.chars() { match state {
-        ParseState::Empty => {
-            if c.is_alphabetic() {
-                char_stack.push(c);
-                state = ParseState::Alpha;
-            } else if c.is_numeric() {
-                char_stack.push(c);
-                state = ParseState::Numeric;
-            } else {
-                let character = if cast_period { '.' } else { c };
-                token_stack.push(Token::Separator(character.to_string()));
+    for c in characters.chars() {
+        match state {
+            ParseState::Empty => {
+                if c.is_alphabetic() {
+                    char_stack.push(c);
+                    state = ParseState::Alpha;
+                } else if c.is_numeric() {
+                    char_stack.push(c);
+                    state = ParseState::Numeric;
+                } else {
+                    let character = if cast_period { '.' } else { c };
+                    token_stack.push(Token::Separator(character.to_string()));
+                }
            }
-        },
-        ParseState::Alpha => {
-            if c.is_alphabetic() {
-                char_stack.push(c);
-            } else {
-                token_stack.push(Token::Alpha(char_stack.iter().collect()));
-                char_stack.clear();
-                let character = if cast_period { '.' } else { c };
-                token_stack.push(Token::Separator(character.to_string()));
-                state = ParseState::Empty;
+            ParseState::Alpha => {
+                if c.is_alphabetic() {
+                    char_stack.push(c);
+                } else {
+                    token_stack.push(Token::Alpha(char_stack.iter().collect()));
+                    char_stack.clear();
+                    let character = if cast_period { '.' } else { c };
+                    token_stack.push(Token::Separator(character.to_string()));
+                    state = ParseState::Empty;
+                }
            }
-        },
-        ParseState::Numeric => {
-            if c.is_numeric() {
-                char_stack.push(c);
-            } else {
-                token_stack.push(Token::Numeric(char_stack.iter().collect()));
-                char_stack.clear();
-                let character = if cast_period { '.' } else { c };
-                token_stack.push(Token::Separator(character.to_string()));
-                state = ParseState::Empty;
+            ParseState::Numeric => {
+                if c.is_numeric() {
+                    char_stack.push(c);
+                } else {
+                    token_stack.push(Token::Numeric(char_stack.iter().collect()));
+                    char_stack.clear();
+                    let character = if cast_period { '.' } else { c };
+                    token_stack.push(Token::Separator(character.to_string()));
+                    state = ParseState::Empty;
+                }
            }
-        },
-        _ => panic!("Invalid parse state during decimal_split()")
-    }}
+            _ => panic!("Invalid parse state during decimal_split()"),
+        }
+    }

    match state {
        ParseState::Alpha => token_stack.push(Token::Alpha(char_stack.iter().collect())),
        ParseState::Numeric => token_stack.push(Token::Numeric(char_stack.iter().collect())),
        ParseState::Empty => (),
-        _ => panic!("Invalid parse state during decimal_split()")
+        _ => panic!("Invalid parse state during decimal_split()"),
    }

    token_stack
 }

-
 pub fn tokenize(parse_string: &str) -> Vec<Token> {
    let tokenizer = Tokenizer::new(parse_string.to_owned());
    tokenizer.collect()
 }
-
-
-pub fn parse(date: &str) -> Result<DateTime<Utc>, ParseError> {
-    let current = SystemTime::now();
-    let epoch = current.duration_since(UNIX_EPOCH).unwrap();
-
-    let naive = NaiveDateTime::from_timestamp(epoch.as_secs() as i64, epoch.subsec_nanos());
-
-    Ok(DateTime::from_utc(naive, Utc))
-}
@@ -0,0 +1,39 @@
+use super::Token;
+use super::tokenize;
+
+#[test]
+fn test_python_compat() {
+    assert_eq!(
+        tokenize("2018.5.15"),
+        vec![
+            Token::Numeric("2018".to_owned()),
+            Token::Separator(".".to_owned()),
+            Token::Numeric("5".to_owned()),
+            Token::Separator(".".to_owned()),
+            Token::Numeric("15".to_owned()),
+        ]
+    );
+    assert_eq!(
+        tokenize("May 5, 2018"),
+        vec![
+            Token::Alpha("May".to_owned()),
+            Token::Separator(" ".to_owned()),
+            Token::Numeric("5".to_owned()),
+            Token::Separator(",".to_owned()),
+            Token::Separator(" ".to_owned()),
+            Token::Numeric("2018".to_owned()),
+        ]
+    );
+    assert_eq!(
+        tokenize("Mar. 5, 2018"),
+        vec![
+            Token::Alpha("Mar".to_owned()),
+            Token::Separator(".".to_owned()),
+            Token::Separator(" ".to_owned()),
+            Token::Numeric("5".to_owned()),
+            Token::Separator(",".to_owned()),
+            Token::Separator(" ".to_owned()),
+            Token::Numeric("2018".to_owned()),
+        ]
+    );
+}
@@ -1,50 +0,0 @@
-use super::Token;
-use super::tokenize;
-
-macro_rules! t {
-    ($string: expr, $( $x: expr ),* ) => {
-        assert_eq!(
-            tokenize($string),
-            vec![$( $x, )*]
-        )
-    };
-}
-
-macro_rules! a {
-    ($string:expr) => {
-        Token::Alpha($string.to_owned())
-    };
-}
-
-macro_rules! n {
-    ($string:expr) => {
-        Token::Numeric($string.to_owned())
-    };
-}
-
-macro_rules! s {
-    ($string:expr) => {
-        Token::Separator($string.to_owned())
-    };
-}
-
-#[test]
-fn test_basic_tokenize() {
-    t!("Sep.2009.24",
-       a!("Sep"), s!("."), n!("2009"), s!("."), n!("24"));
-
-    t!("Sep.2009;24",
-       a!("Sep"), s!("."), n!("2009"), s!(";"), n!("24"));
-
-    t!("Sep.2009,24",
-       a!("Sep"), s!("."), n!("2009"), s!(","), n!("24"));
-
-    t!("24 Sep., 2009",
-       n!("24"), s!(" "), a!("Sep"), s!("."), s!(","), s!(" "), n!("2009"));
-
-    t!("2009.24",
-       n!("2009.24"));
-
-    t!("2009.24.09",
-       n!("2009"), s!("."), n!("24"), s!("."), n!("09"));
-}