Add a Python dateutil compat test

2024-11-12 17:08:09 -05:00 · 2018-05-17 00:31:57 -04:00 · 2018-05-17 00:31:57 -04:00 · 58e3b05b45
commit 58e3b05b45
parent af0908c3cc
5 changed files with 208 additions and 199 deletions
--- a/build_tests.py
+++ b/build_tests.py
@ -0,0 +1,56 @@
+#import dateutil.parser._timelex.split as time_split
+from dateutil.parser import _timelex
+
+# The TEST_STRINGS list should be the only thing that actually needs changing
+TEST_STRINGS = [
+    '2018.5.15',
+    'May 5, 2018',
+    'Mar. 5, 2018',
+]
+
+S4 = ' ' * 4
+S8 = ' ' * 8
+S12 = ' ' * 12
+
+def test_string_to_rust(time_string):
+    split_array = _timelex.split(time_string)
+
+    def translate_token(token):
+        if token[0].isalpha():
+            return 'Token::Alpha("{}".to_owned())'.format(token)
+        elif token[0].isnumeric():
+            return 'Token::Numeric("{}".to_owned())'.format(token)
+        elif len(token) == 1:
+            return 'Token::Separator("{}".to_owned())'.format(token)
+        else:
+            raise Exception("Invalid token during parsing of dateutil "
+                            "split: {}".format(token))
+
+    return [translate_token(t) for t in split_array]
+
+def main():
+    header = '''use super::Token;
+use super::tokenize;
+
+#[test]
+fn test_python_compat() {\n'''
+
+    tests = []
+
+    for test_string in TEST_STRINGS:
+        token_string = '\n'.join(['{}{},'.format(S12, s)
+                                  for s in test_string_to_rust(test_string)])
+        tests.append('    assert_eq!(\n{}tokenize("{}"),\n{}vec![\n{}\n{}]\n{});'
+                     .format(S8, test_string, S8, token_string, S8, S4))
+
+    body = '\n'.join(tests)
+
+    footer = '\n}\n'
+
+    with open('src/test_python_compat.rs', 'w') as handle:
+        handle.write(header)
+        handle.write(body)
+        handle.write(footer)
+
+if __name__ == '__main__':
+    main()
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,15 +1,9 @@
 extern crate chrono;

-use chrono::DateTime;
-use chrono::NaiveDateTime;
-use chrono::ParseError;
-use chrono::Utc;
-use std::time::SystemTime;
-use std::time::UNIX_EPOCH;
 use std::vec::Vec;

 #[cfg(test)]
-mod tests;
+mod test_python_compat;

 #[derive(PartialEq, Debug)]
 pub enum Token {
@ -33,7 +27,6 @@ enum ParseState {
 }

 impl Tokenizer {
-
    fn new(parse_string: String) -> Self {
        Tokenizer {
            token_stack: Vec::new(),
@ -46,8 +39,12 @@ impl Iterator for Tokenizer {
    type Item = Token;

    fn next(&mut self) -> Option<Self::Item> {
-        if !self.token_stack.is_empty() { return Some(self.token_stack.pop().unwrap()) };
-        if self.parse_string.is_empty() { return None };
+        if !self.token_stack.is_empty() {
+            return Some(self.token_stack.pop().unwrap());
+        };
+        if self.parse_string.is_empty() {
+            return None;
+        };

        let mut char_stack: Vec<char> = Vec::new();
        let mut seen_letters = false;
@ -56,76 +53,80 @@ impl Iterator for Tokenizer {
        while let Some(next) = self.parse_string.pop() {
            println!("{} - {:?}", next, state);
            match state {
-            ParseState::Empty => {
-                if next.is_numeric() {
-                    state = ParseState::Numeric;
-                    char_stack.push(next);
-                } else if next.is_alphabetic() {
-                    state = ParseState::Alpha;
-                    seen_letters = true;
-                    char_stack.push(next);
-                } else if next.is_whitespace() {
-                    char_stack.push(' ');
-                    break;
-                } else {
-                    char_stack.push(next);
-                    break;
+                ParseState::Empty => {
+                    if next.is_numeric() {
+                        state = ParseState::Numeric;
+                        char_stack.push(next);
+                    } else if next.is_alphabetic() {
+                        state = ParseState::Alpha;
+                        seen_letters = true;
+                        char_stack.push(next);
+                    } else if next.is_whitespace() {
+                        char_stack.push(' ');
+                        break;
+                    } else {
+                        char_stack.push(next);
+                        break;
+                    }
                }
-            },
-            ParseState::Alpha => {
-                if next.is_alphabetic() {
-                    char_stack.push(next);
-                } else if next == '.' {
-                    state = ParseState::AlphaDecimal;
-                    char_stack.push(next);
-                } else {
-                    // We don't recognize the character, so push it back
-                    // to be handled later.
-                    self.parse_string.push(next);
-                    break;
+                ParseState::Alpha => {
+                    if next.is_alphabetic() {
+                        char_stack.push(next);
+                    } else if next == '.' {
+                        state = ParseState::AlphaDecimal;
+                        char_stack.push(next);
+                    } else {
+                        // We don't recognize the character, so push it back
+                        // to be handled later.
+                        self.parse_string.push(next);
+                        break;
+                    }
                }
-            },
-            ParseState::AlphaDecimal => {
-                if next == '.' || next.is_alphabetic() {
-                    char_stack.push(next);
-                } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' {
-                    char_stack.push(next);
-                    state = ParseState::NumericDecimal;
-                } else {
-                    self.parse_string.push(next);
-                    break;
+                ParseState::AlphaDecimal => {
+                    if next == '.' || next.is_alphabetic() {
+                        char_stack.push(next);
+                    } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' {
+                        char_stack.push(next);
+                        state = ParseState::NumericDecimal;
+                    } else {
+                        self.parse_string.push(next);
+                        break;
+                    }
                }
-            },
-            ParseState::Numeric => {
-                if next.is_numeric() {
-                    char_stack.push(next);
-                } else if next == '.' || (next == ',' && char_stack.len() >= 2) {
-                    char_stack.push(next);
-                    state = ParseState::NumericDecimal;
-                } else {
-                    // We don't recognize the character, so push it back
-                    // to be handled later
-                    self.parse_string.push(next);
-                    break;
+                ParseState::Numeric => {
+                    if next.is_numeric() {
+                        char_stack.push(next);
+                    } else if next == '.' || (next == ',' && char_stack.len() >= 2) {
+                        char_stack.push(next);
+                        state = ParseState::NumericDecimal;
+                    } else {
+                        // We don't recognize the character, so push it back
+                        // to be handled later
+                        self.parse_string.push(next);
+                        break;
+                    }
                }
-            },
-            ParseState::NumericDecimal => {
-                if next == '.' || next.is_numeric() {
-                    char_stack.push(next);
-                } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' {
-                    char_stack.push(next);
-                    state = ParseState::AlphaDecimal;
-                } else {
-                    self.parse_string.push(next);
-                    break;
+                ParseState::NumericDecimal => {
+                    if next == '.' || next.is_numeric() {
+                        char_stack.push(next);
+                    } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' {
+                        char_stack.push(next);
+                        state = ParseState::AlphaDecimal;
+                    } else {
+                        self.parse_string.push(next);
+                        break;
+                    }
                }
            }
-        }};
+        }

        // I like Python's version of this much better:
        // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,'
-        let dot_count = char_stack.iter().fold(0, |count, character| count + (if character == &'.' {1} else {0}));
-        let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' || char_stack.last().unwrap() == &',';
+        let dot_count = char_stack.iter().fold(0, |count, character| {
+            count + (if character == &'.' { 1 } else { 0 })
+        });
+        let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.'
+            || char_stack.last().unwrap() == &',';
        let final_string = char_stack.into_iter().collect();

        let mut tokens = match state {
@ -134,7 +135,7 @@ impl Iterator for Tokenizer {
            ParseState::Numeric => vec![Token::Numeric(final_string)],
            ParseState::AlphaDecimal => {
                if needs_split {
-                   decimal_split(&final_string, false)
+                    decimal_split(&final_string, false)
                } else {
                    vec![Token::Alpha(final_string)]
                }
@ -146,7 +147,9 @@ impl Iterator for Tokenizer {
                    vec![Token::Numeric(final_string)]
                }
            }
-        }.into_iter().rev().collect();
+        }.into_iter()
+            .rev()
+            .collect();

        self.token_stack.append(&mut tokens);
        // UNWRAP: Previous match guaranteed that at least one token was added
@ -154,72 +157,62 @@ impl Iterator for Tokenizer {
    }
 }

-
 fn decimal_split(characters: &str, cast_period: bool) -> Vec<Token> {
    let mut token_stack: Vec<Token> = Vec::new();
    let mut char_stack: Vec<char> = Vec::new();
    let mut state = ParseState::Empty;

-    for c in characters.chars() { match state {
-        ParseState::Empty => {
-            if c.is_alphabetic() {
-                char_stack.push(c);
-                state = ParseState::Alpha;
-            } else if c.is_numeric() {
-                char_stack.push(c);
-                state = ParseState::Numeric;
-            } else {
-                let character = if cast_period { '.' } else { c };
-                token_stack.push(Token::Separator(character.to_string()));
+    for c in characters.chars() {
+        match state {
+            ParseState::Empty => {
+                if c.is_alphabetic() {
+                    char_stack.push(c);
+                    state = ParseState::Alpha;
+                } else if c.is_numeric() {
+                    char_stack.push(c);
+                    state = ParseState::Numeric;
+                } else {
+                    let character = if cast_period { '.' } else { c };
+                    token_stack.push(Token::Separator(character.to_string()));
+                }
            }
-        },
-        ParseState::Alpha => {
-            if c.is_alphabetic() {
-                char_stack.push(c);
-            } else {
-                token_stack.push(Token::Alpha(char_stack.iter().collect()));
-                char_stack.clear();
-                let character = if cast_period { '.' } else { c };
-                token_stack.push(Token::Separator(character.to_string()));
-                state = ParseState::Empty;
+            ParseState::Alpha => {
+                if c.is_alphabetic() {
+                    char_stack.push(c);
+                } else {
+                    token_stack.push(Token::Alpha(char_stack.iter().collect()));
+                    char_stack.clear();
+                    let character = if cast_period { '.' } else { c };
+                    token_stack.push(Token::Separator(character.to_string()));
+                    state = ParseState::Empty;
+                }
            }
-        },
-        ParseState::Numeric => {
-            if c.is_numeric() {
-                char_stack.push(c);
-            } else {
-                token_stack.push(Token::Numeric(char_stack.iter().collect()));
-                char_stack.clear();
-                let character = if cast_period { '.' } else { c };
-                token_stack.push(Token::Separator(character.to_string()));
-                state = ParseState::Empty;
+            ParseState::Numeric => {
+                if c.is_numeric() {
+                    char_stack.push(c);
+                } else {
+                    token_stack.push(Token::Numeric(char_stack.iter().collect()));
+                    char_stack.clear();
+                    let character = if cast_period { '.' } else { c };
+                    token_stack.push(Token::Separator(character.to_string()));
+                    state = ParseState::Empty;
+                }
            }
-        },
-        _ => panic!("Invalid parse state during decimal_split()")
-    }}
+            _ => panic!("Invalid parse state during decimal_split()"),
+        }
+    }

    match state {
        ParseState::Alpha => token_stack.push(Token::Alpha(char_stack.iter().collect())),
        ParseState::Numeric => token_stack.push(Token::Numeric(char_stack.iter().collect())),
        ParseState::Empty => (),
-        _ => panic!("Invalid parse state during decimal_split()")
+        _ => panic!("Invalid parse state during decimal_split()"),
    }

    token_stack
 }

-
 pub fn tokenize(parse_string: &str) -> Vec<Token> {
    let tokenizer = Tokenizer::new(parse_string.to_owned());
    tokenizer.collect()
 }
-
-
-pub fn parse(date: &str) -> Result<DateTime<Utc>, ParseError> {
-    let current = SystemTime::now();
-    let epoch = current.duration_since(UNIX_EPOCH).unwrap();
-
-    let naive = NaiveDateTime::from_timestamp(epoch.as_secs() as i64, epoch.subsec_nanos());
-
-    Ok(DateTime::from_utc(naive, Utc))
-}
--- a/src/test_python_compat.rs
+++ b/src/test_python_compat.rs
@ -0,0 +1,39 @@
+use super::Token;
+use super::tokenize;
+
+#[test]
+fn test_python_compat() {
+    assert_eq!(
+        tokenize("2018.5.15"),
+        vec![
+            Token::Numeric("2018".to_owned()),
+            Token::Separator(".".to_owned()),
+            Token::Numeric("5".to_owned()),
+            Token::Separator(".".to_owned()),
+            Token::Numeric("15".to_owned()),
+        ]
+    );
+    assert_eq!(
+        tokenize("May 5, 2018"),
+        vec![
+            Token::Alpha("May".to_owned()),
+            Token::Separator(" ".to_owned()),
+            Token::Numeric("5".to_owned()),
+            Token::Separator(",".to_owned()),
+            Token::Separator(" ".to_owned()),
+            Token::Numeric("2018".to_owned()),
+        ]
+    );
+    assert_eq!(
+        tokenize("Mar. 5, 2018"),
+        vec![
+            Token::Alpha("Mar".to_owned()),
+            Token::Separator(".".to_owned()),
+            Token::Separator(" ".to_owned()),
+            Token::Numeric("5".to_owned()),
+            Token::Separator(",".to_owned()),
+            Token::Separator(" ".to_owned()),
+            Token::Numeric("2018".to_owned()),
+        ]
+    );
+}
--- a/src/tests.rs
+++ b/src/tests.rs
@ -1,50 +0,0 @@
-use super::Token;
-use super::tokenize;
-
-macro_rules! t {
-    ($string: expr, $( $x: expr ),* ) => {
-        assert_eq!(
-            tokenize($string),
-            vec![$( $x, )*]
-        )
-    };
-}
-
-macro_rules! a {
-    ($string:expr) => {
-        Token::Alpha($string.to_owned())
-    };
-}
-
-macro_rules! n {
-    ($string:expr) => {
-        Token::Numeric($string.to_owned())
-    };
-}
-
-macro_rules! s {
-    ($string:expr) => {
-        Token::Separator($string.to_owned())
-    };
-}
-
-#[test]
-fn test_basic_tokenize() {
-    t!("Sep.2009.24",
-       a!("Sep"), s!("."), n!("2009"), s!("."), n!("24"));
-
-    t!("Sep.2009;24",
-       a!("Sep"), s!("."), n!("2009"), s!(";"), n!("24"));
-
-    t!("Sep.2009,24",
-       a!("Sep"), s!("."), n!("2009"), s!(","), n!("24"));
-
-    t!("24 Sep., 2009",
-       n!("24"), s!(" "), a!("Sep"), s!("."), s!(","), s!(" "), n!("2009"));
-
-    t!("2009.24",
-       n!("2009.24"));
-
-    t!("2009.24.09",
-       n!("2009"), s!("."), n!("24"), s!("."), n!("09"));
-}
--- a/tests/parse.rs
+++ b/tests/parse.rs
@ -1,29 +0,0 @@
-extern crate chrono;
-extern crate dtparse;
-
-use chrono::DateTime;
-use chrono::NaiveDate;
-use chrono::NaiveDateTime;
-use chrono::NaiveTime;
-use chrono::Utc;
-
-use dtparse::parse;
-
-macro_rules! ymd_test {
-    ($date:expr, $year:expr, $month:expr, $day:expr) => {
-        let nd = NaiveDate::from_ymd($year, $month, $day);
-        let nt = NaiveTime::from_hms(0, 0, 0);
-        let dt = NaiveDateTime::new(nd, nt);
-        let utc_dt = DateTime::from_utc(dt, Utc);
-
-        let parsed = parse($date);
-
-        println!("{:?}", parsed);
-        assert!(parsed == Ok(utc_dt));
-    };
-}
-
-#[test]
-fn test_basic() {
-    ymd_test!("2014 January 19", 2014, 1, 19);
-}