Add tokenization from Dateutil

2018-05-15 00:50:14 -04:00 · 2018-05-15 00:50:14 -04:00 · af0908c3cc
parent 9ab5a3d5e3
commit af0908c3cc
3 changed files with 265 additions and 8 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,14 +1,221 @@
 extern crate chrono;

-use chrono::NaiveDateTime;
 use chrono::DateTime;
-use chrono::Utc;
+use chrono::NaiveDateTime;
 use chrono::ParseError;
-use std::time::UNIX_EPOCH;
+use chrono::Utc;
 use std::time::SystemTime;
+use std::time::UNIX_EPOCH;
+use std::vec::Vec;
+
+#[cfg(test)]
+mod tests;
+
+#[derive(PartialEq, Debug)]
+pub enum Token {
+    Alpha(String),
+    Numeric(String),
+    Separator(String),
+}
+
+pub struct Tokenizer {
+    token_stack: Vec<Token>,
+    parse_string: String,
+}
+
+#[derive(Debug)]
+enum ParseState {
+    Empty,
+    Alpha,
+    AlphaDecimal,
+    Numeric,
+    NumericDecimal,
+}
+
+impl Tokenizer {
+
+    fn new(parse_string: String) -> Self {
+        Tokenizer {
+            token_stack: Vec::new(),
+            parse_string: parse_string.chars().rev().collect(),
+        }
+    }
+}
+
+impl Iterator for Tokenizer {
+    type Item = Token;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if !self.token_stack.is_empty() { return Some(self.token_stack.pop().unwrap()) };
+        if self.parse_string.is_empty() { return None };
+
+        let mut char_stack: Vec<char> = Vec::new();
+        let mut seen_letters = false;
+        let mut state = ParseState::Empty;
+
+        while let Some(next) = self.parse_string.pop() {
+            println!("{} - {:?}", next, state);
+            match state {
+            ParseState::Empty => {
+                if next.is_numeric() {
+                    state = ParseState::Numeric;
+                    char_stack.push(next);
+                } else if next.is_alphabetic() {
+                    state = ParseState::Alpha;
+                    seen_letters = true;
+                    char_stack.push(next);
+                } else if next.is_whitespace() {
+                    char_stack.push(' ');
+                    break;
+                } else {
+                    char_stack.push(next);
+                    break;
+                }
+            },
+            ParseState::Alpha => {
+                if next.is_alphabetic() {
+                    char_stack.push(next);
+                } else if next == '.' {
+                    state = ParseState::AlphaDecimal;
+                    char_stack.push(next);
+                } else {
+                    // We don't recognize the character, so push it back
+                    // to be handled later.
+                    self.parse_string.push(next);
+                    break;
+                }
+            },
+            ParseState::AlphaDecimal => {
+                if next == '.' || next.is_alphabetic() {
+                    char_stack.push(next);
+                } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' {
+                    char_stack.push(next);
+                    state = ParseState::NumericDecimal;
+                } else {
+                    self.parse_string.push(next);
+                    break;
+                }
+            },
+            ParseState::Numeric => {
+                if next.is_numeric() {
+                    char_stack.push(next);
+                } else if next == '.' || (next == ',' && char_stack.len() >= 2) {
+                    char_stack.push(next);
+                    state = ParseState::NumericDecimal;
+                } else {
+                    // We don't recognize the character, so push it back
+                    // to be handled later
+                    self.parse_string.push(next);
+                    break;
+                }
+            },
+            ParseState::NumericDecimal => {
+                if next == '.' || next.is_numeric() {
+                    char_stack.push(next);
+                } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' {
+                    char_stack.push(next);
+                    state = ParseState::AlphaDecimal;
+                } else {
+                    self.parse_string.push(next);
+                    break;
+                }
+            }
+        }};
+
+        // I like Python's version of this much better:
+        // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,'
+        let dot_count = char_stack.iter().fold(0, |count, character| count + (if character == &'.' {1} else {0}));
+        let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' || char_stack.last().unwrap() == &',';
+        let final_string = char_stack.into_iter().collect();
+
+        let mut tokens = match state {
+            ParseState::Empty => vec![Token::Separator(final_string)],
+            ParseState::Alpha => vec![Token::Alpha(final_string)],
+            ParseState::Numeric => vec![Token::Numeric(final_string)],
+            ParseState::AlphaDecimal => {
+                if needs_split {
+                   decimal_split(&final_string, false)
+                } else {
+                    vec![Token::Alpha(final_string)]
+                }
+            }
+            ParseState::NumericDecimal => {
+                if needs_split {
+                    decimal_split(&final_string, dot_count == 0)
+                } else {
+                    vec![Token::Numeric(final_string)]
+                }
+            }
+        }.into_iter().rev().collect();
+
+        self.token_stack.append(&mut tokens);
+        // UNWRAP: Previous match guaranteed that at least one token was added
+        Some(self.token_stack.pop().unwrap())
+    }
+}
+
+
+fn decimal_split(characters: &str, cast_period: bool) -> Vec<Token> {
+    let mut token_stack: Vec<Token> = Vec::new();
+    let mut char_stack: Vec<char> = Vec::new();
+    let mut state = ParseState::Empty;
+
+    for c in characters.chars() { match state {
+        ParseState::Empty => {
+            if c.is_alphabetic() {
+                char_stack.push(c);
+                state = ParseState::Alpha;
+            } else if c.is_numeric() {
+                char_stack.push(c);
+                state = ParseState::Numeric;
+            } else {
+                let character = if cast_period { '.' } else { c };
+                token_stack.push(Token::Separator(character.to_string()));
+            }
+        },
+        ParseState::Alpha => {
+            if c.is_alphabetic() {
+                char_stack.push(c);
+            } else {
+                token_stack.push(Token::Alpha(char_stack.iter().collect()));
+                char_stack.clear();
+                let character = if cast_period { '.' } else { c };
+                token_stack.push(Token::Separator(character.to_string()));
+                state = ParseState::Empty;
+            }
+        },
+        ParseState::Numeric => {
+            if c.is_numeric() {
+                char_stack.push(c);
+            } else {
+                token_stack.push(Token::Numeric(char_stack.iter().collect()));
+                char_stack.clear();
+                let character = if cast_period { '.' } else { c };
+                token_stack.push(Token::Separator(character.to_string()));
+                state = ParseState::Empty;
+            }
+        },
+        _ => panic!("Invalid parse state during decimal_split()")
+    }}
+
+    match state {
+        ParseState::Alpha => token_stack.push(Token::Alpha(char_stack.iter().collect())),
+        ParseState::Numeric => token_stack.push(Token::Numeric(char_stack.iter().collect())),
+        ParseState::Empty => (),
+        _ => panic!("Invalid parse state during decimal_split()")
+    }
+
+    token_stack
+}
+
+
+pub fn tokenize(parse_string: &str) -> Vec<Token> {
+    let tokenizer = Tokenizer::new(parse_string.to_owned());
+    tokenizer.collect()
+}
+

 pub fn parse(date: &str) -> Result<DateTime<Utc>, ParseError> {
-
    let current = SystemTime::now();
    let epoch = current.duration_since(UNIX_EPOCH).unwrap();

--- a/src/tests.rs
+++ b/src/tests.rs
@ -0,0 +1,50 @@
+use super::Token;
+use super::tokenize;
+
+macro_rules! t {
+    ($string: expr, $( $x: expr ),* ) => {
+        assert_eq!(
+            tokenize($string),
+            vec![$( $x, )*]
+        )
+    };
+}
+
+macro_rules! a {
+    ($string:expr) => {
+        Token::Alpha($string.to_owned())
+    };
+}
+
+macro_rules! n {
+    ($string:expr) => {
+        Token::Numeric($string.to_owned())
+    };
+}
+
+macro_rules! s {
+    ($string:expr) => {
+        Token::Separator($string.to_owned())
+    };
+}
+
+#[test]
+fn test_basic_tokenize() {
+    t!("Sep.2009.24",
+       a!("Sep"), s!("."), n!("2009"), s!("."), n!("24"));
+
+    t!("Sep.2009;24",
+       a!("Sep"), s!("."), n!("2009"), s!(";"), n!("24"));
+
+    t!("Sep.2009,24",
+       a!("Sep"), s!("."), n!("2009"), s!(","), n!("24"));
+
+    t!("24 Sep., 2009",
+       n!("24"), s!(" "), a!("Sep"), s!("."), s!(","), s!(" "), n!("2009"));
+
+    t!("2009.24",
+       n!("2009.24"));
+
+    t!("2009.24.09",
+       n!("2009"), s!("."), n!("24"), s!("."), n!("09"));
+}
--- a/tests/parse.rs
+++ b/tests/parse.rs
@ -2,15 +2,15 @@ extern crate chrono;
 extern crate dtparse;

 use chrono::DateTime;
-use chrono::Utc;
 use chrono::NaiveDate;
-use chrono::NaiveTime;
 use chrono::NaiveDateTime;
+use chrono::NaiveTime;
+use chrono::Utc;

 use dtparse::parse;

 macro_rules! ymd_test {
-    ($date: expr, $year: expr, $month: expr, $day: expr) => {
+    ($date:expr, $year:expr, $month:expr, $day:expr) => {
        let nd = NaiveDate::from_ymd($year, $month, $day);
        let nt = NaiveTime::from_hms(0, 0, 0);
        let dt = NaiveDateTime::new(nd, nt);
@ -26,4 +26,4 @@ macro_rules! ymd_test {
 #[test]
 fn test_basic() {
    ymd_test!("2014 January 19", 2014, 1, 19);
-}
+}