diff --git a/src/lib.rs b/src/lib.rs index d9d9c5d..0808ab6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,14 +1,221 @@ extern crate chrono; -use chrono::NaiveDateTime; use chrono::DateTime; -use chrono::Utc; +use chrono::NaiveDateTime; use chrono::ParseError; -use std::time::UNIX_EPOCH; +use chrono::Utc; use std::time::SystemTime; +use std::time::UNIX_EPOCH; +use std::vec::Vec; + +#[cfg(test)] +mod tests; + +#[derive(PartialEq, Debug)] +pub enum Token { + Alpha(String), + Numeric(String), + Separator(String), +} + +pub struct Tokenizer { + token_stack: Vec, + parse_string: String, +} + +#[derive(Debug)] +enum ParseState { + Empty, + Alpha, + AlphaDecimal, + Numeric, + NumericDecimal, +} + +impl Tokenizer { + + fn new(parse_string: String) -> Self { + Tokenizer { + token_stack: Vec::new(), + parse_string: parse_string.chars().rev().collect(), + } + } +} + +impl Iterator for Tokenizer { + type Item = Token; + + fn next(&mut self) -> Option { + if !self.token_stack.is_empty() { return Some(self.token_stack.pop().unwrap()) }; + if self.parse_string.is_empty() { return None }; + + let mut char_stack: Vec = Vec::new(); + let mut seen_letters = false; + let mut state = ParseState::Empty; + + while let Some(next) = self.parse_string.pop() { + println!("{} - {:?}", next, state); + match state { + ParseState::Empty => { + if next.is_numeric() { + state = ParseState::Numeric; + char_stack.push(next); + } else if next.is_alphabetic() { + state = ParseState::Alpha; + seen_letters = true; + char_stack.push(next); + } else if next.is_whitespace() { + char_stack.push(' '); + break; + } else { + char_stack.push(next); + break; + } + }, + ParseState::Alpha => { + if next.is_alphabetic() { + char_stack.push(next); + } else if next == '.' { + state = ParseState::AlphaDecimal; + char_stack.push(next); + } else { + // We don't recognize the character, so push it back + // to be handled later. + self.parse_string.push(next); + break; + } + }, + ParseState::AlphaDecimal => { + if next == '.' || next.is_alphabetic() { + char_stack.push(next); + } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' { + char_stack.push(next); + state = ParseState::NumericDecimal; + } else { + self.parse_string.push(next); + break; + } + }, + ParseState::Numeric => { + if next.is_numeric() { + char_stack.push(next); + } else if next == '.' || (next == ',' && char_stack.len() >= 2) { + char_stack.push(next); + state = ParseState::NumericDecimal; + } else { + // We don't recognize the character, so push it back + // to be handled later + self.parse_string.push(next); + break; + } + }, + ParseState::NumericDecimal => { + if next == '.' || next.is_numeric() { + char_stack.push(next); + } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' { + char_stack.push(next); + state = ParseState::AlphaDecimal; + } else { + self.parse_string.push(next); + break; + } + } + }}; + + // I like Python's version of this much better: + // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,' + let dot_count = char_stack.iter().fold(0, |count, character| count + (if character == &'.' {1} else {0})); + let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' || char_stack.last().unwrap() == &','; + let final_string = char_stack.into_iter().collect(); + + let mut tokens = match state { + ParseState::Empty => vec![Token::Separator(final_string)], + ParseState::Alpha => vec![Token::Alpha(final_string)], + ParseState::Numeric => vec![Token::Numeric(final_string)], + ParseState::AlphaDecimal => { + if needs_split { + decimal_split(&final_string, false) + } else { + vec![Token::Alpha(final_string)] + } + } + ParseState::NumericDecimal => { + if needs_split { + decimal_split(&final_string, dot_count == 0) + } else { + vec![Token::Numeric(final_string)] + } + } + }.into_iter().rev().collect(); + + self.token_stack.append(&mut tokens); + // UNWRAP: Previous match guaranteed that at least one token was added + Some(self.token_stack.pop().unwrap()) + } +} + + +fn decimal_split(characters: &str, cast_period: bool) -> Vec { + let mut token_stack: Vec = Vec::new(); + let mut char_stack: Vec = Vec::new(); + let mut state = ParseState::Empty; + + for c in characters.chars() { match state { + ParseState::Empty => { + if c.is_alphabetic() { + char_stack.push(c); + state = ParseState::Alpha; + } else if c.is_numeric() { + char_stack.push(c); + state = ParseState::Numeric; + } else { + let character = if cast_period { '.' } else { c }; + token_stack.push(Token::Separator(character.to_string())); + } + }, + ParseState::Alpha => { + if c.is_alphabetic() { + char_stack.push(c); + } else { + token_stack.push(Token::Alpha(char_stack.iter().collect())); + char_stack.clear(); + let character = if cast_period { '.' } else { c }; + token_stack.push(Token::Separator(character.to_string())); + state = ParseState::Empty; + } + }, + ParseState::Numeric => { + if c.is_numeric() { + char_stack.push(c); + } else { + token_stack.push(Token::Numeric(char_stack.iter().collect())); + char_stack.clear(); + let character = if cast_period { '.' } else { c }; + token_stack.push(Token::Separator(character.to_string())); + state = ParseState::Empty; + } + }, + _ => panic!("Invalid parse state during decimal_split()") + }} + + match state { + ParseState::Alpha => token_stack.push(Token::Alpha(char_stack.iter().collect())), + ParseState::Numeric => token_stack.push(Token::Numeric(char_stack.iter().collect())), + ParseState::Empty => (), + _ => panic!("Invalid parse state during decimal_split()") + } + + token_stack +} + + +pub fn tokenize(parse_string: &str) -> Vec { + let tokenizer = Tokenizer::new(parse_string.to_owned()); + tokenizer.collect() +} + pub fn parse(date: &str) -> Result, ParseError> { - let current = SystemTime::now(); let epoch = current.duration_since(UNIX_EPOCH).unwrap(); diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 0000000..7658559 --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,50 @@ +use super::Token; +use super::tokenize; + +macro_rules! t { + ($string: expr, $( $x: expr ),* ) => { + assert_eq!( + tokenize($string), + vec![$( $x, )*] + ) + }; +} + +macro_rules! a { + ($string:expr) => { + Token::Alpha($string.to_owned()) + }; +} + +macro_rules! n { + ($string:expr) => { + Token::Numeric($string.to_owned()) + }; +} + +macro_rules! s { + ($string:expr) => { + Token::Separator($string.to_owned()) + }; +} + +#[test] +fn test_basic_tokenize() { + t!("Sep.2009.24", + a!("Sep"), s!("."), n!("2009"), s!("."), n!("24")); + + t!("Sep.2009;24", + a!("Sep"), s!("."), n!("2009"), s!(";"), n!("24")); + + t!("Sep.2009,24", + a!("Sep"), s!("."), n!("2009"), s!(","), n!("24")); + + t!("24 Sep., 2009", + n!("24"), s!(" "), a!("Sep"), s!("."), s!(","), s!(" "), n!("2009")); + + t!("2009.24", + n!("2009.24")); + + t!("2009.24.09", + n!("2009"), s!("."), n!("24"), s!("."), n!("09")); +} diff --git a/tests/parse.rs b/tests/parse.rs index 82531c4..eb135ec 100644 --- a/tests/parse.rs +++ b/tests/parse.rs @@ -2,15 +2,15 @@ extern crate chrono; extern crate dtparse; use chrono::DateTime; -use chrono::Utc; use chrono::NaiveDate; -use chrono::NaiveTime; use chrono::NaiveDateTime; +use chrono::NaiveTime; +use chrono::Utc; use dtparse::parse; macro_rules! ymd_test { - ($date: expr, $year: expr, $month: expr, $day: expr) => { + ($date:expr, $year:expr, $month:expr, $day:expr) => { let nd = NaiveDate::from_ymd($year, $month, $day); let nt = NaiveTime::from_hms(0, 0, 0); let dt = NaiveDateTime::new(nd, nt); @@ -26,4 +26,4 @@ macro_rules! ymd_test { #[test] fn test_basic() { ymd_test!("2014 January 19", 2014, 1, 19); -} \ No newline at end of file +}