diff --git a/src/lib.rs b/src/lib.rs index f8a52ea..b5c4cab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,11 +27,14 @@ use std::num::ParseIntError; use std::str::FromStr; use std::vec::Vec; +mod tokenize; mod weekday; #[cfg(test)] mod tests; +use tokenize::ParseState; +use tokenize::Tokenizer; use weekday::day_of_week; use weekday::DayOfWeek; @@ -89,210 +92,6 @@ impl From for ParseError { type ParseResult = Result; type ParseIResult = Result; -pub struct Tokenizer { - token_stack: Vec, - parse_string: String, -} - -#[derive(Debug, PartialEq)] -enum ParseState { - Empty, - Alpha, - AlphaDecimal, - Numeric, - NumericDecimal, -} - -impl Tokenizer { - fn new(parse_string: String) -> Self { - Tokenizer { - token_stack: Vec::new(), - parse_string: parse_string.chars().rev().collect(), - } - } -} - -impl Iterator for Tokenizer { - type Item = String; - - fn next(&mut self) -> Option { - if !self.token_stack.is_empty() { - return Some(self.token_stack.pop().unwrap()); - }; - if self.parse_string.is_empty() { - return None; - }; - - let mut char_stack: Vec = Vec::new(); - let mut seen_letters = false; - let mut state = ParseState::Empty; - - while let Some(next) = self.parse_string.pop() { - match state { - ParseState::Empty => { - if next.is_numeric() { - state = ParseState::Numeric; - char_stack.push(next); - } else if next.is_alphabetic() { - state = ParseState::Alpha; - seen_letters = true; - char_stack.push(next); - } else if next.is_whitespace() { - char_stack.push(' '); - break; - } else { - char_stack.push(next); - break; - } - } - ParseState::Alpha => { - if next.is_alphabetic() { - char_stack.push(next); - } else if next == '.' { - state = ParseState::AlphaDecimal; - char_stack.push(next); - } else { - // We don't recognize the character, so push it back - // to be handled later. - self.parse_string.push(next); - break; - } - } - ParseState::AlphaDecimal => { - if next == '.' || next.is_alphabetic() { - char_stack.push(next); - } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' { - char_stack.push(next); - state = ParseState::NumericDecimal; - } else { - self.parse_string.push(next); - break; - } - } - ParseState::Numeric => { - if next.is_numeric() { - char_stack.push(next); - } else if next == '.' || (next == ',' && char_stack.len() >= 2) { - char_stack.push(next); - state = ParseState::NumericDecimal; - } else { - // We don't recognize the character, so push it back - // to be handled later - self.parse_string.push(next); - break; - } - } - ParseState::NumericDecimal => { - if next == '.' || next.is_numeric() { - char_stack.push(next); - } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' { - char_stack.push(next); - state = ParseState::AlphaDecimal; - } else { - self.parse_string.push(next); - break; - } - } - } - } - - // I like Python's version of this much better: - // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,' - let dot_count = char_stack.iter().fold(0, |count, character| { - count + (if character == &'.' { 1 } else { 0 }) - }); - let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' - || char_stack.last().unwrap() == &','; - let final_string: String = char_stack.into_iter().collect(); - - let mut tokens = match state { - ParseState::Empty => vec![final_string], - ParseState::Alpha => vec![final_string], - ParseState::Numeric => vec![final_string], - ParseState::AlphaDecimal => { - if needs_split { - decimal_split(&final_string, false) - } else { - vec![final_string] - } - } - ParseState::NumericDecimal => { - if needs_split { - decimal_split(&final_string, dot_count == 0) - } else { - vec![final_string] - } - } - }.into_iter() - .rev() - .collect(); - - self.token_stack.append(&mut tokens); - // UNWRAP: Previous match guaranteed that at least one token was added - let token = self.token_stack.pop().unwrap(); - if state == ParseState::NumericDecimal && !token.contains(".") { - Some(token.replace(",", ".")) - } else { - Some(token) - } - } -} - -fn decimal_split(characters: &str, cast_period: bool) -> Vec { - let mut token_stack: Vec = Vec::new(); - let mut char_stack: Vec = Vec::new(); - let mut state = ParseState::Empty; - - for c in characters.chars() { - match state { - ParseState::Empty => { - if c.is_alphabetic() { - char_stack.push(c); - state = ParseState::Alpha; - } else if c.is_numeric() { - char_stack.push(c); - state = ParseState::Numeric; - } else { - let character = if cast_period { '.' } else { c }; - token_stack.push(character.to_string()); - } - } - ParseState::Alpha => { - if c.is_alphabetic() { - char_stack.push(c); - } else { - token_stack.push(char_stack.iter().collect()); - char_stack.clear(); - let character = if cast_period { '.' } else { c }; - token_stack.push(character.to_string()); - state = ParseState::Empty; - } - } - ParseState::Numeric => { - if c.is_numeric() { - char_stack.push(c); - } else { - token_stack.push(char_stack.iter().collect()); - char_stack.clear(); - let character = if cast_period { '.' } else { c }; - token_stack.push(character.to_string()); - state = ParseState::Empty; - } - } - _ => panic!("Invalid parse state during decimal_split()"), - } - } - - match state { - ParseState::Alpha => token_stack.push(char_stack.iter().collect()), - ParseState::Numeric => token_stack.push(char_stack.iter().collect()), - ParseState::Empty => (), - _ => panic!("Invalid parse state during decimal_split()"), - } - - token_stack -} - pub fn tokenize(parse_string: &str) -> Vec { let tokenizer = Tokenizer::new(parse_string.to_owned()); tokenizer.collect() diff --git a/src/tokenize.rs b/src/tokenize.rs new file mode 100644 index 0000000..2a44d25 --- /dev/null +++ b/src/tokenize.rs @@ -0,0 +1,203 @@ +pub(crate) struct Tokenizer { + token_stack: Vec, + parse_string: String, +} + +#[derive(Debug, PartialEq)] +pub(crate) enum ParseState { + Empty, + Alpha, + AlphaDecimal, + Numeric, + NumericDecimal, +} + +impl Tokenizer { + pub(crate) fn new(parse_string: String) -> Self { + Tokenizer { + token_stack: Vec::new(), + parse_string: parse_string.chars().rev().collect(), + } + } +} + +impl Iterator for Tokenizer { + type Item = String; + + fn next(&mut self) -> Option { + if !self.token_stack.is_empty() { + return Some(self.token_stack.pop().unwrap()); + }; + if self.parse_string.is_empty() { + return None; + }; + + let mut char_stack: Vec = Vec::new(); + let mut seen_letters = false; + let mut state = ParseState::Empty; + + while let Some(next) = self.parse_string.pop() { + match state { + ParseState::Empty => { + if next.is_numeric() { + state = ParseState::Numeric; + char_stack.push(next); + } else if next.is_alphabetic() { + state = ParseState::Alpha; + seen_letters = true; + char_stack.push(next); + } else if next.is_whitespace() { + char_stack.push(' '); + break; + } else { + char_stack.push(next); + break; + } + } + ParseState::Alpha => { + if next.is_alphabetic() { + char_stack.push(next); + } else if next == '.' { + state = ParseState::AlphaDecimal; + char_stack.push(next); + } else { + // We don't recognize the character, so push it back + // to be handled later. + self.parse_string.push(next); + break; + } + } + ParseState::AlphaDecimal => { + if next == '.' || next.is_alphabetic() { + char_stack.push(next); + } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' { + char_stack.push(next); + state = ParseState::NumericDecimal; + } else { + self.parse_string.push(next); + break; + } + } + ParseState::Numeric => { + if next.is_numeric() { + char_stack.push(next); + } else if next == '.' || (next == ',' && char_stack.len() >= 2) { + char_stack.push(next); + state = ParseState::NumericDecimal; + } else { + // We don't recognize the character, so push it back + // to be handled later + self.parse_string.push(next); + break; + } + } + ParseState::NumericDecimal => { + if next == '.' || next.is_numeric() { + char_stack.push(next); + } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' { + char_stack.push(next); + state = ParseState::AlphaDecimal; + } else { + self.parse_string.push(next); + break; + } + } + } + } + + // I like Python's version of this much better: + // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,' + let dot_count = char_stack.iter().fold(0, |count, character| { + count + (if character == &'.' { 1 } else { 0 }) + }); + let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' + || char_stack.last().unwrap() == &','; + let final_string: String = char_stack.into_iter().collect(); + + let mut tokens = match state { + ParseState::Empty => vec![final_string], + ParseState::Alpha => vec![final_string], + ParseState::Numeric => vec![final_string], + ParseState::AlphaDecimal => { + if needs_split { + decimal_split(&final_string, false) + } else { + vec![final_string] + } + } + ParseState::NumericDecimal => { + if needs_split { + decimal_split(&final_string, dot_count == 0) + } else { + vec![final_string] + } + } + }.into_iter() + .rev() + .collect(); + + self.token_stack.append(&mut tokens); + // UNWRAP: Previous match guaranteed that at least one token was added + let token = self.token_stack.pop().unwrap(); + if state == ParseState::NumericDecimal && !token.contains(".") { + Some(token.replace(",", ".")) + } else { + Some(token) + } + } +} + +fn decimal_split(characters: &str, cast_period: bool) -> Vec { + let mut token_stack: Vec = Vec::new(); + let mut char_stack: Vec = Vec::new(); + let mut state = ParseState::Empty; + + for c in characters.chars() { + match state { + ParseState::Empty => { + if c.is_alphabetic() { + char_stack.push(c); + state = ParseState::Alpha; + } else if c.is_numeric() { + char_stack.push(c); + state = ParseState::Numeric; + } else { + let character = if cast_period { '.' } else { c }; + token_stack.push(character.to_string()); + } + } + ParseState::Alpha => { + if c.is_alphabetic() { + char_stack.push(c); + } else { + token_stack.push(char_stack.iter().collect()); + char_stack.clear(); + let character = if cast_period { '.' } else { c }; + token_stack.push(character.to_string()); + state = ParseState::Empty; + } + } + ParseState::Numeric => { + if c.is_numeric() { + char_stack.push(c); + } else { + token_stack.push(char_stack.iter().collect()); + char_stack.clear(); + let character = if cast_period { '.' } else { c }; + token_stack.push(character.to_string()); + state = ParseState::Empty; + } + } + _ => panic!("Invalid parse state during decimal_split()"), + } + } + + match state { + ParseState::Alpha => token_stack.push(char_stack.iter().collect()), + ParseState::Numeric => token_stack.push(char_stack.iter().collect()), + ParseState::Empty => (), + _ => panic!("Invalid parse state during decimal_split()"), + } + + token_stack +}