2018-06-29 23:04:10 -04:00
|
|
|
pub(crate) struct Tokenizer {
|
|
|
|
token_stack: Vec<String>,
|
2018-07-07 23:37:02 -04:00
|
|
|
// TODO: Should this be more generic? io::Read for example?
|
2018-06-29 23:04:10 -04:00
|
|
|
parse_string: String,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, PartialEq)]
|
|
|
|
pub(crate) enum ParseState {
|
|
|
|
Empty,
|
|
|
|
Alpha,
|
|
|
|
AlphaDecimal,
|
|
|
|
Numeric,
|
|
|
|
NumericDecimal,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Tokenizer {
|
2018-07-07 23:37:02 -04:00
|
|
|
pub(crate) fn new(parse_string: &str) -> Self {
|
2018-06-29 23:04:10 -04:00
|
|
|
Tokenizer {
|
2018-07-07 23:37:02 -04:00
|
|
|
token_stack: vec![],
|
2018-06-29 23:04:10 -04:00
|
|
|
parse_string: parse_string.chars().rev().collect(),
|
|
|
|
}
|
|
|
|
}
|
2018-07-07 23:37:02 -04:00
|
|
|
|
|
|
|
fn isword(&self, c: char) -> bool {
|
|
|
|
c.is_alphabetic()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn isnum(&self, c: char) -> bool {
|
|
|
|
c.is_numeric()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn isspace(&self, c: char) -> bool {
|
|
|
|
c.is_whitespace()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn decimal_split(&self, s: &str) -> Vec<String> {
|
|
|
|
// Handles the same thing as Python's re.split()
|
|
|
|
let mut tokens: Vec<String> = vec!["".to_owned()];
|
|
|
|
|
|
|
|
for c in s.chars() {
|
|
|
|
if c == '.' || c == ',' {
|
|
|
|
tokens.push(c.to_string());
|
|
|
|
tokens.push("".to_owned());
|
|
|
|
} else {
|
|
|
|
// UNWRAP: Initial setup guarantees we always have an item
|
|
|
|
let mut t = tokens.pop().unwrap();
|
|
|
|
t.push(c);
|
|
|
|
tokens.push(t);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: Do I really have to use &String instead of &str?
|
|
|
|
if tokens.last() == Some(&"".to_owned()) {
|
|
|
|
tokens.pop();
|
|
|
|
}
|
|
|
|
|
|
|
|
tokens
|
|
|
|
}
|
2018-06-29 23:04:10 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Iterator for Tokenizer {
|
|
|
|
type Item = String;
|
|
|
|
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
|
|
if !self.token_stack.is_empty() {
|
2018-07-07 23:37:02 -04:00
|
|
|
return Some(self.token_stack.remove(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut seenletters = false;
|
|
|
|
let mut token: Option<String> = None;
|
2018-06-29 23:04:10 -04:00
|
|
|
let mut state = ParseState::Empty;
|
|
|
|
|
2018-07-07 23:37:02 -04:00
|
|
|
while !self.parse_string.is_empty() {
|
|
|
|
// Dateutil uses a separate `charstack` to manage the incoming stream.
|
|
|
|
// Because parse_string can have things pushed back onto it, we skip
|
|
|
|
// a couple of steps related to the `charstack`.
|
|
|
|
|
|
|
|
// UNWRAP: Just checked that parse_string isn't empty
|
|
|
|
let nextchar = self.parse_string.pop().unwrap();
|
|
|
|
|
2018-06-29 23:04:10 -04:00
|
|
|
match state {
|
|
|
|
ParseState::Empty => {
|
2018-07-07 23:37:02 -04:00
|
|
|
token = Some(nextchar.to_string());
|
|
|
|
if self.isword(nextchar) {
|
2018-06-29 23:04:10 -04:00
|
|
|
state = ParseState::Alpha;
|
2018-07-07 23:37:02 -04:00
|
|
|
} else if self.isnum(nextchar) {
|
|
|
|
state = ParseState::Numeric;
|
|
|
|
} else if self.isspace(nextchar) {
|
|
|
|
token = Some(" ".to_owned());
|
2018-06-29 23:04:10 -04:00
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2019-11-13 23:18:37 -05:00
|
|
|
}
|
2018-06-29 23:04:10 -04:00
|
|
|
ParseState::Alpha => {
|
2018-07-07 23:37:02 -04:00
|
|
|
seenletters = true;
|
|
|
|
if self.isword(nextchar) {
|
|
|
|
// UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token
|
|
|
|
token.as_mut().unwrap().push(nextchar);
|
|
|
|
} else if nextchar == '.' {
|
|
|
|
token.as_mut().unwrap().push(nextchar);
|
2018-06-29 23:04:10 -04:00
|
|
|
state = ParseState::AlphaDecimal;
|
|
|
|
} else {
|
2018-07-07 23:37:02 -04:00
|
|
|
self.parse_string.push(nextchar);
|
2018-06-29 23:04:10 -04:00
|
|
|
break;
|
|
|
|
}
|
2019-11-13 23:18:37 -05:00
|
|
|
}
|
2018-07-07 23:37:02 -04:00
|
|
|
ParseState::Numeric => {
|
|
|
|
if self.isnum(nextchar) {
|
|
|
|
// UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token
|
|
|
|
token.as_mut().unwrap().push(nextchar);
|
2019-11-13 23:18:37 -05:00
|
|
|
} else if nextchar == '.'
|
|
|
|
|| (nextchar == ',' && token.as_ref().unwrap().len() >= 2)
|
|
|
|
{
|
2018-07-07 23:37:02 -04:00
|
|
|
token.as_mut().unwrap().push(nextchar);
|
2018-06-29 23:04:10 -04:00
|
|
|
state = ParseState::NumericDecimal;
|
|
|
|
} else {
|
2018-07-07 23:37:02 -04:00
|
|
|
self.parse_string.push(nextchar);
|
2018-06-29 23:04:10 -04:00
|
|
|
break;
|
|
|
|
}
|
2019-11-13 23:18:37 -05:00
|
|
|
}
|
2018-07-07 23:37:02 -04:00
|
|
|
ParseState::AlphaDecimal => {
|
|
|
|
seenletters = true;
|
|
|
|
if nextchar == '.' || self.isword(nextchar) {
|
|
|
|
// UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token
|
|
|
|
token.as_mut().unwrap().push(nextchar);
|
2018-07-18 20:34:14 -04:00
|
|
|
} else if self.isnum(nextchar) && token.as_ref().unwrap().ends_with('.') {
|
2018-07-07 23:37:02 -04:00
|
|
|
token.as_mut().unwrap().push(nextchar);
|
2018-06-29 23:04:10 -04:00
|
|
|
state = ParseState::NumericDecimal;
|
|
|
|
} else {
|
2018-07-07 23:37:02 -04:00
|
|
|
self.parse_string.push(nextchar);
|
2018-06-29 23:04:10 -04:00
|
|
|
break;
|
|
|
|
}
|
2019-11-13 23:18:37 -05:00
|
|
|
}
|
2018-06-29 23:04:10 -04:00
|
|
|
ParseState::NumericDecimal => {
|
2018-07-07 23:37:02 -04:00
|
|
|
if nextchar == '.' || self.isnum(nextchar) {
|
|
|
|
// UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token
|
|
|
|
token.as_mut().unwrap().push(nextchar);
|
2018-07-18 20:34:14 -04:00
|
|
|
} else if self.isword(nextchar) && token.as_ref().unwrap().ends_with('.') {
|
2018-07-07 23:37:02 -04:00
|
|
|
token.as_mut().unwrap().push(nextchar);
|
2018-06-29 23:04:10 -04:00
|
|
|
state = ParseState::AlphaDecimal;
|
|
|
|
} else {
|
2018-07-07 23:37:02 -04:00
|
|
|
self.parse_string.push(nextchar);
|
2018-06-29 23:04:10 -04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-07 23:37:02 -04:00
|
|
|
// Python uses the state to short-circuit and make sure it doesn't run into issues with None
|
|
|
|
// We do something slightly different to express the same logic
|
|
|
|
if state == ParseState::AlphaDecimal || state == ParseState::NumericDecimal {
|
|
|
|
// UNWRAP: The state check guarantees that we have a value
|
2019-11-13 23:18:37 -05:00
|
|
|
let dot_count = token
|
|
|
|
.as_ref()
|
|
|
|
.unwrap()
|
|
|
|
.chars()
|
|
|
|
.filter(|c| *c == '.')
|
|
|
|
.count();
|
2018-07-07 23:37:02 -04:00
|
|
|
let last_char = token.as_ref().unwrap().chars().last();
|
|
|
|
let last_splittable = last_char == Some('.') || last_char == Some(',');
|
2019-11-13 23:18:37 -05:00
|
|
|
|
2018-07-07 23:37:02 -04:00
|
|
|
if seenletters || dot_count > 1 || last_splittable {
|
|
|
|
let mut l = self.decimal_split(token.as_ref().unwrap());
|
|
|
|
let remaining = l.split_off(1);
|
2019-11-13 23:18:37 -05:00
|
|
|
|
2018-07-07 23:37:02 -04:00
|
|
|
token = Some(l[0].clone());
|
|
|
|
for t in remaining {
|
|
|
|
self.token_stack.push(t);
|
2018-06-29 23:04:10 -04:00
|
|
|
}
|
|
|
|
}
|
2019-11-13 23:18:37 -05:00
|
|
|
|
2018-07-07 23:37:02 -04:00
|
|
|
if state == ParseState::NumericDecimal && dot_count == 0 {
|
|
|
|
token = Some(token.unwrap().replace(',', "."));
|
2018-06-29 23:04:10 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-07 23:37:02 -04:00
|
|
|
token
|
2018-06-29 23:04:10 -04:00
|
|
|
}
|
|
|
|
}
|
2018-07-03 01:02:27 -04:00
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
|
|
|
|
use Tokenizer;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_basic() {
|
2018-07-07 23:37:02 -04:00
|
|
|
let tokens: Vec<String> = Tokenizer::new("September of 2003,").collect();
|
2018-07-03 01:02:27 -04:00
|
|
|
assert_eq!(tokens, vec!["September", " ", "of", " ", "2003", ","]);
|
|
|
|
}
|
|
|
|
}
|