mirror of
				https://github.com/bspeice/dtparse
				synced 2025-10-31 09:30:35 -04:00 
			
		
		
		
	Add tokenization from Dateutil
This commit is contained in:
		
							
								
								
									
										215
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										215
									
								
								src/lib.rs
									
									
									
									
									
								
							| @ -1,14 +1,221 @@ | ||||
| extern crate chrono; | ||||
|  | ||||
| use chrono::NaiveDateTime; | ||||
| use chrono::DateTime; | ||||
| use chrono::Utc; | ||||
| use chrono::NaiveDateTime; | ||||
| use chrono::ParseError; | ||||
| use std::time::UNIX_EPOCH; | ||||
| use chrono::Utc; | ||||
| use std::time::SystemTime; | ||||
| use std::time::UNIX_EPOCH; | ||||
| use std::vec::Vec; | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests; | ||||
|  | ||||
| #[derive(PartialEq, Debug)] | ||||
| pub enum Token { | ||||
|     Alpha(String), | ||||
|     Numeric(String), | ||||
|     Separator(String), | ||||
| } | ||||
|  | ||||
| pub struct Tokenizer { | ||||
|     token_stack: Vec<Token>, | ||||
|     parse_string: String, | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| enum ParseState { | ||||
|     Empty, | ||||
|     Alpha, | ||||
|     AlphaDecimal, | ||||
|     Numeric, | ||||
|     NumericDecimal, | ||||
| } | ||||
|  | ||||
| impl Tokenizer { | ||||
|  | ||||
|     fn new(parse_string: String) -> Self { | ||||
|         Tokenizer { | ||||
|             token_stack: Vec::new(), | ||||
|             parse_string: parse_string.chars().rev().collect(), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Iterator for Tokenizer { | ||||
|     type Item = Token; | ||||
|  | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         if !self.token_stack.is_empty() { return Some(self.token_stack.pop().unwrap()) }; | ||||
|         if self.parse_string.is_empty() { return None }; | ||||
|  | ||||
|         let mut char_stack: Vec<char> = Vec::new(); | ||||
|         let mut seen_letters = false; | ||||
|         let mut state = ParseState::Empty; | ||||
|  | ||||
|         while let Some(next) = self.parse_string.pop() { | ||||
|             println!("{} - {:?}", next, state); | ||||
|             match state { | ||||
|             ParseState::Empty => { | ||||
|                 if next.is_numeric() { | ||||
|                     state = ParseState::Numeric; | ||||
|                     char_stack.push(next); | ||||
|                 } else if next.is_alphabetic() { | ||||
|                     state = ParseState::Alpha; | ||||
|                     seen_letters = true; | ||||
|                     char_stack.push(next); | ||||
|                 } else if next.is_whitespace() { | ||||
|                     char_stack.push(' '); | ||||
|                     break; | ||||
|                 } else { | ||||
|                     char_stack.push(next); | ||||
|                     break; | ||||
|                 } | ||||
|             }, | ||||
|             ParseState::Alpha => { | ||||
|                 if next.is_alphabetic() { | ||||
|                     char_stack.push(next); | ||||
|                 } else if next == '.' { | ||||
|                     state = ParseState::AlphaDecimal; | ||||
|                     char_stack.push(next); | ||||
|                 } else { | ||||
|                     // We don't recognize the character, so push it back | ||||
|                     // to be handled later. | ||||
|                     self.parse_string.push(next); | ||||
|                     break; | ||||
|                 } | ||||
|             }, | ||||
|             ParseState::AlphaDecimal => { | ||||
|                 if next == '.' || next.is_alphabetic() { | ||||
|                     char_stack.push(next); | ||||
|                 } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' { | ||||
|                     char_stack.push(next); | ||||
|                     state = ParseState::NumericDecimal; | ||||
|                 } else { | ||||
|                     self.parse_string.push(next); | ||||
|                     break; | ||||
|                 } | ||||
|             }, | ||||
|             ParseState::Numeric => { | ||||
|                 if next.is_numeric() { | ||||
|                     char_stack.push(next); | ||||
|                 } else if next == '.' || (next == ',' && char_stack.len() >= 2) { | ||||
|                     char_stack.push(next); | ||||
|                     state = ParseState::NumericDecimal; | ||||
|                 } else { | ||||
|                     // We don't recognize the character, so push it back | ||||
|                     // to be handled later | ||||
|                     self.parse_string.push(next); | ||||
|                     break; | ||||
|                 } | ||||
|             }, | ||||
|             ParseState::NumericDecimal => { | ||||
|                 if next == '.' || next.is_numeric() { | ||||
|                     char_stack.push(next); | ||||
|                 } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' { | ||||
|                     char_stack.push(next); | ||||
|                     state = ParseState::AlphaDecimal; | ||||
|                 } else { | ||||
|                     self.parse_string.push(next); | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|         }}; | ||||
|  | ||||
|         // I like Python's version of this much better: | ||||
|         // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,' | ||||
|         let dot_count = char_stack.iter().fold(0, |count, character| count + (if character == &'.' {1} else {0})); | ||||
|         let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' || char_stack.last().unwrap() == &','; | ||||
|         let final_string = char_stack.into_iter().collect(); | ||||
|  | ||||
|         let mut tokens = match state { | ||||
|             ParseState::Empty => vec![Token::Separator(final_string)], | ||||
|             ParseState::Alpha => vec![Token::Alpha(final_string)], | ||||
|             ParseState::Numeric => vec![Token::Numeric(final_string)], | ||||
|             ParseState::AlphaDecimal => { | ||||
|                 if needs_split { | ||||
|                    decimal_split(&final_string, false) | ||||
|                 } else { | ||||
|                     vec![Token::Alpha(final_string)] | ||||
|                 } | ||||
|             } | ||||
|             ParseState::NumericDecimal => { | ||||
|                 if needs_split { | ||||
|                     decimal_split(&final_string, dot_count == 0) | ||||
|                 } else { | ||||
|                     vec![Token::Numeric(final_string)] | ||||
|                 } | ||||
|             } | ||||
|         }.into_iter().rev().collect(); | ||||
|  | ||||
|         self.token_stack.append(&mut tokens); | ||||
|         // UNWRAP: Previous match guaranteed that at least one token was added | ||||
|         Some(self.token_stack.pop().unwrap()) | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| fn decimal_split(characters: &str, cast_period: bool) -> Vec<Token> { | ||||
|     let mut token_stack: Vec<Token> = Vec::new(); | ||||
|     let mut char_stack: Vec<char> = Vec::new(); | ||||
|     let mut state = ParseState::Empty; | ||||
|  | ||||
|     for c in characters.chars() { match state { | ||||
|         ParseState::Empty => { | ||||
|             if c.is_alphabetic() { | ||||
|                 char_stack.push(c); | ||||
|                 state = ParseState::Alpha; | ||||
|             } else if c.is_numeric() { | ||||
|                 char_stack.push(c); | ||||
|                 state = ParseState::Numeric; | ||||
|             } else { | ||||
|                 let character = if cast_period { '.' } else { c }; | ||||
|                 token_stack.push(Token::Separator(character.to_string())); | ||||
|             } | ||||
|         }, | ||||
|         ParseState::Alpha => { | ||||
|             if c.is_alphabetic() { | ||||
|                 char_stack.push(c); | ||||
|             } else { | ||||
|                 token_stack.push(Token::Alpha(char_stack.iter().collect())); | ||||
|                 char_stack.clear(); | ||||
|                 let character = if cast_period { '.' } else { c }; | ||||
|                 token_stack.push(Token::Separator(character.to_string())); | ||||
|                 state = ParseState::Empty; | ||||
|             } | ||||
|         }, | ||||
|         ParseState::Numeric => { | ||||
|             if c.is_numeric() { | ||||
|                 char_stack.push(c); | ||||
|             } else { | ||||
|                 token_stack.push(Token::Numeric(char_stack.iter().collect())); | ||||
|                 char_stack.clear(); | ||||
|                 let character = if cast_period { '.' } else { c }; | ||||
|                 token_stack.push(Token::Separator(character.to_string())); | ||||
|                 state = ParseState::Empty; | ||||
|             } | ||||
|         }, | ||||
|         _ => panic!("Invalid parse state during decimal_split()") | ||||
|     }} | ||||
|  | ||||
|     match state { | ||||
|         ParseState::Alpha => token_stack.push(Token::Alpha(char_stack.iter().collect())), | ||||
|         ParseState::Numeric => token_stack.push(Token::Numeric(char_stack.iter().collect())), | ||||
|         ParseState::Empty => (), | ||||
|         _ => panic!("Invalid parse state during decimal_split()") | ||||
|     } | ||||
|  | ||||
|     token_stack | ||||
| } | ||||
|  | ||||
|  | ||||
| pub fn tokenize(parse_string: &str) -> Vec<Token> { | ||||
|     let tokenizer = Tokenizer::new(parse_string.to_owned()); | ||||
|     tokenizer.collect() | ||||
| } | ||||
|  | ||||
|  | ||||
| pub fn parse(date: &str) -> Result<DateTime<Utc>, ParseError> { | ||||
|  | ||||
|     let current = SystemTime::now(); | ||||
|     let epoch = current.duration_since(UNIX_EPOCH).unwrap(); | ||||
|  | ||||
|  | ||||
							
								
								
									
										50
									
								
								src/tests.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								src/tests.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,50 @@ | ||||
| use super::Token; | ||||
| use super::tokenize; | ||||
|  | ||||
| macro_rules! t { | ||||
|     ($string: expr, $( $x: expr ),* ) => { | ||||
|         assert_eq!( | ||||
|             tokenize($string), | ||||
|             vec![$( $x, )*] | ||||
|         ) | ||||
|     }; | ||||
| } | ||||
|  | ||||
| macro_rules! a { | ||||
|     ($string:expr) => { | ||||
|         Token::Alpha($string.to_owned()) | ||||
|     }; | ||||
| } | ||||
|  | ||||
| macro_rules! n { | ||||
|     ($string:expr) => { | ||||
|         Token::Numeric($string.to_owned()) | ||||
|     }; | ||||
| } | ||||
|  | ||||
| macro_rules! s { | ||||
|     ($string:expr) => { | ||||
|         Token::Separator($string.to_owned()) | ||||
|     }; | ||||
| } | ||||
|  | ||||
| #[test] | ||||
| fn test_basic_tokenize() { | ||||
|     t!("Sep.2009.24", | ||||
|        a!("Sep"), s!("."), n!("2009"), s!("."), n!("24")); | ||||
|  | ||||
|     t!("Sep.2009;24", | ||||
|        a!("Sep"), s!("."), n!("2009"), s!(";"), n!("24")); | ||||
|  | ||||
|     t!("Sep.2009,24", | ||||
|        a!("Sep"), s!("."), n!("2009"), s!(","), n!("24")); | ||||
|  | ||||
|     t!("24 Sep., 2009", | ||||
|        n!("24"), s!(" "), a!("Sep"), s!("."), s!(","), s!(" "), n!("2009")); | ||||
|  | ||||
|     t!("2009.24", | ||||
|        n!("2009.24")); | ||||
|  | ||||
|     t!("2009.24.09", | ||||
|        n!("2009"), s!("."), n!("24"), s!("."), n!("09")); | ||||
| } | ||||
| @ -2,15 +2,15 @@ extern crate chrono; | ||||
| extern crate dtparse; | ||||
|  | ||||
| use chrono::DateTime; | ||||
| use chrono::Utc; | ||||
| use chrono::NaiveDate; | ||||
| use chrono::NaiveTime; | ||||
| use chrono::NaiveDateTime; | ||||
| use chrono::NaiveTime; | ||||
| use chrono::Utc; | ||||
|  | ||||
| use dtparse::parse; | ||||
|  | ||||
| macro_rules! ymd_test { | ||||
|     ($date: expr, $year: expr, $month: expr, $day: expr) => { | ||||
|     ($date:expr, $year:expr, $month:expr, $day:expr) => { | ||||
|         let nd = NaiveDate::from_ymd($year, $month, $day); | ||||
|         let nt = NaiveTime::from_hms(0, 0, 0); | ||||
|         let dt = NaiveDateTime::new(nd, nt); | ||||
| @ -26,4 +26,4 @@ macro_rules! ymd_test { | ||||
| #[test] | ||||
| fn test_basic() { | ||||
|     ymd_test!("2014 January 19", 2014, 1, 19); | ||||
| } | ||||
| } | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	 Bradlee Speice
					Bradlee Speice