From 9135962839f3b83924902c4306bfdc266c96224b Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 3 Jul 2018 01:02:27 -0400 Subject: [PATCH] Lots of fixes, but it turns out tokenization is broken --- build_pycompat.py | 56 ++++++++++++++++++++++++-------- src/lib.rs | 83 ++++++++++++++++++++++++++--------------------- src/tokenize.rs | 12 +++++++ src/weekday.rs | 2 -- tests/pycompat.rs | 41 +++++++++++++++-------- 5 files changed, 127 insertions(+), 67 deletions(-) diff --git a/build_pycompat.py b/build_pycompat.py index acd48be..641494e 100644 --- a/build_pycompat.py +++ b/build_pycompat.py @@ -83,6 +83,9 @@ tests = { 'test_fuzzy_tzinfo': [ 'Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.' ], + 'test_fuzzy_tokens_tzinfo': [ + 'Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.' + ], 'test_parse_default_ignore': [ ], } @@ -158,6 +161,17 @@ def test_fuzzy_tzinfo(i, s): return TEST_FUZZY_TZINFO.format(i=i, d=d, s=s, offset=int(d.tzinfo._offset.total_seconds())) + +def test_fuzzy_tokens_tzinfo(i, s): + d, tokens = parse(s, fuzzy_with_tokens=True) + + r_tokens = ", ".join(list(map(lambda s: f'"{s}".to_owned()', tokens))) + + return TEST_FUZZY_TOKENS_TZINFO.format( + i=i, d=d, s=s, offset=int(d.tzinfo._offset.total_seconds()), + tokens=r_tokens + ) + # Here lies all the ugly junk. TEST_HEADER = ''' extern crate chrono; @@ -214,8 +228,8 @@ fn parse_and_assert( assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); } fn parse_and_assert_simple( @@ -223,14 +237,14 @@ fn parse_and_assert_simple( s: &str, ) { let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); - assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for {}", s); - assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for {}", s); - assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for {}", s); - assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for {}", s); - assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch for {}", s); - assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for {}", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); + assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); + assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); + assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); + assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch for '{}'", s); + assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); } fn parse_fuzzy_and_assert( @@ -264,9 +278,9 @@ fn parse_fuzzy_and_assert( assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); - assert_eq!(ptokens, rs_parsed.2, "Fuzzy mismatch for {}", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); + assert_eq!(ptokens, rs_parsed.2, "Tokens mismatch for '{}'", s); } macro_rules! rs_tzinfo_map { @@ -411,7 +425,7 @@ fn test_parse_default_ignore{i}() {{ TEST_FUZZY_TZINFO = ''' #[test] -fn test_fuzzy{i}() {{ +fn test_fuzzy_tzinfo{i}() {{ let info = ParserInfo::default(); let pdt = PyDateTime {{ year: {d.year}, month: {d.month}, day: {d.day}, @@ -422,6 +436,20 @@ fn test_fuzzy{i}() {{ None, false, HashMap::new()); }}\n''' +TEST_FUZZY_TOKENS_TZINFO = ''' +#[test] +fn test_fuzzy_tokens_tzinfo{i}() {{ + let info = ParserInfo::default(); + let pdt = PyDateTime {{ + year: {d.year}, month: {d.month}, day: {d.day}, + hour: {d.hour}, minute: {d.minute}, second: {d.second}, + micros: {d.microsecond}, tzo: Some({offset}) + }}; + let tokens = vec![{tokens}]; + parse_fuzzy_and_assert(pdt, Some(tokens), info, "{s}", None, None, true, true, + None, false, HashMap::new()); +}}\n''' + if __name__ == '__main__': main() \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 46266be..1590974 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,3 @@ -#![allow(dead_code)] -#![allow(unused)] - #[macro_use] extern crate lazy_static; @@ -8,7 +5,6 @@ extern crate chrono; extern crate num_traits; extern crate rust_decimal; -use chrono::DateTime; use chrono::Datelike; use chrono::Duration; use chrono::FixedOffset; @@ -17,7 +13,6 @@ use chrono::NaiveDate; use chrono::NaiveDateTime; use chrono::NaiveTime; use chrono::Timelike; -use chrono::Utc; use num_traits::cast::ToPrimitive; use rust_decimal::Decimal; use rust_decimal::Error as DecimalError; @@ -33,7 +28,6 @@ mod weekday; #[cfg(test)] mod tests; -use tokenize::ParseState; use tokenize::Tokenizer; use weekday::day_of_week; use weekday::DayOfWeek; @@ -59,13 +53,13 @@ pub enum ParseInternalError { } impl From for ParseInternalError { - fn from(err: DecimalError) -> Self { + fn from(_err: DecimalError) -> Self { ParseInternalError::InvalidDecimal } } impl From for ParseInternalError { - fn from(err: ParseIntError) -> Self { + fn from(_err: ParseIntError) -> Self { ParseInternalError::InvalidInteger } } @@ -294,11 +288,6 @@ struct YMD { ystridx: Option, } -enum YMDAppendEither { - Number(i32), - Stringy(String), -} - impl YMD { fn len(&self) -> usize { self._ymd.len() @@ -388,7 +377,7 @@ impl YMD { Ok(()) } } - None => Err(ParseInternalError::ValueError("Missing label.".to_owned())), + None => Ok(()), } } @@ -613,34 +602,34 @@ impl Parser { while i < len_l { let value_repr = l[i].clone(); - if let Ok(v) = Decimal::from_str(&value_repr) { + if let Ok(_v) = Decimal::from_str(&value_repr) { i = self.parse_numeric_token(&l, i, &self.info, &mut ymd, &mut res, fuzzy)?; } else if let Some(value) = self.info.get_weekday(&l[i]) { res.weekday = Some(value); } else if let Some(value) = self.info.get_month(&l[i]) { - ymd.append(value as i32, &l[i], Some(YMDLabel::Month)); + ymd.append(value as i32, &l[i], Some(YMDLabel::Month))?; if i + 1 < len_l { if l[i + 1] == "-" || l[i + 1] == "/" { // Jan-01[-99] let sep = &l[i + 1]; // TODO: This seems like a very unsafe unwrap - ymd.append(l[i + 2].parse::().unwrap(), &l[i + 2], None); + ymd.append(l[i + 2].parse::().unwrap(), &l[i + 2], None)?; if i + 3 < len_l && &l[i + 3] == sep { // Jan-01-99 - ymd.append(l[i + 4].parse::().unwrap(), &l[i + 4], None); + ymd.append(l[i + 4].parse::().unwrap(), &l[i + 4], None)?; i += 2; } i += 2; - } else if (i + 4 < len_l && l[i + 1] == l[i + 3] && l[i + 3] == " " - && self.info.get_pertain(&l[i + 2])) + } else if i + 4 < len_l && l[i + 1] == l[i + 3] && l[i + 3] == " " + && self.info.get_pertain(&l[i + 2]) { // Jan of 01 if let Some(value) = l[i + 4].parse::().ok() { let year = self.info.convertyear(value, false); - ymd.append(year, &l[i + 4], Some(YMDLabel::Year)); + ymd.append(year, &l[i + 4], Some(YMDLabel::Year))?; } i += 4; @@ -737,7 +726,7 @@ impl Parser { if !self.info.validate(&mut res) { Err(ParseError::InvalidParseResult(res)) } else if fuzzy_with_tokens { - let skipped_tokens = skipped_idxs.into_iter().map(|i| l[i].clone()).collect(); + let skipped_tokens = self.recombine_skipped(skipped_idxs, l); Ok((res, Some(skipped_tokens))) } else { Ok((res, None)) @@ -797,7 +786,7 @@ impl Parser { }; // TODO: Change month/day to u32 - let mut d = NaiveDate::from_ymd( + let d = NaiveDate::from_ymd( y, m, min(res.day.unwrap_or(default.day() as i32) as u32, days_in_month(y, m as i32)?) @@ -818,7 +807,7 @@ impl Parser { fn build_tzaware( &self, - dt: &NaiveDateTime, + _dt: &NaiveDateTime, res: &ParsingResult, tzinfos: HashMap, ) -> ParseResult> { @@ -877,9 +866,9 @@ impl Parser { let s = &tokens[idx]; if ymd.len() == 0 && tokens[idx].find(".") == None { - ymd.append(s[0..2].parse::().unwrap(), &s[0..2], None); - ymd.append(s[2..4].parse::().unwrap(), &s[2..4], None); - ymd.append(s[4..6].parse::().unwrap(), &s[4..6], None); + ymd.append(s[0..2].parse::().unwrap(), &s[0..2], None)?; + ymd.append(s[2..4].parse::().unwrap(), &s[2..4], None)?; + ymd.append(s[4..6].parse::().unwrap(), &s[4..6], None)?; } else { // 19990101T235959[.59] res.hour = s[0..2].parse::().ok(); @@ -892,9 +881,9 @@ impl Parser { } else if vec![8, 12, 14].contains(&len_li) { // YYMMDD let s = &tokens[idx]; - ymd.append(s[..4].parse::().unwrap(), &s[..4], Some(YMDLabel::Year)); - ymd.append(s[4..6].parse::().unwrap(), &s[4..6], None); - ymd.append(s[6..8].parse::().unwrap(), &s[6..8], None); + ymd.append(s[..4].parse::().unwrap(), &s[..4], Some(YMDLabel::Year))?; + ymd.append(s[4..6].parse::().unwrap(), &s[4..6], None)?; + ymd.append(s[6..8].parse::().unwrap(), &s[6..8], None)?; if len_li > 8 { res.hour = Some(s[8..10].parse::()?); @@ -936,20 +925,20 @@ impl Parser { { // TODO: There's got to be a better way of handling the condition above let sep = &tokens[idx + 1]; - ymd.append(value_repr.parse::().unwrap(), &value_repr, None); + ymd.append(value_repr.parse::().unwrap(), &value_repr, None)?; if idx + 2 < len_l && !info.get_jump(&tokens[idx + 2]) { if let Ok(val) = tokens[idx + 2].parse::() { - ymd.append(val, &tokens[idx + 2], None); + ymd.append(val, &tokens[idx + 2], None)?; } else if let Some(val) = info.get_month(&tokens[idx + 2]) { - ymd.append(val as i32, &tokens[idx + 2], Some(YMDLabel::Month)); + ymd.append(val as i32, &tokens[idx + 2], Some(YMDLabel::Month))?; } if idx + 3 < len_l && &tokens[idx + 3] == sep { if let Some(value) = info.get_month(&tokens[idx + 4]) { - ymd.append(value as i32, &tokens[idx + 4], Some(YMDLabel::Month)); + ymd.append(value as i32, &tokens[idx + 4], Some(YMDLabel::Month))?; } else { - ymd.append(tokens[idx + 4].parse::().unwrap(), &tokens[idx + 4], None); + ymd.append(tokens[idx + 4].parse::().unwrap(), &tokens[idx + 4], None)?; } idx += 2; @@ -965,7 +954,7 @@ impl Parser { let ampm = info.get_ampm(&tokens[idx + 2]).unwrap(); res.hour = Some(self.adjust_ampm(hour, ampm)); } else { - ymd.append(value.floor().to_i64().unwrap() as i32, &value_repr, None); + ymd.append(value.floor().to_i64().unwrap() as i32, &value_repr, None)?; } } else if info.get_ampm(&tokens[idx + 1]).is_some() && (*ZERO <= value && value < *TWENTY_FOUR) @@ -975,7 +964,7 @@ impl Parser { res.hour = Some(self.adjust_ampm(hour, info.get_ampm(&tokens[idx + 1]).unwrap())); idx += 1; } else if ymd.could_be_day(value.to_i64().unwrap() as i32) { - ymd.append(value.to_i64().unwrap() as i32, &value_repr, None); + ymd.append(value.to_i64().unwrap() as i32, &value_repr, None)?; } else if !fuzzy { return Err(ParseInternalError::ValueError("".to_owned())); } @@ -1106,6 +1095,26 @@ impl Parser { (minute, second) } + + fn recombine_skipped(&self, skipped_idxs: Vec, tokens: Vec) -> Vec { + let mut skipped_tokens: Vec = vec![]; + + let mut sorted_idxs = skipped_idxs.clone(); + sorted_idxs.sort(); + + for (i, idx) in sorted_idxs.iter().enumerate() { + if i > 0 && idx - 1 == skipped_idxs[i - 1] { + // UNWRAP: Having an initial value and unconditional push at end guarantees value + let mut t = skipped_tokens.pop().unwrap(); + t.push_str(tokens[idx.clone()].as_ref()); + skipped_tokens.push(t); + } else { + skipped_tokens.push(tokens[idx.clone()].to_owned()); + } + } + + skipped_tokens + } } fn close_to_integer(value: &Decimal) -> bool { diff --git a/src/tokenize.rs b/src/tokenize.rs index 2a44d25..38091a8 100644 --- a/src/tokenize.rs +++ b/src/tokenize.rs @@ -201,3 +201,15 @@ fn decimal_split(characters: &str, cast_period: bool) -> Vec { token_stack } + +#[cfg(test)] +mod tests { + + use Tokenizer; + + #[test] + fn test_basic() { + let tokens: Vec = Tokenizer::new("September of 2003,".to_owned()).collect(); + assert_eq!(tokens, vec!["September", " ", "of", " ", "2003", ","]); + } +} diff --git a/src/weekday.rs b/src/weekday.rs index 516d452..d92c758 100644 --- a/src/weekday.rs +++ b/src/weekday.rs @@ -1,5 +1,3 @@ -use std::cmp::max; - use ParseResult; use ParseError; diff --git a/tests/pycompat.rs b/tests/pycompat.rs index 7efe2d2..8ca6e6f 100644 --- a/tests/pycompat.rs +++ b/tests/pycompat.rs @@ -53,8 +53,8 @@ fn parse_and_assert( assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); } fn parse_and_assert_simple( @@ -62,14 +62,14 @@ fn parse_and_assert_simple( s: &str, ) { let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); - assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for {}", s); - assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for {}", s); - assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for {}", s); - assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for {}", s); - assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch for {}", s); - assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for {}", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); + assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); + assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); + assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); + assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch for '{}'", s); + assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); } fn parse_fuzzy_and_assert( @@ -103,9 +103,9 @@ fn parse_fuzzy_and_assert( assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); - assert_eq!(ptokens, rs_parsed.2, "Fuzzy mismatch for {}", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); + assert_eq!(ptokens, rs_parsed.2, "Tokens mismatch for '{}'", s); } macro_rules! rs_tzinfo_map { @@ -1735,7 +1735,7 @@ fn test_parse_ignoretz7() { } #[test] -fn test_fuzzy0() { +fn test_fuzzy_tzinfo0() { let info = ParserInfo::default(); let pdt = PyDateTime { year: 2003, month: 9, day: 25, @@ -1745,3 +1745,16 @@ fn test_fuzzy0() { parse_fuzzy_and_assert(pdt, None, info, "Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", None, None, true, false, None, false, HashMap::new()); } + +#[test] +fn test_fuzzy_tokens_tzinfo0() { + let info = ParserInfo::default(); + let pdt = PyDateTime { + year: 2003, month: 9, day: 25, + hour: 10, minute: 49, second: 41, + micros: 0, tzo: Some(-10800) + }; + let tokens = vec!["Today is ".to_owned(), "of ".to_owned(), ", exactly at ".to_owned(), " with timezone ".to_owned(), ".".to_owned()]; + parse_fuzzy_and_assert(pdt, Some(tokens), info, "Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", None, None, true, true, + None, false, HashMap::new()); +}