From d7b3c3356707de828579b0ce440a7fbd0242d490 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Mon, 2 Jul 2018 22:23:55 -0400 Subject: [PATCH 01/12] Don't track pyc --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index bb7277a..e3de6c9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ /target **/*.rs.bk Cargo.lock -.vscode \ No newline at end of file +.vscode +*.pyc \ No newline at end of file From 79ac26e07fea5d5c1c4c010dcaacb7091d5b0094 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Mon, 2 Jul 2018 22:49:05 -0400 Subject: [PATCH 02/12] Fuzzy tests are failing already... --- build_pycompat.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++ tests/pycompat.rs | 48 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/build_pycompat.py b/build_pycompat.py index c353265..572c5b0 100644 --- a/build_pycompat.py +++ b/build_pycompat.py @@ -80,6 +80,9 @@ tests = { '1994-11-05T08:15:30Z', '1976-07-04T00:01:02Z', 'Tue Apr 4 00:22:12 PDT 1995' ], + 'test_fuzzy': [ + 'Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.' + ], 'test_parse_default_ignore': [ ], } @@ -149,6 +152,12 @@ def test_parse_default_ignore(i, s): return TEST_PARSE_DEFAULT_IGNORE.format(i=i, d=d, s=s) + +def test_fuzzy(i, s): + d = parse(s, fuzzy=True) + + return TEST_FUZZY.format(i=i, d=d, s=s) + # Here lies all the ugly junk. TEST_HEADER = ''' extern crate chrono; @@ -224,6 +233,42 @@ fn parse_and_assert_simple( assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); } +fn parse_fuzzy_and_assert( + pdt: PyDateTime, + ptokens: Option>, + info: ParserInfo, + s: &str, + dayfirst: Option, + yearfirst: Option, + fuzzy: bool, + fuzzy_with_tokens: bool, + default: Option<&NaiveDateTime>, + ignoretz: bool, + tzinfos: HashMap, +) { + + let mut parser = Parser::new(info); + let rs_parsed = parser.parse( + s, + dayfirst, + yearfirst, + fuzzy, + fuzzy_with_tokens, + default, + ignoretz, + tzinfos).expect(&format!("Unable to parse date in Rust '{}'", s)); + + assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); + assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); + assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); + assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); + assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); + assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(ptokens, rs_parsed.2, "Fuzzy mismatch for {}", s); +} + macro_rules! rs_tzinfo_map { () => ({ let mut h = HashMap::new(); @@ -364,6 +409,19 @@ fn test_parse_default_ignore{i}() {{ Some(default_rsdate), false, HashMap::new()); }}\n''' +TEST_FUZZY = ''' +#[test] +fn test_fuzzy{i}() {{ + let info = ParserInfo::default(); + let pdt = PyDateTime {{ + year: {d.year}, month: {d.month}, day: {d.day}, + hour: {d.hour}, minute: {d.minute}, second: {d.second}, + micros: {d.microsecond}, tzo: None + }}; + parse_fuzzy_and_assert(pdt, None, info, "{s}", None, None, true, false, + None, false, HashMap::new()); +}}\n''' + if __name__ == '__main__': main() \ No newline at end of file diff --git a/tests/pycompat.rs b/tests/pycompat.rs index 8e7805c..319f714 100644 --- a/tests/pycompat.rs +++ b/tests/pycompat.rs @@ -72,6 +72,42 @@ fn parse_and_assert_simple( assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); } +fn parse_fuzzy_and_assert( + pdt: PyDateTime, + ptokens: Option>, + info: ParserInfo, + s: &str, + dayfirst: Option, + yearfirst: Option, + fuzzy: bool, + fuzzy_with_tokens: bool, + default: Option<&NaiveDateTime>, + ignoretz: bool, + tzinfos: HashMap, +) { + + let mut parser = Parser::new(info); + let rs_parsed = parser.parse( + s, + dayfirst, + yearfirst, + fuzzy, + fuzzy_with_tokens, + default, + ignoretz, + tzinfos).expect(&format!("Unable to parse date in Rust '{}'", s)); + + assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); + assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); + assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); + assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); + assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); + assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(ptokens, rs_parsed.2, "Fuzzy mismatch for {}", s); +} + macro_rules! rs_tzinfo_map { () => ({ let mut h = HashMap::new(); @@ -1697,3 +1733,15 @@ fn test_parse_ignoretz7() { parse_and_assert(pdt, info, "Tue Apr 4 00:22:12 PDT 1995", None, None, false, false, None, true, HashMap::new()); } + +#[test] +fn test_fuzzy0() { + let info = ParserInfo::default(); + let pdt = PyDateTime { + year: 2003, month: 9, day: 25, + hour: 10, minute: 49, second: 41, + micros: 0, tzo: None + }; + parse_fuzzy_and_assert(pdt, None, info, "Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", None, None, true, false, + None, false, HashMap::new()); +} From c566c5b7c8ee6beb0d37073a40a1113b91e2439e Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Mon, 2 Jul 2018 22:54:20 -0400 Subject: [PATCH 03/12] Remove an extraneous TODO --- src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index c05597d..7003a94 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -566,7 +566,6 @@ impl Parser { let default_ts = NaiveDateTime::new(default_date, NaiveTime::from_hms(0, 0, 0)); - // TODO: What should be done with the tokens? let (res, tokens) = self.parse_with_tokens(timestr, dayfirst, yearfirst, fuzzy, fuzzy_with_tokens)?; From 2b90bf6ed73330d9ccd5cdb68a76454d07f2ec1b Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Mon, 2 Jul 2018 23:00:45 -0400 Subject: [PATCH 04/12] ...I may be an idiot. --- build_pycompat.py | 10 +++++----- src/lib.rs | 2 +- tests/pycompat.rs | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/build_pycompat.py b/build_pycompat.py index 572c5b0..acd48be 100644 --- a/build_pycompat.py +++ b/build_pycompat.py @@ -80,7 +80,7 @@ tests = { '1994-11-05T08:15:30Z', '1976-07-04T00:01:02Z', 'Tue Apr 4 00:22:12 PDT 1995' ], - 'test_fuzzy': [ + 'test_fuzzy_tzinfo': [ 'Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.' ], 'test_parse_default_ignore': [ @@ -153,10 +153,10 @@ def test_parse_default_ignore(i, s): return TEST_PARSE_DEFAULT_IGNORE.format(i=i, d=d, s=s) -def test_fuzzy(i, s): +def test_fuzzy_tzinfo(i, s): d = parse(s, fuzzy=True) - return TEST_FUZZY.format(i=i, d=d, s=s) + return TEST_FUZZY_TZINFO.format(i=i, d=d, s=s, offset=int(d.tzinfo._offset.total_seconds())) # Here lies all the ugly junk. TEST_HEADER = ''' @@ -409,14 +409,14 @@ fn test_parse_default_ignore{i}() {{ Some(default_rsdate), false, HashMap::new()); }}\n''' -TEST_FUZZY = ''' +TEST_FUZZY_TZINFO = ''' #[test] fn test_fuzzy{i}() {{ let info = ParserInfo::default(); let pdt = PyDateTime {{ year: {d.year}, month: {d.month}, day: {d.day}, hour: {d.hour}, minute: {d.minute}, second: {d.second}, - micros: {d.microsecond}, tzo: None + micros: {d.microsecond}, tzo: Some({offset}) }}; parse_fuzzy_and_assert(pdt, None, info, "{s}", None, None, true, false, None, false, HashMap::new()); diff --git a/src/lib.rs b/src/lib.rs index 7003a94..46266be 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -718,7 +718,7 @@ impl Parser { } i += 1; - } else if !self.info.get_jump(&l[i]) || fuzzy { + } else if !(self.info.get_jump(&l[i]) || fuzzy) { return Err(ParseError::UnrecognizedToken(l[i].clone())); } else { skipped_idxs.push(i); diff --git a/tests/pycompat.rs b/tests/pycompat.rs index 319f714..7efe2d2 100644 --- a/tests/pycompat.rs +++ b/tests/pycompat.rs @@ -1740,7 +1740,7 @@ fn test_fuzzy0() { let pdt = PyDateTime { year: 2003, month: 9, day: 25, hour: 10, minute: 49, second: 41, - micros: 0, tzo: None + micros: 0, tzo: Some(-10800) }; parse_fuzzy_and_assert(pdt, None, info, "Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", None, None, true, false, None, false, HashMap::new()); From 9135962839f3b83924902c4306bfdc266c96224b Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Tue, 3 Jul 2018 01:02:27 -0400 Subject: [PATCH 05/12] Lots of fixes, but it turns out tokenization is broken --- build_pycompat.py | 56 ++++++++++++++++++++++++-------- src/lib.rs | 83 ++++++++++++++++++++++++++--------------------- src/tokenize.rs | 12 +++++++ src/weekday.rs | 2 -- tests/pycompat.rs | 41 +++++++++++++++-------- 5 files changed, 127 insertions(+), 67 deletions(-) diff --git a/build_pycompat.py b/build_pycompat.py index acd48be..641494e 100644 --- a/build_pycompat.py +++ b/build_pycompat.py @@ -83,6 +83,9 @@ tests = { 'test_fuzzy_tzinfo': [ 'Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.' ], + 'test_fuzzy_tokens_tzinfo': [ + 'Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.' + ], 'test_parse_default_ignore': [ ], } @@ -158,6 +161,17 @@ def test_fuzzy_tzinfo(i, s): return TEST_FUZZY_TZINFO.format(i=i, d=d, s=s, offset=int(d.tzinfo._offset.total_seconds())) + +def test_fuzzy_tokens_tzinfo(i, s): + d, tokens = parse(s, fuzzy_with_tokens=True) + + r_tokens = ", ".join(list(map(lambda s: f'"{s}".to_owned()', tokens))) + + return TEST_FUZZY_TOKENS_TZINFO.format( + i=i, d=d, s=s, offset=int(d.tzinfo._offset.total_seconds()), + tokens=r_tokens + ) + # Here lies all the ugly junk. TEST_HEADER = ''' extern crate chrono; @@ -214,8 +228,8 @@ fn parse_and_assert( assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); } fn parse_and_assert_simple( @@ -223,14 +237,14 @@ fn parse_and_assert_simple( s: &str, ) { let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); - assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for {}", s); - assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for {}", s); - assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for {}", s); - assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for {}", s); - assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch for {}", s); - assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for {}", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); + assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); + assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); + assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); + assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch for '{}'", s); + assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); } fn parse_fuzzy_and_assert( @@ -264,9 +278,9 @@ fn parse_fuzzy_and_assert( assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); - assert_eq!(ptokens, rs_parsed.2, "Fuzzy mismatch for {}", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); + assert_eq!(ptokens, rs_parsed.2, "Tokens mismatch for '{}'", s); } macro_rules! rs_tzinfo_map { @@ -411,7 +425,7 @@ fn test_parse_default_ignore{i}() {{ TEST_FUZZY_TZINFO = ''' #[test] -fn test_fuzzy{i}() {{ +fn test_fuzzy_tzinfo{i}() {{ let info = ParserInfo::default(); let pdt = PyDateTime {{ year: {d.year}, month: {d.month}, day: {d.day}, @@ -422,6 +436,20 @@ fn test_fuzzy{i}() {{ None, false, HashMap::new()); }}\n''' +TEST_FUZZY_TOKENS_TZINFO = ''' +#[test] +fn test_fuzzy_tokens_tzinfo{i}() {{ + let info = ParserInfo::default(); + let pdt = PyDateTime {{ + year: {d.year}, month: {d.month}, day: {d.day}, + hour: {d.hour}, minute: {d.minute}, second: {d.second}, + micros: {d.microsecond}, tzo: Some({offset}) + }}; + let tokens = vec![{tokens}]; + parse_fuzzy_and_assert(pdt, Some(tokens), info, "{s}", None, None, true, true, + None, false, HashMap::new()); +}}\n''' + if __name__ == '__main__': main() \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 46266be..1590974 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,3 @@ -#![allow(dead_code)] -#![allow(unused)] - #[macro_use] extern crate lazy_static; @@ -8,7 +5,6 @@ extern crate chrono; extern crate num_traits; extern crate rust_decimal; -use chrono::DateTime; use chrono::Datelike; use chrono::Duration; use chrono::FixedOffset; @@ -17,7 +13,6 @@ use chrono::NaiveDate; use chrono::NaiveDateTime; use chrono::NaiveTime; use chrono::Timelike; -use chrono::Utc; use num_traits::cast::ToPrimitive; use rust_decimal::Decimal; use rust_decimal::Error as DecimalError; @@ -33,7 +28,6 @@ mod weekday; #[cfg(test)] mod tests; -use tokenize::ParseState; use tokenize::Tokenizer; use weekday::day_of_week; use weekday::DayOfWeek; @@ -59,13 +53,13 @@ pub enum ParseInternalError { } impl From for ParseInternalError { - fn from(err: DecimalError) -> Self { + fn from(_err: DecimalError) -> Self { ParseInternalError::InvalidDecimal } } impl From for ParseInternalError { - fn from(err: ParseIntError) -> Self { + fn from(_err: ParseIntError) -> Self { ParseInternalError::InvalidInteger } } @@ -294,11 +288,6 @@ struct YMD { ystridx: Option, } -enum YMDAppendEither { - Number(i32), - Stringy(String), -} - impl YMD { fn len(&self) -> usize { self._ymd.len() @@ -388,7 +377,7 @@ impl YMD { Ok(()) } } - None => Err(ParseInternalError::ValueError("Missing label.".to_owned())), + None => Ok(()), } } @@ -613,34 +602,34 @@ impl Parser { while i < len_l { let value_repr = l[i].clone(); - if let Ok(v) = Decimal::from_str(&value_repr) { + if let Ok(_v) = Decimal::from_str(&value_repr) { i = self.parse_numeric_token(&l, i, &self.info, &mut ymd, &mut res, fuzzy)?; } else if let Some(value) = self.info.get_weekday(&l[i]) { res.weekday = Some(value); } else if let Some(value) = self.info.get_month(&l[i]) { - ymd.append(value as i32, &l[i], Some(YMDLabel::Month)); + ymd.append(value as i32, &l[i], Some(YMDLabel::Month))?; if i + 1 < len_l { if l[i + 1] == "-" || l[i + 1] == "/" { // Jan-01[-99] let sep = &l[i + 1]; // TODO: This seems like a very unsafe unwrap - ymd.append(l[i + 2].parse::().unwrap(), &l[i + 2], None); + ymd.append(l[i + 2].parse::().unwrap(), &l[i + 2], None)?; if i + 3 < len_l && &l[i + 3] == sep { // Jan-01-99 - ymd.append(l[i + 4].parse::().unwrap(), &l[i + 4], None); + ymd.append(l[i + 4].parse::().unwrap(), &l[i + 4], None)?; i += 2; } i += 2; - } else if (i + 4 < len_l && l[i + 1] == l[i + 3] && l[i + 3] == " " - && self.info.get_pertain(&l[i + 2])) + } else if i + 4 < len_l && l[i + 1] == l[i + 3] && l[i + 3] == " " + && self.info.get_pertain(&l[i + 2]) { // Jan of 01 if let Some(value) = l[i + 4].parse::().ok() { let year = self.info.convertyear(value, false); - ymd.append(year, &l[i + 4], Some(YMDLabel::Year)); + ymd.append(year, &l[i + 4], Some(YMDLabel::Year))?; } i += 4; @@ -737,7 +726,7 @@ impl Parser { if !self.info.validate(&mut res) { Err(ParseError::InvalidParseResult(res)) } else if fuzzy_with_tokens { - let skipped_tokens = skipped_idxs.into_iter().map(|i| l[i].clone()).collect(); + let skipped_tokens = self.recombine_skipped(skipped_idxs, l); Ok((res, Some(skipped_tokens))) } else { Ok((res, None)) @@ -797,7 +786,7 @@ impl Parser { }; // TODO: Change month/day to u32 - let mut d = NaiveDate::from_ymd( + let d = NaiveDate::from_ymd( y, m, min(res.day.unwrap_or(default.day() as i32) as u32, days_in_month(y, m as i32)?) @@ -818,7 +807,7 @@ impl Parser { fn build_tzaware( &self, - dt: &NaiveDateTime, + _dt: &NaiveDateTime, res: &ParsingResult, tzinfos: HashMap, ) -> ParseResult> { @@ -877,9 +866,9 @@ impl Parser { let s = &tokens[idx]; if ymd.len() == 0 && tokens[idx].find(".") == None { - ymd.append(s[0..2].parse::().unwrap(), &s[0..2], None); - ymd.append(s[2..4].parse::().unwrap(), &s[2..4], None); - ymd.append(s[4..6].parse::().unwrap(), &s[4..6], None); + ymd.append(s[0..2].parse::().unwrap(), &s[0..2], None)?; + ymd.append(s[2..4].parse::().unwrap(), &s[2..4], None)?; + ymd.append(s[4..6].parse::().unwrap(), &s[4..6], None)?; } else { // 19990101T235959[.59] res.hour = s[0..2].parse::().ok(); @@ -892,9 +881,9 @@ impl Parser { } else if vec![8, 12, 14].contains(&len_li) { // YYMMDD let s = &tokens[idx]; - ymd.append(s[..4].parse::().unwrap(), &s[..4], Some(YMDLabel::Year)); - ymd.append(s[4..6].parse::().unwrap(), &s[4..6], None); - ymd.append(s[6..8].parse::().unwrap(), &s[6..8], None); + ymd.append(s[..4].parse::().unwrap(), &s[..4], Some(YMDLabel::Year))?; + ymd.append(s[4..6].parse::().unwrap(), &s[4..6], None)?; + ymd.append(s[6..8].parse::().unwrap(), &s[6..8], None)?; if len_li > 8 { res.hour = Some(s[8..10].parse::()?); @@ -936,20 +925,20 @@ impl Parser { { // TODO: There's got to be a better way of handling the condition above let sep = &tokens[idx + 1]; - ymd.append(value_repr.parse::().unwrap(), &value_repr, None); + ymd.append(value_repr.parse::().unwrap(), &value_repr, None)?; if idx + 2 < len_l && !info.get_jump(&tokens[idx + 2]) { if let Ok(val) = tokens[idx + 2].parse::() { - ymd.append(val, &tokens[idx + 2], None); + ymd.append(val, &tokens[idx + 2], None)?; } else if let Some(val) = info.get_month(&tokens[idx + 2]) { - ymd.append(val as i32, &tokens[idx + 2], Some(YMDLabel::Month)); + ymd.append(val as i32, &tokens[idx + 2], Some(YMDLabel::Month))?; } if idx + 3 < len_l && &tokens[idx + 3] == sep { if let Some(value) = info.get_month(&tokens[idx + 4]) { - ymd.append(value as i32, &tokens[idx + 4], Some(YMDLabel::Month)); + ymd.append(value as i32, &tokens[idx + 4], Some(YMDLabel::Month))?; } else { - ymd.append(tokens[idx + 4].parse::().unwrap(), &tokens[idx + 4], None); + ymd.append(tokens[idx + 4].parse::().unwrap(), &tokens[idx + 4], None)?; } idx += 2; @@ -965,7 +954,7 @@ impl Parser { let ampm = info.get_ampm(&tokens[idx + 2]).unwrap(); res.hour = Some(self.adjust_ampm(hour, ampm)); } else { - ymd.append(value.floor().to_i64().unwrap() as i32, &value_repr, None); + ymd.append(value.floor().to_i64().unwrap() as i32, &value_repr, None)?; } } else if info.get_ampm(&tokens[idx + 1]).is_some() && (*ZERO <= value && value < *TWENTY_FOUR) @@ -975,7 +964,7 @@ impl Parser { res.hour = Some(self.adjust_ampm(hour, info.get_ampm(&tokens[idx + 1]).unwrap())); idx += 1; } else if ymd.could_be_day(value.to_i64().unwrap() as i32) { - ymd.append(value.to_i64().unwrap() as i32, &value_repr, None); + ymd.append(value.to_i64().unwrap() as i32, &value_repr, None)?; } else if !fuzzy { return Err(ParseInternalError::ValueError("".to_owned())); } @@ -1106,6 +1095,26 @@ impl Parser { (minute, second) } + + fn recombine_skipped(&self, skipped_idxs: Vec, tokens: Vec) -> Vec { + let mut skipped_tokens: Vec = vec![]; + + let mut sorted_idxs = skipped_idxs.clone(); + sorted_idxs.sort(); + + for (i, idx) in sorted_idxs.iter().enumerate() { + if i > 0 && idx - 1 == skipped_idxs[i - 1] { + // UNWRAP: Having an initial value and unconditional push at end guarantees value + let mut t = skipped_tokens.pop().unwrap(); + t.push_str(tokens[idx.clone()].as_ref()); + skipped_tokens.push(t); + } else { + skipped_tokens.push(tokens[idx.clone()].to_owned()); + } + } + + skipped_tokens + } } fn close_to_integer(value: &Decimal) -> bool { diff --git a/src/tokenize.rs b/src/tokenize.rs index 2a44d25..38091a8 100644 --- a/src/tokenize.rs +++ b/src/tokenize.rs @@ -201,3 +201,15 @@ fn decimal_split(characters: &str, cast_period: bool) -> Vec { token_stack } + +#[cfg(test)] +mod tests { + + use Tokenizer; + + #[test] + fn test_basic() { + let tokens: Vec = Tokenizer::new("September of 2003,".to_owned()).collect(); + assert_eq!(tokens, vec!["September", " ", "of", " ", "2003", ","]); + } +} diff --git a/src/weekday.rs b/src/weekday.rs index 516d452..d92c758 100644 --- a/src/weekday.rs +++ b/src/weekday.rs @@ -1,5 +1,3 @@ -use std::cmp::max; - use ParseResult; use ParseError; diff --git a/tests/pycompat.rs b/tests/pycompat.rs index 7efe2d2..8ca6e6f 100644 --- a/tests/pycompat.rs +++ b/tests/pycompat.rs @@ -53,8 +53,8 @@ fn parse_and_assert( assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); } fn parse_and_assert_simple( @@ -62,14 +62,14 @@ fn parse_and_assert_simple( s: &str, ) { let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); - assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for {}", s); - assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for {}", s); - assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for {}", s); - assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for {}", s); - assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch for {}", s); - assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for {}", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); + assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); + assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); + assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); + assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); + assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch for '{}'", s); + assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); } fn parse_fuzzy_and_assert( @@ -103,9 +103,9 @@ fn parse_fuzzy_and_assert( assert_eq!(pdt.hour, rs_parsed.0.hour(), "Hour mismatch for '{}'", s); assert_eq!(pdt.minute, rs_parsed.0.minute(), "Minute mismatch f'or' {}", s); assert_eq!(pdt.second, rs_parsed.0.second(), "Second mismatch for '{}'", s); - assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for {}", s); - assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for {}", s); - assert_eq!(ptokens, rs_parsed.2, "Fuzzy mismatch for {}", s); + assert_eq!(pdt.micros, rs_parsed.0.timestamp_subsec_micros(), "Microsecond mismatch for '{}'", s); + assert_eq!(pdt.tzo, rs_parsed.1.map(|u| u.local_minus_utc()), "Timezone Offset mismatch for '{}'", s); + assert_eq!(ptokens, rs_parsed.2, "Tokens mismatch for '{}'", s); } macro_rules! rs_tzinfo_map { @@ -1735,7 +1735,7 @@ fn test_parse_ignoretz7() { } #[test] -fn test_fuzzy0() { +fn test_fuzzy_tzinfo0() { let info = ParserInfo::default(); let pdt = PyDateTime { year: 2003, month: 9, day: 25, @@ -1745,3 +1745,16 @@ fn test_fuzzy0() { parse_fuzzy_and_assert(pdt, None, info, "Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", None, None, true, false, None, false, HashMap::new()); } + +#[test] +fn test_fuzzy_tokens_tzinfo0() { + let info = ParserInfo::default(); + let pdt = PyDateTime { + year: 2003, month: 9, day: 25, + hour: 10, minute: 49, second: 41, + micros: 0, tzo: Some(-10800) + }; + let tokens = vec!["Today is ".to_owned(), "of ".to_owned(), ", exactly at ".to_owned(), " with timezone ".to_owned(), ".".to_owned()]; + parse_fuzzy_and_assert(pdt, Some(tokens), info, "Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", None, None, true, true, + None, false, HashMap::new()); +} From e049618fffa70a8f217cc90ec981a00845a1a56d Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Sat, 7 Jul 2018 23:37:02 -0400 Subject: [PATCH 06/12] Redo the tokenization Still has issues with one test case for fuzzy --- build_pycompat.py | 11 +- build_pycompat_tokenizer.py | 35 + src/lib.rs | 2 +- src/{tests.rs => tests/fuzzing.rs} | 0 src/tests/mod.rs | 3 + .../tests/pycompat_parser.rs | 9 +- src/tests/pycompat_tokenizer.rs | 865 ++++++++++++++++++ src/tokenize.rs | 257 +++--- src/weekday.rs | 2 + 9 files changed, 1029 insertions(+), 155 deletions(-) create mode 100644 build_pycompat_tokenizer.py rename src/{tests.rs => tests/fuzzing.rs} (100%) create mode 100644 src/tests/mod.rs rename tests/pycompat.rs => src/tests/pycompat_parser.rs (99%) create mode 100644 src/tests/pycompat_tokenizer.rs diff --git a/build_pycompat.py b/build_pycompat.py index 641494e..036233f 100644 --- a/build_pycompat.py +++ b/build_pycompat.py @@ -91,7 +91,7 @@ tests = { } def main(): - with open('tests/pycompat.rs', 'w+') as handle: + with open('src/tests/pycompat_parser.rs', 'w+') as handle: handle.write(TEST_HEADER) for test_name, test_strings in tests.items(): @@ -182,10 +182,9 @@ use chrono::NaiveDateTime; use chrono::Timelike; use std::collections::HashMap; -extern crate dtparse; - -use dtparse::Parser; -use dtparse::ParserInfo; +use Parser; +use ParserInfo; +use parse; struct PyDateTime { year: i32, @@ -236,7 +235,7 @@ fn parse_and_assert_simple( pdt: PyDateTime, s: &str, ) { - let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); + let rs_parsed = parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); diff --git a/build_pycompat_tokenizer.py b/build_pycompat_tokenizer.py new file mode 100644 index 0000000..ea0ed8c --- /dev/null +++ b/build_pycompat_tokenizer.py @@ -0,0 +1,35 @@ +from dateutil.parser import _timelex + +from build_pycompat import tests + +def main(): + with open('src/tests/pycompat_tokenizer.rs', 'w+') as handle: + handle.write(TEST_HEADER) + + counter = 0 + for _, test_strings in tests.items(): + for s in test_strings: + handle.write(build_test(counter, s)) + counter += 1 + +def build_test(i, test_string): + python_tokens = list(_timelex(test_string)) + formatted_tokens = 'vec!["' + '", "'.join(python_tokens) + '"]' + return f''' +#[test] +fn test_tokenize{i}() {{ + let comp = {formatted_tokens}; + tokenize_assert("{test_string}", comp); +}}\n''' + + +TEST_HEADER = ''' +use tokenize::Tokenizer; + +fn tokenize_assert(test_str: &str, comparison: Vec<&str>) { + let tokens: Vec = Tokenizer::new(test_str).collect(); + assert_eq!(tokens, comparison, "Tokenizing mismatch for `{}`", test_str); +}\n''' + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 1590974..02c6a1a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,7 +87,7 @@ type ParseResult = Result; type ParseIResult = Result; pub fn tokenize(parse_string: &str) -> Vec { - let tokenizer = Tokenizer::new(parse_string.to_owned()); + let tokenizer = Tokenizer::new(parse_string); tokenizer.collect() } diff --git a/src/tests.rs b/src/tests/fuzzing.rs similarity index 100% rename from src/tests.rs rename to src/tests/fuzzing.rs diff --git a/src/tests/mod.rs b/src/tests/mod.rs new file mode 100644 index 0000000..1776124 --- /dev/null +++ b/src/tests/mod.rs @@ -0,0 +1,3 @@ +mod fuzzing; +mod pycompat_parser; +mod pycompat_tokenizer; diff --git a/tests/pycompat.rs b/src/tests/pycompat_parser.rs similarity index 99% rename from tests/pycompat.rs rename to src/tests/pycompat_parser.rs index 8ca6e6f..647f7a5 100644 --- a/tests/pycompat.rs +++ b/src/tests/pycompat_parser.rs @@ -7,10 +7,9 @@ use chrono::NaiveDateTime; use chrono::Timelike; use std::collections::HashMap; -extern crate dtparse; - -use dtparse::Parser; -use dtparse::ParserInfo; +use Parser; +use ParserInfo; +use parse; struct PyDateTime { year: i32, @@ -61,7 +60,7 @@ fn parse_and_assert_simple( pdt: PyDateTime, s: &str, ) { - let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); + let rs_parsed = parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); diff --git a/src/tests/pycompat_tokenizer.rs b/src/tests/pycompat_tokenizer.rs new file mode 100644 index 0000000..6ba6a21 --- /dev/null +++ b/src/tests/pycompat_tokenizer.rs @@ -0,0 +1,865 @@ + +use tokenize::Tokenizer; + +fn tokenize_assert(test_str: &str, comparison: Vec<&str>) { + let tokens: Vec = Tokenizer::new(test_str).collect(); + assert_eq!(tokens, comparison, "Tokenizing mismatch for `{}`", test_str); +} + +#[test] +fn test_tokenize0() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28"]; + tokenize_assert("Thu Sep 25 10:36:28", comp); +} + +#[test] +fn test_tokenize1() { + let comp = vec!["Sep", " ", "10", ":", "36", ":", "28"]; + tokenize_assert("Sep 10:36:28", comp); +} + +#[test] +fn test_tokenize2() { + let comp = vec!["10", ":", "36", ":", "28"]; + tokenize_assert("10:36:28", comp); +} + +#[test] +fn test_tokenize3() { + let comp = vec!["10", ":", "36"]; + tokenize_assert("10:36", comp); +} + +#[test] +fn test_tokenize4() { + let comp = vec!["Sep", " ", "2003"]; + tokenize_assert("Sep 2003", comp); +} + +#[test] +fn test_tokenize5() { + let comp = vec!["Sep"]; + tokenize_assert("Sep", comp); +} + +#[test] +fn test_tokenize6() { + let comp = vec!["2003"]; + tokenize_assert("2003", comp); +} + +#[test] +fn test_tokenize7() { + let comp = vec!["10", "h", "36", "m", "28.5", "s"]; + tokenize_assert("10h36m28.5s", comp); +} + +#[test] +fn test_tokenize8() { + let comp = vec!["10", "h", "36", "m", "28", "s"]; + tokenize_assert("10h36m28s", comp); +} + +#[test] +fn test_tokenize9() { + let comp = vec!["10", "h", "36", "m"]; + tokenize_assert("10h36m", comp); +} + +#[test] +fn test_tokenize10() { + let comp = vec!["10", "h"]; + tokenize_assert("10h", comp); +} + +#[test] +fn test_tokenize11() { + let comp = vec!["10", " ", "h", " ", "36"]; + tokenize_assert("10 h 36", comp); +} + +#[test] +fn test_tokenize12() { + let comp = vec!["10", " ", "h", " ", "36.5"]; + tokenize_assert("10 h 36.5", comp); +} + +#[test] +fn test_tokenize13() { + let comp = vec!["36", " ", "m", " ", "5"]; + tokenize_assert("36 m 5", comp); +} + +#[test] +fn test_tokenize14() { + let comp = vec!["36", " ", "m", " ", "5", " ", "s"]; + tokenize_assert("36 m 5 s", comp); +} + +#[test] +fn test_tokenize15() { + let comp = vec!["36", " ", "m", " ", "05"]; + tokenize_assert("36 m 05", comp); +} + +#[test] +fn test_tokenize16() { + let comp = vec!["36", " ", "m", " ", "05", " ", "s"]; + tokenize_assert("36 m 05 s", comp); +} + +#[test] +fn test_tokenize17() { + let comp = vec!["10", "h", " ", "am"]; + tokenize_assert("10h am", comp); +} + +#[test] +fn test_tokenize18() { + let comp = vec!["10", "h", " ", "pm"]; + tokenize_assert("10h pm", comp); +} + +#[test] +fn test_tokenize19() { + let comp = vec!["10", "am"]; + tokenize_assert("10am", comp); +} + +#[test] +fn test_tokenize20() { + let comp = vec!["10", "pm"]; + tokenize_assert("10pm", comp); +} + +#[test] +fn test_tokenize21() { + let comp = vec!["10", ":", "00", " ", "am"]; + tokenize_assert("10:00 am", comp); +} + +#[test] +fn test_tokenize22() { + let comp = vec!["10", ":", "00", " ", "pm"]; + tokenize_assert("10:00 pm", comp); +} + +#[test] +fn test_tokenize23() { + let comp = vec!["10", ":", "00", "am"]; + tokenize_assert("10:00am", comp); +} + +#[test] +fn test_tokenize24() { + let comp = vec!["10", ":", "00", "pm"]; + tokenize_assert("10:00pm", comp); +} + +#[test] +fn test_tokenize25() { + let comp = vec!["10", ":", "00", "a", ".", "m"]; + tokenize_assert("10:00a.m", comp); +} + +#[test] +fn test_tokenize26() { + let comp = vec!["10", ":", "00", "p", ".", "m"]; + tokenize_assert("10:00p.m", comp); +} + +#[test] +fn test_tokenize27() { + let comp = vec!["10", ":", "00", "a", ".", "m", "."]; + tokenize_assert("10:00a.m.", comp); +} + +#[test] +fn test_tokenize28() { + let comp = vec!["10", ":", "00", "p", ".", "m", "."]; + tokenize_assert("10:00p.m.", comp); +} + +#[test] +fn test_tokenize29() { + let comp = vec!["October"]; + tokenize_assert("October", comp); +} + +#[test] +fn test_tokenize30() { + let comp = vec!["31", "-", "Dec", "-", "00"]; + tokenize_assert("31-Dec-00", comp); +} + +#[test] +fn test_tokenize31() { + let comp = vec!["0", ":", "01", ":", "02"]; + tokenize_assert("0:01:02", comp); +} + +#[test] +fn test_tokenize32() { + let comp = vec!["12", "h", " ", "01", "m", "02", "s", " ", "am"]; + tokenize_assert("12h 01m02s am", comp); +} + +#[test] +fn test_tokenize33() { + let comp = vec!["12", ":", "08", " ", "PM"]; + tokenize_assert("12:08 PM", comp); +} + +#[test] +fn test_tokenize34() { + let comp = vec!["01", "h", "02", "m", "03"]; + tokenize_assert("01h02m03", comp); +} + +#[test] +fn test_tokenize35() { + let comp = vec!["01", "h", "02"]; + tokenize_assert("01h02", comp); +} + +#[test] +fn test_tokenize36() { + let comp = vec!["01", "h", "02", "s"]; + tokenize_assert("01h02s", comp); +} + +#[test] +fn test_tokenize37() { + let comp = vec!["01", "m", "02"]; + tokenize_assert("01m02", comp); +} + +#[test] +fn test_tokenize38() { + let comp = vec!["01", "m", "02", "h"]; + tokenize_assert("01m02h", comp); +} + +#[test] +fn test_tokenize39() { + let comp = vec!["2004", " ", "10", " ", "Apr", " ", "11", "h", "30", "m"]; + tokenize_assert("2004 10 Apr 11h30m", comp); +} + +#[test] +fn test_tokenize40() { + let comp = vec!["Sep", " ", "03"]; + tokenize_assert("Sep 03", comp); +} + +#[test] +fn test_tokenize41() { + let comp = vec!["Sep", " ", "of", " ", "03"]; + tokenize_assert("Sep of 03", comp); +} + +#[test] +fn test_tokenize42() { + let comp = vec!["02", ":", "17", "NOV", "2017"]; + tokenize_assert("02:17NOV2017", comp); +} + +#[test] +fn test_tokenize43() { + let comp = vec!["Thu", " ", "Sep", " ", "10", ":", "36", ":", "28"]; + tokenize_assert("Thu Sep 10:36:28", comp); +} + +#[test] +fn test_tokenize44() { + let comp = vec!["Thu", " ", "10", ":", "36", ":", "28"]; + tokenize_assert("Thu 10:36:28", comp); +} + +#[test] +fn test_tokenize45() { + let comp = vec!["Wed"]; + tokenize_assert("Wed", comp); +} + +#[test] +fn test_tokenize46() { + let comp = vec!["Wednesday"]; + tokenize_assert("Wednesday", comp); +} + +#[test] +fn test_tokenize47() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28", " ", "2003"]; + tokenize_assert("Thu Sep 25 10:36:28 2003", comp); +} + +#[test] +fn test_tokenize48() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "2003"]; + tokenize_assert("Thu Sep 25 2003", comp); +} + +#[test] +fn test_tokenize49() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49", ":", "41"]; + tokenize_assert("2003-09-25T10:49:41", comp); +} + +#[test] +fn test_tokenize50() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49"]; + tokenize_assert("2003-09-25T10:49", comp); +} + +#[test] +fn test_tokenize51() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10"]; + tokenize_assert("2003-09-25T10", comp); +} + +#[test] +fn test_tokenize52() { + let comp = vec!["2003", "-", "09", "-", "25"]; + tokenize_assert("2003-09-25", comp); +} + +#[test] +fn test_tokenize53() { + let comp = vec!["20030925", "T", "104941"]; + tokenize_assert("20030925T104941", comp); +} + +#[test] +fn test_tokenize54() { + let comp = vec!["20030925", "T", "1049"]; + tokenize_assert("20030925T1049", comp); +} + +#[test] +fn test_tokenize55() { + let comp = vec!["20030925", "T", "10"]; + tokenize_assert("20030925T10", comp); +} + +#[test] +fn test_tokenize56() { + let comp = vec!["20030925"]; + tokenize_assert("20030925", comp); +} + +#[test] +fn test_tokenize57() { + let comp = vec!["2003", "-", "09", "-", "25", " ", "10", ":", "49", ":", "41.502"]; + tokenize_assert("2003-09-25 10:49:41,502", comp); +} + +#[test] +fn test_tokenize58() { + let comp = vec!["199709020908"]; + tokenize_assert("199709020908", comp); +} + +#[test] +fn test_tokenize59() { + let comp = vec!["19970902090807"]; + tokenize_assert("19970902090807", comp); +} + +#[test] +fn test_tokenize60() { + let comp = vec!["2003", "-", "09", "-", "25"]; + tokenize_assert("2003-09-25", comp); +} + +#[test] +fn test_tokenize61() { + let comp = vec!["09", "-", "25", "-", "2003"]; + tokenize_assert("09-25-2003", comp); +} + +#[test] +fn test_tokenize62() { + let comp = vec!["25", "-", "09", "-", "2003"]; + tokenize_assert("25-09-2003", comp); +} + +#[test] +fn test_tokenize63() { + let comp = vec!["10", "-", "09", "-", "2003"]; + tokenize_assert("10-09-2003", comp); +} + +#[test] +fn test_tokenize64() { + let comp = vec!["10", "-", "09", "-", "03"]; + tokenize_assert("10-09-03", comp); +} + +#[test] +fn test_tokenize65() { + let comp = vec!["2003", ".", "09", ".", "25"]; + tokenize_assert("2003.09.25", comp); +} + +#[test] +fn test_tokenize66() { + let comp = vec!["09", ".", "25", ".", "2003"]; + tokenize_assert("09.25.2003", comp); +} + +#[test] +fn test_tokenize67() { + let comp = vec!["25", ".", "09", ".", "2003"]; + tokenize_assert("25.09.2003", comp); +} + +#[test] +fn test_tokenize68() { + let comp = vec!["10", ".", "09", ".", "2003"]; + tokenize_assert("10.09.2003", comp); +} + +#[test] +fn test_tokenize69() { + let comp = vec!["10", ".", "09", ".", "03"]; + tokenize_assert("10.09.03", comp); +} + +#[test] +fn test_tokenize70() { + let comp = vec!["2003", "/", "09", "/", "25"]; + tokenize_assert("2003/09/25", comp); +} + +#[test] +fn test_tokenize71() { + let comp = vec!["09", "/", "25", "/", "2003"]; + tokenize_assert("09/25/2003", comp); +} + +#[test] +fn test_tokenize72() { + let comp = vec!["25", "/", "09", "/", "2003"]; + tokenize_assert("25/09/2003", comp); +} + +#[test] +fn test_tokenize73() { + let comp = vec!["10", "/", "09", "/", "2003"]; + tokenize_assert("10/09/2003", comp); +} + +#[test] +fn test_tokenize74() { + let comp = vec!["10", "/", "09", "/", "03"]; + tokenize_assert("10/09/03", comp); +} + +#[test] +fn test_tokenize75() { + let comp = vec!["2003", " ", "09", " ", "25"]; + tokenize_assert("2003 09 25", comp); +} + +#[test] +fn test_tokenize76() { + let comp = vec!["09", " ", "25", " ", "2003"]; + tokenize_assert("09 25 2003", comp); +} + +#[test] +fn test_tokenize77() { + let comp = vec!["25", " ", "09", " ", "2003"]; + tokenize_assert("25 09 2003", comp); +} + +#[test] +fn test_tokenize78() { + let comp = vec!["10", " ", "09", " ", "2003"]; + tokenize_assert("10 09 2003", comp); +} + +#[test] +fn test_tokenize79() { + let comp = vec!["10", " ", "09", " ", "03"]; + tokenize_assert("10 09 03", comp); +} + +#[test] +fn test_tokenize80() { + let comp = vec!["25", " ", "09", " ", "03"]; + tokenize_assert("25 09 03", comp); +} + +#[test] +fn test_tokenize81() { + let comp = vec!["03", " ", "25", " ", "Sep"]; + tokenize_assert("03 25 Sep", comp); +} + +#[test] +fn test_tokenize82() { + let comp = vec!["25", " ", "03", " ", "Sep"]; + tokenize_assert("25 03 Sep", comp); +} + +#[test] +fn test_tokenize83() { + let comp = vec![" ", " ", "July", " ", " ", " ", "4", " ", ",", " ", " ", "1976", " ", " ", " ", "12", ":", "01", ":", "02", " ", " ", " ", "am", " ", " "]; + tokenize_assert(" July 4 , 1976 12:01:02 am ", comp); +} + +#[test] +fn test_tokenize84() { + let comp = vec!["Wed", ",", " ", "July", " ", "10", ",", " ", "'", "96"]; + tokenize_assert("Wed, July 10, '96", comp); +} + +#[test] +fn test_tokenize85() { + let comp = vec!["1996", ".", "July", ".", "10", " ", "AD", " ", "12", ":", "08", " ", "PM"]; + tokenize_assert("1996.July.10 AD 12:08 PM", comp); +} + +#[test] +fn test_tokenize86() { + let comp = vec!["July", " ", "4", ",", " ", "1976"]; + tokenize_assert("July 4, 1976", comp); +} + +#[test] +fn test_tokenize87() { + let comp = vec!["7", " ", "4", " ", "1976"]; + tokenize_assert("7 4 1976", comp); +} + +#[test] +fn test_tokenize88() { + let comp = vec!["4", " ", "jul", " ", "1976"]; + tokenize_assert("4 jul 1976", comp); +} + +#[test] +fn test_tokenize89() { + let comp = vec!["7", "-", "4", "-", "76"]; + tokenize_assert("7-4-76", comp); +} + +#[test] +fn test_tokenize90() { + let comp = vec!["19760704"]; + tokenize_assert("19760704", comp); +} + +#[test] +fn test_tokenize91() { + let comp = vec!["0", ":", "01", ":", "02", " ", "on", " ", "July", " ", "4", ",", " ", "1976"]; + tokenize_assert("0:01:02 on July 4, 1976", comp); +} + +#[test] +fn test_tokenize92() { + let comp = vec!["0", ":", "01", ":", "02", " ", "on", " ", "July", " ", "4", ",", " ", "1976"]; + tokenize_assert("0:01:02 on July 4, 1976", comp); +} + +#[test] +fn test_tokenize93() { + let comp = vec!["July", " ", "4", ",", " ", "1976", " ", "12", ":", "01", ":", "02", " ", "am"]; + tokenize_assert("July 4, 1976 12:01:02 am", comp); +} + +#[test] +fn test_tokenize94() { + let comp = vec!["Mon", " ", "Jan", " ", " ", "2", " ", "04", ":", "24", ":", "27", " ", "1995"]; + tokenize_assert("Mon Jan 2 04:24:27 1995", comp); +} + +#[test] +fn test_tokenize95() { + let comp = vec!["04", ".", "04", ".", "95", " ", "00", ":", "22"]; + tokenize_assert("04.04.95 00:22", comp); +} + +#[test] +fn test_tokenize96() { + let comp = vec!["Jan", " ", "1", " ", "1999", " ", "11", ":", "23", ":", "34.578"]; + tokenize_assert("Jan 1 1999 11:23:34.578", comp); +} + +#[test] +fn test_tokenize97() { + let comp = vec!["950404", " ", "122212"]; + tokenize_assert("950404 122212", comp); +} + +#[test] +fn test_tokenize98() { + let comp = vec!["3", "rd", " ", "of", " ", "May", " ", "2001"]; + tokenize_assert("3rd of May 2001", comp); +} + +#[test] +fn test_tokenize99() { + let comp = vec!["5", "th", " ", "of", " ", "March", " ", "2001"]; + tokenize_assert("5th of March 2001", comp); +} + +#[test] +fn test_tokenize100() { + let comp = vec!["1", "st", " ", "of", " ", "May", " ", "2003"]; + tokenize_assert("1st of May 2003", comp); +} + +#[test] +fn test_tokenize101() { + let comp = vec!["0099", "-", "01", "-", "01", "T", "00", ":", "00", ":", "00"]; + tokenize_assert("0099-01-01T00:00:00", comp); +} + +#[test] +fn test_tokenize102() { + let comp = vec!["0031", "-", "01", "-", "01", "T", "00", ":", "00", ":", "00"]; + tokenize_assert("0031-01-01T00:00:00", comp); +} + +#[test] +fn test_tokenize103() { + let comp = vec!["20080227", "T", "21", ":", "26", ":", "01.123456789"]; + tokenize_assert("20080227T21:26:01.123456789", comp); +} + +#[test] +fn test_tokenize104() { + let comp = vec!["13", "NOV", "2017"]; + tokenize_assert("13NOV2017", comp); +} + +#[test] +fn test_tokenize105() { + let comp = vec!["0003", "-", "03", "-", "04"]; + tokenize_assert("0003-03-04", comp); +} + +#[test] +fn test_tokenize106() { + let comp = vec!["December", ".", "0031", ".", "30"]; + tokenize_assert("December.0031.30", comp); +} + +#[test] +fn test_tokenize107() { + let comp = vec!["090107"]; + tokenize_assert("090107", comp); +} + +#[test] +fn test_tokenize108() { + let comp = vec!["2015", "-", "15", "-", "May"]; + tokenize_assert("2015-15-May", comp); +} + +#[test] +fn test_tokenize109() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28", " ", "BRST", " ", "2003"]; + tokenize_assert("Thu Sep 25 10:36:28 BRST 2003", comp); +} + +#[test] +fn test_tokenize110() { + let comp = vec!["2003", " ", "10", ":", "36", ":", "28", " ", "BRST", " ", "25", " ", "Sep", " ", "Thu"]; + tokenize_assert("2003 10:36:28 BRST 25 Sep Thu", comp); +} + +#[test] +fn test_tokenize111() { + let comp = vec!["Thu", ",", " ", "25", " ", "Sep", " ", "2003", " ", "10", ":", "49", ":", "41", " ", "-", "0300"]; + tokenize_assert("Thu, 25 Sep 2003 10:49:41 -0300", comp); +} + +#[test] +fn test_tokenize112() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49", ":", "41.5", "-", "03", ":", "00"]; + tokenize_assert("2003-09-25T10:49:41.5-03:00", comp); +} + +#[test] +fn test_tokenize113() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49", ":", "41", "-", "03", ":", "00"]; + tokenize_assert("2003-09-25T10:49:41-03:00", comp); +} + +#[test] +fn test_tokenize114() { + let comp = vec!["20030925", "T", "104941.5", "-", "0300"]; + tokenize_assert("20030925T104941.5-0300", comp); +} + +#[test] +fn test_tokenize115() { + let comp = vec!["20030925", "T", "104941", "-", "0300"]; + tokenize_assert("20030925T104941-0300", comp); +} + +#[test] +fn test_tokenize116() { + let comp = vec!["10", "-", "09", "-", "2003"]; + tokenize_assert("10-09-2003", comp); +} + +#[test] +fn test_tokenize117() { + let comp = vec!["10", ".", "09", ".", "2003"]; + tokenize_assert("10.09.2003", comp); +} + +#[test] +fn test_tokenize118() { + let comp = vec!["10", "/", "09", "/", "2003"]; + tokenize_assert("10/09/2003", comp); +} + +#[test] +fn test_tokenize119() { + let comp = vec!["10", " ", "09", " ", "2003"]; + tokenize_assert("10 09 2003", comp); +} + +#[test] +fn test_tokenize120() { + let comp = vec!["090107"]; + tokenize_assert("090107", comp); +} + +#[test] +fn test_tokenize121() { + let comp = vec!["2015", " ", "09", " ", "25"]; + tokenize_assert("2015 09 25", comp); +} + +#[test] +fn test_tokenize122() { + let comp = vec!["10", "-", "09", "-", "03"]; + tokenize_assert("10-09-03", comp); +} + +#[test] +fn test_tokenize123() { + let comp = vec!["10", ".", "09", ".", "03"]; + tokenize_assert("10.09.03", comp); +} + +#[test] +fn test_tokenize124() { + let comp = vec!["10", "/", "09", "/", "03"]; + tokenize_assert("10/09/03", comp); +} + +#[test] +fn test_tokenize125() { + let comp = vec!["10", " ", "09", " ", "03"]; + tokenize_assert("10 09 03", comp); +} + +#[test] +fn test_tokenize126() { + let comp = vec!["090107"]; + tokenize_assert("090107", comp); +} + +#[test] +fn test_tokenize127() { + let comp = vec!["2015", " ", "09", " ", "25"]; + tokenize_assert("2015 09 25", comp); +} + +#[test] +fn test_tokenize128() { + let comp = vec!["090107"]; + tokenize_assert("090107", comp); +} + +#[test] +fn test_tokenize129() { + let comp = vec!["2015", " ", "09", " ", "25"]; + tokenize_assert("2015 09 25", comp); +} + +#[test] +fn test_tokenize130() { + let comp = vec!["April", " ", "2009"]; + tokenize_assert("April 2009", comp); +} + +#[test] +fn test_tokenize131() { + let comp = vec!["Feb", " ", "2007"]; + tokenize_assert("Feb 2007", comp); +} + +#[test] +fn test_tokenize132() { + let comp = vec!["Feb", " ", "2008"]; + tokenize_assert("Feb 2008", comp); +} + +#[test] +fn test_tokenize133() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28", " ", "BRST", " ", "2003"]; + tokenize_assert("Thu Sep 25 10:36:28 BRST 2003", comp); +} + +#[test] +fn test_tokenize134() { + let comp = vec!["1996", ".", "07", ".", "10", " ", "AD", " ", "at", " ", "15", ":", "08", ":", "56", " ", "PDT"]; + tokenize_assert("1996.07.10 AD at 15:08:56 PDT", comp); +} + +#[test] +fn test_tokenize135() { + let comp = vec!["Tuesday", ",", " ", "April", " ", "12", ",", " ", "1952", " ", "AD", " ", "3", ":", "30", ":", "42", "pm", " ", "PST"]; + tokenize_assert("Tuesday, April 12, 1952 AD 3:30:42pm PST", comp); +} + +#[test] +fn test_tokenize136() { + let comp = vec!["November", " ", "5", ",", " ", "1994", ",", " ", "8", ":", "15", ":", "30", " ", "am", " ", "EST"]; + tokenize_assert("November 5, 1994, 8:15:30 am EST", comp); +} + +#[test] +fn test_tokenize137() { + let comp = vec!["1994", "-", "11", "-", "05", "T", "08", ":", "15", ":", "30", "-", "05", ":", "00"]; + tokenize_assert("1994-11-05T08:15:30-05:00", comp); +} + +#[test] +fn test_tokenize138() { + let comp = vec!["1994", "-", "11", "-", "05", "T", "08", ":", "15", ":", "30", "Z"]; + tokenize_assert("1994-11-05T08:15:30Z", comp); +} + +#[test] +fn test_tokenize139() { + let comp = vec!["1976", "-", "07", "-", "04", "T", "00", ":", "01", ":", "02", "Z"]; + tokenize_assert("1976-07-04T00:01:02Z", comp); +} + +#[test] +fn test_tokenize140() { + let comp = vec!["Tue", " ", "Apr", " ", "4", " ", "00", ":", "22", ":", "12", " ", "PDT", " ", "1995"]; + tokenize_assert("Tue Apr 4 00:22:12 PDT 1995", comp); +} + +#[test] +fn test_tokenize141() { + let comp = vec!["Today", " ", "is", " ", "25", " ", "of", " ", "September", " ", "of", " ", "2003", ",", " ", "exactly", " ", "at", " ", "10", ":", "49", ":", "41", " ", "with", " ", "timezone", " ", "-", "03", ":", "00", "."]; + tokenize_assert("Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", comp); +} + +#[test] +fn test_tokenize142() { + let comp = vec!["Today", " ", "is", " ", "25", " ", "of", " ", "September", " ", "of", " ", "2003", ",", " ", "exactly", " ", "at", " ", "10", ":", "49", ":", "41", " ", "with", " ", "timezone", " ", "-", "03", ":", "00", "."]; + tokenize_assert("Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", comp); +} diff --git a/src/tokenize.rs b/src/tokenize.rs index 38091a8..bb982a0 100644 --- a/src/tokenize.rs +++ b/src/tokenize.rs @@ -1,5 +1,6 @@ pub(crate) struct Tokenizer { token_stack: Vec, + // TODO: Should this be more generic? io::Read for example? parse_string: String, } @@ -13,12 +14,49 @@ pub(crate) enum ParseState { } impl Tokenizer { - pub(crate) fn new(parse_string: String) -> Self { + + pub(crate) fn new(parse_string: &str) -> Self { Tokenizer { - token_stack: Vec::new(), + token_stack: vec![], parse_string: parse_string.chars().rev().collect(), } } + + fn isword(&self, c: char) -> bool { + c.is_alphabetic() + } + + fn isnum(&self, c: char) -> bool { + c.is_numeric() + } + + fn isspace(&self, c: char) -> bool { + c.is_whitespace() + } + + fn decimal_split(&self, s: &str) -> Vec { + // Handles the same thing as Python's re.split() + let mut tokens: Vec = vec!["".to_owned()]; + + for c in s.chars() { + if c == '.' || c == ',' { + tokens.push(c.to_string()); + tokens.push("".to_owned()); + } else { + // UNWRAP: Initial setup guarantees we always have an item + let mut t = tokens.pop().unwrap(); + t.push(c); + tokens.push(t); + } + } + + // TODO: Do I really have to use &String instead of &str? + if tokens.last() == Some(&"".to_owned()) { + tokens.pop(); + } + + tokens + } } impl Iterator for Tokenizer { @@ -26,182 +64,115 @@ impl Iterator for Tokenizer { fn next(&mut self) -> Option { if !self.token_stack.is_empty() { - return Some(self.token_stack.pop().unwrap()); - }; - if self.parse_string.is_empty() { - return None; - }; + return Some(self.token_stack.remove(0)); + } - let mut char_stack: Vec = Vec::new(); - let mut seen_letters = false; + let mut seenletters = false; + let mut token: Option = None; let mut state = ParseState::Empty; - while let Some(next) = self.parse_string.pop() { + while !self.parse_string.is_empty() { + // Dateutil uses a separate `charstack` to manage the incoming stream. + // Because parse_string can have things pushed back onto it, we skip + // a couple of steps related to the `charstack`. + + // UNWRAP: Just checked that parse_string isn't empty + let nextchar = self.parse_string.pop().unwrap(); + match state { ParseState::Empty => { - if next.is_numeric() { - state = ParseState::Numeric; - char_stack.push(next); - } else if next.is_alphabetic() { + token = Some(nextchar.to_string()); + if self.isword(nextchar) { state = ParseState::Alpha; - seen_letters = true; - char_stack.push(next); - } else if next.is_whitespace() { - char_stack.push(' '); + } else if self.isnum(nextchar) { + state = ParseState::Numeric; + } else if self.isspace(nextchar) { + token = Some(" ".to_owned()); break; } else { - char_stack.push(next); break; } - } + }, ParseState::Alpha => { - if next.is_alphabetic() { - char_stack.push(next); - } else if next == '.' { + seenletters = true; + if self.isword(nextchar) { + // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token + token.as_mut().unwrap().push(nextchar); + } else if nextchar == '.' { + token.as_mut().unwrap().push(nextchar); state = ParseState::AlphaDecimal; - char_stack.push(next); } else { - // We don't recognize the character, so push it back - // to be handled later. - self.parse_string.push(next); + self.parse_string.push(nextchar); break; } - } - ParseState::AlphaDecimal => { - if next == '.' || next.is_alphabetic() { - char_stack.push(next); - } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' { - char_stack.push(next); - state = ParseState::NumericDecimal; - } else { - self.parse_string.push(next); - break; - } - } + }, ParseState::Numeric => { - if next.is_numeric() { - char_stack.push(next); - } else if next == '.' || (next == ',' && char_stack.len() >= 2) { - char_stack.push(next); + if self.isnum(nextchar) { + // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token + token.as_mut().unwrap().push(nextchar); + } else if nextchar == '.' || (nextchar == ',' && token.as_ref().unwrap().len() >= 2) { + token.as_mut().unwrap().push(nextchar); state = ParseState::NumericDecimal; } else { - // We don't recognize the character, so push it back - // to be handled later - self.parse_string.push(next); + self.parse_string.push(nextchar); break; } - } + }, + ParseState::AlphaDecimal => { + seenletters = true; + if nextchar == '.' || self.isword(nextchar) { + // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token + token.as_mut().unwrap().push(nextchar); + } else if self.isnum(nextchar) && token.as_ref().unwrap().chars().last() == Some('.') { + token.as_mut().unwrap().push(nextchar); + state = ParseState::NumericDecimal; + } else { + self.parse_string.push(nextchar); + break; + } + }, ParseState::NumericDecimal => { - if next == '.' || next.is_numeric() { - char_stack.push(next); - } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' { - char_stack.push(next); + if nextchar == '.' || self.isnum(nextchar) { + // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token + token.as_mut().unwrap().push(nextchar); + } else if self.isword(nextchar) && token.as_ref().unwrap().chars().last() == Some('.') { + token.as_mut().unwrap().push(nextchar); state = ParseState::AlphaDecimal; } else { - self.parse_string.push(next); + self.parse_string.push(nextchar); break; } } } } - // I like Python's version of this much better: - // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,' - let dot_count = char_stack.iter().fold(0, |count, character| { - count + (if character == &'.' { 1 } else { 0 }) - }); - let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' - || char_stack.last().unwrap() == &','; - let final_string: String = char_stack.into_iter().collect(); - - let mut tokens = match state { - ParseState::Empty => vec![final_string], - ParseState::Alpha => vec![final_string], - ParseState::Numeric => vec![final_string], - ParseState::AlphaDecimal => { - if needs_split { - decimal_split(&final_string, false) - } else { - vec![final_string] + // Python uses the state to short-circuit and make sure it doesn't run into issues with None + // We do something slightly different to express the same logic + if state == ParseState::AlphaDecimal || state == ParseState::NumericDecimal { + // UNWRAP: The state check guarantees that we have a value + let dot_count = token.as_ref().unwrap().chars().filter(|c| *c == '.').count(); + let last_char = token.as_ref().unwrap().chars().last(); + let last_splittable = last_char == Some('.') || last_char == Some(','); + + if seenletters || dot_count > 1 || last_splittable { + let mut l = self.decimal_split(token.as_ref().unwrap()); + let remaining = l.split_off(1); + + token = Some(l[0].clone()); + for t in remaining { + self.token_stack.push(t); } } - ParseState::NumericDecimal => { - if needs_split { - decimal_split(&final_string, dot_count == 0) - } else { - vec![final_string] - } + + if state == ParseState::NumericDecimal && dot_count == 0 { + token = Some(token.unwrap().replace(',', ".")); } - }.into_iter() - .rev() - .collect(); - - self.token_stack.append(&mut tokens); - // UNWRAP: Previous match guaranteed that at least one token was added - let token = self.token_stack.pop().unwrap(); - if state == ParseState::NumericDecimal && !token.contains(".") { - Some(token.replace(",", ".")) - } else { - Some(token) } + + token } } -fn decimal_split(characters: &str, cast_period: bool) -> Vec { - let mut token_stack: Vec = Vec::new(); - let mut char_stack: Vec = Vec::new(); - let mut state = ParseState::Empty; - - for c in characters.chars() { - match state { - ParseState::Empty => { - if c.is_alphabetic() { - char_stack.push(c); - state = ParseState::Alpha; - } else if c.is_numeric() { - char_stack.push(c); - state = ParseState::Numeric; - } else { - let character = if cast_period { '.' } else { c }; - token_stack.push(character.to_string()); - } - } - ParseState::Alpha => { - if c.is_alphabetic() { - char_stack.push(c); - } else { - token_stack.push(char_stack.iter().collect()); - char_stack.clear(); - let character = if cast_period { '.' } else { c }; - token_stack.push(character.to_string()); - state = ParseState::Empty; - } - } - ParseState::Numeric => { - if c.is_numeric() { - char_stack.push(c); - } else { - token_stack.push(char_stack.iter().collect()); - char_stack.clear(); - let character = if cast_period { '.' } else { c }; - token_stack.push(character.to_string()); - state = ParseState::Empty; - } - } - _ => panic!("Invalid parse state during decimal_split()"), - } - } - - match state { - ParseState::Alpha => token_stack.push(char_stack.iter().collect()), - ParseState::Numeric => token_stack.push(char_stack.iter().collect()), - ParseState::Empty => (), - _ => panic!("Invalid parse state during decimal_split()"), - } - - token_stack -} - #[cfg(test)] mod tests { @@ -209,7 +180,7 @@ mod tests { #[test] fn test_basic() { - let tokens: Vec = Tokenizer::new("September of 2003,".to_owned()).collect(); + let tokens: Vec = Tokenizer::new("September of 2003,").collect(); assert_eq!(tokens, vec!["September", " ", "of", " ", "2003", ","]); } } diff --git a/src/weekday.rs b/src/weekday.rs index d92c758..f874fcf 100644 --- a/src/weekday.rs +++ b/src/weekday.rs @@ -99,6 +99,8 @@ pub fn day_of_week(year: u32, month: u32, day: u32) -> ParseResult { } } +// Rust warns about unused imports here, but they're definitely used. +#[allow(unused_imports)] mod test { use weekday::day_of_week; From c954a533c3ba94a6213f2c20fa395a2b169a8ea4 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Sun, 8 Jul 2018 15:11:29 -0400 Subject: [PATCH 07/12] It's working! Still need to add more tests, but I think we're mostly good to go --- src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 02c6a1a..334bae2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -953,9 +953,12 @@ impl Parser { let hour = value.to_i64().unwrap() as i32; let ampm = info.get_ampm(&tokens[idx + 2]).unwrap(); res.hour = Some(self.adjust_ampm(hour, ampm)); + idx += 1; } else { ymd.append(value.floor().to_i64().unwrap() as i32, &value_repr, None)?; } + + idx += 1; } else if info.get_ampm(&tokens[idx + 1]).is_some() && (*ZERO <= value && value < *TWENTY_FOUR) { @@ -1098,6 +1101,7 @@ impl Parser { fn recombine_skipped(&self, skipped_idxs: Vec, tokens: Vec) -> Vec { let mut skipped_tokens: Vec = vec![]; + println!("idxs: {:?}, tokens: {:?}", skipped_idxs, tokens); let mut sorted_idxs = skipped_idxs.clone(); sorted_idxs.sort(); From c03817858302d05eecf1bb5cb16731d8bd27151c Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Sun, 8 Jul 2018 15:23:51 -0400 Subject: [PATCH 08/12] testFuzzyAMPMProblem is causing us issues --- build_pycompat.py | 28 ++++++++++++++ src/tests/pycompat_parser.rs | 72 ++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/build_pycompat.py b/build_pycompat.py index 036233f..a63cf54 100644 --- a/build_pycompat.py +++ b/build_pycompat.py @@ -86,6 +86,14 @@ tests = { 'test_fuzzy_tokens_tzinfo': [ 'Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.' ], + 'test_fuzzy_simple': [ + 'I have a meeting on March 1, 1974', # testFuzzyAMPMProblem + 'On June 8th, 2020, I am going to be the first man on Mars', # testFuzzyAMPMProblem + 'Meet me at the AM/PM on Sunset at 3:00 AM on December 3rd, 2003', # testFuzzyAMPMProblem + 'Meet me at 3:00 AM on December 3rd, 2003 at the AM/PM on Sunset', # testFuzzyAMPMProblem + 'Jan 29, 1945 14:45 AM I going to see you there?', # testFuzzyIgnoreAMPM + '2017-07-17 06:15:', # test_idx_check + ], 'test_parse_default_ignore': [ ], } @@ -172,6 +180,13 @@ def test_fuzzy_tokens_tzinfo(i, s): tokens=r_tokens ) + +def test_fuzzy_simple(i, s): + d = parse(s, fuzzy=True) + + return TEST_FUZZY_SIMPLE.format(i=i, d=d, s=s) + + # Here lies all the ugly junk. TEST_HEADER = ''' extern crate chrono; @@ -449,6 +464,19 @@ fn test_fuzzy_tokens_tzinfo{i}() {{ None, false, HashMap::new()); }}\n''' +TEST_FUZZY_SIMPLE = ''' +#[test] +fn test_fuzzy_simple{i}() {{ + let info = ParserInfo::default(); + let pdt = PyDateTime {{ + year: {d.year}, month: {d.month}, day: {d.day}, + hour: {d.hour}, minute: {d.minute}, second: {d.second}, + micros: {d.microsecond}, tzo: None + }}; + parse_fuzzy_and_assert(pdt, None, info, "{s}", None, None, true, false, + None, false, HashMap::new()); +}}\n''' + if __name__ == '__main__': main() \ No newline at end of file diff --git a/src/tests/pycompat_parser.rs b/src/tests/pycompat_parser.rs index 647f7a5..624ee7d 100644 --- a/src/tests/pycompat_parser.rs +++ b/src/tests/pycompat_parser.rs @@ -1757,3 +1757,75 @@ fn test_fuzzy_tokens_tzinfo0() { parse_fuzzy_and_assert(pdt, Some(tokens), info, "Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", None, None, true, true, None, false, HashMap::new()); } + +#[test] +fn test_fuzzy_simple0() { + let info = ParserInfo::default(); + let pdt = PyDateTime { + year: 1974, month: 3, day: 1, + hour: 0, minute: 0, second: 0, + micros: 0, tzo: None + }; + parse_fuzzy_and_assert(pdt, None, info, "I have a meeting on March 1, 1974", None, None, true, false, + None, false, HashMap::new()); +} + +#[test] +fn test_fuzzy_simple1() { + let info = ParserInfo::default(); + let pdt = PyDateTime { + year: 2020, month: 6, day: 8, + hour: 0, minute: 0, second: 0, + micros: 0, tzo: None + }; + parse_fuzzy_and_assert(pdt, None, info, "On June 8th, 2020, I am going to be the first man on Mars", None, None, true, false, + None, false, HashMap::new()); +} + +#[test] +fn test_fuzzy_simple2() { + let info = ParserInfo::default(); + let pdt = PyDateTime { + year: 2003, month: 12, day: 3, + hour: 3, minute: 0, second: 0, + micros: 0, tzo: None + }; + parse_fuzzy_and_assert(pdt, None, info, "Meet me at the AM/PM on Sunset at 3:00 AM on December 3rd, 2003", None, None, true, false, + None, false, HashMap::new()); +} + +#[test] +fn test_fuzzy_simple3() { + let info = ParserInfo::default(); + let pdt = PyDateTime { + year: 2003, month: 12, day: 3, + hour: 3, minute: 0, second: 0, + micros: 0, tzo: None + }; + parse_fuzzy_and_assert(pdt, None, info, "Meet me at 3:00 AM on December 3rd, 2003 at the AM/PM on Sunset", None, None, true, false, + None, false, HashMap::new()); +} + +#[test] +fn test_fuzzy_simple4() { + let info = ParserInfo::default(); + let pdt = PyDateTime { + year: 1945, month: 1, day: 29, + hour: 14, minute: 45, second: 0, + micros: 0, tzo: None + }; + parse_fuzzy_and_assert(pdt, None, info, "Jan 29, 1945 14:45 AM I going to see you there?", None, None, true, false, + None, false, HashMap::new()); +} + +#[test] +fn test_fuzzy_simple5() { + let info = ParserInfo::default(); + let pdt = PyDateTime { + year: 2017, month: 7, day: 17, + hour: 6, minute: 15, second: 0, + micros: 0, tzo: None + }; + parse_fuzzy_and_assert(pdt, None, info, "2017-07-17 06:15:", None, None, true, false, + None, false, HashMap::new()); +} From 9008ee8339bce63f645d7b44bb3f4a36b416e405 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Sun, 8 Jul 2018 21:16:43 -0400 Subject: [PATCH 09/12] Regenerate tests Need to make this automated --- src/lib.rs | 2 +- src/tests/pycompat_tokenizer.rs | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 334bae2..695667f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1068,7 +1068,6 @@ impl Parser { if hms == 0 { res.hour = Some(value.to_i64().unwrap() as i32); if !close_to_integer(&value) { - // TODO: High probability of issues with rounding here. res.minute = Some((*SIXTY * (value % *ONE)).to_i64().unwrap() as i32); } } else if hms == 1 { @@ -1088,6 +1087,7 @@ impl Parser { } fn parse_min_sec(&self, value: Decimal) -> (i32, Option) { + // UNWRAP: i64 guaranteed to be fine because of preceding floor let minute = value.floor().to_i64().unwrap() as i32; let mut second = None; diff --git a/src/tests/pycompat_tokenizer.rs b/src/tests/pycompat_tokenizer.rs index 6ba6a21..fbf35c8 100644 --- a/src/tests/pycompat_tokenizer.rs +++ b/src/tests/pycompat_tokenizer.rs @@ -863,3 +863,39 @@ fn test_tokenize142() { let comp = vec!["Today", " ", "is", " ", "25", " ", "of", " ", "September", " ", "of", " ", "2003", ",", " ", "exactly", " ", "at", " ", "10", ":", "49", ":", "41", " ", "with", " ", "timezone", " ", "-", "03", ":", "00", "."]; tokenize_assert("Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", comp); } + +#[test] +fn test_tokenize143() { + let comp = vec!["I", " ", "have", " ", "a", " ", "meeting", " ", "on", " ", "March", " ", "1", ",", " ", "1974"]; + tokenize_assert("I have a meeting on March 1, 1974", comp); +} + +#[test] +fn test_tokenize144() { + let comp = vec!["On", " ", "June", " ", "8", "th", ",", " ", "2020", ",", " ", "I", " ", "am", " ", "going", " ", "to", " ", "be", " ", "the", " ", "first", " ", "man", " ", "on", " ", "Mars"]; + tokenize_assert("On June 8th, 2020, I am going to be the first man on Mars", comp); +} + +#[test] +fn test_tokenize145() { + let comp = vec!["Meet", " ", "me", " ", "at", " ", "the", " ", "AM", "/", "PM", " ", "on", " ", "Sunset", " ", "at", " ", "3", ":", "00", " ", "AM", " ", "on", " ", "December", " ", "3", "rd", ",", " ", "2003"]; + tokenize_assert("Meet me at the AM/PM on Sunset at 3:00 AM on December 3rd, 2003", comp); +} + +#[test] +fn test_tokenize146() { + let comp = vec!["Meet", " ", "me", " ", "at", " ", "3", ":", "00", " ", "AM", " ", "on", " ", "December", " ", "3", "rd", ",", " ", "2003", " ", "at", " ", "the", " ", "AM", "/", "PM", " ", "on", " ", "Sunset"]; + tokenize_assert("Meet me at 3:00 AM on December 3rd, 2003 at the AM/PM on Sunset", comp); +} + +#[test] +fn test_tokenize147() { + let comp = vec!["Jan", " ", "29", ",", " ", "1945", " ", "14", ":", "45", " ", "AM", " ", "I", " ", "going", " ", "to", " ", "see", " ", "you", " ", "there", "?"]; + tokenize_assert("Jan 29, 1945 14:45 AM I going to see you there?", comp); +} + +#[test] +fn test_tokenize148() { + let comp = vec!["2017", "-", "07", "-", "17", " ", "06", ":", "15", ":"]; + tokenize_assert("2017-07-17 06:15:", comp); +} From 5152ced2f13a751bcfa981f29ccf0d0dea2d4040 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Sun, 8 Jul 2018 21:31:18 -0400 Subject: [PATCH 10/12] One last test case, then version 0.9! --- src/lib.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 695667f..422b597 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -196,7 +196,9 @@ impl ParserInfo { fn get_ampm(&self, name: &str) -> Option { if let Some(v) = self.ampm.get(&name.to_lowercase()) { - Some(v.to_owned() == 1) + // Python technically uses numbers here, but given that the numbers are + // only 0 and 1, it's easier to use booleans + Some(*v == 1) } else { None } @@ -639,7 +641,7 @@ impl Parser { let is_ampm = self.ampm_valid(res.hour, res.ampm, fuzzy); if is_ampm.is_ok() { - res.hour = Some(self.adjust_ampm(res.hour.unwrap(), value)); + res.hour = res.hour.map(|h| self.adjust_ampm(h, value)); res.ampm = Some(value); } else if fuzzy { skipped_idxs.push(i); From d55e67830f42c37916659b4a44f78631d217e350 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Sun, 8 Jul 2018 21:51:02 -0400 Subject: [PATCH 11/12] All tests are working! --- src/lib.rs | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 422b597..53bc165 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -640,7 +640,7 @@ impl Parser { } else if let Some(value) = self.info.get_ampm(&l[i]) { let is_ampm = self.ampm_valid(res.hour, res.ampm, fuzzy); - if is_ampm.is_ok() { + if is_ampm == Ok(true) { res.hour = res.hour.map(|h| self.adjust_ampm(h, value)); res.ampm = Some(value); } else if fuzzy { @@ -750,25 +750,27 @@ impl Parser { } fn ampm_valid(&self, hour: Option, ampm: Option, fuzzy: bool) -> ParseResult { - if fuzzy && ampm == Some(true) { - return Ok(false); + let mut val_is_ampm = true; + + if fuzzy && ampm.is_some() { + val_is_ampm = false; } if hour.is_none() { if fuzzy { - Ok(false) + val_is_ampm = false; } else { - Err(ParseError::AmPmWithoutHour) + return Err(ParseError::AmPmWithoutHour); } } else if !(0 <= hour.unwrap() && hour.unwrap() <= 12) { if fuzzy { - Ok(false) + val_is_ampm = false; } else { - Err(ParseError::InvalidHour) + return Err(ParseError::InvalidHour); } - } else { - Ok(false) } + + Ok(val_is_ampm) } fn build_naive(&self, res: &ParsingResult, default: &NaiveDateTime) -> ParseResult { @@ -1103,7 +1105,6 @@ impl Parser { fn recombine_skipped(&self, skipped_idxs: Vec, tokens: Vec) -> Vec { let mut skipped_tokens: Vec = vec![]; - println!("idxs: {:?}, tokens: {:?}", skipped_idxs, tokens); let mut sorted_idxs = skipped_idxs.clone(); sorted_idxs.sort(); From ac920949d292b85d32372776214d36c54dcbaadb Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Sun, 8 Jul 2018 21:54:28 -0400 Subject: [PATCH 12/12] Fuzzy mode is now tested --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 721d85a..7a37ddd 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ Supported in v0.8 theoretically would provide support, but I'd also like some helper things available (e.g. "EST" is not a named zone in `chrono-tz`). Explicit time zones (i.e. "00:00:00 -0300") are working as expected. -3. "Fuzzy" and "Fuzzy with tokens" modes haven't been tested. The code should work, but I need to get the -test cases added to the auto-generation suite +3. ~~"Fuzzy" and "Fuzzy with tokens" modes haven't been tested. The code should work, but I need to get the +test cases added to the auto-generation suite~~ **Non-functional**: This library is intended to be a direct port from Python, and thus the code looks a lot more like Python than it does Rust. There are a ton of `TODO` comments in the code