From e049618fffa70a8f217cc90ec981a00845a1a56d Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Sat, 7 Jul 2018 23:37:02 -0400 Subject: [PATCH] Redo the tokenization Still has issues with one test case for fuzzy --- build_pycompat.py | 11 +- build_pycompat_tokenizer.py | 35 + src/lib.rs | 2 +- src/{tests.rs => tests/fuzzing.rs} | 0 src/tests/mod.rs | 3 + .../tests/pycompat_parser.rs | 9 +- src/tests/pycompat_tokenizer.rs | 865 ++++++++++++++++++ src/tokenize.rs | 257 +++--- src/weekday.rs | 2 + 9 files changed, 1029 insertions(+), 155 deletions(-) create mode 100644 build_pycompat_tokenizer.py rename src/{tests.rs => tests/fuzzing.rs} (100%) create mode 100644 src/tests/mod.rs rename tests/pycompat.rs => src/tests/pycompat_parser.rs (99%) create mode 100644 src/tests/pycompat_tokenizer.rs diff --git a/build_pycompat.py b/build_pycompat.py index 641494e..036233f 100644 --- a/build_pycompat.py +++ b/build_pycompat.py @@ -91,7 +91,7 @@ tests = { } def main(): - with open('tests/pycompat.rs', 'w+') as handle: + with open('src/tests/pycompat_parser.rs', 'w+') as handle: handle.write(TEST_HEADER) for test_name, test_strings in tests.items(): @@ -182,10 +182,9 @@ use chrono::NaiveDateTime; use chrono::Timelike; use std::collections::HashMap; -extern crate dtparse; - -use dtparse::Parser; -use dtparse::ParserInfo; +use Parser; +use ParserInfo; +use parse; struct PyDateTime { year: i32, @@ -236,7 +235,7 @@ fn parse_and_assert_simple( pdt: PyDateTime, s: &str, ) { - let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); + let rs_parsed = parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); diff --git a/build_pycompat_tokenizer.py b/build_pycompat_tokenizer.py new file mode 100644 index 0000000..ea0ed8c --- /dev/null +++ b/build_pycompat_tokenizer.py @@ -0,0 +1,35 @@ +from dateutil.parser import _timelex + +from build_pycompat import tests + +def main(): + with open('src/tests/pycompat_tokenizer.rs', 'w+') as handle: + handle.write(TEST_HEADER) + + counter = 0 + for _, test_strings in tests.items(): + for s in test_strings: + handle.write(build_test(counter, s)) + counter += 1 + +def build_test(i, test_string): + python_tokens = list(_timelex(test_string)) + formatted_tokens = 'vec!["' + '", "'.join(python_tokens) + '"]' + return f''' +#[test] +fn test_tokenize{i}() {{ + let comp = {formatted_tokens}; + tokenize_assert("{test_string}", comp); +}}\n''' + + +TEST_HEADER = ''' +use tokenize::Tokenizer; + +fn tokenize_assert(test_str: &str, comparison: Vec<&str>) { + let tokens: Vec = Tokenizer::new(test_str).collect(); + assert_eq!(tokens, comparison, "Tokenizing mismatch for `{}`", test_str); +}\n''' + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 1590974..02c6a1a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,7 +87,7 @@ type ParseResult = Result; type ParseIResult = Result; pub fn tokenize(parse_string: &str) -> Vec { - let tokenizer = Tokenizer::new(parse_string.to_owned()); + let tokenizer = Tokenizer::new(parse_string); tokenizer.collect() } diff --git a/src/tests.rs b/src/tests/fuzzing.rs similarity index 100% rename from src/tests.rs rename to src/tests/fuzzing.rs diff --git a/src/tests/mod.rs b/src/tests/mod.rs new file mode 100644 index 0000000..1776124 --- /dev/null +++ b/src/tests/mod.rs @@ -0,0 +1,3 @@ +mod fuzzing; +mod pycompat_parser; +mod pycompat_tokenizer; diff --git a/tests/pycompat.rs b/src/tests/pycompat_parser.rs similarity index 99% rename from tests/pycompat.rs rename to src/tests/pycompat_parser.rs index 8ca6e6f..647f7a5 100644 --- a/tests/pycompat.rs +++ b/src/tests/pycompat_parser.rs @@ -7,10 +7,9 @@ use chrono::NaiveDateTime; use chrono::Timelike; use std::collections::HashMap; -extern crate dtparse; - -use dtparse::Parser; -use dtparse::ParserInfo; +use Parser; +use ParserInfo; +use parse; struct PyDateTime { year: i32, @@ -61,7 +60,7 @@ fn parse_and_assert_simple( pdt: PyDateTime, s: &str, ) { - let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); + let rs_parsed = parse(s).expect(&format!("Unable to parse date in Rust '{}'", s)); assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s); assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s); assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s); diff --git a/src/tests/pycompat_tokenizer.rs b/src/tests/pycompat_tokenizer.rs new file mode 100644 index 0000000..6ba6a21 --- /dev/null +++ b/src/tests/pycompat_tokenizer.rs @@ -0,0 +1,865 @@ + +use tokenize::Tokenizer; + +fn tokenize_assert(test_str: &str, comparison: Vec<&str>) { + let tokens: Vec = Tokenizer::new(test_str).collect(); + assert_eq!(tokens, comparison, "Tokenizing mismatch for `{}`", test_str); +} + +#[test] +fn test_tokenize0() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28"]; + tokenize_assert("Thu Sep 25 10:36:28", comp); +} + +#[test] +fn test_tokenize1() { + let comp = vec!["Sep", " ", "10", ":", "36", ":", "28"]; + tokenize_assert("Sep 10:36:28", comp); +} + +#[test] +fn test_tokenize2() { + let comp = vec!["10", ":", "36", ":", "28"]; + tokenize_assert("10:36:28", comp); +} + +#[test] +fn test_tokenize3() { + let comp = vec!["10", ":", "36"]; + tokenize_assert("10:36", comp); +} + +#[test] +fn test_tokenize4() { + let comp = vec!["Sep", " ", "2003"]; + tokenize_assert("Sep 2003", comp); +} + +#[test] +fn test_tokenize5() { + let comp = vec!["Sep"]; + tokenize_assert("Sep", comp); +} + +#[test] +fn test_tokenize6() { + let comp = vec!["2003"]; + tokenize_assert("2003", comp); +} + +#[test] +fn test_tokenize7() { + let comp = vec!["10", "h", "36", "m", "28.5", "s"]; + tokenize_assert("10h36m28.5s", comp); +} + +#[test] +fn test_tokenize8() { + let comp = vec!["10", "h", "36", "m", "28", "s"]; + tokenize_assert("10h36m28s", comp); +} + +#[test] +fn test_tokenize9() { + let comp = vec!["10", "h", "36", "m"]; + tokenize_assert("10h36m", comp); +} + +#[test] +fn test_tokenize10() { + let comp = vec!["10", "h"]; + tokenize_assert("10h", comp); +} + +#[test] +fn test_tokenize11() { + let comp = vec!["10", " ", "h", " ", "36"]; + tokenize_assert("10 h 36", comp); +} + +#[test] +fn test_tokenize12() { + let comp = vec!["10", " ", "h", " ", "36.5"]; + tokenize_assert("10 h 36.5", comp); +} + +#[test] +fn test_tokenize13() { + let comp = vec!["36", " ", "m", " ", "5"]; + tokenize_assert("36 m 5", comp); +} + +#[test] +fn test_tokenize14() { + let comp = vec!["36", " ", "m", " ", "5", " ", "s"]; + tokenize_assert("36 m 5 s", comp); +} + +#[test] +fn test_tokenize15() { + let comp = vec!["36", " ", "m", " ", "05"]; + tokenize_assert("36 m 05", comp); +} + +#[test] +fn test_tokenize16() { + let comp = vec!["36", " ", "m", " ", "05", " ", "s"]; + tokenize_assert("36 m 05 s", comp); +} + +#[test] +fn test_tokenize17() { + let comp = vec!["10", "h", " ", "am"]; + tokenize_assert("10h am", comp); +} + +#[test] +fn test_tokenize18() { + let comp = vec!["10", "h", " ", "pm"]; + tokenize_assert("10h pm", comp); +} + +#[test] +fn test_tokenize19() { + let comp = vec!["10", "am"]; + tokenize_assert("10am", comp); +} + +#[test] +fn test_tokenize20() { + let comp = vec!["10", "pm"]; + tokenize_assert("10pm", comp); +} + +#[test] +fn test_tokenize21() { + let comp = vec!["10", ":", "00", " ", "am"]; + tokenize_assert("10:00 am", comp); +} + +#[test] +fn test_tokenize22() { + let comp = vec!["10", ":", "00", " ", "pm"]; + tokenize_assert("10:00 pm", comp); +} + +#[test] +fn test_tokenize23() { + let comp = vec!["10", ":", "00", "am"]; + tokenize_assert("10:00am", comp); +} + +#[test] +fn test_tokenize24() { + let comp = vec!["10", ":", "00", "pm"]; + tokenize_assert("10:00pm", comp); +} + +#[test] +fn test_tokenize25() { + let comp = vec!["10", ":", "00", "a", ".", "m"]; + tokenize_assert("10:00a.m", comp); +} + +#[test] +fn test_tokenize26() { + let comp = vec!["10", ":", "00", "p", ".", "m"]; + tokenize_assert("10:00p.m", comp); +} + +#[test] +fn test_tokenize27() { + let comp = vec!["10", ":", "00", "a", ".", "m", "."]; + tokenize_assert("10:00a.m.", comp); +} + +#[test] +fn test_tokenize28() { + let comp = vec!["10", ":", "00", "p", ".", "m", "."]; + tokenize_assert("10:00p.m.", comp); +} + +#[test] +fn test_tokenize29() { + let comp = vec!["October"]; + tokenize_assert("October", comp); +} + +#[test] +fn test_tokenize30() { + let comp = vec!["31", "-", "Dec", "-", "00"]; + tokenize_assert("31-Dec-00", comp); +} + +#[test] +fn test_tokenize31() { + let comp = vec!["0", ":", "01", ":", "02"]; + tokenize_assert("0:01:02", comp); +} + +#[test] +fn test_tokenize32() { + let comp = vec!["12", "h", " ", "01", "m", "02", "s", " ", "am"]; + tokenize_assert("12h 01m02s am", comp); +} + +#[test] +fn test_tokenize33() { + let comp = vec!["12", ":", "08", " ", "PM"]; + tokenize_assert("12:08 PM", comp); +} + +#[test] +fn test_tokenize34() { + let comp = vec!["01", "h", "02", "m", "03"]; + tokenize_assert("01h02m03", comp); +} + +#[test] +fn test_tokenize35() { + let comp = vec!["01", "h", "02"]; + tokenize_assert("01h02", comp); +} + +#[test] +fn test_tokenize36() { + let comp = vec!["01", "h", "02", "s"]; + tokenize_assert("01h02s", comp); +} + +#[test] +fn test_tokenize37() { + let comp = vec!["01", "m", "02"]; + tokenize_assert("01m02", comp); +} + +#[test] +fn test_tokenize38() { + let comp = vec!["01", "m", "02", "h"]; + tokenize_assert("01m02h", comp); +} + +#[test] +fn test_tokenize39() { + let comp = vec!["2004", " ", "10", " ", "Apr", " ", "11", "h", "30", "m"]; + tokenize_assert("2004 10 Apr 11h30m", comp); +} + +#[test] +fn test_tokenize40() { + let comp = vec!["Sep", " ", "03"]; + tokenize_assert("Sep 03", comp); +} + +#[test] +fn test_tokenize41() { + let comp = vec!["Sep", " ", "of", " ", "03"]; + tokenize_assert("Sep of 03", comp); +} + +#[test] +fn test_tokenize42() { + let comp = vec!["02", ":", "17", "NOV", "2017"]; + tokenize_assert("02:17NOV2017", comp); +} + +#[test] +fn test_tokenize43() { + let comp = vec!["Thu", " ", "Sep", " ", "10", ":", "36", ":", "28"]; + tokenize_assert("Thu Sep 10:36:28", comp); +} + +#[test] +fn test_tokenize44() { + let comp = vec!["Thu", " ", "10", ":", "36", ":", "28"]; + tokenize_assert("Thu 10:36:28", comp); +} + +#[test] +fn test_tokenize45() { + let comp = vec!["Wed"]; + tokenize_assert("Wed", comp); +} + +#[test] +fn test_tokenize46() { + let comp = vec!["Wednesday"]; + tokenize_assert("Wednesday", comp); +} + +#[test] +fn test_tokenize47() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28", " ", "2003"]; + tokenize_assert("Thu Sep 25 10:36:28 2003", comp); +} + +#[test] +fn test_tokenize48() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "2003"]; + tokenize_assert("Thu Sep 25 2003", comp); +} + +#[test] +fn test_tokenize49() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49", ":", "41"]; + tokenize_assert("2003-09-25T10:49:41", comp); +} + +#[test] +fn test_tokenize50() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49"]; + tokenize_assert("2003-09-25T10:49", comp); +} + +#[test] +fn test_tokenize51() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10"]; + tokenize_assert("2003-09-25T10", comp); +} + +#[test] +fn test_tokenize52() { + let comp = vec!["2003", "-", "09", "-", "25"]; + tokenize_assert("2003-09-25", comp); +} + +#[test] +fn test_tokenize53() { + let comp = vec!["20030925", "T", "104941"]; + tokenize_assert("20030925T104941", comp); +} + +#[test] +fn test_tokenize54() { + let comp = vec!["20030925", "T", "1049"]; + tokenize_assert("20030925T1049", comp); +} + +#[test] +fn test_tokenize55() { + let comp = vec!["20030925", "T", "10"]; + tokenize_assert("20030925T10", comp); +} + +#[test] +fn test_tokenize56() { + let comp = vec!["20030925"]; + tokenize_assert("20030925", comp); +} + +#[test] +fn test_tokenize57() { + let comp = vec!["2003", "-", "09", "-", "25", " ", "10", ":", "49", ":", "41.502"]; + tokenize_assert("2003-09-25 10:49:41,502", comp); +} + +#[test] +fn test_tokenize58() { + let comp = vec!["199709020908"]; + tokenize_assert("199709020908", comp); +} + +#[test] +fn test_tokenize59() { + let comp = vec!["19970902090807"]; + tokenize_assert("19970902090807", comp); +} + +#[test] +fn test_tokenize60() { + let comp = vec!["2003", "-", "09", "-", "25"]; + tokenize_assert("2003-09-25", comp); +} + +#[test] +fn test_tokenize61() { + let comp = vec!["09", "-", "25", "-", "2003"]; + tokenize_assert("09-25-2003", comp); +} + +#[test] +fn test_tokenize62() { + let comp = vec!["25", "-", "09", "-", "2003"]; + tokenize_assert("25-09-2003", comp); +} + +#[test] +fn test_tokenize63() { + let comp = vec!["10", "-", "09", "-", "2003"]; + tokenize_assert("10-09-2003", comp); +} + +#[test] +fn test_tokenize64() { + let comp = vec!["10", "-", "09", "-", "03"]; + tokenize_assert("10-09-03", comp); +} + +#[test] +fn test_tokenize65() { + let comp = vec!["2003", ".", "09", ".", "25"]; + tokenize_assert("2003.09.25", comp); +} + +#[test] +fn test_tokenize66() { + let comp = vec!["09", ".", "25", ".", "2003"]; + tokenize_assert("09.25.2003", comp); +} + +#[test] +fn test_tokenize67() { + let comp = vec!["25", ".", "09", ".", "2003"]; + tokenize_assert("25.09.2003", comp); +} + +#[test] +fn test_tokenize68() { + let comp = vec!["10", ".", "09", ".", "2003"]; + tokenize_assert("10.09.2003", comp); +} + +#[test] +fn test_tokenize69() { + let comp = vec!["10", ".", "09", ".", "03"]; + tokenize_assert("10.09.03", comp); +} + +#[test] +fn test_tokenize70() { + let comp = vec!["2003", "/", "09", "/", "25"]; + tokenize_assert("2003/09/25", comp); +} + +#[test] +fn test_tokenize71() { + let comp = vec!["09", "/", "25", "/", "2003"]; + tokenize_assert("09/25/2003", comp); +} + +#[test] +fn test_tokenize72() { + let comp = vec!["25", "/", "09", "/", "2003"]; + tokenize_assert("25/09/2003", comp); +} + +#[test] +fn test_tokenize73() { + let comp = vec!["10", "/", "09", "/", "2003"]; + tokenize_assert("10/09/2003", comp); +} + +#[test] +fn test_tokenize74() { + let comp = vec!["10", "/", "09", "/", "03"]; + tokenize_assert("10/09/03", comp); +} + +#[test] +fn test_tokenize75() { + let comp = vec!["2003", " ", "09", " ", "25"]; + tokenize_assert("2003 09 25", comp); +} + +#[test] +fn test_tokenize76() { + let comp = vec!["09", " ", "25", " ", "2003"]; + tokenize_assert("09 25 2003", comp); +} + +#[test] +fn test_tokenize77() { + let comp = vec!["25", " ", "09", " ", "2003"]; + tokenize_assert("25 09 2003", comp); +} + +#[test] +fn test_tokenize78() { + let comp = vec!["10", " ", "09", " ", "2003"]; + tokenize_assert("10 09 2003", comp); +} + +#[test] +fn test_tokenize79() { + let comp = vec!["10", " ", "09", " ", "03"]; + tokenize_assert("10 09 03", comp); +} + +#[test] +fn test_tokenize80() { + let comp = vec!["25", " ", "09", " ", "03"]; + tokenize_assert("25 09 03", comp); +} + +#[test] +fn test_tokenize81() { + let comp = vec!["03", " ", "25", " ", "Sep"]; + tokenize_assert("03 25 Sep", comp); +} + +#[test] +fn test_tokenize82() { + let comp = vec!["25", " ", "03", " ", "Sep"]; + tokenize_assert("25 03 Sep", comp); +} + +#[test] +fn test_tokenize83() { + let comp = vec![" ", " ", "July", " ", " ", " ", "4", " ", ",", " ", " ", "1976", " ", " ", " ", "12", ":", "01", ":", "02", " ", " ", " ", "am", " ", " "]; + tokenize_assert(" July 4 , 1976 12:01:02 am ", comp); +} + +#[test] +fn test_tokenize84() { + let comp = vec!["Wed", ",", " ", "July", " ", "10", ",", " ", "'", "96"]; + tokenize_assert("Wed, July 10, '96", comp); +} + +#[test] +fn test_tokenize85() { + let comp = vec!["1996", ".", "July", ".", "10", " ", "AD", " ", "12", ":", "08", " ", "PM"]; + tokenize_assert("1996.July.10 AD 12:08 PM", comp); +} + +#[test] +fn test_tokenize86() { + let comp = vec!["July", " ", "4", ",", " ", "1976"]; + tokenize_assert("July 4, 1976", comp); +} + +#[test] +fn test_tokenize87() { + let comp = vec!["7", " ", "4", " ", "1976"]; + tokenize_assert("7 4 1976", comp); +} + +#[test] +fn test_tokenize88() { + let comp = vec!["4", " ", "jul", " ", "1976"]; + tokenize_assert("4 jul 1976", comp); +} + +#[test] +fn test_tokenize89() { + let comp = vec!["7", "-", "4", "-", "76"]; + tokenize_assert("7-4-76", comp); +} + +#[test] +fn test_tokenize90() { + let comp = vec!["19760704"]; + tokenize_assert("19760704", comp); +} + +#[test] +fn test_tokenize91() { + let comp = vec!["0", ":", "01", ":", "02", " ", "on", " ", "July", " ", "4", ",", " ", "1976"]; + tokenize_assert("0:01:02 on July 4, 1976", comp); +} + +#[test] +fn test_tokenize92() { + let comp = vec!["0", ":", "01", ":", "02", " ", "on", " ", "July", " ", "4", ",", " ", "1976"]; + tokenize_assert("0:01:02 on July 4, 1976", comp); +} + +#[test] +fn test_tokenize93() { + let comp = vec!["July", " ", "4", ",", " ", "1976", " ", "12", ":", "01", ":", "02", " ", "am"]; + tokenize_assert("July 4, 1976 12:01:02 am", comp); +} + +#[test] +fn test_tokenize94() { + let comp = vec!["Mon", " ", "Jan", " ", " ", "2", " ", "04", ":", "24", ":", "27", " ", "1995"]; + tokenize_assert("Mon Jan 2 04:24:27 1995", comp); +} + +#[test] +fn test_tokenize95() { + let comp = vec!["04", ".", "04", ".", "95", " ", "00", ":", "22"]; + tokenize_assert("04.04.95 00:22", comp); +} + +#[test] +fn test_tokenize96() { + let comp = vec!["Jan", " ", "1", " ", "1999", " ", "11", ":", "23", ":", "34.578"]; + tokenize_assert("Jan 1 1999 11:23:34.578", comp); +} + +#[test] +fn test_tokenize97() { + let comp = vec!["950404", " ", "122212"]; + tokenize_assert("950404 122212", comp); +} + +#[test] +fn test_tokenize98() { + let comp = vec!["3", "rd", " ", "of", " ", "May", " ", "2001"]; + tokenize_assert("3rd of May 2001", comp); +} + +#[test] +fn test_tokenize99() { + let comp = vec!["5", "th", " ", "of", " ", "March", " ", "2001"]; + tokenize_assert("5th of March 2001", comp); +} + +#[test] +fn test_tokenize100() { + let comp = vec!["1", "st", " ", "of", " ", "May", " ", "2003"]; + tokenize_assert("1st of May 2003", comp); +} + +#[test] +fn test_tokenize101() { + let comp = vec!["0099", "-", "01", "-", "01", "T", "00", ":", "00", ":", "00"]; + tokenize_assert("0099-01-01T00:00:00", comp); +} + +#[test] +fn test_tokenize102() { + let comp = vec!["0031", "-", "01", "-", "01", "T", "00", ":", "00", ":", "00"]; + tokenize_assert("0031-01-01T00:00:00", comp); +} + +#[test] +fn test_tokenize103() { + let comp = vec!["20080227", "T", "21", ":", "26", ":", "01.123456789"]; + tokenize_assert("20080227T21:26:01.123456789", comp); +} + +#[test] +fn test_tokenize104() { + let comp = vec!["13", "NOV", "2017"]; + tokenize_assert("13NOV2017", comp); +} + +#[test] +fn test_tokenize105() { + let comp = vec!["0003", "-", "03", "-", "04"]; + tokenize_assert("0003-03-04", comp); +} + +#[test] +fn test_tokenize106() { + let comp = vec!["December", ".", "0031", ".", "30"]; + tokenize_assert("December.0031.30", comp); +} + +#[test] +fn test_tokenize107() { + let comp = vec!["090107"]; + tokenize_assert("090107", comp); +} + +#[test] +fn test_tokenize108() { + let comp = vec!["2015", "-", "15", "-", "May"]; + tokenize_assert("2015-15-May", comp); +} + +#[test] +fn test_tokenize109() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28", " ", "BRST", " ", "2003"]; + tokenize_assert("Thu Sep 25 10:36:28 BRST 2003", comp); +} + +#[test] +fn test_tokenize110() { + let comp = vec!["2003", " ", "10", ":", "36", ":", "28", " ", "BRST", " ", "25", " ", "Sep", " ", "Thu"]; + tokenize_assert("2003 10:36:28 BRST 25 Sep Thu", comp); +} + +#[test] +fn test_tokenize111() { + let comp = vec!["Thu", ",", " ", "25", " ", "Sep", " ", "2003", " ", "10", ":", "49", ":", "41", " ", "-", "0300"]; + tokenize_assert("Thu, 25 Sep 2003 10:49:41 -0300", comp); +} + +#[test] +fn test_tokenize112() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49", ":", "41.5", "-", "03", ":", "00"]; + tokenize_assert("2003-09-25T10:49:41.5-03:00", comp); +} + +#[test] +fn test_tokenize113() { + let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49", ":", "41", "-", "03", ":", "00"]; + tokenize_assert("2003-09-25T10:49:41-03:00", comp); +} + +#[test] +fn test_tokenize114() { + let comp = vec!["20030925", "T", "104941.5", "-", "0300"]; + tokenize_assert("20030925T104941.5-0300", comp); +} + +#[test] +fn test_tokenize115() { + let comp = vec!["20030925", "T", "104941", "-", "0300"]; + tokenize_assert("20030925T104941-0300", comp); +} + +#[test] +fn test_tokenize116() { + let comp = vec!["10", "-", "09", "-", "2003"]; + tokenize_assert("10-09-2003", comp); +} + +#[test] +fn test_tokenize117() { + let comp = vec!["10", ".", "09", ".", "2003"]; + tokenize_assert("10.09.2003", comp); +} + +#[test] +fn test_tokenize118() { + let comp = vec!["10", "/", "09", "/", "2003"]; + tokenize_assert("10/09/2003", comp); +} + +#[test] +fn test_tokenize119() { + let comp = vec!["10", " ", "09", " ", "2003"]; + tokenize_assert("10 09 2003", comp); +} + +#[test] +fn test_tokenize120() { + let comp = vec!["090107"]; + tokenize_assert("090107", comp); +} + +#[test] +fn test_tokenize121() { + let comp = vec!["2015", " ", "09", " ", "25"]; + tokenize_assert("2015 09 25", comp); +} + +#[test] +fn test_tokenize122() { + let comp = vec!["10", "-", "09", "-", "03"]; + tokenize_assert("10-09-03", comp); +} + +#[test] +fn test_tokenize123() { + let comp = vec!["10", ".", "09", ".", "03"]; + tokenize_assert("10.09.03", comp); +} + +#[test] +fn test_tokenize124() { + let comp = vec!["10", "/", "09", "/", "03"]; + tokenize_assert("10/09/03", comp); +} + +#[test] +fn test_tokenize125() { + let comp = vec!["10", " ", "09", " ", "03"]; + tokenize_assert("10 09 03", comp); +} + +#[test] +fn test_tokenize126() { + let comp = vec!["090107"]; + tokenize_assert("090107", comp); +} + +#[test] +fn test_tokenize127() { + let comp = vec!["2015", " ", "09", " ", "25"]; + tokenize_assert("2015 09 25", comp); +} + +#[test] +fn test_tokenize128() { + let comp = vec!["090107"]; + tokenize_assert("090107", comp); +} + +#[test] +fn test_tokenize129() { + let comp = vec!["2015", " ", "09", " ", "25"]; + tokenize_assert("2015 09 25", comp); +} + +#[test] +fn test_tokenize130() { + let comp = vec!["April", " ", "2009"]; + tokenize_assert("April 2009", comp); +} + +#[test] +fn test_tokenize131() { + let comp = vec!["Feb", " ", "2007"]; + tokenize_assert("Feb 2007", comp); +} + +#[test] +fn test_tokenize132() { + let comp = vec!["Feb", " ", "2008"]; + tokenize_assert("Feb 2008", comp); +} + +#[test] +fn test_tokenize133() { + let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28", " ", "BRST", " ", "2003"]; + tokenize_assert("Thu Sep 25 10:36:28 BRST 2003", comp); +} + +#[test] +fn test_tokenize134() { + let comp = vec!["1996", ".", "07", ".", "10", " ", "AD", " ", "at", " ", "15", ":", "08", ":", "56", " ", "PDT"]; + tokenize_assert("1996.07.10 AD at 15:08:56 PDT", comp); +} + +#[test] +fn test_tokenize135() { + let comp = vec!["Tuesday", ",", " ", "April", " ", "12", ",", " ", "1952", " ", "AD", " ", "3", ":", "30", ":", "42", "pm", " ", "PST"]; + tokenize_assert("Tuesday, April 12, 1952 AD 3:30:42pm PST", comp); +} + +#[test] +fn test_tokenize136() { + let comp = vec!["November", " ", "5", ",", " ", "1994", ",", " ", "8", ":", "15", ":", "30", " ", "am", " ", "EST"]; + tokenize_assert("November 5, 1994, 8:15:30 am EST", comp); +} + +#[test] +fn test_tokenize137() { + let comp = vec!["1994", "-", "11", "-", "05", "T", "08", ":", "15", ":", "30", "-", "05", ":", "00"]; + tokenize_assert("1994-11-05T08:15:30-05:00", comp); +} + +#[test] +fn test_tokenize138() { + let comp = vec!["1994", "-", "11", "-", "05", "T", "08", ":", "15", ":", "30", "Z"]; + tokenize_assert("1994-11-05T08:15:30Z", comp); +} + +#[test] +fn test_tokenize139() { + let comp = vec!["1976", "-", "07", "-", "04", "T", "00", ":", "01", ":", "02", "Z"]; + tokenize_assert("1976-07-04T00:01:02Z", comp); +} + +#[test] +fn test_tokenize140() { + let comp = vec!["Tue", " ", "Apr", " ", "4", " ", "00", ":", "22", ":", "12", " ", "PDT", " ", "1995"]; + tokenize_assert("Tue Apr 4 00:22:12 PDT 1995", comp); +} + +#[test] +fn test_tokenize141() { + let comp = vec!["Today", " ", "is", " ", "25", " ", "of", " ", "September", " ", "of", " ", "2003", ",", " ", "exactly", " ", "at", " ", "10", ":", "49", ":", "41", " ", "with", " ", "timezone", " ", "-", "03", ":", "00", "."]; + tokenize_assert("Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", comp); +} + +#[test] +fn test_tokenize142() { + let comp = vec!["Today", " ", "is", " ", "25", " ", "of", " ", "September", " ", "of", " ", "2003", ",", " ", "exactly", " ", "at", " ", "10", ":", "49", ":", "41", " ", "with", " ", "timezone", " ", "-", "03", ":", "00", "."]; + tokenize_assert("Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", comp); +} diff --git a/src/tokenize.rs b/src/tokenize.rs index 38091a8..bb982a0 100644 --- a/src/tokenize.rs +++ b/src/tokenize.rs @@ -1,5 +1,6 @@ pub(crate) struct Tokenizer { token_stack: Vec, + // TODO: Should this be more generic? io::Read for example? parse_string: String, } @@ -13,12 +14,49 @@ pub(crate) enum ParseState { } impl Tokenizer { - pub(crate) fn new(parse_string: String) -> Self { + + pub(crate) fn new(parse_string: &str) -> Self { Tokenizer { - token_stack: Vec::new(), + token_stack: vec![], parse_string: parse_string.chars().rev().collect(), } } + + fn isword(&self, c: char) -> bool { + c.is_alphabetic() + } + + fn isnum(&self, c: char) -> bool { + c.is_numeric() + } + + fn isspace(&self, c: char) -> bool { + c.is_whitespace() + } + + fn decimal_split(&self, s: &str) -> Vec { + // Handles the same thing as Python's re.split() + let mut tokens: Vec = vec!["".to_owned()]; + + for c in s.chars() { + if c == '.' || c == ',' { + tokens.push(c.to_string()); + tokens.push("".to_owned()); + } else { + // UNWRAP: Initial setup guarantees we always have an item + let mut t = tokens.pop().unwrap(); + t.push(c); + tokens.push(t); + } + } + + // TODO: Do I really have to use &String instead of &str? + if tokens.last() == Some(&"".to_owned()) { + tokens.pop(); + } + + tokens + } } impl Iterator for Tokenizer { @@ -26,182 +64,115 @@ impl Iterator for Tokenizer { fn next(&mut self) -> Option { if !self.token_stack.is_empty() { - return Some(self.token_stack.pop().unwrap()); - }; - if self.parse_string.is_empty() { - return None; - }; + return Some(self.token_stack.remove(0)); + } - let mut char_stack: Vec = Vec::new(); - let mut seen_letters = false; + let mut seenletters = false; + let mut token: Option = None; let mut state = ParseState::Empty; - while let Some(next) = self.parse_string.pop() { + while !self.parse_string.is_empty() { + // Dateutil uses a separate `charstack` to manage the incoming stream. + // Because parse_string can have things pushed back onto it, we skip + // a couple of steps related to the `charstack`. + + // UNWRAP: Just checked that parse_string isn't empty + let nextchar = self.parse_string.pop().unwrap(); + match state { ParseState::Empty => { - if next.is_numeric() { - state = ParseState::Numeric; - char_stack.push(next); - } else if next.is_alphabetic() { + token = Some(nextchar.to_string()); + if self.isword(nextchar) { state = ParseState::Alpha; - seen_letters = true; - char_stack.push(next); - } else if next.is_whitespace() { - char_stack.push(' '); + } else if self.isnum(nextchar) { + state = ParseState::Numeric; + } else if self.isspace(nextchar) { + token = Some(" ".to_owned()); break; } else { - char_stack.push(next); break; } - } + }, ParseState::Alpha => { - if next.is_alphabetic() { - char_stack.push(next); - } else if next == '.' { + seenletters = true; + if self.isword(nextchar) { + // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token + token.as_mut().unwrap().push(nextchar); + } else if nextchar == '.' { + token.as_mut().unwrap().push(nextchar); state = ParseState::AlphaDecimal; - char_stack.push(next); } else { - // We don't recognize the character, so push it back - // to be handled later. - self.parse_string.push(next); + self.parse_string.push(nextchar); break; } - } - ParseState::AlphaDecimal => { - if next == '.' || next.is_alphabetic() { - char_stack.push(next); - } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' { - char_stack.push(next); - state = ParseState::NumericDecimal; - } else { - self.parse_string.push(next); - break; - } - } + }, ParseState::Numeric => { - if next.is_numeric() { - char_stack.push(next); - } else if next == '.' || (next == ',' && char_stack.len() >= 2) { - char_stack.push(next); + if self.isnum(nextchar) { + // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token + token.as_mut().unwrap().push(nextchar); + } else if nextchar == '.' || (nextchar == ',' && token.as_ref().unwrap().len() >= 2) { + token.as_mut().unwrap().push(nextchar); state = ParseState::NumericDecimal; } else { - // We don't recognize the character, so push it back - // to be handled later - self.parse_string.push(next); + self.parse_string.push(nextchar); break; } - } + }, + ParseState::AlphaDecimal => { + seenletters = true; + if nextchar == '.' || self.isword(nextchar) { + // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token + token.as_mut().unwrap().push(nextchar); + } else if self.isnum(nextchar) && token.as_ref().unwrap().chars().last() == Some('.') { + token.as_mut().unwrap().push(nextchar); + state = ParseState::NumericDecimal; + } else { + self.parse_string.push(nextchar); + break; + } + }, ParseState::NumericDecimal => { - if next == '.' || next.is_numeric() { - char_stack.push(next); - } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' { - char_stack.push(next); + if nextchar == '.' || self.isnum(nextchar) { + // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token + token.as_mut().unwrap().push(nextchar); + } else if self.isword(nextchar) && token.as_ref().unwrap().chars().last() == Some('.') { + token.as_mut().unwrap().push(nextchar); state = ParseState::AlphaDecimal; } else { - self.parse_string.push(next); + self.parse_string.push(nextchar); break; } } } } - // I like Python's version of this much better: - // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,' - let dot_count = char_stack.iter().fold(0, |count, character| { - count + (if character == &'.' { 1 } else { 0 }) - }); - let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.' - || char_stack.last().unwrap() == &','; - let final_string: String = char_stack.into_iter().collect(); - - let mut tokens = match state { - ParseState::Empty => vec![final_string], - ParseState::Alpha => vec![final_string], - ParseState::Numeric => vec![final_string], - ParseState::AlphaDecimal => { - if needs_split { - decimal_split(&final_string, false) - } else { - vec![final_string] + // Python uses the state to short-circuit and make sure it doesn't run into issues with None + // We do something slightly different to express the same logic + if state == ParseState::AlphaDecimal || state == ParseState::NumericDecimal { + // UNWRAP: The state check guarantees that we have a value + let dot_count = token.as_ref().unwrap().chars().filter(|c| *c == '.').count(); + let last_char = token.as_ref().unwrap().chars().last(); + let last_splittable = last_char == Some('.') || last_char == Some(','); + + if seenletters || dot_count > 1 || last_splittable { + let mut l = self.decimal_split(token.as_ref().unwrap()); + let remaining = l.split_off(1); + + token = Some(l[0].clone()); + for t in remaining { + self.token_stack.push(t); } } - ParseState::NumericDecimal => { - if needs_split { - decimal_split(&final_string, dot_count == 0) - } else { - vec![final_string] - } + + if state == ParseState::NumericDecimal && dot_count == 0 { + token = Some(token.unwrap().replace(',', ".")); } - }.into_iter() - .rev() - .collect(); - - self.token_stack.append(&mut tokens); - // UNWRAP: Previous match guaranteed that at least one token was added - let token = self.token_stack.pop().unwrap(); - if state == ParseState::NumericDecimal && !token.contains(".") { - Some(token.replace(",", ".")) - } else { - Some(token) } + + token } } -fn decimal_split(characters: &str, cast_period: bool) -> Vec { - let mut token_stack: Vec = Vec::new(); - let mut char_stack: Vec = Vec::new(); - let mut state = ParseState::Empty; - - for c in characters.chars() { - match state { - ParseState::Empty => { - if c.is_alphabetic() { - char_stack.push(c); - state = ParseState::Alpha; - } else if c.is_numeric() { - char_stack.push(c); - state = ParseState::Numeric; - } else { - let character = if cast_period { '.' } else { c }; - token_stack.push(character.to_string()); - } - } - ParseState::Alpha => { - if c.is_alphabetic() { - char_stack.push(c); - } else { - token_stack.push(char_stack.iter().collect()); - char_stack.clear(); - let character = if cast_period { '.' } else { c }; - token_stack.push(character.to_string()); - state = ParseState::Empty; - } - } - ParseState::Numeric => { - if c.is_numeric() { - char_stack.push(c); - } else { - token_stack.push(char_stack.iter().collect()); - char_stack.clear(); - let character = if cast_period { '.' } else { c }; - token_stack.push(character.to_string()); - state = ParseState::Empty; - } - } - _ => panic!("Invalid parse state during decimal_split()"), - } - } - - match state { - ParseState::Alpha => token_stack.push(char_stack.iter().collect()), - ParseState::Numeric => token_stack.push(char_stack.iter().collect()), - ParseState::Empty => (), - _ => panic!("Invalid parse state during decimal_split()"), - } - - token_stack -} - #[cfg(test)] mod tests { @@ -209,7 +180,7 @@ mod tests { #[test] fn test_basic() { - let tokens: Vec = Tokenizer::new("September of 2003,".to_owned()).collect(); + let tokens: Vec = Tokenizer::new("September of 2003,").collect(); assert_eq!(tokens, vec!["September", " ", "of", " ", "2003", ","]); } } diff --git a/src/weekday.rs b/src/weekday.rs index d92c758..f874fcf 100644 --- a/src/weekday.rs +++ b/src/weekday.rs @@ -99,6 +99,8 @@ pub fn day_of_week(year: u32, month: u32, day: u32) -> ParseResult { } } +// Rust warns about unused imports here, but they're definitely used. +#[allow(unused_imports)] mod test { use weekday::day_of_week;