Redo the tokenization

Still has issues with one test case for fuzzy
2025-10-27 15:40:32 -04:00 · 2018-07-07 23:37:02 -04:00
parent 9135962839
commit e049618fff
9 changed files with 1029 additions and 155 deletions
--- a/build_pycompat.py
+++ b/build_pycompat.py
@ -91,7 +91,7 @@ tests = {
 }

 def main():
-    with open('tests/pycompat.rs', 'w+') as handle:
+    with open('src/tests/pycompat_parser.rs', 'w+') as handle:
        handle.write(TEST_HEADER)

        for test_name, test_strings in tests.items():
@ -182,10 +182,9 @@ use chrono::NaiveDateTime;
 use chrono::Timelike;
 use std::collections::HashMap;

-extern crate dtparse;
-
-use dtparse::Parser;
-use dtparse::ParserInfo;
+use Parser;
+use ParserInfo;
+use parse;

 struct PyDateTime {
    year: i32,
@ -236,7 +235,7 @@ fn parse_and_assert_simple(
    pdt: PyDateTime,
    s: &str,
 ) {
-    let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s));
+    let rs_parsed = parse(s).expect(&format!("Unable to parse date in Rust '{}'", s));
    assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s);
    assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s);
    assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s);
--- a/build_pycompat_tokenizer.py
+++ b/build_pycompat_tokenizer.py
@ -0,0 +1,35 @@
+from dateutil.parser import _timelex
+
+from build_pycompat import tests
+
+def main():
+    with open('src/tests/pycompat_tokenizer.rs', 'w+') as handle:
+        handle.write(TEST_HEADER)
+
+        counter = 0
+        for _, test_strings in tests.items():
+            for s in test_strings:
+                handle.write(build_test(counter, s))
+                counter += 1
+
+def build_test(i, test_string):
+    python_tokens = list(_timelex(test_string))
+    formatted_tokens = 'vec!["' + '", "'.join(python_tokens) + '"]'
+    return f'''
+#[test]
+fn test_tokenize{i}() {{
+    let comp = {formatted_tokens};
+    tokenize_assert("{test_string}", comp);
+}}\n'''
+
+
+TEST_HEADER = '''
+use tokenize::Tokenizer;
+
+fn tokenize_assert(test_str: &str, comparison: Vec<&str>) {
+    let tokens: Vec<String> = Tokenizer::new(test_str).collect();
+    assert_eq!(tokens, comparison, "Tokenizing mismatch for `{}`", test_str);
+}\n'''
+
+if __name__ == '__main__':
+    main()
--- a/src/lib.rs
+++ b/src/lib.rs
@ -87,7 +87,7 @@ type ParseResult<I> = Result<I, ParseError>;
 type ParseIResult<I> = Result<I, ParseInternalError>;

 pub fn tokenize(parse_string: &str) -> Vec<String> {
-    let tokenizer = Tokenizer::new(parse_string.to_owned());
+    let tokenizer = Tokenizer::new(parse_string);
    tokenizer.collect()
 }

--- a/src/tests/fuzzing.rs
+++ b/src/tests/fuzzing.rs
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@ -0,0 +1,3 @@
+mod fuzzing;
+mod pycompat_parser;
+mod pycompat_tokenizer;
--- a/src/tests/pycompat_parser.rs
+++ b/src/tests/pycompat_parser.rs
@ -7,10 +7,9 @@ use chrono::NaiveDateTime;
 use chrono::Timelike;
 use std::collections::HashMap;

-extern crate dtparse;
-
-use dtparse::Parser;
-use dtparse::ParserInfo;
+use Parser;
+use ParserInfo;
+use parse;

 struct PyDateTime {
    year: i32,
@ -61,7 +60,7 @@ fn parse_and_assert_simple(
    pdt: PyDateTime,
    s: &str,
 ) {
-    let rs_parsed = dtparse::parse(s).expect(&format!("Unable to parse date in Rust '{}'", s));
+    let rs_parsed = parse(s).expect(&format!("Unable to parse date in Rust '{}'", s));
    assert_eq!(pdt.year, rs_parsed.0.year(), "Year mismatch for '{}'", s);
    assert_eq!(pdt.month, rs_parsed.0.month(), "Month mismatch for '{}'", s);
    assert_eq!(pdt.day, rs_parsed.0.day(), "Day mismatch for '{}'", s);
--- a/src/tests/pycompat_tokenizer.rs
+++ b/src/tests/pycompat_tokenizer.rs
@ -0,0 +1,865 @@
+
+use tokenize::Tokenizer;
+
+fn tokenize_assert(test_str: &str, comparison: Vec<&str>) {
+    let tokens: Vec<String> = Tokenizer::new(test_str).collect();
+    assert_eq!(tokens, comparison, "Tokenizing mismatch for `{}`", test_str);
+}
+
+#[test]
+fn test_tokenize0() {
+    let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28"];
+    tokenize_assert("Thu Sep 25 10:36:28", comp);
+}
+
+#[test]
+fn test_tokenize1() {
+    let comp = vec!["Sep", " ", "10", ":", "36", ":", "28"];
+    tokenize_assert("Sep 10:36:28", comp);
+}
+
+#[test]
+fn test_tokenize2() {
+    let comp = vec!["10", ":", "36", ":", "28"];
+    tokenize_assert("10:36:28", comp);
+}
+
+#[test]
+fn test_tokenize3() {
+    let comp = vec!["10", ":", "36"];
+    tokenize_assert("10:36", comp);
+}
+
+#[test]
+fn test_tokenize4() {
+    let comp = vec!["Sep", " ", "2003"];
+    tokenize_assert("Sep 2003", comp);
+}
+
+#[test]
+fn test_tokenize5() {
+    let comp = vec!["Sep"];
+    tokenize_assert("Sep", comp);
+}
+
+#[test]
+fn test_tokenize6() {
+    let comp = vec!["2003"];
+    tokenize_assert("2003", comp);
+}
+
+#[test]
+fn test_tokenize7() {
+    let comp = vec!["10", "h", "36", "m", "28.5", "s"];
+    tokenize_assert("10h36m28.5s", comp);
+}
+
+#[test]
+fn test_tokenize8() {
+    let comp = vec!["10", "h", "36", "m", "28", "s"];
+    tokenize_assert("10h36m28s", comp);
+}
+
+#[test]
+fn test_tokenize9() {
+    let comp = vec!["10", "h", "36", "m"];
+    tokenize_assert("10h36m", comp);
+}
+
+#[test]
+fn test_tokenize10() {
+    let comp = vec!["10", "h"];
+    tokenize_assert("10h", comp);
+}
+
+#[test]
+fn test_tokenize11() {
+    let comp = vec!["10", " ", "h", " ", "36"];
+    tokenize_assert("10 h 36", comp);
+}
+
+#[test]
+fn test_tokenize12() {
+    let comp = vec!["10", " ", "h", " ", "36.5"];
+    tokenize_assert("10 h 36.5", comp);
+}
+
+#[test]
+fn test_tokenize13() {
+    let comp = vec!["36", " ", "m", " ", "5"];
+    tokenize_assert("36 m 5", comp);
+}
+
+#[test]
+fn test_tokenize14() {
+    let comp = vec!["36", " ", "m", " ", "5", " ", "s"];
+    tokenize_assert("36 m 5 s", comp);
+}
+
+#[test]
+fn test_tokenize15() {
+    let comp = vec!["36", " ", "m", " ", "05"];
+    tokenize_assert("36 m 05", comp);
+}
+
+#[test]
+fn test_tokenize16() {
+    let comp = vec!["36", " ", "m", " ", "05", " ", "s"];
+    tokenize_assert("36 m 05 s", comp);
+}
+
+#[test]
+fn test_tokenize17() {
+    let comp = vec!["10", "h", " ", "am"];
+    tokenize_assert("10h am", comp);
+}
+
+#[test]
+fn test_tokenize18() {
+    let comp = vec!["10", "h", " ", "pm"];
+    tokenize_assert("10h pm", comp);
+}
+
+#[test]
+fn test_tokenize19() {
+    let comp = vec!["10", "am"];
+    tokenize_assert("10am", comp);
+}
+
+#[test]
+fn test_tokenize20() {
+    let comp = vec!["10", "pm"];
+    tokenize_assert("10pm", comp);
+}
+
+#[test]
+fn test_tokenize21() {
+    let comp = vec!["10", ":", "00", " ", "am"];
+    tokenize_assert("10:00 am", comp);
+}
+
+#[test]
+fn test_tokenize22() {
+    let comp = vec!["10", ":", "00", " ", "pm"];
+    tokenize_assert("10:00 pm", comp);
+}
+
+#[test]
+fn test_tokenize23() {
+    let comp = vec!["10", ":", "00", "am"];
+    tokenize_assert("10:00am", comp);
+}
+
+#[test]
+fn test_tokenize24() {
+    let comp = vec!["10", ":", "00", "pm"];
+    tokenize_assert("10:00pm", comp);
+}
+
+#[test]
+fn test_tokenize25() {
+    let comp = vec!["10", ":", "00", "a", ".", "m"];
+    tokenize_assert("10:00a.m", comp);
+}
+
+#[test]
+fn test_tokenize26() {
+    let comp = vec!["10", ":", "00", "p", ".", "m"];
+    tokenize_assert("10:00p.m", comp);
+}
+
+#[test]
+fn test_tokenize27() {
+    let comp = vec!["10", ":", "00", "a", ".", "m", "."];
+    tokenize_assert("10:00a.m.", comp);
+}
+
+#[test]
+fn test_tokenize28() {
+    let comp = vec!["10", ":", "00", "p", ".", "m", "."];
+    tokenize_assert("10:00p.m.", comp);
+}
+
+#[test]
+fn test_tokenize29() {
+    let comp = vec!["October"];
+    tokenize_assert("October", comp);
+}
+
+#[test]
+fn test_tokenize30() {
+    let comp = vec!["31", "-", "Dec", "-", "00"];
+    tokenize_assert("31-Dec-00", comp);
+}
+
+#[test]
+fn test_tokenize31() {
+    let comp = vec!["0", ":", "01", ":", "02"];
+    tokenize_assert("0:01:02", comp);
+}
+
+#[test]
+fn test_tokenize32() {
+    let comp = vec!["12", "h", " ", "01", "m", "02", "s", " ", "am"];
+    tokenize_assert("12h 01m02s am", comp);
+}
+
+#[test]
+fn test_tokenize33() {
+    let comp = vec!["12", ":", "08", " ", "PM"];
+    tokenize_assert("12:08 PM", comp);
+}
+
+#[test]
+fn test_tokenize34() {
+    let comp = vec!["01", "h", "02", "m", "03"];
+    tokenize_assert("01h02m03", comp);
+}
+
+#[test]
+fn test_tokenize35() {
+    let comp = vec!["01", "h", "02"];
+    tokenize_assert("01h02", comp);
+}
+
+#[test]
+fn test_tokenize36() {
+    let comp = vec!["01", "h", "02", "s"];
+    tokenize_assert("01h02s", comp);
+}
+
+#[test]
+fn test_tokenize37() {
+    let comp = vec!["01", "m", "02"];
+    tokenize_assert("01m02", comp);
+}
+
+#[test]
+fn test_tokenize38() {
+    let comp = vec!["01", "m", "02", "h"];
+    tokenize_assert("01m02h", comp);
+}
+
+#[test]
+fn test_tokenize39() {
+    let comp = vec!["2004", " ", "10", " ", "Apr", " ", "11", "h", "30", "m"];
+    tokenize_assert("2004 10 Apr 11h30m", comp);
+}
+
+#[test]
+fn test_tokenize40() {
+    let comp = vec!["Sep", " ", "03"];
+    tokenize_assert("Sep 03", comp);
+}
+
+#[test]
+fn test_tokenize41() {
+    let comp = vec!["Sep", " ", "of", " ", "03"];
+    tokenize_assert("Sep of 03", comp);
+}
+
+#[test]
+fn test_tokenize42() {
+    let comp = vec!["02", ":", "17", "NOV", "2017"];
+    tokenize_assert("02:17NOV2017", comp);
+}
+
+#[test]
+fn test_tokenize43() {
+    let comp = vec!["Thu", " ", "Sep", " ", "10", ":", "36", ":", "28"];
+    tokenize_assert("Thu Sep 10:36:28", comp);
+}
+
+#[test]
+fn test_tokenize44() {
+    let comp = vec!["Thu", " ", "10", ":", "36", ":", "28"];
+    tokenize_assert("Thu 10:36:28", comp);
+}
+
+#[test]
+fn test_tokenize45() {
+    let comp = vec!["Wed"];
+    tokenize_assert("Wed", comp);
+}
+
+#[test]
+fn test_tokenize46() {
+    let comp = vec!["Wednesday"];
+    tokenize_assert("Wednesday", comp);
+}
+
+#[test]
+fn test_tokenize47() {
+    let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28", " ", "2003"];
+    tokenize_assert("Thu Sep 25 10:36:28 2003", comp);
+}
+
+#[test]
+fn test_tokenize48() {
+    let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "2003"];
+    tokenize_assert("Thu Sep 25 2003", comp);
+}
+
+#[test]
+fn test_tokenize49() {
+    let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49", ":", "41"];
+    tokenize_assert("2003-09-25T10:49:41", comp);
+}
+
+#[test]
+fn test_tokenize50() {
+    let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49"];
+    tokenize_assert("2003-09-25T10:49", comp);
+}
+
+#[test]
+fn test_tokenize51() {
+    let comp = vec!["2003", "-", "09", "-", "25", "T", "10"];
+    tokenize_assert("2003-09-25T10", comp);
+}
+
+#[test]
+fn test_tokenize52() {
+    let comp = vec!["2003", "-", "09", "-", "25"];
+    tokenize_assert("2003-09-25", comp);
+}
+
+#[test]
+fn test_tokenize53() {
+    let comp = vec!["20030925", "T", "104941"];
+    tokenize_assert("20030925T104941", comp);
+}
+
+#[test]
+fn test_tokenize54() {
+    let comp = vec!["20030925", "T", "1049"];
+    tokenize_assert("20030925T1049", comp);
+}
+
+#[test]
+fn test_tokenize55() {
+    let comp = vec!["20030925", "T", "10"];
+    tokenize_assert("20030925T10", comp);
+}
+
+#[test]
+fn test_tokenize56() {
+    let comp = vec!["20030925"];
+    tokenize_assert("20030925", comp);
+}
+
+#[test]
+fn test_tokenize57() {
+    let comp = vec!["2003", "-", "09", "-", "25", " ", "10", ":", "49", ":", "41.502"];
+    tokenize_assert("2003-09-25 10:49:41,502", comp);
+}
+
+#[test]
+fn test_tokenize58() {
+    let comp = vec!["199709020908"];
+    tokenize_assert("199709020908", comp);
+}
+
+#[test]
+fn test_tokenize59() {
+    let comp = vec!["19970902090807"];
+    tokenize_assert("19970902090807", comp);
+}
+
+#[test]
+fn test_tokenize60() {
+    let comp = vec!["2003", "-", "09", "-", "25"];
+    tokenize_assert("2003-09-25", comp);
+}
+
+#[test]
+fn test_tokenize61() {
+    let comp = vec!["09", "-", "25", "-", "2003"];
+    tokenize_assert("09-25-2003", comp);
+}
+
+#[test]
+fn test_tokenize62() {
+    let comp = vec!["25", "-", "09", "-", "2003"];
+    tokenize_assert("25-09-2003", comp);
+}
+
+#[test]
+fn test_tokenize63() {
+    let comp = vec!["10", "-", "09", "-", "2003"];
+    tokenize_assert("10-09-2003", comp);
+}
+
+#[test]
+fn test_tokenize64() {
+    let comp = vec!["10", "-", "09", "-", "03"];
+    tokenize_assert("10-09-03", comp);
+}
+
+#[test]
+fn test_tokenize65() {
+    let comp = vec!["2003", ".", "09", ".", "25"];
+    tokenize_assert("2003.09.25", comp);
+}
+
+#[test]
+fn test_tokenize66() {
+    let comp = vec!["09", ".", "25", ".", "2003"];
+    tokenize_assert("09.25.2003", comp);
+}
+
+#[test]
+fn test_tokenize67() {
+    let comp = vec!["25", ".", "09", ".", "2003"];
+    tokenize_assert("25.09.2003", comp);
+}
+
+#[test]
+fn test_tokenize68() {
+    let comp = vec!["10", ".", "09", ".", "2003"];
+    tokenize_assert("10.09.2003", comp);
+}
+
+#[test]
+fn test_tokenize69() {
+    let comp = vec!["10", ".", "09", ".", "03"];
+    tokenize_assert("10.09.03", comp);
+}
+
+#[test]
+fn test_tokenize70() {
+    let comp = vec!["2003", "/", "09", "/", "25"];
+    tokenize_assert("2003/09/25", comp);
+}
+
+#[test]
+fn test_tokenize71() {
+    let comp = vec!["09", "/", "25", "/", "2003"];
+    tokenize_assert("09/25/2003", comp);
+}
+
+#[test]
+fn test_tokenize72() {
+    let comp = vec!["25", "/", "09", "/", "2003"];
+    tokenize_assert("25/09/2003", comp);
+}
+
+#[test]
+fn test_tokenize73() {
+    let comp = vec!["10", "/", "09", "/", "2003"];
+    tokenize_assert("10/09/2003", comp);
+}
+
+#[test]
+fn test_tokenize74() {
+    let comp = vec!["10", "/", "09", "/", "03"];
+    tokenize_assert("10/09/03", comp);
+}
+
+#[test]
+fn test_tokenize75() {
+    let comp = vec!["2003", " ", "09", " ", "25"];
+    tokenize_assert("2003 09 25", comp);
+}
+
+#[test]
+fn test_tokenize76() {
+    let comp = vec!["09", " ", "25", " ", "2003"];
+    tokenize_assert("09 25 2003", comp);
+}
+
+#[test]
+fn test_tokenize77() {
+    let comp = vec!["25", " ", "09", " ", "2003"];
+    tokenize_assert("25 09 2003", comp);
+}
+
+#[test]
+fn test_tokenize78() {
+    let comp = vec!["10", " ", "09", " ", "2003"];
+    tokenize_assert("10 09 2003", comp);
+}
+
+#[test]
+fn test_tokenize79() {
+    let comp = vec!["10", " ", "09", " ", "03"];
+    tokenize_assert("10 09 03", comp);
+}
+
+#[test]
+fn test_tokenize80() {
+    let comp = vec!["25", " ", "09", " ", "03"];
+    tokenize_assert("25 09 03", comp);
+}
+
+#[test]
+fn test_tokenize81() {
+    let comp = vec!["03", " ", "25", " ", "Sep"];
+    tokenize_assert("03 25 Sep", comp);
+}
+
+#[test]
+fn test_tokenize82() {
+    let comp = vec!["25", " ", "03", " ", "Sep"];
+    tokenize_assert("25 03 Sep", comp);
+}
+
+#[test]
+fn test_tokenize83() {
+    let comp = vec![" ", " ", "July", " ", " ", " ", "4", " ", ",", " ", " ", "1976", " ", " ", " ", "12", ":", "01", ":", "02", " ", " ", " ", "am", " ", " "];
+    tokenize_assert("  July   4 ,  1976   12:01:02   am  ", comp);
+}
+
+#[test]
+fn test_tokenize84() {
+    let comp = vec!["Wed", ",", " ", "July", " ", "10", ",", " ", "'", "96"];
+    tokenize_assert("Wed, July 10, '96", comp);
+}
+
+#[test]
+fn test_tokenize85() {
+    let comp = vec!["1996", ".", "July", ".", "10", " ", "AD", " ", "12", ":", "08", " ", "PM"];
+    tokenize_assert("1996.July.10 AD 12:08 PM", comp);
+}
+
+#[test]
+fn test_tokenize86() {
+    let comp = vec!["July", " ", "4", ",", " ", "1976"];
+    tokenize_assert("July 4, 1976", comp);
+}
+
+#[test]
+fn test_tokenize87() {
+    let comp = vec!["7", " ", "4", " ", "1976"];
+    tokenize_assert("7 4 1976", comp);
+}
+
+#[test]
+fn test_tokenize88() {
+    let comp = vec!["4", " ", "jul", " ", "1976"];
+    tokenize_assert("4 jul 1976", comp);
+}
+
+#[test]
+fn test_tokenize89() {
+    let comp = vec!["7", "-", "4", "-", "76"];
+    tokenize_assert("7-4-76", comp);
+}
+
+#[test]
+fn test_tokenize90() {
+    let comp = vec!["19760704"];
+    tokenize_assert("19760704", comp);
+}
+
+#[test]
+fn test_tokenize91() {
+    let comp = vec!["0", ":", "01", ":", "02", " ", "on", " ", "July", " ", "4", ",", " ", "1976"];
+    tokenize_assert("0:01:02 on July 4, 1976", comp);
+}
+
+#[test]
+fn test_tokenize92() {
+    let comp = vec!["0", ":", "01", ":", "02", " ", "on", " ", "July", " ", "4", ",", " ", "1976"];
+    tokenize_assert("0:01:02 on July 4, 1976", comp);
+}
+
+#[test]
+fn test_tokenize93() {
+    let comp = vec!["July", " ", "4", ",", " ", "1976", " ", "12", ":", "01", ":", "02", " ", "am"];
+    tokenize_assert("July 4, 1976 12:01:02 am", comp);
+}
+
+#[test]
+fn test_tokenize94() {
+    let comp = vec!["Mon", " ", "Jan", " ", " ", "2", " ", "04", ":", "24", ":", "27", " ", "1995"];
+    tokenize_assert("Mon Jan  2 04:24:27 1995", comp);
+}
+
+#[test]
+fn test_tokenize95() {
+    let comp = vec!["04", ".", "04", ".", "95", " ", "00", ":", "22"];
+    tokenize_assert("04.04.95 00:22", comp);
+}
+
+#[test]
+fn test_tokenize96() {
+    let comp = vec!["Jan", " ", "1", " ", "1999", " ", "11", ":", "23", ":", "34.578"];
+    tokenize_assert("Jan 1 1999 11:23:34.578", comp);
+}
+
+#[test]
+fn test_tokenize97() {
+    let comp = vec!["950404", " ", "122212"];
+    tokenize_assert("950404 122212", comp);
+}
+
+#[test]
+fn test_tokenize98() {
+    let comp = vec!["3", "rd", " ", "of", " ", "May", " ", "2001"];
+    tokenize_assert("3rd of May 2001", comp);
+}
+
+#[test]
+fn test_tokenize99() {
+    let comp = vec!["5", "th", " ", "of", " ", "March", " ", "2001"];
+    tokenize_assert("5th of March 2001", comp);
+}
+
+#[test]
+fn test_tokenize100() {
+    let comp = vec!["1", "st", " ", "of", " ", "May", " ", "2003"];
+    tokenize_assert("1st of May 2003", comp);
+}
+
+#[test]
+fn test_tokenize101() {
+    let comp = vec!["0099", "-", "01", "-", "01", "T", "00", ":", "00", ":", "00"];
+    tokenize_assert("0099-01-01T00:00:00", comp);
+}
+
+#[test]
+fn test_tokenize102() {
+    let comp = vec!["0031", "-", "01", "-", "01", "T", "00", ":", "00", ":", "00"];
+    tokenize_assert("0031-01-01T00:00:00", comp);
+}
+
+#[test]
+fn test_tokenize103() {
+    let comp = vec!["20080227", "T", "21", ":", "26", ":", "01.123456789"];
+    tokenize_assert("20080227T21:26:01.123456789", comp);
+}
+
+#[test]
+fn test_tokenize104() {
+    let comp = vec!["13", "NOV", "2017"];
+    tokenize_assert("13NOV2017", comp);
+}
+
+#[test]
+fn test_tokenize105() {
+    let comp = vec!["0003", "-", "03", "-", "04"];
+    tokenize_assert("0003-03-04", comp);
+}
+
+#[test]
+fn test_tokenize106() {
+    let comp = vec!["December", ".", "0031", ".", "30"];
+    tokenize_assert("December.0031.30", comp);
+}
+
+#[test]
+fn test_tokenize107() {
+    let comp = vec!["090107"];
+    tokenize_assert("090107", comp);
+}
+
+#[test]
+fn test_tokenize108() {
+    let comp = vec!["2015", "-", "15", "-", "May"];
+    tokenize_assert("2015-15-May", comp);
+}
+
+#[test]
+fn test_tokenize109() {
+    let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28", " ", "BRST", " ", "2003"];
+    tokenize_assert("Thu Sep 25 10:36:28 BRST 2003", comp);
+}
+
+#[test]
+fn test_tokenize110() {
+    let comp = vec!["2003", " ", "10", ":", "36", ":", "28", " ", "BRST", " ", "25", " ", "Sep", " ", "Thu"];
+    tokenize_assert("2003 10:36:28 BRST 25 Sep Thu", comp);
+}
+
+#[test]
+fn test_tokenize111() {
+    let comp = vec!["Thu", ",", " ", "25", " ", "Sep", " ", "2003", " ", "10", ":", "49", ":", "41", " ", "-", "0300"];
+    tokenize_assert("Thu, 25 Sep 2003 10:49:41 -0300", comp);
+}
+
+#[test]
+fn test_tokenize112() {
+    let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49", ":", "41.5", "-", "03", ":", "00"];
+    tokenize_assert("2003-09-25T10:49:41.5-03:00", comp);
+}
+
+#[test]
+fn test_tokenize113() {
+    let comp = vec!["2003", "-", "09", "-", "25", "T", "10", ":", "49", ":", "41", "-", "03", ":", "00"];
+    tokenize_assert("2003-09-25T10:49:41-03:00", comp);
+}
+
+#[test]
+fn test_tokenize114() {
+    let comp = vec!["20030925", "T", "104941.5", "-", "0300"];
+    tokenize_assert("20030925T104941.5-0300", comp);
+}
+
+#[test]
+fn test_tokenize115() {
+    let comp = vec!["20030925", "T", "104941", "-", "0300"];
+    tokenize_assert("20030925T104941-0300", comp);
+}
+
+#[test]
+fn test_tokenize116() {
+    let comp = vec!["10", "-", "09", "-", "2003"];
+    tokenize_assert("10-09-2003", comp);
+}
+
+#[test]
+fn test_tokenize117() {
+    let comp = vec!["10", ".", "09", ".", "2003"];
+    tokenize_assert("10.09.2003", comp);
+}
+
+#[test]
+fn test_tokenize118() {
+    let comp = vec!["10", "/", "09", "/", "2003"];
+    tokenize_assert("10/09/2003", comp);
+}
+
+#[test]
+fn test_tokenize119() {
+    let comp = vec!["10", " ", "09", " ", "2003"];
+    tokenize_assert("10 09 2003", comp);
+}
+
+#[test]
+fn test_tokenize120() {
+    let comp = vec!["090107"];
+    tokenize_assert("090107", comp);
+}
+
+#[test]
+fn test_tokenize121() {
+    let comp = vec!["2015", " ", "09", " ", "25"];
+    tokenize_assert("2015 09 25", comp);
+}
+
+#[test]
+fn test_tokenize122() {
+    let comp = vec!["10", "-", "09", "-", "03"];
+    tokenize_assert("10-09-03", comp);
+}
+
+#[test]
+fn test_tokenize123() {
+    let comp = vec!["10", ".", "09", ".", "03"];
+    tokenize_assert("10.09.03", comp);
+}
+
+#[test]
+fn test_tokenize124() {
+    let comp = vec!["10", "/", "09", "/", "03"];
+    tokenize_assert("10/09/03", comp);
+}
+
+#[test]
+fn test_tokenize125() {
+    let comp = vec!["10", " ", "09", " ", "03"];
+    tokenize_assert("10 09 03", comp);
+}
+
+#[test]
+fn test_tokenize126() {
+    let comp = vec!["090107"];
+    tokenize_assert("090107", comp);
+}
+
+#[test]
+fn test_tokenize127() {
+    let comp = vec!["2015", " ", "09", " ", "25"];
+    tokenize_assert("2015 09 25", comp);
+}
+
+#[test]
+fn test_tokenize128() {
+    let comp = vec!["090107"];
+    tokenize_assert("090107", comp);
+}
+
+#[test]
+fn test_tokenize129() {
+    let comp = vec!["2015", " ", "09", " ", "25"];
+    tokenize_assert("2015 09 25", comp);
+}
+
+#[test]
+fn test_tokenize130() {
+    let comp = vec!["April", " ", "2009"];
+    tokenize_assert("April 2009", comp);
+}
+
+#[test]
+fn test_tokenize131() {
+    let comp = vec!["Feb", " ", "2007"];
+    tokenize_assert("Feb 2007", comp);
+}
+
+#[test]
+fn test_tokenize132() {
+    let comp = vec!["Feb", " ", "2008"];
+    tokenize_assert("Feb 2008", comp);
+}
+
+#[test]
+fn test_tokenize133() {
+    let comp = vec!["Thu", " ", "Sep", " ", "25", " ", "10", ":", "36", ":", "28", " ", "BRST", " ", "2003"];
+    tokenize_assert("Thu Sep 25 10:36:28 BRST 2003", comp);
+}
+
+#[test]
+fn test_tokenize134() {
+    let comp = vec!["1996", ".", "07", ".", "10", " ", "AD", " ", "at", " ", "15", ":", "08", ":", "56", " ", "PDT"];
+    tokenize_assert("1996.07.10 AD at 15:08:56 PDT", comp);
+}
+
+#[test]
+fn test_tokenize135() {
+    let comp = vec!["Tuesday", ",", " ", "April", " ", "12", ",", " ", "1952", " ", "AD", " ", "3", ":", "30", ":", "42", "pm", " ", "PST"];
+    tokenize_assert("Tuesday, April 12, 1952 AD 3:30:42pm PST", comp);
+}
+
+#[test]
+fn test_tokenize136() {
+    let comp = vec!["November", " ", "5", ",", " ", "1994", ",", " ", "8", ":", "15", ":", "30", " ", "am", " ", "EST"];
+    tokenize_assert("November 5, 1994, 8:15:30 am EST", comp);
+}
+
+#[test]
+fn test_tokenize137() {
+    let comp = vec!["1994", "-", "11", "-", "05", "T", "08", ":", "15", ":", "30", "-", "05", ":", "00"];
+    tokenize_assert("1994-11-05T08:15:30-05:00", comp);
+}
+
+#[test]
+fn test_tokenize138() {
+    let comp = vec!["1994", "-", "11", "-", "05", "T", "08", ":", "15", ":", "30", "Z"];
+    tokenize_assert("1994-11-05T08:15:30Z", comp);
+}
+
+#[test]
+fn test_tokenize139() {
+    let comp = vec!["1976", "-", "07", "-", "04", "T", "00", ":", "01", ":", "02", "Z"];
+    tokenize_assert("1976-07-04T00:01:02Z", comp);
+}
+
+#[test]
+fn test_tokenize140() {
+    let comp = vec!["Tue", " ", "Apr", " ", "4", " ", "00", ":", "22", ":", "12", " ", "PDT", " ", "1995"];
+    tokenize_assert("Tue Apr 4 00:22:12 PDT 1995", comp);
+}
+
+#[test]
+fn test_tokenize141() {
+    let comp = vec!["Today", " ", "is", " ", "25", " ", "of", " ", "September", " ", "of", " ", "2003", ",", " ", "exactly", " ", "at", " ", "10", ":", "49", ":", "41", " ", "with", " ", "timezone", " ", "-", "03", ":", "00", "."];
+    tokenize_assert("Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", comp);
+}
+
+#[test]
+fn test_tokenize142() {
+    let comp = vec!["Today", " ", "is", " ", "25", " ", "of", " ", "September", " ", "of", " ", "2003", ",", " ", "exactly", " ", "at", " ", "10", ":", "49", ":", "41", " ", "with", " ", "timezone", " ", "-", "03", ":", "00", "."];
+    tokenize_assert("Today is 25 of September of 2003, exactly at 10:49:41 with timezone -03:00.", comp);
+}
--- a/src/tokenize.rs
+++ b/src/tokenize.rs
@ -1,5 +1,6 @@
 pub(crate) struct Tokenizer {
    token_stack: Vec<String>,
+    // TODO: Should this be more generic? io::Read for example?
    parse_string: String,
 }

@ -13,12 +14,49 @@ pub(crate) enum ParseState {
 }

 impl Tokenizer {
-    pub(crate) fn new(parse_string: String) -> Self {
+
+    pub(crate) fn new(parse_string: &str) -> Self {
        Tokenizer {
-            token_stack: Vec::new(),
+            token_stack: vec![],
            parse_string: parse_string.chars().rev().collect(),
        }
    }
+
+    fn isword(&self, c: char) -> bool {
+        c.is_alphabetic()
+    }
+
+    fn isnum(&self, c: char) -> bool {
+        c.is_numeric()
+    }
+
+    fn isspace(&self, c: char) -> bool {
+        c.is_whitespace()
+    }
+
+    fn decimal_split(&self, s: &str) -> Vec<String> {
+        // Handles the same thing as Python's re.split()
+        let mut tokens: Vec<String> = vec!["".to_owned()];
+
+        for c in s.chars() {
+            if c == '.' || c == ',' {
+                tokens.push(c.to_string());
+                tokens.push("".to_owned());
+            } else {
+                // UNWRAP: Initial setup guarantees we always have an item
+                let mut t = tokens.pop().unwrap();
+                t.push(c);
+                tokens.push(t);
+            }
+        }
+
+        // TODO: Do I really have to use &String instead of &str?
+        if tokens.last() == Some(&"".to_owned()) {
+            tokens.pop();
+        }
+
+        tokens
+    }
 }

 impl Iterator for Tokenizer {
@ -26,182 +64,115 @@ impl Iterator for Tokenizer {

    fn next(&mut self) -> Option<Self::Item> {
        if !self.token_stack.is_empty() {
-            return Some(self.token_stack.pop().unwrap());
-        };
-        if self.parse_string.is_empty() {
-            return None;
-        };
+            return Some(self.token_stack.remove(0));
+        }

-        let mut char_stack: Vec<char> = Vec::new();
-        let mut seen_letters = false;
+        let mut seenletters = false;
+        let mut token: Option<String> = None;
        let mut state = ParseState::Empty;

-        while let Some(next) = self.parse_string.pop() {
+        while !self.parse_string.is_empty() {
+            // Dateutil uses a separate `charstack` to manage the incoming stream.
+            // Because parse_string can have things pushed back onto it, we skip
+            // a couple of steps related to the `charstack`.
+
+            // UNWRAP: Just checked that parse_string isn't empty
+            let nextchar = self.parse_string.pop().unwrap();
+
            match state {
                ParseState::Empty => {
-                    if next.is_numeric() {
-                        state = ParseState::Numeric;
-                        char_stack.push(next);
-                    } else if next.is_alphabetic() {
+                    token = Some(nextchar.to_string());
+                    if self.isword(nextchar) {
                        state = ParseState::Alpha;
-                        seen_letters = true;
-                        char_stack.push(next);
-                    } else if next.is_whitespace() {
-                        char_stack.push(' ');
+                    } else if self.isnum(nextchar) {
+                        state = ParseState::Numeric;
+                    } else if self.isspace(nextchar) {
+                        token = Some(" ".to_owned());
                        break;
                    } else {
-                        char_stack.push(next);
                        break;
                    }
-                }
+                },
                ParseState::Alpha => {
-                    if next.is_alphabetic() {
-                        char_stack.push(next);
-                    } else if next == '.' {
+                    seenletters = true;
+                    if self.isword(nextchar) {
+                        // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token
+                        token.as_mut().unwrap().push(nextchar);
+                    } else if nextchar == '.' {
+                        token.as_mut().unwrap().push(nextchar);
                        state = ParseState::AlphaDecimal;
-                        char_stack.push(next);
                    } else {
-                        // We don't recognize the character, so push it back
-                        // to be handled later.
-                        self.parse_string.push(next);
+                        self.parse_string.push(nextchar);
                        break;
                    }
-                }
-                ParseState::AlphaDecimal => {
-                    if next == '.' || next.is_alphabetic() {
-                        char_stack.push(next);
-                    } else if next.is_numeric() && char_stack.last().unwrap().clone() == '.' {
-                        char_stack.push(next);
-                        state = ParseState::NumericDecimal;
-                    } else {
-                        self.parse_string.push(next);
-                        break;
-                    }
-                }
+                },
                ParseState::Numeric => {
-                    if next.is_numeric() {
-                        char_stack.push(next);
-                    } else if next == '.' || (next == ',' && char_stack.len() >= 2) {
-                        char_stack.push(next);
+                    if self.isnum(nextchar) {
+                        // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token
+                        token.as_mut().unwrap().push(nextchar);
+                    } else if nextchar == '.' || (nextchar == ',' && token.as_ref().unwrap().len() >= 2) {
+                        token.as_mut().unwrap().push(nextchar);
                        state = ParseState::NumericDecimal;
                    } else {
-                        // We don't recognize the character, so push it back
-                        // to be handled later
-                        self.parse_string.push(next);
+                        self.parse_string.push(nextchar);
                        break;
                    }
-                }
+                },
+                ParseState::AlphaDecimal => {
+                    seenletters = true;
+                    if nextchar == '.' || self.isword(nextchar) {
+                        // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token
+                        token.as_mut().unwrap().push(nextchar);
+                    } else if self.isnum(nextchar) && token.as_ref().unwrap().chars().last() == Some('.') {
+                        token.as_mut().unwrap().push(nextchar);
+                        state = ParseState::NumericDecimal;
+                    } else {
+                        self.parse_string.push(nextchar);
+                        break;
+                    }
+                },
                ParseState::NumericDecimal => {
-                    if next == '.' || next.is_numeric() {
-                        char_stack.push(next);
-                    } else if next.is_alphabetic() && char_stack.last().unwrap().clone() == '.' {
-                        char_stack.push(next);
+                    if nextchar == '.' || self.isnum(nextchar) {
+                        // UNWRAP: Because we're in non-empty parse state, we're guaranteed to have a token
+                        token.as_mut().unwrap().push(nextchar);
+                    } else if self.isword(nextchar) && token.as_ref().unwrap().chars().last() == Some('.') {
+                        token.as_mut().unwrap().push(nextchar);
                        state = ParseState::AlphaDecimal;
                    } else {
-                        self.parse_string.push(next);
+                        self.parse_string.push(nextchar);
                        break;
                    }
                }
            }
        }

-        // I like Python's version of this much better:
-        // needs_split = seen_letters or char_stack.count('.') > 1 or char_stack[-1] in '.,'
-        let dot_count = char_stack.iter().fold(0, |count, character| {
-            count + (if character == &'.' { 1 } else { 0 })
-        });
-        let needs_split = seen_letters || dot_count > 1 || char_stack.last().unwrap() == &'.'
-            || char_stack.last().unwrap() == &',';
-        let final_string: String = char_stack.into_iter().collect();
-
-        let mut tokens = match state {
-            ParseState::Empty => vec![final_string],
-            ParseState::Alpha => vec![final_string],
-            ParseState::Numeric => vec![final_string],
-            ParseState::AlphaDecimal => {
-                if needs_split {
-                    decimal_split(&final_string, false)
-                } else {
-                    vec![final_string]
+        // Python uses the state to short-circuit and make sure it doesn't run into issues with None
+        // We do something slightly different to express the same logic
+        if state == ParseState::AlphaDecimal || state == ParseState::NumericDecimal {
+            // UNWRAP: The state check guarantees that we have a value
+            let dot_count = token.as_ref().unwrap().chars().filter(|c| *c == '.').count();
+            let last_char = token.as_ref().unwrap().chars().last();
+            let last_splittable = last_char == Some('.') || last_char == Some(',');
+    
+            if seenletters || dot_count > 1 || last_splittable {
+                let mut l = self.decimal_split(token.as_ref().unwrap());
+                let remaining = l.split_off(1);
+    
+                token = Some(l[0].clone());
+                for t in remaining {
+                    self.token_stack.push(t);
                }
            }
-            ParseState::NumericDecimal => {
-                if needs_split {
-                    decimal_split(&final_string, dot_count == 0)
-                } else {
-                    vec![final_string]
-                }
+    
+            if state == ParseState::NumericDecimal && dot_count == 0 {
+                token = Some(token.unwrap().replace(',', "."));
            }
-        }.into_iter()
-            .rev()
-            .collect();
-
-        self.token_stack.append(&mut tokens);
-        // UNWRAP: Previous match guaranteed that at least one token was added
-        let token = self.token_stack.pop().unwrap();
-        if state == ParseState::NumericDecimal && !token.contains(".") {
-            Some(token.replace(",", "."))
-        } else {
-            Some(token)
        }
+
+        token
    }
 }

-fn decimal_split(characters: &str, cast_period: bool) -> Vec<String> {
-    let mut token_stack: Vec<String> = Vec::new();
-    let mut char_stack: Vec<char> = Vec::new();
-    let mut state = ParseState::Empty;
-
-    for c in characters.chars() {
-        match state {
-            ParseState::Empty => {
-                if c.is_alphabetic() {
-                    char_stack.push(c);
-                    state = ParseState::Alpha;
-                } else if c.is_numeric() {
-                    char_stack.push(c);
-                    state = ParseState::Numeric;
-                } else {
-                    let character = if cast_period { '.' } else { c };
-                    token_stack.push(character.to_string());
-                }
-            }
-            ParseState::Alpha => {
-                if c.is_alphabetic() {
-                    char_stack.push(c);
-                } else {
-                    token_stack.push(char_stack.iter().collect());
-                    char_stack.clear();
-                    let character = if cast_period { '.' } else { c };
-                    token_stack.push(character.to_string());
-                    state = ParseState::Empty;
-                }
-            }
-            ParseState::Numeric => {
-                if c.is_numeric() {
-                    char_stack.push(c);
-                } else {
-                    token_stack.push(char_stack.iter().collect());
-                    char_stack.clear();
-                    let character = if cast_period { '.' } else { c };
-                    token_stack.push(character.to_string());
-                    state = ParseState::Empty;
-                }
-            }
-            _ => panic!("Invalid parse state during decimal_split()"),
-        }
-    }
-
-    match state {
-        ParseState::Alpha => token_stack.push(char_stack.iter().collect()),
-        ParseState::Numeric => token_stack.push(char_stack.iter().collect()),
-        ParseState::Empty => (),
-        _ => panic!("Invalid parse state during decimal_split()"),
-    }
-
-    token_stack
-}
-
 #[cfg(test)]
 mod tests {

@ -209,7 +180,7 @@ mod tests {

    #[test]
    fn test_basic() {
-        let tokens: Vec<String> = Tokenizer::new("September of 2003,".to_owned()).collect();
+        let tokens: Vec<String> = Tokenizer::new("September of 2003,").collect();
        assert_eq!(tokens, vec!["September", " ", "of", " ", "2003", ","]);
    }
 }
--- a/src/weekday.rs
+++ b/src/weekday.rs
@ -99,6 +99,8 @@ pub fn day_of_week(year: u32, month: u32, day: u32) -> ParseResult<DayOfWeek> {
    }
 }

+// Rust warns about unused imports here, but they're definitely used.
+#[allow(unused_imports)]
 mod test {

    use weekday::day_of_week;