mirror of
https://github.com/bspeice/dtparse
synced 2024-12-22 12:28:08 -05:00
Add a custom parsing example in Russian
This commit is contained in:
parent
b966c02d04
commit
28b7bec91d
@ -63,6 +63,7 @@ assert_eq!(
|
|||||||
);
|
);
|
||||||
```
|
```
|
||||||
|
|
||||||
Further examples can be found in the `examples` directory on international usage.
|
Further examples can be found in the [examples](examples) directory on international usage.
|
||||||
|
|
||||||
[dateutil]: https://github.com/dateutil/dateutil
|
[dateutil]: https://github.com/dateutil/dateutil
|
||||||
|
[examples]: https://github.com/bspeice/dtparse/tree/master/examples
|
||||||
|
39
examples/russian.rs
Normal file
39
examples/russian.rs
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
extern crate chrono;
|
||||||
|
extern crate dtparse;
|
||||||
|
|
||||||
|
use chrono::NaiveDate;
|
||||||
|
use dtparse::parse_info;
|
||||||
|
use dtparse::Parser;
|
||||||
|
use dtparse::ParserInfo;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
|
||||||
|
// In this example, we'll just swap the default "months" parameter
|
||||||
|
// with a version in Russian. Lovingly taken from:
|
||||||
|
// https://github.com/dateutil/dateutil/blob/99f5770e7c63aa049b28abe465d7f1cc25b63fd2/dateutil/test/test_parser.py#L244
|
||||||
|
|
||||||
|
let mut info = ParserInfo::default();
|
||||||
|
info.months = parse_info(vec![
|
||||||
|
vec!["янв", "Январь"],
|
||||||
|
vec!["фев", "Февраль"],
|
||||||
|
vec!["мар", "Март"],
|
||||||
|
vec!["апр", "Апрель"],
|
||||||
|
vec!["май", "Май"],
|
||||||
|
vec!["июн", "Июнь"],
|
||||||
|
vec!["июл", "Июль"],
|
||||||
|
vec!["авг", "Август"],
|
||||||
|
vec!["сен", "Сентябрь"],
|
||||||
|
vec!["окт", "Октябрь"],
|
||||||
|
vec!["ноя", "Ноябрь"],
|
||||||
|
vec!["дек", "Декабрь"]
|
||||||
|
]);
|
||||||
|
|
||||||
|
let mut p = Parser::new(info);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
p.parse("10 Сентябрь 2015 10:20", None, None, false, false, None, false, &HashMap::new())
|
||||||
|
.unwrap().0,
|
||||||
|
NaiveDate::from_ymd(2015, 9, 10).and_hms(10, 20, 0)
|
||||||
|
);
|
||||||
|
}
|
103
src/lib.rs
103
src/lib.rs
@ -184,18 +184,33 @@ pub fn parse_info(vec: Vec<Vec<&str>>) -> HashMap<String, usize> {
|
|||||||
/// and will be parsed as "YDM"
|
/// and will be parsed as "YDM"
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub struct ParserInfo {
|
pub struct ParserInfo {
|
||||||
jump: HashMap<String, usize>,
|
/// Tokens that can be safely ignored
|
||||||
weekday: HashMap<String, usize>,
|
pub jump: HashMap<String, usize>,
|
||||||
months: HashMap<String, usize>,
|
/// Names of all seven weekdays
|
||||||
hms: HashMap<String, usize>,
|
pub weekday: HashMap<String, usize>,
|
||||||
ampm: HashMap<String, usize>,
|
/// Names of all twelve months
|
||||||
utczone: HashMap<String, usize>,
|
pub months: HashMap<String, usize>,
|
||||||
pertain: HashMap<String, usize>,
|
/// Tokens to indicate a value is in units of hours, minutes, or seconds
|
||||||
tzoffset: HashMap<String, usize>,
|
pub hms: HashMap<String, usize>,
|
||||||
dayfirst: bool,
|
/// Tokens to indicate a value refers to AM or PM time
|
||||||
yearfirst: bool,
|
pub ampm: HashMap<String, usize>,
|
||||||
year: i32,
|
/// Tokens to indicate our timestamp is in the UTC timezone
|
||||||
century: i32,
|
pub utczone: HashMap<String, usize>,
|
||||||
|
/// Tokens to indicate values "belonging" to other tokens (e.g. 3rd *of* March)
|
||||||
|
pub pertain: HashMap<String, usize>,
|
||||||
|
/// Map of timezone names to their offset in seconds
|
||||||
|
pub tzoffset: HashMap<String, usize>,
|
||||||
|
/// For ambiguous year/month/day values, and `dayfirst` was not specified as
|
||||||
|
/// an argument to `Parser`, treat the first observed value as the day.
|
||||||
|
pub dayfirst: bool,
|
||||||
|
/// For ambiguous year/month/day values, and `dayfirst` was not specified as
|
||||||
|
/// an argument to `Parser`, treat the first observed value as the day.
|
||||||
|
/// Takes priority over `dayfirst`
|
||||||
|
pub yearfirst: bool,
|
||||||
|
/// The current year we are parsing values for
|
||||||
|
pub year: i32,
|
||||||
|
/// The current year we are parsing values for *modulo* 100
|
||||||
|
pub century: i32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for ParserInfo {
|
impl Default for ParserInfo {
|
||||||
@ -252,23 +267,23 @@ impl Default for ParserInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl ParserInfo {
|
impl ParserInfo {
|
||||||
fn get_jump(&self, name: &str) -> bool {
|
fn jump_index(&self, name: &str) -> bool {
|
||||||
self.jump.contains_key(&name.to_lowercase())
|
self.jump.contains_key(&name.to_lowercase())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_weekday(&self, name: &str) -> Option<usize> {
|
fn weekday_index(&self, name: &str) -> Option<usize> {
|
||||||
self.weekday.get(&name.to_lowercase()).map(|i| *i)
|
self.weekday.get(&name.to_lowercase()).map(|i| *i)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_month(&self, name: &str) -> Option<usize> {
|
fn month_index(&self, name: &str) -> Option<usize> {
|
||||||
self.months.get(&name.to_lowercase()).map(|u| u + 1)
|
self.months.get(&name.to_lowercase()).map(|u| u + 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_hms(&self, name: &str) -> Option<usize> {
|
fn hms_index(&self, name: &str) -> Option<usize> {
|
||||||
self.hms.get(&name.to_lowercase()).map(|i| *i)
|
self.hms.get(&name.to_lowercase()).map(|i| *i)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_ampm(&self, name: &str) -> Option<bool> {
|
fn ampm_index(&self, name: &str) -> Option<bool> {
|
||||||
if let Some(v) = self.ampm.get(&name.to_lowercase()) {
|
if let Some(v) = self.ampm.get(&name.to_lowercase()) {
|
||||||
// Python technically uses numbers here, but given that the numbers are
|
// Python technically uses numbers here, but given that the numbers are
|
||||||
// only 0 and 1, it's easier to use booleans
|
// only 0 and 1, it's easier to use booleans
|
||||||
@ -278,15 +293,15 @@ impl ParserInfo {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_pertain(&self, name: &str) -> bool {
|
fn pertain_index(&self, name: &str) -> bool {
|
||||||
self.pertain.contains_key(&name.to_lowercase())
|
self.pertain.contains_key(&name.to_lowercase())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_utczone(&self, name: &str) -> bool {
|
fn utczone_index(&self, name: &str) -> bool {
|
||||||
self.utczone.contains_key(&name.to_lowercase())
|
self.utczone.contains_key(&name.to_lowercase())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_tzoffset(&self, name: &str) -> Option<usize> {
|
fn tzoffset_index(&self, name: &str) -> Option<usize> {
|
||||||
if self.utczone.contains_key(&name.to_lowercase()) {
|
if self.utczone.contains_key(&name.to_lowercase()) {
|
||||||
Some(0)
|
Some(0)
|
||||||
} else {
|
} else {
|
||||||
@ -319,7 +334,7 @@ impl ParserInfo {
|
|||||||
res.tzname = Some("UTC".to_owned());
|
res.tzname = Some("UTC".to_owned());
|
||||||
res.tzoffset = Some(0);
|
res.tzoffset = Some(0);
|
||||||
} else if res.tzoffset != Some(0) && res.tzname.is_some()
|
} else if res.tzoffset != Some(0) && res.tzname.is_some()
|
||||||
&& self.get_utczone(res.tzname.as_ref().unwrap())
|
&& self.utczone_index(res.tzname.as_ref().unwrap())
|
||||||
{
|
{
|
||||||
res.tzoffset = Some(0);
|
res.tzoffset = Some(0);
|
||||||
}
|
}
|
||||||
@ -709,9 +724,9 @@ impl Parser {
|
|||||||
|
|
||||||
if let Ok(_v) = Decimal::from_str(&value_repr) {
|
if let Ok(_v) = Decimal::from_str(&value_repr) {
|
||||||
i = self.parse_numeric_token(&l, i, &self.info, &mut ymd, &mut res, fuzzy)?;
|
i = self.parse_numeric_token(&l, i, &self.info, &mut ymd, &mut res, fuzzy)?;
|
||||||
} else if let Some(value) = self.info.get_weekday(&l[i]) {
|
} else if let Some(value) = self.info.weekday_index(&l[i]) {
|
||||||
res.weekday = Some(value);
|
res.weekday = Some(value);
|
||||||
} else if let Some(value) = self.info.get_month(&l[i]) {
|
} else if let Some(value) = self.info.month_index(&l[i]) {
|
||||||
ymd.append(value as i32, &l[i], Some(YMDLabel::Month))?;
|
ymd.append(value as i32, &l[i], Some(YMDLabel::Month))?;
|
||||||
|
|
||||||
if i + 1 < len_l {
|
if i + 1 < len_l {
|
||||||
@ -729,7 +744,7 @@ impl Parser {
|
|||||||
|
|
||||||
i += 2;
|
i += 2;
|
||||||
} else if i + 4 < len_l && l[i + 1] == l[i + 3] && l[i + 3] == " "
|
} else if i + 4 < len_l && l[i + 1] == l[i + 3] && l[i + 3] == " "
|
||||||
&& self.info.get_pertain(&l[i + 2])
|
&& self.info.pertain_index(&l[i + 2])
|
||||||
{
|
{
|
||||||
// Jan of 01
|
// Jan of 01
|
||||||
if let Some(value) = l[i + 4].parse::<i32>().ok() {
|
if let Some(value) = l[i + 4].parse::<i32>().ok() {
|
||||||
@ -740,7 +755,7 @@ impl Parser {
|
|||||||
i += 4;
|
i += 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if let Some(value) = self.info.get_ampm(&l[i]) {
|
} else if let Some(value) = self.info.ampm_index(&l[i]) {
|
||||||
let is_ampm = self.ampm_valid(res.hour, res.ampm, fuzzy);
|
let is_ampm = self.ampm_valid(res.hour, res.ampm, fuzzy);
|
||||||
|
|
||||||
if is_ampm == Ok(true) {
|
if is_ampm == Ok(true) {
|
||||||
@ -753,7 +768,7 @@ impl Parser {
|
|||||||
res.tzname = Some(l[i].clone());
|
res.tzname = Some(l[i].clone());
|
||||||
|
|
||||||
let tzname = res.tzname.clone().unwrap();
|
let tzname = res.tzname.clone().unwrap();
|
||||||
res.tzoffset = self.info.get_tzoffset(&tzname).map(|t| t as i32);
|
res.tzoffset = self.info.tzoffset_index(&tzname).map(|t| t as i32);
|
||||||
|
|
||||||
if i + 1 < len_l && (l[i + 1] == "+" || l[i + 1] == "-") {
|
if i + 1 < len_l && (l[i + 1] == "+" || l[i + 1] == "-") {
|
||||||
// GMT+3
|
// GMT+3
|
||||||
@ -771,7 +786,7 @@ impl Parser {
|
|||||||
|
|
||||||
res.tzoffset = None;
|
res.tzoffset = None;
|
||||||
|
|
||||||
if self.info.get_utczone(&tzname) {
|
if self.info.utczone_index(&tzname) {
|
||||||
res.tzname = None;
|
res.tzname = None;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -802,7 +817,7 @@ impl Parser {
|
|||||||
Some(signal * (hour_offset.unwrap() * 3600 + min_offset.unwrap() * 60));
|
Some(signal * (hour_offset.unwrap() * 3600 + min_offset.unwrap() * 60));
|
||||||
|
|
||||||
let tzname = res.tzname.clone();
|
let tzname = res.tzname.clone();
|
||||||
if i + 5 < len_l && self.info.get_jump(&l[i + 2]) && l[i + 3] == "("
|
if i + 5 < len_l && self.info.jump_index(&l[i + 2]) && l[i + 3] == "("
|
||||||
&& l[i + 5] == ")" && 3 <= l[i + 4].len()
|
&& l[i + 5] == ")" && 3 <= l[i + 4].len()
|
||||||
&& self.could_be_tzname(res.hour, tzname, None, &l[i + 4])
|
&& self.could_be_tzname(res.hour, tzname, None, &l[i + 4])
|
||||||
{
|
{
|
||||||
@ -812,7 +827,7 @@ impl Parser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
i += 1;
|
i += 1;
|
||||||
} else if !(self.info.get_jump(&l[i]) || fuzzy) {
|
} else if !(self.info.jump_index(&l[i]) || fuzzy) {
|
||||||
return Err(ParseError::UnrecognizedToken(l[i].clone()));
|
return Err(ParseError::UnrecognizedToken(l[i].clone()));
|
||||||
} else {
|
} else {
|
||||||
skipped_idxs.push(i);
|
skipped_idxs.push(i);
|
||||||
@ -970,7 +985,7 @@ impl Parser {
|
|||||||
// TODO: Decompose this logic a bit
|
// TODO: Decompose this logic a bit
|
||||||
if ymd.len() == 3 && (len_li == 2 || len_li == 4) && res.hour.is_none()
|
if ymd.len() == 3 && (len_li == 2 || len_li == 4) && res.hour.is_none()
|
||||||
&& (idx + 1 >= len_l
|
&& (idx + 1 >= len_l
|
||||||
|| (tokens[idx + 1] != ":" && info.get_hms(&tokens[idx + 1]).is_none()))
|
|| (tokens[idx + 1] != ":" && info.hms_index(&tokens[idx + 1]).is_none()))
|
||||||
{
|
{
|
||||||
// 1990101T32[59]
|
// 1990101T32[59]
|
||||||
let s = &tokens[idx];
|
let s = &tokens[idx];
|
||||||
@ -1045,15 +1060,15 @@ impl Parser {
|
|||||||
let sep = &tokens[idx + 1];
|
let sep = &tokens[idx + 1];
|
||||||
ymd.append(value_repr.parse::<i32>().unwrap(), &value_repr, None)?;
|
ymd.append(value_repr.parse::<i32>().unwrap(), &value_repr, None)?;
|
||||||
|
|
||||||
if idx + 2 < len_l && !info.get_jump(&tokens[idx + 2]) {
|
if idx + 2 < len_l && !info.jump_index(&tokens[idx + 2]) {
|
||||||
if let Ok(val) = tokens[idx + 2].parse::<i32>() {
|
if let Ok(val) = tokens[idx + 2].parse::<i32>() {
|
||||||
ymd.append(val, &tokens[idx + 2], None)?;
|
ymd.append(val, &tokens[idx + 2], None)?;
|
||||||
} else if let Some(val) = info.get_month(&tokens[idx + 2]) {
|
} else if let Some(val) = info.month_index(&tokens[idx + 2]) {
|
||||||
ymd.append(val as i32, &tokens[idx + 2], Some(YMDLabel::Month))?;
|
ymd.append(val as i32, &tokens[idx + 2], Some(YMDLabel::Month))?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if idx + 3 < len_l && &tokens[idx + 3] == sep {
|
if idx + 3 < len_l && &tokens[idx + 3] == sep {
|
||||||
if let Some(value) = info.get_month(&tokens[idx + 4]) {
|
if let Some(value) = info.month_index(&tokens[idx + 4]) {
|
||||||
ymd.append(value as i32, &tokens[idx + 4], Some(YMDLabel::Month))?;
|
ymd.append(value as i32, &tokens[idx + 4], Some(YMDLabel::Month))?;
|
||||||
} else {
|
} else {
|
||||||
if let Ok(val) = tokens[idx + 4].parse::<i32>() {
|
if let Ok(val) = tokens[idx + 4].parse::<i32>() {
|
||||||
@ -1070,10 +1085,10 @@ impl Parser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
idx += 1
|
idx += 1
|
||||||
} else if idx + 1 >= len_l || info.get_jump(&tokens[idx + 1]) {
|
} else if idx + 1 >= len_l || info.jump_index(&tokens[idx + 1]) {
|
||||||
if idx + 2 < len_l && info.get_ampm(&tokens[idx + 2]).is_some() {
|
if idx + 2 < len_l && info.ampm_index(&tokens[idx + 2]).is_some() {
|
||||||
let hour = value.to_i64().unwrap() as i32;
|
let hour = value.to_i64().unwrap() as i32;
|
||||||
let ampm = info.get_ampm(&tokens[idx + 2]).unwrap();
|
let ampm = info.ampm_index(&tokens[idx + 2]).unwrap();
|
||||||
res.hour = Some(self.adjust_ampm(hour, ampm));
|
res.hour = Some(self.adjust_ampm(hour, ampm));
|
||||||
idx += 1;
|
idx += 1;
|
||||||
} else {
|
} else {
|
||||||
@ -1081,12 +1096,12 @@ impl Parser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
idx += 1;
|
idx += 1;
|
||||||
} else if info.get_ampm(&tokens[idx + 1]).is_some()
|
} else if info.ampm_index(&tokens[idx + 1]).is_some()
|
||||||
&& (*ZERO <= value && value < *TWENTY_FOUR)
|
&& (*ZERO <= value && value < *TWENTY_FOUR)
|
||||||
{
|
{
|
||||||
// 12am
|
// 12am
|
||||||
let hour = value.to_i64().unwrap() as i32;
|
let hour = value.to_i64().unwrap() as i32;
|
||||||
res.hour = Some(self.adjust_ampm(hour, info.get_ampm(&tokens[idx + 1]).unwrap()));
|
res.hour = Some(self.adjust_ampm(hour, info.ampm_index(&tokens[idx + 1]).unwrap()));
|
||||||
idx += 1;
|
idx += 1;
|
||||||
} else if ymd.could_be_day(value.to_i64().unwrap() as i32) {
|
} else if ymd.could_be_day(value.to_i64().unwrap() as i32) {
|
||||||
ymd.append(value.to_i64().unwrap() as i32, &value_repr, None)?;
|
ymd.append(value.to_i64().unwrap() as i32, &value_repr, None)?;
|
||||||
@ -1145,16 +1160,16 @@ impl Parser {
|
|||||||
0
|
0
|
||||||
};
|
};
|
||||||
|
|
||||||
if idx + 1 < len_l && info.get_hms(&tokens[idx + 1]).is_some() {
|
if idx + 1 < len_l && info.hms_index(&tokens[idx + 1]).is_some() {
|
||||||
hms_idx = Some(idx + 1)
|
hms_idx = Some(idx + 1)
|
||||||
} else if allow_jump && idx + 2 < len_l && tokens[idx + 1] == " "
|
} else if allow_jump && idx + 2 < len_l && tokens[idx + 1] == " "
|
||||||
&& info.get_hms(&tokens[idx + 2]).is_some()
|
&& info.hms_index(&tokens[idx + 2]).is_some()
|
||||||
{
|
{
|
||||||
hms_idx = Some(idx + 2)
|
hms_idx = Some(idx + 2)
|
||||||
} else if idx > 0 && info.get_hms(&tokens[idx - 1]).is_some() {
|
} else if idx > 0 && info.hms_index(&tokens[idx - 1]).is_some() {
|
||||||
hms_idx = Some(idx - 1)
|
hms_idx = Some(idx - 1)
|
||||||
} else if len_l > 0 && idx > 0 && idx == len_l - 1 && tokens[idx - 1] == " "
|
} else if len_l > 0 && idx > 0 && idx == len_l - 1 && tokens[idx - 1] == " "
|
||||||
&& info.get_hms(&tokens[idx_minus_two]).is_some()
|
&& info.hms_index(&tokens[idx_minus_two]).is_some()
|
||||||
{
|
{
|
||||||
hms_idx = Some(idx - 2)
|
hms_idx = Some(idx - 2)
|
||||||
}
|
}
|
||||||
@ -1174,12 +1189,12 @@ impl Parser {
|
|||||||
} else if hms_index.unwrap() > idx {
|
} else if hms_index.unwrap() > idx {
|
||||||
(
|
(
|
||||||
hms_index.unwrap(),
|
hms_index.unwrap(),
|
||||||
info.get_hms(&tokens[hms_index.unwrap()]),
|
info.hms_index(&tokens[hms_index.unwrap()]),
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
(
|
(
|
||||||
idx,
|
idx,
|
||||||
info.get_hms(&tokens[hms_index.unwrap()]).map(|u| u + 1),
|
info.hms_index(&tokens[hms_index.unwrap()]).map(|u| u + 1),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user