-
Notifications
You must be signed in to change notification settings - Fork 5
Open
Description
I'm still working on the ISO 8601 parser mentioned in #15.
I've made some good progress, but there is an issue with an unambiguous, but initially multiply-matching rule. In the example below, I expected the parser to do the following:
- Find digit4 (it does that correctly)
- Assign the digit4 to yearnum (it does that correctly)
- Find digit2 (it doesn't do that)
- Assign the digit2 to monthnum
- Find digit2
- Assign the digit2 to mdaynum
I'm not sure why it's not finding digit2 and it is finding digit3. It should only find digit3 if that is the end of the string. Below is the code, and the issue is with the first call to parser::parse().
library(rly)
TOKENS <- c("DIGIT", "DECIMALPOINT")
LITERALS <- c("W", "Z", "Q", "W", "T", ":", "-")
p_collapse <- function(x, p) {
paste0(sapply(X=x, FUN=p$get), collapse="")
}
set_value <- function(p) {
ret <- list()
for (idx in (1 + seq_len(p$length() - 1))) {
current <- p$get(idx)
for (nm in names(current)) {
if (nm %in% names(ret)) {
if (ret[[nm]] != current[[nm]]) {
print(ret)
print(current)
stop(sprintf("mismatch with %s: %s vs %s", nm, ret[[nm]], current[[nm]]))
}
} else {
ret[[nm]] <- current[[nm]]
}
}
}
ret
}
# Lexer ####
Lexer <-
R6::R6Class(
"Lexer",
public=list(
tokens=TOKENS,
literals=LITERALS,
t_DIGIT="[0-9]",
t_DECIMALPOINT="[\\.,]",
#t_ignore = " \t",
t_newline = function(re='\\n+', t) {
t$lexer$lineno <- t$lexer$lineno + nchar(t$value)
return(NULL)
},
t_error = function(t) {
cat(sprintf("Illegal character '%s'", t$value[1]))
t$lexer$skip(1)
return(t)
}
)
)
# General parser support functions ####
l_parser_general <-
list(
tokens=TOKENS,
literals=LITERALS,
## Helpers ####
p_fraction=function(doc="fraction : DECIMALPOINT multi_digit", p) {
part <- "fraction"
message(part)
p$set(1, list(fraction=p$get(3)))
},
p_multi_digit=function(doc="multi_digit : DIGIT
| digit2
| digit3
| digit4", p) {
part <- "multi_digit"
message(part)
p$set(1, p$get(2))
},
p_digit4=function(doc="digit4 : digit3 DIGIT", p) {
part <- "digit4"
message(part)
p$set(1, p_collapse(2:3, p))
},
p_digit3=function(doc="digit3 : digit2 DIGIT", p) {
part <- "digit3"
message(part)
p$set(1, p_collapse(2:3, p))
},
p_digit2=function(doc="digit2 : DIGIT DIGIT", p) {
part <- "digit2"
message(part)
p$set(1, p_collapse(2:3, p))
},
p_basic=function(doc="basic : ", p) {
p$set(1, list(iso_8601_format="basic"))
},
p_error = function(p) {
if(is.null(p)) {
cat("Syntax error at EOF")
} else {
cat(sprintf(
"Syntax error at '%s'\n%s\n%s^",
p$value, p$lexer$lexdata, strrep(' ', p$lexpos - 1)
))
}
}
)
# Specific numbers ####
l_specific_numbers <-
list(
p_yearnum=function(doc="yearnum : digit4", p) {
part <- "yearnum"
message(part)
p$set(1, list(year=p$get(2)))
},
p_monthnum=function(doc="monthnum : digit2", p) {
part <- "monthnum"
message(part)
p$set(1, list(month=p$get(2)))
},
p_mdaynum=function(doc="mdaynum : digit2", p) {
part <- "mdaynum"
message(part)
p$set(1, list(mday=p$get(2)))
},
p_weeknum=function(doc="weeknum : digit2", p) {
part <- "weeknum"
message(part)
p$set(1, list(week=p$get(2)))
},
p_weekdaynum=function(doc="weekdaynum : DIGIT", p) {
part <- "weekdaynum"
message(part)
p$set(1, list(weekday=p$get(2)))
},
p_odaynum=function(doc="odaynum : digit3", p) {
part <- "odaynum"
message(part)
p$set(1, list(oday=p$get(2)))
},
p_hournum=function(doc="hournum : digit2", p) {
part <- "hournum"
message(part)
p$set(1, list(hour=p$get(2)))
},
p_minutenum=function(doc="minutenum : digit2", p) {
part <- "minutenum"
message(part)
p$set(1, list(minute=p$get(2)))
},
p_secondnum=function(doc="secondnum : digit2", p) {
part <- "secondnum"
message(part)
p$set(1, list(second=p$get(2)))
}
)
# Extended Parser ####
l_extended_iso8601 <-
list(
p_date=function(doc="date : year", p) {
part <- "date"
message(part)
p$set(1, set_value(p))
},
p_year=function(doc="year : yearnum
| yearnum fraction
| yearnum basic subyear
| yearnum dash subyear", p) {
part <- "year"
message(part)
p$set(1, set_value(p))
},
p_subyear=function(doc="subyear : month
| week
| oday", p) {
part <- "subyear"
message(part)
p$set(1, set_value(p))
},
p_month=function(doc="month : monthnum
| monthnum fraction
| monthnum basic mday
| monthnum dash mday", p) {
part <- "month"
message(part)
p$set(1, set_value(p))
},
p_mday=function(doc="mday : mdaynum
| mdaynum fraction
| mdaynum subday", p) {
part <- "mday"
message(part)
p$set(1, set_value(p))
},
p_week=function(doc="week : week_w weeknum
| week_w weeknum fraction
| week_w weeknum basic weekday
| week_w weeknum dash weekday", p) {
part <- "week"
message(part)
p$set(1, set_value(p))
},
p_week_w=function(doc="week_w : 'W'", p) {
part <- "week_w"
message(part)
p$set(1, list())
},
p_weekday=function(doc="weekday : weekdaynum
| weekdaynum fraction
| weekdaynum subday", p) {
part <- "weekday"
message(part)
p$set(1, set_value(p))
},
p_oday=function(doc="oday : odaynum
| odaynum fraction
| odaynum subday", p) {
part <- "oday"
message(part)
p$set(1, set_value(p))
},
p_subday=function(doc="subday : time_with_t", p) {
part <- "subday"
message(part)
p$set(1, set_value(p))
},
p_time=function(doc="time : time_with_t
| time_without_t", p) {
# if just hour is given, it must be preceded by 'T'
part <- "time"
message(part)
p$set(1, set_value(p))
},
p_time_with_t=function(doc="time_with_t : time_t hournum
| time_t hournum fraction
| time_t time_without_t", p) {
part <- "time_with_t"
message(part)
p$set(1, set_value(p))
},
p_time_t=function(doc="time_t : 'T'", p) {
part <- "time_t"
message(part)
p$set(1, list())
},
p_time_without_t=function(doc="time_without_t : hournum basic minute
| hournum colon minute", p) {
part <- "time_without_t"
message(part)
p$set(1, set_value(p))
},
p_minute=function(doc="minute : minutenum
| minutenum fraction
| minutenum basic second
| minutenum colon second", p) {
part <- "minute"
message(part)
p$set(1, set_value(p))
},
p_second=function(doc="second : secondnum
| secondnum fraction", p) {
part <- "second"
message(part)
p$set(1, set_value(p))
},
p_dash=function(doc="dash : '-'", p) {
part <- "dash"
message(part)
p$set(1, list(iso_8601_format="extended"))
},
p_colon=function(doc="colon : ':'", p) {
part <- "colon"
message(part)
p$set(1, list(iso_8601_format="extended"))
}
)
Parser <-
R6::R6Class(
"Basic Parser",
public=append(append(l_extended_iso8601, l_specific_numbers), l_parser_general)
)
lexer <- rly::lex(Lexer)
parser <- rly::yacc(Parser)
#> WARN [2021-11-12 13:45:33] Rule time defined, but not used
#> WARN [2021-11-12 13:45:33] There is 1 unused rule
#> WARN [2021-11-12 13:45:33] Symbol time is unreachable
parser$parse("20201101", lexer)
#> digit2
#> digit3
#> digit4
#> yearnum
#> digit2
#> digit3
#> Syntax error at '1'
#> 20201101
#> ^
#> NULL
parser$parse("2020110", lexer)
#> digit2
#> digit3
#> digit4
#> yearnum
#> digit2
#> digit3
#> odaynum
#> oday
#> subyear
#> year
#> date
#> $year
#> [1] "2020"
#>
#> $iso_8601_format
#> [1] "basic"
#>
#> $oday
#> [1] "110"
parser$parse("2020-11-01", lexer)
#> digit2
#> digit3
#> digit4
#> yearnum
#> dash
#> digit2
#> monthnum
#> dash
#> digit2
#> mdaynum
#> mday
#> month
#> subyear
#> year
#> date
#> $year
#> [1] "2020"
#>
#> $iso_8601_format
#> [1] "extended"
#>
#> $month
#> [1] "11"
#>
#> $mday
#> [1] "01"
parser$parse("2020-110", lexer)
#> digit2
#> digit3
#> digit4
#> yearnum
#> dash
#> digit2
#> digit3
#> odaynum
#> oday
#> subyear
#> year
#> date
#> $year
#> [1] "2020"
#>
#> $iso_8601_format
#> [1] "extended"
#>
#> $oday
#> [1] "110"Created on 2021-11-12 by the reprex package (v2.0.1)
Metadata
Metadata
Assignees
Labels
No labels