Skip to content

Commit 214135d

Browse files
dragosmgjonkeane
andcommitted
ARROW-14848: [R] Implement bindings for lubridate's parse_date_time
This PR adds a partial implementation of `parse_date_time()`: * only parses the year, month, and date components (no hours, minutes and seconds yet) * does not support parsing of strings without separators (e.g. `"220912"` to `2022-09-12`) * `lubridate::parse_date_time()` infers the most likely `format` given `orders` (via `guess_formats()`, while the Arrow binding does not do any inference. Closes #12589 from dragosmg/parse_date_time Lead-authored-by: Dragoș Moldovan-Grünfeld <dragos.mold@gmail.com> Co-authored-by: Jonathan Keane <jkeane@gmail.com> Signed-off-by: Jonathan Keane <jkeane@gmail.com>
1 parent b264dca commit 214135d

4 files changed

Lines changed: 182 additions & 1 deletion

File tree

r/NEWS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@
1616
specific language governing permissions and limitations
1717
under the License.
1818
-->
19+
# development version
20+
21+
* `lubridate::parse_date_time()` datetime parser:
22+
* currently parses only `orders` with year, month, and day components. In a future release `orders` support for other datetime components (such as hours, minutes, seconds, etc) will be added.
23+
* strings with no separators (e.g. `"20210917"`) could be ambiguous and are not yet supported.
24+
* the `orders` argument in the Arrow binding works as follows: `orders` are transformed into `formats` which subsequently get applied in turn. There is no `select_formats` parameter and no inference takes place (like is the case in `lubridate::parse_date_time()`).
1925

2026
# arrow 8.0.0.9000
2127

r/R/dplyr-datetime-helpers.R

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,48 @@ binding_as_date_numeric <- function(x, origin = "1970-01-01") {
154154

155155
x
156156
}
157+
158+
build_formats <- function(orders) {
159+
# only keep the letters and the underscore as separator -> allow the users to
160+
# pass strptime-like formats (with "%"). Processing is needed (instead of passing
161+
# formats as-is) due to the processing of the character vector in parse_date_time()
162+
orders <- gsub("[^A-Za-z_]", "", orders)
163+
orders <- gsub("Y", "y", orders)
164+
165+
supported_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
166+
unsupported_passed_orders <- setdiff(orders, supported_orders)
167+
supported_passed_orders <- intersect(orders, supported_orders)
168+
169+
# error only if there isn't at least one valid order we can try
170+
if (length(supported_passed_orders) == 0) {
171+
arrow_not_supported(
172+
paste0(
173+
oxford_paste(
174+
unsupported_passed_orders
175+
),
176+
" `orders`"
177+
)
178+
)
179+
}
180+
181+
formats_list <- map(orders, build_format_from_order)
182+
purrr::flatten_chr(formats_list)
183+
}
184+
185+
build_format_from_order <- function(order) {
186+
year_chars <- c("%y", "%Y")
187+
month_chars <- c("%m", "%B", "%b")
188+
day_chars <- "%d"
189+
190+
outcome <- switch(
191+
order,
192+
"ymd" = expand.grid(year_chars, month_chars, day_chars),
193+
"ydm" = expand.grid(year_chars, day_chars, month_chars),
194+
"mdy" = expand.grid(month_chars, day_chars, year_chars),
195+
"myd" = expand.grid(month_chars, year_chars, day_chars),
196+
"dmy" = expand.grid(day_chars, month_chars, year_chars),
197+
"dym" = expand.grid(day_chars, year_chars, month_chars)
198+
)
199+
outcome$format <- paste(outcome$Var1, outcome$Var2, outcome$Var3, sep = "-")
200+
outcome$format
201+
}

r/R/dplyr-funcs-datetime.R

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ register_bindings_datetime <- function() {
2424
register_bindings_duration()
2525
register_bindings_duration_constructor()
2626
register_bindings_duration_helpers()
27+
register_bindings_datetime_parsers()
2728
}
2829

2930
register_bindings_datetime_utility <- function() {
@@ -485,3 +486,38 @@ register_bindings_duration_helpers <- function() {
485486
abort("Duration in picoseconds not supported in Arrow.")
486487
})
487488
}
489+
490+
register_bindings_datetime_parsers <- function() {
491+
register_binding("parse_date_time", function(x,
492+
orders,
493+
tz = "UTC") {
494+
495+
# each order is translated into possible formats
496+
formats <- build_formats(orders)
497+
498+
# make all separators (non-letters and non-numbers) into "-"
499+
x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x)
500+
# collapse multiple separators into a single one
501+
x <- call_binding("gsub", "-{2,}", "-", x)
502+
503+
# TODO figure out how to parse strings that have no separators
504+
# https://issues.apache.org/jira/browse/ARROW-16446
505+
# we could insert separators at the "likely" positions, but it might be
506+
# tricky given the possible combinations between dmy formats + locale
507+
508+
# build a list of expressions for each format
509+
parse_attempt_expressions <- list()
510+
511+
for (i in seq_along(formats)) {
512+
parse_attempt_expressions[[i]] <- build_expr(
513+
"strptime",
514+
x,
515+
options = list(format = formats[[i]], unit = 0L, error_is_null = TRUE)
516+
)
517+
}
518+
519+
coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)
520+
521+
build_expr("assume_timezone", coalesce_output, options = list(timezone = tz))
522+
})
523+
}

r/tests/testthat/test-dplyr-funcs-datetime.R

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1429,7 +1429,7 @@ test_that("make_difftime()", {
14291429
) %>%
14301430
collect(),
14311431
paste0("named `difftime` units other than: `second`, `minute`, `hour`,",
1432-
" `day`, and `week` not supported in Arrow.")
1432+
" `day`, and `week` not supported in Arrow.")
14331433
)
14341434
)
14351435

@@ -1621,3 +1621,97 @@ test_that("`as_datetime()`", {
16211621
regexp = "Float value 10.1 was truncated converting to int64"
16221622
)
16231623
})
1624+
1625+
test_that("parse_date_time() works with year, month, and date components", {
1626+
# string processing requires RE2 library (not available on Windows with R 3.6)
1627+
skip_if_not_available("re2")
1628+
compare_dplyr_binding(
1629+
.input %>%
1630+
mutate(
1631+
parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"),
1632+
parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"),
1633+
parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy")
1634+
) %>%
1635+
collect(),
1636+
tibble::tibble(
1637+
string_ymd = c(
1638+
"2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5",
1639+
"2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", "21:09:11",
1640+
# not yet working for strings with no separators, like "20210917", "210918" or "2021Sep19
1641+
# no separators and %b or %B are even more complicated (and they work in
1642+
# lubridate). not to mention locale
1643+
NA
1644+
),
1645+
string_dmy = c(
1646+
"1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021",
1647+
"6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21",
1648+
# not yet working for strings with no separators, like "10092021", "100921",
1649+
NA
1650+
),
1651+
string_mdy = c(
1652+
"09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021",
1653+
"09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21",
1654+
# not yet working for strings with no separators, like "09102021", "091021",
1655+
NA
1656+
)
1657+
)
1658+
)
1659+
1660+
# locale (affecting "%b% and "%B" formats) does not work properly on Windows
1661+
# TODO revisit once https://issues.apache.org/jira/browse/ARROW-16443 is done
1662+
skip_on_os("windows")
1663+
compare_dplyr_binding(
1664+
.input %>%
1665+
mutate(
1666+
parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"),
1667+
parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"),
1668+
parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy")
1669+
) %>%
1670+
collect(),
1671+
tibble::tibble(
1672+
string_ymd = c(
1673+
"2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", NA
1674+
),
1675+
string_dmy = c(
1676+
"12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", NA
1677+
),
1678+
string_mdy = c(
1679+
"Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", NA
1680+
)
1681+
)
1682+
)
1683+
})
1684+
1685+
test_that("parse_date_time() works with a mix of formats and orders", {
1686+
# string processing requires RE2 library (not available on Windows with R 3.6)
1687+
skip_if_not_available("re2")
1688+
test_df <- tibble(
1689+
string_combi = c("2021-09-1", "2/09//2021", "09.3.2021")
1690+
)
1691+
1692+
compare_dplyr_binding(
1693+
.input %>%
1694+
mutate(
1695+
date_from_string = parse_date_time(
1696+
string_combi,
1697+
orders = c("ymd", "%d/%m//%Y", "%m.%d.%Y")
1698+
)
1699+
) %>%
1700+
collect(),
1701+
test_df
1702+
)
1703+
})
1704+
1705+
test_that("parse_date_time() doesn't work with hour, minutes, and second components", {
1706+
test_dates_times <- tibble(
1707+
date_times = c("09-01-17 12:34:56", NA)
1708+
)
1709+
1710+
expect_warning(
1711+
test_dates_times %>%
1712+
arrow_table() %>%
1713+
mutate(parsed_date_ymd = parse_date_time(date_times, orders = "ymd_HMS")) %>%
1714+
collect(),
1715+
'"ymd_HMS" `orders` not supported in Arrow'
1716+
)
1717+
})

0 commit comments

Comments
 (0)