Title: | Clean and Standardize Epidemiological Data |
---|---|
Description: | Cleaning and standardizing tabular data package, tailored specifically for curating epidemiological data. It streamlines various data cleaning tasks that are typically expected when working with datasets in epidemiology. It returns the processed data in the same format, and generates a comprehensive report detailing the outcomes of each cleaning task. |
Authors: | Karim Mané [aut, cre] |
Maintainer: | Karim Mané <[email protected]> |
License: | MIT + file LICENSE |
Version: | 1.1.0.9000 |
Built: | 2025-03-21 12:28:39 UTC |
Source: | https://github.com/epiverse-trace/cleanepi |
Add an element to the data dictionary
add_to_dictionary(dictionary, option, value, grp, order = NULL)
add_to_dictionary(dictionary, option, value, grp, order = NULL)
dictionary |
A
|
option |
A |
value |
A |
grp |
A |
order |
A |
A <data.frame>
. This is the new data dictionary with
an additional line that contains the details about the new options.
test <- add_to_dictionary( dictionary = readRDS( system.file("extdata", "test_dict.RDS", package = "cleanepi") ), option = "ml", value = "male", grp = "gender", order = NULL )
test <- add_to_dictionary( dictionary = readRDS( system.file("extdata", "test_dict.RDS", package = "cleanepi") ), option = "ml", value = "male", grp = "gender", order = NULL )
Add an element to the report object
add_to_report(x, key, value = NULL)
add_to_report(x, key, value = NULL)
x |
A |
key |
A |
value |
The object to add to the report object |
The input <data.frame>
or <linelist>
with an
additional element to the report.
# scan through the data scan_res <- scan_data( data = readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) ) # Perform data cleaning cleaned_data <- clean_data( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), to_numeric = list(target_columns = "sex", lang = "en"), dictionary = NULL ) # add the data scanning result to the report cleaned_data <- add_to_report( x = cleaned_data, key = "scanning_result", value = scan_res )
# scan through the data scan_res <- scan_data( data = readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) ) # Perform data cleaning cleaned_data <- clean_data( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), to_numeric = list(target_columns = "sex", lang = "en"), dictionary = NULL ) # add the data scanning result to the report cleaned_data <- add_to_report( x = cleaned_data, key = "scanning_result", value = scan_res )
Checks whether a date sequence in a vector of specified columns is in chronological order or not.
check_date_sequence(data, target_columns)
check_date_sequence(data, target_columns)
data |
The input |
target_columns |
A |
The input dataset. When found, the incorrect date sequences will be
stored in the report and can be accessed using
attr(data, "report")
.
# import the data data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) # standardize the date values data <- data %>% standardize_dates( target_columns = c("date_first_pcr_positive_test", "date.of.admission"), error_tolerance = 0.4, format = NULL, timeframe = NULL ) # check the date sequence in two columns good_date_sequence <- check_date_sequence( data = data, target_columns = c("date_first_pcr_positive_test", "date.of.admission") )
# import the data data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) # standardize the date values data <- data %>% standardize_dates( target_columns = c("date_first_pcr_positive_test", "date.of.admission"), error_tolerance = 0.4, format = NULL, timeframe = NULL ) # check the date sequence in two columns good_date_sequence <- check_date_sequence( data = data, target_columns = c("date_first_pcr_positive_test", "date.of.admission") )
correct_subject_ids
function to correct them.Check whether the subject IDs comply with the expected format. When incorrect
IDs are found, the function sends a warning and the user can call the
correct_subject_ids
function to correct them.
check_subject_ids( data, target_columns, prefix = NULL, suffix = NULL, range = NULL, nchar = NULL )
check_subject_ids( data, target_columns, prefix = NULL, suffix = NULL, range = NULL, nchar = NULL )
data |
The input |
target_columns |
A |
prefix |
A |
suffix |
A |
range |
A |
nchar |
An |
The input dataset with a warning if incorrect subject ids were found
dat <- check_subject_ids( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 )
dat <- check_subject_ids( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 )
Cleans up messy data frames by performing several operations. These include among others: cleaning of column names, detecting and removing duplicates, empty records and columns, constant columns, replacing missing values by NA, converting character columns into dates when they contain a certain number of date values, detecting subject IDs with wrong formats, etc.
clean_data(data, ...)
clean_data(data, ...)
data |
The input |
... |
A
|
The cleaned input data according to the user-specified parameters.
This is associated with a data cleaning report that can be accessed using
attr(cleaned_data, "report")
# Parameters for column names standardization standardize_column_names <- list(keep = NULL, rename = NULL) # parameters to remove constant columns, empty rows and columns remove_constants <- list(cutoff = 1) # Parameters for substituting missing values with NA: replace_missing_values <- list(target_columns = NULL, na_strings = "-99") # Parameters for duplicates removal across all columns remove_duplicates <- list(target_columns = NULL) # Parameters for dates standardization standardize_dates <- list( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")), orders = list( world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_formats = c("Omdy", "YOmd") ) ) # Parameters for subject IDs standardization standardize_subject_ids <- list( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 ) # convert the 'sex' column into numeric to_numeric <- list(target_columns = "sex", lang = "en") # the dictionary-based cleaning will not be performed here dictionary = NULL # no need to check for the sequence of date events check_date_sequence <- NULL cleaned_data <- clean_data( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), standardize_column_names = standardize_column_names, remove_constants = remove_constants, replace_missing_values = replace_missing_values, remove_duplicates = remove_duplicates, standardize_dates = standardize_dates, standardize_subject_ids = standardize_subject_ids, to_numeric = to_numeric, dictionary = NULL, check_date_sequence = NULL )
# Parameters for column names standardization standardize_column_names <- list(keep = NULL, rename = NULL) # parameters to remove constant columns, empty rows and columns remove_constants <- list(cutoff = 1) # Parameters for substituting missing values with NA: replace_missing_values <- list(target_columns = NULL, na_strings = "-99") # Parameters for duplicates removal across all columns remove_duplicates <- list(target_columns = NULL) # Parameters for dates standardization standardize_dates <- list( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")), orders = list( world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_formats = c("Omdy", "YOmd") ) ) # Parameters for subject IDs standardization standardize_subject_ids <- list( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 ) # convert the 'sex' column into numeric to_numeric <- list(target_columns = "sex", lang = "en") # the dictionary-based cleaning will not be performed here dictionary = NULL # no need to check for the sequence of date events check_date_sequence <- NULL cleaned_data <- clean_data( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), standardize_column_names = standardize_column_names, remove_constants = remove_constants, replace_missing_values = replace_missing_values, remove_duplicates = remove_duplicates, standardize_dates = standardize_dates, standardize_subject_ids = standardize_subject_ids, to_numeric = to_numeric, dictionary = NULL, check_date_sequence = NULL )
Perform dictionary-based cleaning
clean_using_dictionary(data, dictionary)
clean_using_dictionary(data, dictionary)
data |
The input |
dictionary |
A
|
A <data.frame>
or <linelist>
where the target options
have been replaced with their corresponding values in the columns
specified in the data dictionary.
data <- readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ) dictionary <- readRDS( system.file("extdata", "test_dict.RDS", package = "cleanepi") ) # adding an option that is not defined in the dictionary to the 'gender' # column data$gender[2] <- "homme" cleaned_df <- clean_using_dictionary( data = data, dictionary = dictionary )
data <- readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ) dictionary <- readRDS( system.file("extdata", "test_dict.RDS", package = "cleanepi") ) # adding an option that is not defined in the dictionary to the 'gender' # column data$gender[2] <- "homme" cleaned_df <- clean_using_dictionary( data = data, dictionary = dictionary )
This vector contains common values of NA (missing) and is intended for
use within {cleanepi} functions replace_missing_values()
.
The current list of strings used can be found by printing out
common_na_strings
. It serves as a helpful tool to explore your data
for possible missing values. However, I strongly caution against using
this to replace NA
values without meticulously examining the
incidence for each case. Please note that common_na_strings
utilizes
\\
around the "?", ".", and "*" characters to prevent their wildcard
common_na_strings
common_na_strings
A vector of 35 character strings.
This vector is a combination of naniar::common_na_strings
(https://github.com/njtierney/naniar/) and other strings found in the
literature.
Convert numeric to date
convert_numeric_to_date(data, target_columns, ref_date, forward = TRUE)
convert_numeric_to_date(data, target_columns, ref_date, forward = TRUE)
data |
The input |
target_columns |
A |
ref_date |
A |
forward |
A |
A <data.frame>
or <linelist>
where the column of
interest are updated
data <- readRDS(system.file("extdata", "test_df1.RDS", package = "cleanepi")) data <- convert_numeric_to_date( data = data, target_columns = "recruited_on_day", ref_date = as.Date("2022-10-13"), forward = TRUE )
data <- readRDS(system.file("extdata", "test_df1.RDS", package = "cleanepi")) data <- convert_numeric_to_date( data = data, target_columns = "recruited_on_day", ref_date = as.Date("2022-10-13"), forward = TRUE )
When this function is invoked without specifying the column names to be
converted, the target columns are the ones returned by the scan_data()
function. Furthermore, it identifies columns where the proportion of numeric
values is at least twice the percentage of character values and performs the
conversion in them. The function internally makes call of the main function
from the numberize package.
convert_to_numeric(data, target_columns = NULL, lang = c("en", "fr", "es"))
convert_to_numeric(data, target_columns = NULL, lang = c("en", "fr", "es"))
data |
The input |
target_columns |
A |
lang |
A |
A <data.frame>
or <linelist>
wherein all the specified
or detected columns have been transformed into numeric format after the
conversion process.
dat <- convert_to_numeric( data = readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ), target_columns = "age", lang = "en" )
dat <- convert_to_numeric( data = readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ), target_columns = "age", lang = "en" )
After detecting incorrect subject IDs from the check_subject_ids()
function, use this function to provide the correct IDs and perform the
substitution.
correct_subject_ids(data, target_columns, correction_table)
correct_subject_ids(data, target_columns, correction_table)
data |
The input |
target_columns |
A |
correction_table |
A
|
The input dataset where all subject ids comply with the expected format.
# detect the incorrect subject ids dat <- check_subject_ids( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 ) # generate the correction table correction_table <- data.frame( from = c("P0005P2", "PB500P2", "PS004P2-1"), to = c("PB005P2", "PB050P2", "PS004P2") ) # perform the correction dat <- correct_subject_ids( data = dat, target_columns = "study_id", correction_table = correction_table )
# detect the incorrect subject ids dat <- check_subject_ids( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 ) # generate the correction table correction_table <- data.frame( from = c("P0005P2", "PB500P2", "PS004P2-1"), to = c("PB005P2", "PB050P2", "PS004P2") ) # perform the correction dat <- correct_subject_ids( data = dat, target_columns = "study_id", correction_table = correction_table )
Identify and return duplicated rows in a data frame or linelist.
find_duplicates(data, target_columns = NULL)
find_duplicates(data, target_columns = NULL)
data |
The input |
target_columns |
A |
A <data.frame>
or <linelist>
of all duplicated rows
with following 2 additional columns:
The indices of the duplicated rows from the input data. Users can choose from these indices, which row they consider as redundant in each group of duplicates.
a unique identifier associated to each group of duplicates.
dups <- find_duplicates( data = readRDS( system.file("extdata", "test_linelist.RDS", package = "cleanepi") ), target_columns = c("dt_onset", "dt_report", "sex", "outcome") )
dups <- find_duplicates( data = readRDS( system.file("extdata", "test_linelist.RDS", package = "cleanepi") ), target_columns = c("dt_onset", "dt_report", "sex", "outcome") )
clean_data
default parametersWhen clean_data()
function is called without any argument, these
default values provided to the function's arguments will be applied on the
input data. By default, operations that require the target columns to be
specified by the user will not be performed. The default cleaning operations
include: i) standardizing column names, ii) detecting and removing
duplicates, and iii) removing constant data.
get_default_params()
get_default_params()
A <list>
of the default cleaning parameters.
default_params <- get_default_params()
default_params <- get_default_params()
Generate report from data cleaning operations
print_report( data, report_title = "{cleanepi} data cleaning report", output_file_name = NULL, format = "html", print = TRUE )
print_report( data, report_title = "{cleanepi} data cleaning report", output_file_name = NULL, format = "html", print = TRUE )
data |
A |
report_title |
A |
output_file_name |
A |
format |
A |
print |
A |
A <character>
containing the name and path of the saved
report
data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) test_dictionary <- readRDS( system.file("extdata", "test_dictionary.RDS", package = "cleanepi") ) # scan through the data scan_res <- scan_data(data) # Perform data cleaning cleaned_data <- data %>% standardize_column_names(keep = NULL, rename = c("DOB" = "dateOfBirth")) %>% replace_missing_values(target_columns = NULL, na_strings = "-99") %>% remove_constants(cutoff = 1.0) %>% remove_duplicates(target_columns = NULL) %>% standardize_dates( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")) ) %>% check_subject_ids( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1L, 100L), nchar = 7L ) %>% convert_to_numeric(target_columns = "sex", lang = "en") %>% clean_using_dictionary(dictionary = test_dictionary) # add the data scanning result to the report cleaned_data <- add_to_report( x = cleaned_data, key = "scanning_result", value = scan_res ) # save a report in the current directory using the previously-created objects print_report( data = cleaned_data, report_title = "{cleanepi} data cleaning report", output_file_name = NULL, format = "html", print = TRUE )
data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) test_dictionary <- readRDS( system.file("extdata", "test_dictionary.RDS", package = "cleanepi") ) # scan through the data scan_res <- scan_data(data) # Perform data cleaning cleaned_data <- data %>% standardize_column_names(keep = NULL, rename = c("DOB" = "dateOfBirth")) %>% replace_missing_values(target_columns = NULL, na_strings = "-99") %>% remove_constants(cutoff = 1.0) %>% remove_duplicates(target_columns = NULL) %>% standardize_dates( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")) ) %>% check_subject_ids( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1L, 100L), nchar = 7L ) %>% convert_to_numeric(target_columns = "sex", lang = "en") %>% clean_using_dictionary(dictionary = test_dictionary) # add the data scanning result to the report cleaned_data <- add_to_report( x = cleaned_data, key = "scanning_result", value = scan_res ) # save a report in the current directory using the previously-created objects print_report( data = cleaned_data, report_title = "{cleanepi} data cleaning report", output_file_name = NULL, format = "html", print = TRUE )
The function iteratively removes constant data until none remain. It records details of the removed constant data as a data frame within the report object.
remove_constants(data, cutoff = 1)
remove_constants(data, cutoff = 1)
data |
The input |
cutoff |
A |
The input dataset with empty rows, empty columns, and constant columns removed.
data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) # introduce an empty column data$empty_column <- NA # inject some missing values across some columns data$study_id[3] = NA_character_ data$date.of.admission[3] = NA_character_ data$date.of.admission[4] = NA_character_ data$dateOfBirth[3] = NA_character_ data$dateOfBirth[4] = NA_character_ data$dateOfBirth[5] = NA_character_ # with cutoff = 1, line 3, 4, and 5 are not removed test <- remove_constants( data = data, cutoff = 1 ) # drop rows or columns with a percentage of constant values # equal to or more than 50% test <- remove_constants( data = test, cutoff = 0.5 ) # drop rows or columns with a percentage of constant values # equal to or more than 25% test <- remove_constants( data = test, cutoff = 0.25 ) # drop rows or columns with a percentage of constant values # equal to or more than 15% test <- remove_constants( data = test, cutoff = 0.15 ) # check the report to see what has happened report <- attr(test, "report") report$constant_data
data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) # introduce an empty column data$empty_column <- NA # inject some missing values across some columns data$study_id[3] = NA_character_ data$date.of.admission[3] = NA_character_ data$date.of.admission[4] = NA_character_ data$dateOfBirth[3] = NA_character_ data$dateOfBirth[4] = NA_character_ data$dateOfBirth[5] = NA_character_ # with cutoff = 1, line 3, 4, and 5 are not removed test <- remove_constants( data = data, cutoff = 1 ) # drop rows or columns with a percentage of constant values # equal to or more than 50% test <- remove_constants( data = test, cutoff = 0.5 ) # drop rows or columns with a percentage of constant values # equal to or more than 25% test <- remove_constants( data = test, cutoff = 0.25 ) # drop rows or columns with a percentage of constant values # equal to or more than 15% test <- remove_constants( data = test, cutoff = 0.15 ) # check the report to see what has happened report <- attr(test, "report") report$constant_data
When removing duplicates, users can specify a set columns to consider with
the target_columns
argument.
remove_duplicates(data, target_columns = NULL)
remove_duplicates(data, target_columns = NULL)
data |
The input |
target_columns |
A |
The input data <data.frame>
or <linelist>
without the
duplicated rows identified from all or the specified columns.
no_dups <- remove_duplicates( data = readRDS( system.file("extdata", "test_linelist.RDS", package = "cleanepi") ), target_columns = "linelist_tags" )
no_dups <- remove_duplicates( data = readRDS( system.file("extdata", "test_linelist.RDS", package = "cleanepi") ), target_columns = "linelist_tags" )
NA
Replace missing values with NA
replace_missing_values( data, target_columns = NULL, na_strings = cleanepi::common_na_strings )
replace_missing_values( data, target_columns = NULL, na_strings = cleanepi::common_na_strings )
data |
The input |
target_columns |
A |
na_strings |
A |
The input data where missing values are replaced by NA
.
cleaned_data <- replace_missing_values( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "sex", na_strings = "-99" )
cleaned_data <- replace_missing_values( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "sex", na_strings = "-99" )
missing
, numeric
,
Date
, character
, logical
values.The function checks for the existence of character columns in the data. When found, it reports back the proportion of the data types mentioned above in those columns. See the details section to know more about how it works.
scan_data(data)
scan_data(data)
data |
A |
How does it work?
The <character>
columns are identified first. If no <character>
columns are found, the function returns a message.
For each <character>
column, the function counts:
The number of missing values (NA
).
The number of numeric values. A process is initiated to detect valid dates
among these numeric values using lubridate::as_date()
and
date_guess()
functions. If valid dates are found, a warning is
triggered to alert about ambiguous numeric values potentially representing
dates. Note: A date is considered valid if it falls within the range
from today's date to 50 years in the past.
The detection of <Date>
values from non-numeric data using the
date_guess()
function. The total date count includes dates
from today's from both numeric and non-numeric values. Due to overlap, the
sum of counts across rows in the scanning result may exceed 1.
The count of <logical>
values.
Remaining values are categorized as <character>
.
A <data.frame>
if the input data contains columns of type
character. It invisibly returns NA
otherwise. The returned data
frame will have the same number of rows as the number of character
columns, and six columns representing their column names, proportion of
missing, numeric, date, character, and logical values.
# scan through a data frame of characters scan_result <- scan_data( data = readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ) ) # scan through a data frame with two character columns scan_result <- scan_data( data = readRDS(system.file("extdata", "test_linelist.RDS", package = "cleanepi")) ) # scan through a data frame with no character columns data(iris) iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), replace = TRUE)) iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), to = as.Date("2024-08-30"), length.out = nrow(iris))) iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) scan_result <- scan_data(data = iris)
# scan through a data frame of characters scan_result <- scan_data( data = readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ) ) # scan through a data frame with two character columns scan_result <- scan_data( data = readRDS(system.file("extdata", "test_linelist.RDS", package = "cleanepi")) ) # scan through a data frame with no character columns data(iris) iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), replace = TRUE)) iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), to = as.Date("2024-08-30"), length.out = nrow(iris))) iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) scan_result <- scan_data(data = iris)
All columns names will be reformatted to snake_case. When the
conversion to snakecase does not work as expected, use the keep
and/or
rename
arguments to reformat the column name properly.
standardize_column_names(data, keep = NULL, rename = NULL)
standardize_column_names(data, keep = NULL, rename = NULL)
data |
The input |
keep |
A |
rename |
A named |
A <data.frame>
or <linelist>
with easy to work with
column names.
# do not rename 'date.of.admission' cleaned_data <- standardize_column_names( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), keep = "date.of.admission" ) # do not rename 'date.of.admission', but rename 'dateOfBirth' and 'sex' to # 'DOB' and 'gender' respectively cleaned_data <- standardize_column_names( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), keep = "date.of.admission", rename = c(DOB = "dateOfBirth", gender = "sex") )
# do not rename 'date.of.admission' cleaned_data <- standardize_column_names( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), keep = "date.of.admission" ) # do not rename 'date.of.admission', but rename 'dateOfBirth' and 'sex' to # 'DOB' and 'gender' respectively cleaned_data <- standardize_column_names( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), keep = "date.of.admission", rename = c(DOB = "dateOfBirth", gender = "sex") )
When the format of the values in a column and/or the target columns are not
defined, we strongly recommend checking a few converted dates manually to
make sure that the dates extracted from a character
vector or a factor
are correct.
standardize_dates( data, target_columns = NULL, format = NULL, timeframe = NULL, error_tolerance = 0.4, orders = list(world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_formats = c("Omdy", "YOmd")) )
standardize_dates( data, target_columns = NULL, format = NULL, timeframe = NULL, error_tolerance = 0.4, orders = list(world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_formats = c("Omdy", "YOmd")) )
data |
The input |
target_columns |
A |
format |
A |
timeframe |
A |
error_tolerance |
A |
orders |
A list( quarter_partial_dates = c("Y", "Ym", "Yq"), world_digit_months = c("Yq", "ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", "dmY", "mdY", "mYd", "dYm"), world_named_months = c("dby", "dyb", "bdy", "byd", "ybd", "ydb", "dbY", "dYb", "bdY", "bYd", "Ybd", "Ydb"), us_format = c("Omdy", "YOmd") ) |
Check for the presence of date values that could have multiple formats
from the $multi_format_dates
element of the report
.
Converting ambiguous character strings to dates is difficult for many reasons:
dates may not use the standard Ymd format
within the same variable, dates may follow different formats
dates may be mixed with things that are not dates
the behavior of as.Date
in the presence of non-date is hard to predict,
sometimes returning NA
, sometimes issuing an error.
This function tries to address all the above issues. Dates with the following format should be automatically detected, irrespective of separators (e.g. "-", " ", "/") and surrounding text:
"19 09 2018"
"2018 09 19"
"19 Sep 2018"
"2018 Sep 19"
"Sep 19 2018"
This function relies heavily on lubridate::parse_date_time()
, which is an
extremely flexible date parser that works well for consistent date formats,
but can quickly become unwieldy and may produce spurious results.
standardize_dates()
will use a list of formats in the orders
argument to
run parse_date_time()
with each format vector separately and take the first
correctly parsed date from all the trials.
With the default orders shown above, the dates 03 Jan 2018, 07/03/1982, and
08/20/85 are correctly interpreted as 2018-01-03, 1982-03-07, and 1985-08-20.
The examples section will show how you can manipulate the orders
to be
customized for your situation.
The input dataset where the date columns have been standardized. The date values that are out of the specified timeframe will be reported in the report. Similarly, date values that comply with multiple formats will also be featured in the report object.
x <- c("03 Jan 2018", "07/03/1982", "08/20/85") # The below will coerce values where the month is written in letters only # into Date. as.Date(lubridate::parse_date_time(x, orders = c("Ybd", "dby"))) # coerce values where the month is written in letters or numbers into Date. as.Date(lubridate::parse_date_time(x, orders = c("dmy", "Ymd"))) # How to use standardize_dates() dat <- standardize_dates( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "date_first_pcr_positive_test", format = NULL, timeframe = NULL, error_tolerance = 0.4, orders = list( world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_format = c("Omdy", "YOmd") ) )
x <- c("03 Jan 2018", "07/03/1982", "08/20/85") # The below will coerce values where the month is written in letters only # into Date. as.Date(lubridate::parse_date_time(x, orders = c("Ybd", "dby"))) # coerce values where the month is written in letters or numbers into Date. as.Date(lubridate::parse_date_time(x, orders = c("dmy", "Ymd"))) # How to use standardize_dates() dat <- standardize_dates( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "date_first_pcr_positive_test", format = NULL, timeframe = NULL, error_tolerance = 0.4, orders = list( world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_format = c("Omdy", "YOmd") ) )
Calculate time span between dates
timespan( data, target_column = NULL, end_date = Sys.Date(), span_unit = c("years", "months", "weeks", "days"), span_column_name = "span", span_remainder_unit = NULL )
timespan( data, target_column = NULL, end_date = Sys.Date(), span_unit = c("years", "months", "weeks", "days"), span_column_name = "span", span_remainder_unit = NULL )
data |
The input |
target_column |
A |
end_date |
The end date. It can be either a |
span_unit |
A |
span_column_name |
A |
span_remainder_unit |
A |
The input <data.frame>
with one or two additional columns:
or any other name chosen by the user. This will contain the calculated time span in the desired units.
a column with the number of the remaining days or weeks or months depending on the value of the 'span_remainder_unit' parameter. The star represents here the value of the 'span_column_name' argument.
# In the below example, this function is used to calculate patient's age from # their dates of birth # import the data, replace missing values with NA and convert date into ISO # format data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) data <- data %>% replace_missing_values(target_columns = "dateOfBirth", na_strings = "-99") %>% standardize_dates(target_columns = "dateOfBirth", error_tolerance = 0.0) # calculate the age in 'years' and return the remainder in 'months' age <- timespan( data = data, target_column = "dateOfBirth", end_date = Sys.Date(), span_unit = "years", span_column_name = "age_in_years", span_remainder_unit = "months" )
# In the below example, this function is used to calculate patient's age from # their dates of birth # import the data, replace missing values with NA and convert date into ISO # format data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) data <- data %>% replace_missing_values(target_columns = "dateOfBirth", na_strings = "-99") %>% standardize_dates(target_columns = "dateOfBirth", error_tolerance = 0.0) # calculate the age in 'years' and return the remainder in 'months' age <- timespan( data = data, target_column = "dateOfBirth", end_date = Sys.Date(), span_unit = "years", span_column_name = "age_in_years", span_remainder_unit = "months" )