Title: | Clean and Standardize Epidemiological Data |
---|---|
Description: | Cleaning and standardizing tabular data package, tailored specifically for curating epidemiological data. It streamlines various data cleaning tasks that are typically expected when working with datasets in epidemiology. It returns the processed data in the same format, ensuring seamless integration into existing workflows. Additionally, it generates a comprehensive report detailing the outcomes of each cleaning task. |
Authors: | Karim Mané [aut, cre] , Thibaut Jombart [ctb] (Thibault contributed in development of date_guess().), Abdoelnaser Degoot [aut] , Bankolé Ahadzie [aut], Nuredin Mohammed [aut], Bubacarr Bah [aut] , Hugo Gruson [ctb, rev] , Pratik R. Gupte [rev] , James M. Azam [rev] , Joshua W. Lambert [rev] , Chris Hartgerink [rev] , Andree Valle-Campos [rev, ctb], London School of Hygiene and Tropical Medicine, LSHTM [cph], data.org [fnd] |
Maintainer: | Karim Mané <[email protected]> |
License: | MIT + file LICENSE |
Version: | 1.0.2.9000 |
Built: | 2024-10-25 15:15:59 UTC |
Source: | https://github.com/epiverse-trace/cleanepi |
Add an element to the data dictionary
add_to_dictionary(dictionary, option, value, grp, order = NULL)
add_to_dictionary(dictionary, option, value, grp, order = NULL)
dictionary |
A data dictionary in a form of a data frame |
option |
A vector of strings with the new options that need to be added to the dictionary. |
value |
A vector with the values to be used when replacing the new options. |
grp |
A vector with the name of the column that contains the option of interest. |
order |
A numeric with the order of the new option. |
An object of type data frame. This is the new data dictionary with an additional line that contains the details about the new options.
test <- add_to_dictionary( dictionary = readRDS( system.file("extdata", "test_dict.RDS", package = "cleanepi") ), option = "ml", value = "male", grp = "gender", order = NULL )
test <- add_to_dictionary( dictionary = readRDS( system.file("extdata", "test_dict.RDS", package = "cleanepi") ), option = "ml", value = "male", grp = "gender", order = NULL )
Add an element to the report object
add_to_report(x, key, value = NULL)
add_to_report(x, key, value = NULL)
x |
A data frame or linelist |
key |
The name of the cleaning operation |
value |
The object to add to the report object |
The input report object with an additional element
# scan through the data scan_res <- scan_data( data = readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) ) # Perform data cleaning cleaned_data <- clean_data( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), to_numeric = list(target_columns = "sex", lang = "en"), dictionary = NULL ) # add the data scanning result to the report cleaned_data <- add_to_report( x = cleaned_data, key = "scanning_result", value = scan_res )
# scan through the data scan_res <- scan_data( data = readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) ) # Perform data cleaning cleaned_data <- clean_data( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), to_numeric = list(target_columns = "sex", lang = "en"), dictionary = NULL ) # add the data scanning result to the report cleaned_data <- add_to_report( x = cleaned_data, key = "scanning_result", value = scan_res )
Checks whether a date sequence in a vector of specified columns is in order or not.
check_date_sequence(data, target_columns)
check_date_sequence(data, target_columns)
data |
The input data frame or linelist |
target_columns |
A vector of column names for events. Users should
specify at least 2 column names in the expected order. For example:
|
The input dataset. When found, the incorrect date sequences will be
stored in the report and can be accessed using
attr(data, "report")
.
# import the data data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) # standardize the date values data <- data %>% standardize_dates( target_columns = c("date_first_pcr_positive_test", "date.of.admission"), error_tolerance = 0.4, format = NULL, timeframe = NULL ) # check the date sequence in two columns good_date_sequence <- check_date_sequence( data = data, target_columns = c("date_first_pcr_positive_test", "date.of.admission") )
# import the data data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) # standardize the date values data <- data %>% standardize_dates( target_columns = c("date_first_pcr_positive_test", "date.of.admission"), error_tolerance = 0.4, format = NULL, timeframe = NULL ) # check the date sequence in two columns good_date_sequence <- check_date_sequence( data = data, target_columns = c("date_first_pcr_positive_test", "date.of.admission") )
correct_subject_ids()
function to correct them.Check whether the subject IDs comply with the expected format. When incorrect
IDs are found, the function sends a warning and the user can call the
correct_subject_ids()
function to correct them.
check_subject_ids( data, target_columns, prefix = NULL, suffix = NULL, range = NULL, nchar = NULL )
check_subject_ids( data, target_columns, prefix = NULL, suffix = NULL, range = NULL, nchar = NULL )
data |
The input data frame or linelist |
target_columns |
A vector of column names with the subject ids. |
prefix |
A prefix used in the subject IDs |
suffix |
A suffix used in the subject IDs |
range |
A vector with the range of numbers in the sample IDs |
nchar |
An integer that represents the expected number of characters in the subject ids. |
The input dataset with a warning if incorrect subject ids were found
dat <- check_subject_ids( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 )
dat <- check_subject_ids( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 )
Cleans up messy data frames by performing several operations. These include among others: cleaning of column names, detecting and removing duplicates, empty records and columns, constant columns, replacing missing values by NA, converting character columns into dates when they contain a certain number of date values, detecting subject IDs with wrong formats, etc.
clean_data(data, ...)
clean_data(data, ...)
data |
The input data frame or linelist |
... |
A list of cleaning operations to be applied on the input data.
The acceptable arguments for
|
The cleaned input date according to the user-specified parameters.
This is associated with a data cleaning report that can be accessed using
attr(cleaned_data, "report")
# Parameters for column names standardization standardize_column_names <- list(keep = NULL, rename = NULL) # parameters to remove constant columns, empty rows and columns remove_constants <- list(cutoff = 1) # Parameters for substituting missing values with NA: replace_missing_values <- list(target_columns = NULL, na_strings = "-99") # Parameters for duplicates removal across all columns remove_duplicates <- list(target_columns = NULL) # Parameters for dates standardization standardize_dates <- list( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")), orders = list( world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_formats = c("Omdy", "YOmd") ) ) # Parameters for subject IDs standardization standardize_subject_ids <- list( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 ) # convert the 'sex' column into numeric to_numeric <- list(target_columns = "sex", lang = "en") # the dictionary-based cleaning will not be performed here dictionary = NULL # no need to check for the sequence of date events check_date_sequence <- NULL cleaned_data <- clean_data( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), standardize_column_names = standardize_column_names, remove_constants = remove_constants, replace_missing_values = replace_missing_values, remove_duplicates = remove_duplicates, standardize_dates = standardize_dates, standardize_subject_ids = standardize_subject_ids, to_numeric = to_numeric, dictionary = NULL, check_date_sequence = NULL )
# Parameters for column names standardization standardize_column_names <- list(keep = NULL, rename = NULL) # parameters to remove constant columns, empty rows and columns remove_constants <- list(cutoff = 1) # Parameters for substituting missing values with NA: replace_missing_values <- list(target_columns = NULL, na_strings = "-99") # Parameters for duplicates removal across all columns remove_duplicates <- list(target_columns = NULL) # Parameters for dates standardization standardize_dates <- list( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")), orders = list( world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_formats = c("Omdy", "YOmd") ) ) # Parameters for subject IDs standardization standardize_subject_ids <- list( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 ) # convert the 'sex' column into numeric to_numeric <- list(target_columns = "sex", lang = "en") # the dictionary-based cleaning will not be performed here dictionary = NULL # no need to check for the sequence of date events check_date_sequence <- NULL cleaned_data <- clean_data( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), standardize_column_names = standardize_column_names, remove_constants = remove_constants, replace_missing_values = replace_missing_values, remove_duplicates = remove_duplicates, standardize_dates = standardize_dates, standardize_subject_ids = standardize_subject_ids, to_numeric = to_numeric, dictionary = NULL, check_date_sequence = NULL )
Perform dictionary-based cleaning
clean_using_dictionary(data, dictionary)
clean_using_dictionary(data, dictionary)
data |
The input data frame or linelist |
dictionary |
A data dictionary associated with the input data |
A data frame with cleaned values in the target columns specified in the data dictionary.
data <- readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ) dictionary <- readRDS( system.file("extdata", "test_dict.RDS", package = "cleanepi") ) data$gender[2] <- "homme" cleaned_df <- clean_using_dictionary( data = data, dictionary = dictionary )
data <- readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ) dictionary <- readRDS( system.file("extdata", "test_dict.RDS", package = "cleanepi") ) data$gender[2] <- "homme" cleaned_df <- clean_using_dictionary( data = data, dictionary = dictionary )
This vector contains common values of NA (missing) and is intended for
use within {cleanepi} functions replace_missing_values()
.
The current list of strings used can be found by printing out
common_na_strings
. It serves as a helpful tool to explore your data
for possible missing values. However, I strongly caution against using
this to replace NA
values without meticulously examining the
incidence for each case. Please note that common_na_strings
utilizes
\\
around the "?", ".", and "*" characters to prevent their wildcard
common_na_strings
common_na_strings
A vector of 35 character strings.
This vector is a combination of naniar::common_na_strings
(https://github.com/njtierney/naniar/) and other strings found in the
literature.
Convert numeric to date
convert_numeric_to_date(data, target_columns, ref_date, forward = TRUE)
convert_numeric_to_date(data, target_columns, ref_date, forward = TRUE)
data |
The input data frame or linelist |
target_columns |
A vector of columns names to be converted from numeric
to date. When the input data is a |
ref_date |
A reference date. This can also be a character string with the name of the reference column. |
forward |
A Boolean to indicate whether the counts started after the
reference date ( |
A data frame where the column of interest are updated
data <- readRDS(system.file("extdata", "test_df1.RDS", package = "cleanepi")) data <- convert_numeric_to_date( data = data, target_columns = "recruted_on_day", ref_date = as.Date("2022-10-13"), forward = TRUE )
data <- readRDS(system.file("extdata", "test_df1.RDS", package = "cleanepi")) data <- convert_numeric_to_date( data = data, target_columns = "recruted_on_day", ref_date = as.Date("2022-10-13"), forward = TRUE )
When the function is invoked without specifying the column names to be
converted, the target columns are the ones returned by the scan_data()
function. Furthermore, it identifies columns where the proportion of numeric
values is at least twice the percentage of character values and performs the
conversion in them.
convert_to_numeric(data, target_columns = NULL, lang = c("en", "fr", "es"))
convert_to_numeric(data, target_columns = NULL, lang = c("en", "fr", "es"))
data |
The input data frame or linelist |
target_columns |
A vector of the target column names. When the input
data is a |
lang |
The text's language. Currently one of |
A data frame wherein all the specified or detected columns have been transformed into numeric format after the conversion process.
dat <- convert_to_numeric( data = readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ), target_columns = "age", lang = "en" )
dat <- convert_to_numeric( data = readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ), target_columns = "age", lang = "en" )
After detecting incorrect subject IDs from the check_subject_ids()
function, use this function to provide the correct IDs and perform the
substitution.
correct_subject_ids(data, target_columns, correction_table)
correct_subject_ids(data, target_columns, correction_table)
data |
The input data frame or linelist |
target_columns |
A vector of column names with the subject ids. |
correction_table |
A data frame with the following two columns:
|
The input dataset where all subject ids comply with the expected format.
# detect the incorrect subject ids dat <- check_subject_ids( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 ) # generate the correction table correction_table <- data.frame( from = c("P0005P2", "PB500P2", "PS004P2-1"), to = c("PB005P2", "PB050P2", "PS004P2") ) # perform the correction dat <- correct_subject_ids( data = dat, target_columns = "study_id", correction_table = correction_table )
# detect the incorrect subject ids dat <- check_subject_ids( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1, 100), nchar = 7 ) # generate the correction table correction_table <- data.frame( from = c("P0005P2", "PB500P2", "PS004P2-1"), to = c("PB005P2", "PB050P2", "PS004P2") ) # perform the correction dat <- correct_subject_ids( data = dat, target_columns = "study_id", correction_table = correction_table )
Identify and return duplicated rows in a data frame or linelist.
find_duplicates(data, target_columns = NULL)
find_duplicates(data, target_columns = NULL)
data |
A data frame or linelist. |
target_columns |
A vector of columns names or indices to consider when
looking for duplicates. When the input data is a |
A data frame or linelist of all duplicated rows with following 2 additional columns:
row_id
: the indices of the duplicated rows from the input data.
Users can choose from these indices, which row they consider as
redundant in each group of duplicates.
group_id
: a unique identifier associated to each group of
duplicates.
dups <- find_duplicates( data = readRDS( system.file("extdata", "test_linelist.RDS", package = "cleanepi") ), target_columns = c("dt_onset", "dt_report", "sex", "outcome") )
dups <- find_duplicates( data = readRDS( system.file("extdata", "test_linelist.RDS", package = "cleanepi") ), target_columns = c("dt_onset", "dt_report", "sex", "outcome") )
Generate report from data cleaning operations
print_report( data, report_title = "{cleanepi} data cleaning report", output_file_name = NULL, format = "html", print = TRUE )
print_report( data, report_title = "{cleanepi} data cleaning report", output_file_name = NULL, format = "html", print = TRUE )
data |
A data frame or linelist object returned from the
|
report_title |
The title to appear on the report |
output_file_name |
A string specifying the name of the report file,
excluding any file extension. If no file name is supplied, one will be
automatically generated with the format |
format |
The file format of the report. Currently only |
print |
A logical that specifies whether to print the generated HTML
file or no. Default is |
A string containing the name and path of the saved report
data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) test_dictionary <- readRDS( system.file("extdata", "test_dictionary.RDS", package = "cleanepi") ) # scan through the data scan_res <- scan_data(data) # Perform data cleaning cleaned_data <- data %>% standardize_column_names(keep = NULL, rename = c("DOB" = "dateOfBirth")) %>% replace_missing_values(target_columns = NULL, na_strings = "-99") %>% remove_constants(cutoff = 1.0) %>% remove_duplicates(target_columns = NULL) %>% standardize_dates( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")) ) %>% check_subject_ids( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1L, 100L), nchar = 7L ) %>% convert_to_numeric(target_columns = "sex", lang = "en") %>% clean_using_dictionary(dictionary = test_dictionary) # add the data scanning result to the report cleaned_data <- add_to_report( x = cleaned_data, key = "scanning_result", value = scan_res ) # save a report in the current directory using the previously-created objects print_report( data = cleaned_data, report_title = "{cleanepi} data cleaning report", output_file_name = NULL, format = "html", print = TRUE )
data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) test_dictionary <- readRDS( system.file("extdata", "test_dictionary.RDS", package = "cleanepi") ) # scan through the data scan_res <- scan_data(data) # Perform data cleaning cleaned_data <- data %>% standardize_column_names(keep = NULL, rename = c("DOB" = "dateOfBirth")) %>% replace_missing_values(target_columns = NULL, na_strings = "-99") %>% remove_constants(cutoff = 1.0) %>% remove_duplicates(target_columns = NULL) %>% standardize_dates( target_columns = NULL, error_tolerance = 0.4, format = NULL, timeframe = as.Date(c("1973-05-29", "2023-05-29")) ) %>% check_subject_ids( target_columns = "study_id", prefix = "PS", suffix = "P2", range = c(1L, 100L), nchar = 7L ) %>% convert_to_numeric(target_columns = "sex", lang = "en") %>% clean_using_dictionary(dictionary = test_dictionary) # add the data scanning result to the report cleaned_data <- add_to_report( x = cleaned_data, key = "scanning_result", value = scan_res ) # save a report in the current directory using the previously-created objects print_report( data = cleaned_data, report_title = "{cleanepi} data cleaning report", output_file_name = NULL, format = "html", print = TRUE )
The function iteratively removes the constant data until there are not found anymore. It stores the details about the removed constant data in a form of a data frame within the report object.
remove_constants(data, cutoff = 1)
remove_constants(data, cutoff = 1)
data |
The input data frame or linelist |
cutoff |
The cut-off for empty rows and columns removal. If provided, only rows and columns where the percent of missing data is greater than this cut-off will removed. Default is 1. |
The input dataset without the empty rows and columns and the constant columns.
data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) # introduce an empty column data$empty_column <- NA # introduce some missing values across some columns data$study_id[3] = NA_character_ data$date.of.admission[3] = NA_character_ data$date.of.admission[4] = NA_character_ data$dateOfBirth[3] = NA_character_ data$dateOfBirth[4] = NA_character_ data$dateOfBirth[5] = NA_character_ # with cutoff = 1, line 3, 4, and 5 are not removed test <- cleanepi::remove_constants( data = data, cutoff = 1 ) # drop rows or columns with a percentage of constant values # equal to or more than 50% test <- cleanepi::remove_constants( data = test, cutoff = 0.5 ) # drop rows or columns with a percentage of constant values # equal to or more than 25% test <- cleanepi::remove_constants( data = test, cutoff = 0.25 ) # drop rows or columns with a percentage of constant values # equal to or more than 15% test <- cleanepi::remove_constants( data = test, cutoff = 0.15 ) # check the report to see what has happened report <- attr(test, "report") report$constant_data
data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) # introduce an empty column data$empty_column <- NA # introduce some missing values across some columns data$study_id[3] = NA_character_ data$date.of.admission[3] = NA_character_ data$date.of.admission[4] = NA_character_ data$dateOfBirth[3] = NA_character_ data$dateOfBirth[4] = NA_character_ data$dateOfBirth[5] = NA_character_ # with cutoff = 1, line 3, 4, and 5 are not removed test <- cleanepi::remove_constants( data = data, cutoff = 1 ) # drop rows or columns with a percentage of constant values # equal to or more than 50% test <- cleanepi::remove_constants( data = test, cutoff = 0.5 ) # drop rows or columns with a percentage of constant values # equal to or more than 25% test <- cleanepi::remove_constants( data = test, cutoff = 0.25 ) # drop rows or columns with a percentage of constant values # equal to or more than 15% test <- cleanepi::remove_constants( data = test, cutoff = 0.15 ) # check the report to see what has happened report <- attr(test, "report") report$constant_data
When removing duplicates, users can specify a set columns to consider with
the target_columns
argument.
remove_duplicates(data, target_columns = NULL)
remove_duplicates(data, target_columns = NULL)
data |
The input data frame or linelist. |
target_columns |
A vector of column names to use when looking for
duplicates. When the input data is a |
A data frame or linelist without the duplicated rows identified from all or the specified columns.
no_dups <- remove_duplicates( data = readRDS( system.file("extdata", "test_linelist.RDS", package = "cleanepi") ), target_columns = "linelist_tags" )
no_dups <- remove_duplicates( data = readRDS( system.file("extdata", "test_linelist.RDS", package = "cleanepi") ), target_columns = "linelist_tags" )
NA
Replace missing values with NA
replace_missing_values( data, target_columns = NULL, na_strings = cleanepi::common_na_strings )
replace_missing_values( data, target_columns = NULL, na_strings = cleanepi::common_na_strings )
data |
A data frame or linelist |
target_columns |
A vector of column names. If provided, the substitution
of missing values will only be executed in those specified columns. When
the input data is a |
na_strings |
This is a vector of strings that represents the missing
values in the columns of interest. By default, it utilizes
|
The input data where missing values are replaced by NA
.
cleaned_data <- replace_missing_values( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "sex", na_strings = "-99" )
cleaned_data <- replace_missing_values( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "sex", na_strings = "-99" )
missing
, numeric
,
Date
, character
, logical
values.The function checks for the existence of character columns in the data. When found, it reports back the proportion of the data types mentioned above in those columns. See the details section to know more about how it works.
scan_data(data)
scan_data(data)
data |
A data frame or linelist |
How does it work?
The character
columns are identified first. When there is no
character column the function returns a message.
For every character column, we count:
the number of missing data NA
the number of numeric values. A process of detecting valid dates among the
numeric values is then initiated using lubridate::as_date()
and
date_guess()
functions. If found, a warning is triggered to alert on
the presence and ambiguous (numeric values that are potentially date) values.
NOTE: A date is considered valid in this case if it falls within the interval
of today's date and 50 years back from today.
detect the Date values from the non-numeric using the date_guess()
function. The date count is the sum of dates identified from numeric and
non-numeric values. Because of the overlap between numeric and date, the sum
across the rows in the scanning result might be greater than 1.
count the logical values. The remaining values will be those of type characters.
A data frame if the input data contains columns of type character.
It invisibly returns NA
otherwise. The returned data frame will
have the same number of rows as the number of character columns, and six
columns representing their column names, proportion of missing, numeric,
date, character, and logical values.
# scan through a data frame of characters scan_result <- scan_data( data = readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ) ) # scan through a data frame with two character columns scan_result <- scan_data( data = readRDS(system.file("extdata", "test_linelist.RDS", package = "cleanepi")) ) # scan through a data frame with no character columns data(iris) iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), replace = TRUE)) iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), to = as.Date("2024-08-30"), length.out = nrow(iris))) iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) scan_result <- scan_data(data = iris)
# scan through a data frame of characters scan_result <- scan_data( data = readRDS( system.file("extdata", "messy_data.RDS", package = "cleanepi") ) ) # scan through a data frame with two character columns scan_result <- scan_data( data = readRDS(system.file("extdata", "test_linelist.RDS", package = "cleanepi")) ) # scan through a data frame with no character columns data(iris) iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), replace = TRUE)) iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), to = as.Date("2024-08-30"), length.out = nrow(iris))) iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) scan_result <- scan_data(data = iris)
All columns names will be reformatted to use the snakecase. When the
conversion to snakecase does not work as expected, use the keep
and/or
rename
arguments to reformat the column name properly.
standardize_column_names(data, keep = NULL, rename = NULL)
standardize_column_names(data, keep = NULL, rename = NULL)
data |
The input data frame or linelist. |
keep |
A vector of column names to maintain as they are. When dealing
with a linelist, this can be set to |
rename |
A named vector of column names to be renamed. This should be in
the form of |
A data frame or linelist with easy to work with column names.
# do not rename 'date.of.admission' cleaned_data <- standardize_column_names( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), keep = "date.of.admission" ) # do not rename 'date.of.admission', but rename 'dateOfBirth' and 'sex' to # 'DOB' and 'gender' respectively cleaned_data <- standardize_column_names( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), keep = "date.of.admission", rename = c(DOB = "dateOfBirth", gender = "sex") )
# do not rename 'date.of.admission' cleaned_data <- standardize_column_names( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), keep = "date.of.admission" ) # do not rename 'date.of.admission', but rename 'dateOfBirth' and 'sex' to # 'DOB' and 'gender' respectively cleaned_data <- standardize_column_names( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), keep = "date.of.admission", rename = c(DOB = "dateOfBirth", gender = "sex") )
When the format of the values in a column and/or the target columns are not
defined, we strongly recommend checking a few converted dates manually to
make sure that the dates extracted from a character
vector or a factor
are correct.
standardize_dates( data, target_columns = NULL, format = NULL, timeframe = NULL, error_tolerance = 0.5, orders = NULL )
standardize_dates( data, target_columns = NULL, format = NULL, timeframe = NULL, error_tolerance = 0.5, orders = NULL )
data |
A data frame or linelist |
target_columns |
A vector of the target date column names. When the
input data is a |
format |
A vector of the expected formats in the date values from the
date columns. Default is |
timeframe |
A vector of 2 values of type date. If provided, date values
that do not fall within this timeframe will be set to |
error_tolerance |
A number between 0 and 1 indicating the proportion of entries which cannot be identified as dates to be tolerated; if this proportion is exceeded, the original vector is returned, and a message is issued; defaults to 0.4 (40 percent). |
orders |
A list or character vector with the date codes for fine-grained
parsing of dates. This allows for parsing of mixed dates. If a list is
supplied, that list will be used for successive tries in parsing. When
this is not provided ( list( quarter_partial_dates = c("Y", "Ym", "Yq"), world_digit_months = c("Yq", "ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", "dmY", "mdY", "mYd", "dYm"), world_named_months = c("dby", "dyb", "bdy", "byd", "ybd", "ydb", "dbY", "dYb", "bdY", "bYd", "Ybd", "Ydb"), us_format = c("Omdy", "YOmd") ) |
Check for the presence of date values that could have multiple formats
from the $multi_format_dates
element of the report
.
Converting ambiguous character strings to dates is difficult for many reasons:
dates may not use the standard Ymd format
within the same variable, dates may follow different formats
dates may be mixed with things that are not dates
the behavior of as.Date
in the presence of non-date is hard to predict,
sometimes returning NA
, sometimes issuing an error.
This function tries to address all the above issues. Dates with the following format should be automatically detected, irrespective of separators (e.g. "-", " ", "/") and surrounding text:
"19 09 2018"
"2018 09 19"
"19 Sep 2018"
"2018 Sep 19"
"Sep 19 2018"
This function relies heavily on lubridate::parse_date_time()
, which is an
extremely flexible date parser that works well for consistent date formats,
but can quickly become unwieldy and may produce spurious results.
standardize_dates()
will use a list of formats in the orders
argument to
run parse_date_time()
with each format vector separately and take the first
correctly parsed date from all the trials.
With the default orders shown above, the dates 03 Jan 2018, 07/03/1982, and
08/20/85 are correctly interpreted as 2018-01-03, 1982-03-07, and 1985-08-20.
The examples section will show how you can manipulate the orders
to be
customized for your situation.
The input dataset where the date columns have been standardized. The date values that are out of the specified timeframe will be reported in the report. Similarly, date values that comply with multiple formats will also be featured in the report object.
x <- c("03 Jan 2018", "07/03/1982", "08/20/85") # The below will coerce values where the month is written in letters only # into Date. as.Date(lubridate::parse_date_time(x, orders = c("Ybd", "dby"))) # coerce values where the month is written in letters or numbers into Date. as.Date(lubridate::parse_date_time(x, orders = c("dmy", "Ymd"))) # How to use standardize_dates() dat <- standardize_dates( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "date_first_pcr_positive_test", format = NULL, timeframe = NULL, error_tolerance = 0.4, orders = list( world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_format = c("Omdy", "YOmd") ) )
x <- c("03 Jan 2018", "07/03/1982", "08/20/85") # The below will coerce values where the month is written in letters only # into Date. as.Date(lubridate::parse_date_time(x, orders = c("Ybd", "dby"))) # coerce values where the month is written in letters or numbers into Date. as.Date(lubridate::parse_date_time(x, orders = c("dmy", "Ymd"))) # How to use standardize_dates() dat <- standardize_dates( data = readRDS( system.file("extdata", "test_df.RDS", package = "cleanepi") ), target_columns = "date_first_pcr_positive_test", format = NULL, timeframe = NULL, error_tolerance = 0.4, orders = list( world_named_months = c("Ybd", "dby"), world_digit_months = c("dmy", "Ymd"), US_format = c("Omdy", "YOmd") ) )
Calculate time span between dates
timespan( data, target_column = NULL, end_date = Sys.Date(), span_unit = c("years", "months", "weeks", "days"), span_column_name = "span", span_remainder_unit = NULL )
timespan( data, target_column = NULL, end_date = Sys.Date(), span_unit = c("years", "months", "weeks", "days"), span_column_name = "span", span_remainder_unit = NULL )
data |
The input data frame or linelist |
target_column |
A string used to specify the name of the date column of interest. The values in this column should be of type 'Date' in ISO8601 format ("2024-01-31"). |
end_date |
An end date. It can be either a character that is the name of
another column of type 'Date' from the input data or a vector of Dates or
a single Date value. This should also be in the ISO8601 format
("2024-01-31"). Default is today's date |
span_unit |
A string that specifies the units in which the time span between the dates will be returned. The possible units are: 'years', 'months', 'weeks' or 'days'. |
span_column_name |
A string for the name of the new column to be used to store the calculated time span in the input data frame. |
span_remainder_unit |
A string for the unit in which the remainder of
the time span should be calculated. May be one of "months", "weeks",
and "days". Remainders requested in the same unit as the age will return
values of 0. Default is |
The input data frame with one or two additional columns:
"span" or any other name chosen by the user. This will contain the calculated time span in the desired units.
"_remainder": a column with the number of the remaining days or weeks or months depending on the value of the 'span_remainder_unit' parameter. Here '' represents the value of the 'span_column_name' argument.
# In the below example, this function is used to calculate patient's age from # their dates of birth # import the data, replace missing values with NA and convert date into ISO # format data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) data <- data %>% replace_missing_values(target_columns = "dateOfBirth", na_strings = "-99") %>% standardize_dates(target_columns = "dateOfBirth", error_tolerance = 0.0) # calculate the age in 'years' and return the remainder in 'months' age <- timespan( data = data, target_column = "dateOfBirth", end_date = Sys.Date(), span_unit = "years", span_column_name = "age_in_years", span_remainder_unit = "months" )
# In the below example, this function is used to calculate patient's age from # their dates of birth # import the data, replace missing values with NA and convert date into ISO # format data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) data <- data %>% replace_missing_values(target_columns = "dateOfBirth", na_strings = "-99") %>% standardize_dates(target_columns = "dateOfBirth", error_tolerance = 0.0) # calculate the age in 'years' and return the remainder in 'months' age <- timespan( data = data, target_column = "dateOfBirth", end_date = Sys.Date(), span_unit = "years", span_column_name = "age_in_years", span_remainder_unit = "months" )