#' @title Run Text Classification Analyses #' #' @description A wrapper using \code{train_models} meant to run for each dataset \code{x} specified. #' #' #' @param x A data.frame containing only features and label \code{y} (Required) #' @param y A character string of the column in \code{x} containing labels (Required) #' @param filter A character string of a pattern to subset featuers in \code{x} (Default = \code{NULL}) #' @param analyses_labels A character vector of labels representing names of models. Should be equal to \code{length(x)} (Default = \code{NULL}) #' @param ... Additional arguments to pass pass to \code{train_model}. #' #' @return A list of trained \code{carat} models length of \code(x * classifier) #' @export #' @examples #' train_data <- list(iris, iris[-2]) #' #' # run analysis with only variables containing the word Width #' #' trained_models <- run_analysis(train_data, "Species", "Width") #' #' # the svm model for the first data set #' str(trained_models[[1]]) #' #' # the xgboost model for the second data set #' str(trained_models[[4]]) run_analysis <- function(x, y, filter = NULL, analyses_labels = NULL, ...) { UseMethod("run_analysis", x) } #' @export run_analysis.default <- function(x, y, filter = NULL, analyses_labels = NULL, ...) { .filter_data <- function(x, y, .filter) { if (!is.null(.filter)) { .prds <- names(x)[grepl(.filter, names(x), ignore.case = TRUE)] if (length(.prds) != 0) { x <- x[unique(c(y, .prds))] } } x[,y] <- as.factor(x[,y]) return(x) } if (!inherits(x, "data.frame")) { stop("x must be a data.frame or list of data.frames", call = FALSE) } x <- .filter_data(x, y, filter) mdl <- train_model(x, y, ...) return(mdl) } #' @export run_analysis.list <- function(x, y, filter = NULL, analyses_labels = NULL, simplify = TRUE, ...) { .filter_data <- function(x, y, .filter) { if (!is.null(.filter)) { .prds <- names(x)[grepl(.filter, names(x), ignore.case = TRUE)] if (length(.prds) != 0) { x <- x[unique(c(y, .prds))] } } x[,y] <- as.factor(x[,y]) return(x) } if (!inherits(x[[1]], "data.frame")) { stop("x must be a list of data.frames", call = FALSE) } x <- lapply(x, function(xx) { xx <- .filter_data(xx, y, filter) }) mdls <- lapply(x, function(xxx){ xxx <- train_model(xxx, y, ...) }) if (!is.null(analyses_labels)) { names(mdls) <- analyses_labels } if (simplify) { mdls <- unlist(mdls, recursive = FALSE) } return(mdls) } #' @title Train Caret Model #' #' @description A \code{caret} abstraction used in \code{run_analysis} meant to run for each \code{classifier} specified. #' #' @import caret #' @importFrom future availableCores #' @importFrom doParallel registerDoParallel #' @importFrom parallel makeCluster #' #' @param x A data.frame containing only features and label \code{y} (Required) #' @param y A character string of the column in \code{x} containing labels (Required) #' @param classifier A vector of \code{caret::} classification models (Default = \code{c("svmLinear2", "xgbTree")}). See \url{topepo.github.io/caret/train-models-by-tag.html} for more info. #' @param control A list of training arguments to pass to \code{\link[caret]{trainControl}()}. #' @param metric The metric to evaluate training method (Default = \code{"Accuracy"}). #' @param SEED An integer used for setting random seed (Default = \code{46}). #' @param ... Additional arguments to pass pass to \code{\link[caret]{train}()}. #' #' @return A trained caret model for each \code{classifier} specified. #' @export #' @examples #' train_data <- iris #' #' trained_models <- train_model(train_data, "Species") #' #' # the svm model #' str(trained_models[[1]]) #' #' # the xgboost model #' str(trained_models[[2]]) train_model <- function(x, y, classifier = c("svmLinear2", "xgbTree"), control = list( method = "cv", number = 7, allowParallel = TRUE, classProbs = TRUE ), metric = "Accuracy", SEED = 42, ...) { control_args <- do.call(caret::trainControl, control) # set-up parallel processing (if allowed) if (control_args$allowParallel) { .clust <- parallel::makeCluster(future::availableCores() - 1) doParallel::registerDoParallel(.clust) } fmla <- paste0(y, " ~ .") # set-up random seeds set.seed(SEED) if (length(classifier) == 1L) { mdls <- caret::train( form = as.formula(fmla), data = x, method = classifier, metric = metric, trControl = control_args, ...) } else { mdls <- lapply(classifier, function(algs) { mdls <- caret::train( form = as.formula(fmla), data = x, method = algs, metric = metric, trControl = control_args, ... ) }) } parallel::stopCluster(.clust) return(mdls) }