#' Flag duplicate strings in a character vector
#'
#' @param x Required character vector of words, sentences, or documents to identify duplicates within.
#' @param make_first_case_unique Treat first occurrence of a duplicate as unique? (default: TRUE)
#' @param consider_apostrophes Consider apostrophes when flagging duplicates (default: FALSE)
#' @param consider_hyphens Consider hyphens when flagging duplicates (default: FALSE)
#' @param ... Additional arguments that can be passed to gsub before flagging duplicates
#'
#' @return A logical vector of \code(length(x)) indicating duplicate elements of \code(x)
#' @export
#'
#' @examples
flag_duplicates <- function(x, make_first_case_unique = TRUE, consider_apostrophes = FALSE, consider_hyphens = FALSE, ...) {
    stopifnot(exprs = {
        is.character(x)
        all(vapply(c(make_first_case_unique, consider_apostrophes, consider_hyphens), is.logical, FUN.VALUE = logical(1)))
    })
    keep_pattern <- c("a-z0-9", "'", "-")
    keep_pattern <- sprintf("[^%s]+", paste0(keep_pattern[c(TRUE, consider_apostrophes, consider_hyphens)], collapse = ""))
    x <- gsub(keep_pattern, "", tolower(x), ...)
    dup_x <- duplicated(x)
    if (make_first_case_unique) {
        return(dup_x)
    }
    return(x %in% x[dup_x])
}
#' Extract Factor Loadings from psych::fa Object
#'
#' @param fa_model A factor analysis model object from \code{psych::fa()}
#' @param append_model_info_to_names Append model information to loading matrix (default: TRUE)
#'
#' @return A data.frame of factor loadings
#' @export
#'
#' @importFrom psych::fa
#' @examples
extract_efa_loadings_matrix <- function(fa_model, append_model_info_to_names = TRUE) {
    stopifnot(exprs = {
        inherits(fa_model, c("psych", "fa"))
        "loadings" %in% names(fa_model)
    })
    res <- as.data.frame(unclass(fa_model[["loadings"]]))
    if (append_model_info_to_names) {
        names(res) <- paste0(names(res), "_", fa_model[["fm"]], "_", fa_model[["rotation"]])
    }
    return(res)
}

#' Calculate the consensus label among a matrix of labels
#'
#' Used to calculate the consensus among raters
#'
#' @param x A data.frame or matrix of label counts where each row represents a
#'   document and each column a label. Additionally, can be a character vector
#'   of labels, if so, the argument \code{doc_ids} must be defined.
#' @param doc_ids If \code{x} is a character vector of predicted labels,
#'   \code{doc_ids} must be an equal length character vector identifying the
#'   document corresponding to the prediction.
#'
#' @return A vector of consensus labels
#'
#' @examples
#'
#' @export
calculate_consensus_ratings <- function(x, ...) {
    if (is.factor(x)) x <- as.character(x)
    UseMethod("calculate_consensus_ratings", x)
}
#' @method calculate_consensus_ratings character
#' @export
calculate_consensus_ratings.character <- function(x, doc_ids) {
    stopifnot(any(duplicated(doc_ids)))
    rating_counts <- tapply(x, doc_ids, table)
    res <- vapply(rating_counts, function(counts_i) {
        find_top_count(counts_i, names(counts_i))
    }, FUN.VALUE = character(1L))
    return(res)
}
#' @method calculate_consensus_ratings default
#' @export
calculate_consensus_ratings.default <- function(x) {
    stopifnot(inherits(x, c("matrix", "data.frame")))
    number_col <- vapply(x, is.numeric, FUN.VALUE = logical(1L))
    if (!all(number_col)) stop("x must be a data.frame or matrix of only numeric columns")
    vlabs <- colnames(x)
    res <- apply(x, MARGIN = 1, function(row_i) {
        find_top_count(row_i, vlabs)
    }, simplify = TRUE)
    return(res)
}

#' Helper used in \code{\link{calculate_consensus_ratings}}
#'
#' @param x A character vector of values (usually labels)
#' @param xn A character vector of names of \code{x}
#'
#' @return
#' @export
#'
#' @examples
find_top_count <- function(x, xn) {
    stopifnot({
        is.numeric(x)
        is.character(xn)
    })
    max_indx <- seq_along(x)[x == max(x, na.rm = TRUE)]
    if (length(max_indx) > 1L) {
        return(xn[sample(max_indx, 1L)])
    }
    return(xn[max_indx])
}
#' Flag Items based on EFA Factor Loadings
#'
#' @param m A data.frame or matrix of factor loadings (produced by a function like \code{\link[psych]{fa}()})
#'   where each row represents an item and each column a generic factor.
#' @param thresh
#' @param cross_load_prop
#'
#' @return
#' @export
#'
#' @examples
flag_criteria <- function(m, thresh = .40, cross_load_prop = .75) {
    stopifnot(inherits(m, c("data.frame", "matrix")))
    m_ordered <- t(apply(m, MARGIN = 1, FUN = function(i) {
        abs(i)[order(abs(i), decreasing = TRUE)]
    }))
    min_load <- m_ordered[, 1] < thresh
    cross_load <- (m_ordered[, 2] / m_ordered[, 1]) >= cross_load_prop & !min_load
    return(list(no_loading = min_load, cross_loading = cross_load, overall_flag = min_load | cross_load))
}

#' Algorithm to automatically Determine Factor Labels from EFA
#'
#' @param m
#' @param ground_truth
#'
#' @return
#' @export
#'
#' @examples
determine_efa_factor_labels <- function(m, ground_truth) {
    stopifnot( {
        inherits(m, c("data.frame", "matrix"))
        is.character(ground_truth) && length(ground_truth) == nrow(m)
    })
    indx_mat <- apply(m, MARGIN = 2, FUN = function(x) {
        tapply(x, ground_truth, FUN = function(xi) {
            mean(abs(xi))
        })
    })
    mapped_names <- row.names(indx_mat)
    res <- sapply(as.data.frame(indx_mat), function(x) mapped_names[which.max(x)])
    if (anyDuplicated(res) != 0L) {
        stop("Unable to identify correct factors from ground_truth. Please manually inspect.", .call = FALSE)
    }
    res
}

#' Generate grammatically incorrect items
#'
#' @param x A character vector of items to base generation off of.
#' @param split_pattern A valid regex pattern (or character) to split \code{x} into tokens
#' @param num_to_generate Number of fake items to generate.
#' @param remove_words Number of words to remove from \code{x} before generating fake item.
#' @param add_sample_word Logical. Randomly sample word from vocabulary before generating item?
#'
#' @return
#' @export
#'
#' @examples
generate_denatured_text <- function(x, split_pattern = "\\s+", num_to_generate = length(x) %/% 4L, remove_words = 1L, add_sample_word = TRUE) {
    stopifnot({
        is.character(x)
    })
    filter_fun <- \(s, n) {
        s[-c(sample(seq_along(s), n))]
    }
    x <- strsplit(x, split = split_pattern)
    word_corpus <- unique(unlist(x))
    x <- lapply(x, FUN = function(i) {
        .i <- filter_fun(i, n = remove_words)
        if (add_sample_word) {
            .i <- c(.i, sample(word_corpus, 1L))
        }
        sample(.i)
    })
    x <- sapply(x, paste0, collapse = " ")
    return(sample(x, num_to_generate))
}