#' @title Manually create document embeddings #' #' @description Customized way create document embeddings from pre-trained word embeddings #' #' @details #' Common methods to convert word embeddings into document embeddings are \code{mean} (the default), #' \code{sum}, \code{range}, and many more \href{see here}{https://arxiv.org/abs/1711.08609}. #' #' \code{rnorm} is the only valid function that can be passed to \code{oov_vec}, otherwise #' use a numeric vector #' #' @param x Required character vector of words, sentences, or documents to embed. #' @param embedding_data Required. A \code{data.frame} or matrix with 300 variables where each \code{row.name} is a word. #' Use \code{\link{load_fasttext_data}} or \code{\link{load_glove_data}} to generate embedding data #' @param FUNC Optional. A function to be used convert word-embeddings into a single document embedding. See details. #' @param oov_vec Optional. Either a function to used to generate missing word embedding for #' out of vocabulary words, or a numeric vector of length 300 that will be used as a constant #' for out of vocabulary words. If \code{rnorm} the mean and sd of the #' random normal distribution are based on all in vocabulary word embeddings. #' @param ignore.case Optional. Ignore word case when looking for \code{x} in row names of #' \code{embedding_data}? \code{TRUE} by default #' @param word_weights Optional. A numeric vector of \code{length(x)} of weights to give each word. #' Defaults to 1. #' @param stop_words Optional. Character. A vector of words to remove. May use custom words. #' If set to \code{NULL}, the default, will ignore all words. #' @param verbose Optional. Return raw embeddings without applying \code{FUN}? \code{FALSE} by default. #' @param force If \code{TRUE} (\code{FALSE} by default), will replace dropped documents with vector assigned #' to \code{oov_vec}. #' #' @return A matrix of document embeddings embeddings #' #' @examples #' \dontrun{ #' ## basic usage with glove (mean aggregate) #' emb_data <- load_glove_data() #' #' text <- c("How are you doing today?", "Sentence 2 comes next", "Stare at the #' ocean.") #' #' agg_doc_embeddings <- embed_documents(text, emb_data) #' } #' #' @export embed_documents <- function(x, embedding_data, FUNC = mean, oov_vec = runif(300L, -.25, .25), ignore.case = TRUE, word_weights = NULL, stop_words = NULL, verbose = FALSE, force = FALSE) { stopifnot( exprs = { is.character(x) is.function(FUNC) is.logical(ignore.case) is.atomic(oov_vec) && is.numeric(oov_vec) is.logical(verbose) is.logical(force) }) embedding_data <- .validate_embedding_data(.embedding_data = embedding_data) embedding_data <- embedding_data[.make_unique_embeddings(.embedding_data_words = row.names(embedding_data), .ignore.case = ignore.case), ] res <- .match_embeddings_custom(.x = x, .embedding_data = embedding_data, .oov_vec = oov_vec, .ignore.case = ignore.case, .stop_words = stop_words, .verbose = verbose, .force = force) if (verbose) { return(res) } if (!is.null(word_weights)) { res <- .weight_embeddings(res, word_weights) } res <- lapply(split(res[-1], res[1]), function(.x) { .x <- apply(.x, MARGIN = 2, FUNC) }) .doc_id <- names(res) res <- as.data.frame(do.call(rbind, res)) row.names(res) <- .doc_id return(res) } #' @title Validate document embeddings helper #' #' @description #' Helper function used in \code{\link{embed_documents_custom}}. This function #' is not to be used directly by users. #' for more information. #' #' @keywords internal #' #' @family embeddings #' #' @export .validate_embedding_data <- function(.embedding_data) { .check_df <- try(as.data.frame(.embedding_data, stringsAsFactors = FALSE)) if (inherits(.check_df, "try-error")) { stop("embedding data must be a data.frame or coerced to a data.frame") } .temp_data <- c(row.names(.embedding_data)[1L], .embedding_data[1L, ]) .check_letters <- grepl("(?