% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fastml.R
\name{fastml}
\alias{fastml}
\title{Fast Machine Learning Function}
\usage{
fastml(
  data = NULL,
  train_data = NULL,
  test_data = NULL,
  label,
  algorithms = "all",
  task = "auto",
  test_size = 0.2,
  resampling_method = if (identical(task, "survival")) "none" else "cv",
  folds = ifelse(grepl("cv", resampling_method), 10, 25),
  repeats = NULL,
  group_cols = NULL,
  block_col = NULL,
  block_size = NULL,
  initial_window = NULL,
  assess_window = NULL,
  skip = 0,
  outer_folds = NULL,
  event_class = "first",
  exclude = NULL,
  recipe = NULL,
  tune_params = NULL,
  engine_params = list(),
  metric = NULL,
  class_threshold = "auto",
  algorithm_engines = NULL,
  use_parsnip_defaults = FALSE,
  warn_engine_defaults = TRUE,
  n_cores = 1,
  stratify = TRUE,
  impute_method = "error",
  encode_categoricals = TRUE,
  scaling_methods = c("center", "scale"),
  balance_method = "none",
  resamples = NULL,
  summaryFunction = NULL,
  use_default_tuning = FALSE,
  tuning_strategy = "grid",
  tuning_iterations = 10,
  tuning_complexity = "balanced",
  grid_levels = NULL,
  early_stopping = FALSE,
  adaptive = FALSE,
  learning_curve = FALSE,
  seed = 123,
  verbose = FALSE,
  eval_times = NULL,
  survival_metric_convention = "fastml",
  bootstrap_ci = TRUE,
  bootstrap_samples = 500,
  bootstrap_seed = NULL,
  at_risk_threshold = 0.1,
  audit_mode = FALSE,
  multiclass_auc = "macro",
  store_fold_models = FALSE
)
}
\arguments{
\item{data}{A data frame containing the complete dataset. If both `train_data` and `test_data` are `NULL`, `fastml()` will split this into training and testing sets according to `test_size` and `stratify`. When `group_cols` is supplied, the holdout keeps groups intact; when `block_col` is supplied, the holdout uses the last rows in time order. Defaults to `NULL`.}

\item{train_data}{A data frame pre-split for model training. If provided, `test_data` must also be supplied, and no internal splitting will occur. Defaults to `NULL`.}

\item{test_data}{A data frame pre-split for model evaluation. If provided, `train_data` must also be supplied, and no internal splitting will occur. Defaults to `NULL`.}

\item{label}{A string specifying the name of the target variable. For
survival analysis, supply a character vector with the names of the time and
status columns.}

\item{algorithms}{A vector of algorithm names to use. Default is \code{"all"} to run all supported algorithms.}

\item{task}{Character string specifying model type selection. Use "auto" to
let the function detect whether the target is for classification, regression,
or survival based on the data. Survival is detected when `label` is a
character vector of length 2 that matches time and status columns in the data.
You may also explicitly set to "classification", "regression", or "survival".}

\item{test_size}{A numeric value between 0 and 1 indicating the proportion of the data to use for testing. For grouped holdout, this is applied to groups; for time-ordered holdout, it selects the final proportion of rows. Default is \code{0.2}.}

\item{resampling_method}{A string specifying the resampling method for model evaluation. Default is \code{"cv"}
(cross-validation) for classification/regression. Other options include \code{"none"}, \code{"boot"},
\code{"repeatedcv"}, \code{"grouped_cv"}, \code{"blocked_cv"}, \code{"rolling_origin"}, and \code{"nested_cv"}.
For survival tasks, resampling is supported for parsnip-compatible engines (e.g., censored/ranger, glmnet).
Native survival engines (flexsurv/rstpm2/custom xgboost) ignore resampling and will error if custom resamples
are supplied. When the task auto-detects survival and \code{resampling_method} is omitted, it defaults to
\code{"none"} so native engines continue to run; set it explicitly to enable resampling for parsnip survival fits.}

\item{folds}{An integer specifying the number of folds for cross-validation. Default is \code{10} for methods containing "cv" and \code{25} otherwise.}

\item{repeats}{Number of times to repeat cross-validation (only applicable for methods like "repeatedcv").}

\item{group_cols}{Character vector naming one or more grouping columns used when
\code{resampling_method = "grouped_cv"} or when grouped nested cross-validation is desired.
All rows that share the same combination of values remain together in every fold. Columns must exist
in the training data and cannot contain missing values.}

\item{block_col}{Single column name that defines the ordering variable for
\code{resampling_method = "blocked_cv"} or \code{"rolling_origin"}. Data must already be sorted in
ascending order by this column to avoid leakage from future observations.}

\item{block_size}{Positive integer specifying the block size for \code{"blocked_cv"}.}

\item{initial_window}{Positive integer giving the number of observations in the initial training
window for \code{"rolling_origin"} resampling.}

\item{assess_window}{Positive integer giving the number of observations in each assessment window for
\code{"rolling_origin"} resampling.}

\item{skip}{Non-negative integer specifying how many potential rolling windows to skip between
successive resamples when \code{resampling_method = "rolling_origin"}.}

\item{outer_folds}{Positive integer giving the number of outer folds to use when
\code{resampling_method = "nested_cv"} and no custom \code{resamples} object is supplied.}

\item{event_class}{A single string. Either "first" or "second" to specify which
level of the binary outcome factor to treat as the positive class (the "event").
For binary classification, "first" treats the first factor level as the positive
class, "second" treats the second level as positive. Use
\code{levels(your_data$outcome)} to check level order before training. Default is "first".}

\item{exclude}{A character vector specifying the names of the columns to be excluded from the training process.}

\item{recipe}{A user-defined \code{recipe} object for custom preprocessing. If provided, internal recipe steps (imputation, encoding, scaling) are skipped.}

\item{tune_params}{A named list of tuning ranges for each algorithm and engine
pair. Example: \code{list(rand_forest = list(ranger = list(mtry = c(1, 3))))}
will override the defaults for the ranger engine. Default is \code{NULL}.}

\item{engine_params}{A named list of engine-level arguments to pass directly
to the underlying model fitting functions. Use this for fixed settings that
should apply whenever an engine is fitted (for example,
\code{list(royston_parmar = list(rstpm2 = list(link = "PO")))},
\code{list(cox_ph = list(survival = list(ties = "breslow")))}, or
\code{list(rand_forest = list(ranger = list(importance = "impurity")))}).
These arguments are distinct from \code{tune_params}, which define ranges of
hyperparameters to explore during tuning. Default is an empty list.}

\item{metric}{The performance metric to optimize during training. For
classification, options include \code{"accuracy"}, \code{"roc_auc"},
\code{"logloss"}, \code{"brier_score"}, and \code{"ece"} (plus other class metrics).}

\item{class_threshold}{For binary classification, controls how class probabilities
are converted into hard class predictions during holdout evaluation. Numeric
values in (0, 1) set a fixed threshold. The default \code{"auto"} tunes a
threshold on the training data to maximize F1; use \code{"model"} to keep
the model's default threshold.}

\item{algorithm_engines}{A named list specifying the engine to use for each algorithm.}

\item{use_parsnip_defaults}{Logical. If \code{TRUE}, fastml uses parsnip's default
engines instead of fastml's optimized defaults. This provides compatibility with
standard tidymodels behavior. If \code{FALSE} (default), fastml uses its own
curated engine defaults which may differ from parsnip. When engines differ,
a warning is issued unless \code{algorithm_engines} explicitly specifies the engine.
Use \code{print_default_differences()} to see all differences.}

\item{warn_engine_defaults}{Logical. If \code{TRUE} (default), warns when fastml's
default engine differs from parsnip's default. Set to \code{FALSE} to suppress
these warnings. Warnings are only shown once per algorithm per session.}

\item{n_cores}{An integer specifying the number of CPU cores to use for parallel processing. Default is \code{1}.}

\item{stratify}{Logical indicating whether to use stratified sampling when splitting the data. Only applied to random holdout splitting. Default is \code{TRUE} for classification and \code{FALSE} for regression.}

\item{impute_method}{Method for handling missing values. Options include:
\describe{
  \item{\code{"medianImpute"}}{Impute missing values using median imputation (recipe-based).}
  \item{\code{"knnImpute"}}{Impute missing values using k-nearest neighbors (recipe-based).}
  \item{\code{"bagImpute"}}{Impute missing values using bagging (recipe-based).}
  \item{\code{"remove"}}{Remove rows with missing values from the data (recipe-based).}
  \item{\code{"error"}}{Do not perform imputation; if missing values are detected, stop execution with an error.}
  \item{\code{NULL}}{Equivalent to \code{"error"}. No imputation is performed, and the function will stop if missing values are present.}
}
All imputation occurs inside the recipe so the same trained preprocessing
can be applied at prediction time. Default is \code{"error"}.}

\item{encode_categoricals}{Logical indicating whether to encode categorical variables. Default is \code{TRUE}.}

\item{scaling_methods}{Vector of scaling methods to apply. Default is \code{c("center", "scale")}.}

\item{balance_method}{Method to handle class imbalance. One of \code{"none"},
\code{"upsample"}, or \code{"downsample"}. Applied inside the preprocessing
recipe so each resampling split is balanced independently (requires the
\code{themis} package when enabled). Default is \code{"none"}.}

\item{resamples}{Optional rsample object providing custom resampling splits.
If supplied, \code{resampling_method}, \code{folds}, and \code{repeats} are
ignored.}

\item{summaryFunction}{A custom summary function for model evaluation. Default is \code{NULL}.}

\item{use_default_tuning}{Logical. Tuning only runs when resamples are supplied and
\code{tuning_strategy} is not \code{"none"}. If \code{TRUE} and
\code{tune_params} is \code{NULL}, default grids are used; if
\code{tune_params} is provided, those values override/extend defaults. When
\code{FALSE} and no custom parameters are given, models are fitted once with
default settings. If no resamples are available or \code{tuning_strategy =
"none"}, tuning requests are ignored with a warning. Default is \code{FALSE}.}

\item{tuning_strategy}{A string specifying the tuning strategy. Must be one of
\code{"grid"}, \code{"bayes"}, or \code{"none"}. Default is \code{"grid"}.
If custom \code{tune_params} are provided while \code{tuning_strategy = "none"},
they will be ignored with a warning.}

\item{tuning_iterations}{Number of iterations for Bayesian tuning. Ignored when
\code{tuning_strategy} is not \code{"bayes"}. Validation of this argument only
occurs for the Bayesian strategy. Default is \code{10}.}

\item{tuning_complexity}{Character string specifying a tuning complexity preset
that controls grid density and parameter range width. One of:
\describe{
  \item{\code{"quick"}}{Minimal tuning (2 levels/param, ~32 combinations for 5 params).
    Best for: prototyping, debugging, time-constrained scenarios.}
  \item{\code{"balanced"}}{Standard tuning (3 levels/param, ~243 combinations).
    Best for: most production use cases. This is the default.}
  \item{\code{"thorough"}}{Comprehensive tuning (5 levels/param, ~3,125 combinations).
    Best for: final model selection, publications.}
  \item{\code{"exhaustive"}}{Maximum coverage (7 levels/param, ~16,807 combinations).
    Best for: research, competitions. Consider Bayesian tuning instead.}
}
See \code{\link{print_tuning_presets}} for detailed comparison.
Ignored if \code{grid_levels} is explicitly set.}

\item{grid_levels}{Integer specifying the number of levels per parameter for
grid search. Higher values create denser grids but increase computation time
exponentially (grid size = levels^n_params). Typical values:
\itemize{
  \item 2: Very fast, minimal coverage
  \item 3: Balanced (default via \code{tuning_complexity = "balanced"})
  \item 5: Thorough coverage
  \item 7+: Exhaustive (consider Bayesian tuning instead)
}
If \code{NULL} (default), determined by \code{tuning_complexity}.}

\item{early_stopping}{Logical indicating whether to use early stopping in Bayesian tuning methods (if supported). Default is \code{FALSE}.}

\item{adaptive}{Logical indicating whether to use adaptive/racing methods for tuning. Default is \code{FALSE}.}

\item{learning_curve}{Logical. If TRUE, generate learning curves (performance vs. training size).}

\item{seed}{An integer value specifying the random seed for reproducibility.
fastml also configures parallel backends for deterministic RNG streams when
possible; some external engines (e.g., h2o, spark, keras) may still be
nondeterministic and will emit a warning.}

\item{verbose}{Logical; if TRUE, prints progress messages during the training
and evaluation process.}

\item{eval_times}{Optional numeric vector of evaluation horizons for survival
models. When \code{NULL}, defaults to the median and 75th percentile of the
observed follow-up times (rounded to the dataset's time unit).}

\item{survival_metric_convention}{Character string specifying which survival
metric conventions to follow. `"fastml"` (default) uses fastml's internal
defaults for evaluation horizons and t_max. `"tidymodels"` uses
`eval_times` as the explicit evaluation grid and applies yardstick-style
Brier/IBS normalization; when `eval_times` is `NULL`, time-dependent Brier
metrics are omitted.}

\item{bootstrap_ci}{Logical indicating whether bootstrap confidence intervals
should be computed for performance metrics. Applies to all task types.}

\item{bootstrap_samples}{Integer giving the number of bootstrap resamples to
use when \code{bootstrap_ci = TRUE}. Defaults to 500.}

\item{bootstrap_seed}{Optional seed passed to the bootstrap procedure used to
estimate confidence intervals. When omitted, defaults to `seed` for
reproducible intervals; set to `NULL` to allow random bootstrap draws.}

\item{at_risk_threshold}{Numeric value between 0 and 1 used for survival
metrics to determine the last follow-up time (\eqn{t_{max}}). The maximum
time is set to the largest observed time where at least this proportion of
subjects remain at risk.}

\item{audit_mode}{Logical; if \code{TRUE}, enables runtime auditing of custom
preprocessing hooks and records potentially unsafe behaviour (such as global
environment access or file I/O) while flagging the run as potentially
unsafe.}

\item{multiclass_auc}{For multiclass ROC AUC, the averaging method to use:
`"macro"` (default, tidymodels) or `"macro_weighted"`. Macro weights each
class equally, while macro_weighted weights by class prevalence and can
change model rankings on imbalanced data.}

\item{store_fold_models}{Logical. If \code{TRUE}, stores the models trained
on each cross-validation fold (memory intensive). This enables
\code{\link{explain_stability}} to compute feature importance across folds
and assess explanation stability. Default is \code{FALSE}.}
}
\value{
An object of class \code{fastml} containing the best model, performance metrics, and other information.
}
\description{
Trains and evaluates multiple classification or regression models automatically detecting the task based on the target variable type.
}
\details{
Fast Machine Learning Function

Trains and evaluates multiple classification or regression models. The function automatically
detects the task based on the target variable type and can perform advanced hyperparameter tuning
using various tuning strategies.


Model selection is based exclusively on resampling metrics (cross-validation
or nested CV). The holdout split is reserved for final performance
estimation and is never used to choose the best model, mirroring
\code{tidymodels::last_fit()} semantics.

For multiclass ROC AUC, fastml defaults to macro averaging (tidymodels).
Macro treats each class equally, while macro_weighted weights by class
prevalence and can change model rankings on imbalanced data. Keep the same
setting when comparing runs.

## Tuning: Speed vs Robustness Trade-offs

Hyperparameter tuning involves a fundamental trade-off between computational
cost and the likelihood of finding optimal hyperparameters. fastml provides
presets via \code{tuning_complexity} to make this trade-off explicit:

\tabular{lllll}{
  \strong{Level} \tab \strong{Grid Size*} \tab \strong{Time} \tab \strong{Quality} \tab \strong{Use Case} \cr
  quick \tab ~32 \tab ~1x \tab Low \tab Prototyping, debugging \cr
  balanced \tab ~243 \tab ~10x \tab Medium \tab Most production use \cr
  thorough \tab ~3,125 \tab ~100x \tab High \tab Final models, papers \cr
  exhaustive \tab ~16,807 \tab ~1000x \tab Very High \tab Research, competitions \cr
}
*Grid size shown for 5 tunable parameters (levels^5)

**Recommendations:**
\itemize{
  \item Start with \code{tuning_complexity = "quick"} during development
  \item Use \code{"balanced"} (default) for most production pipelines
  \item Switch to \code{"thorough"} for final model selection
  \item Consider \code{tuning_strategy = "bayes"} instead of exhaustive grid search
  \item Enable \code{adaptive = TRUE} for early stopping of poor configurations
}

Use \code{\link{print_tuning_presets}} to see all presets and
\code{\link{estimate_tuning_time}} to estimate runtime before starting.
}
\section{Factor Level Warning}{

For binary classification, the interpretation of metrics like sensitivity, specificity,
and ROC AUC depends on which factor level is treated as the "positive" class (the event
of interest). The \code{event_class} parameter controls this:
\itemize{
  \item \code{"first"} (default): The first factor level is treated as positive
  \item \code{"second"}: The second factor level is treated as positive
}

\strong{Important:} Recipe preprocessing steps like \code{step_other()} or
\code{step_unknown()} can modify factor levels, potentially changing which level
is "first" or "second". Always verify factor levels after preprocessing.

To ensure consistent behavior, explicitly set factor levels before calling fastml:
\preformatted{
# Ensure "positive" is the second level (event_class = "second")
data$outcome <- factor(data$outcome, levels = c("negative", "positive"))

# Or ensure "positive" is the first level (event_class = "first")
data$outcome <- factor(data$outcome, levels = c("positive", "negative"))
}
}

\examples{
\donttest{
# Example 1: Using the iris dataset for binary classification (excluding 'setosa')
data(iris)
iris <- iris[iris$Species != "setosa", ]  # Binary classification
iris$Species <- factor(iris$Species)

# Define a custom tuning grid for the ranger engine
tune <- list(
  rand_forest = list(
    ranger = list(mtry = c(1, 3))
  )
)

# Train models with custom tuning
model <- fastml(
  data = iris,
  label = "Species",
  algorithms = "rand_forest",
  tune_params = tune,
  use_default_tuning = TRUE
)

# View model summary
summary(model)


  }

}
