#' Transformer Model for Time Series Forecasting
#'
#' @description
#' Transformer model for time series forecasting
#'
#' @param df Input file
#' @param study_variable The Study_Variable represents the primary variable of interest in the dataset, (Ex:Closing price)
#' @param sequence_size Sequence size
#' @param head_size Attention head size
#' @param num_heads Number of attention heads
#' @param ff_dim Size of feed-forward network
#' @param num_transformer_blocks Number of transformer blocks
#' @param mlp_units Units for MLP layers
#' @param mlp_dropout Dropout rate for MLP
#' @param dropout Dropout rate for transformer
#' @param epochs Number of epochs
#' @param batch_size Batch size
#' @param patience Early stopping patience
#'
#' @returns
#' A list containing the following results:
#'
#' - `PREDICTIONS`: The predicted values generated by the model.
#' - `RMSE`: Root Mean Squared Error, measuring the average magnitude of the prediction error.
#' - `MAPE`: Mean Absolute Percentage Error, representing the prediction accuracy as a percentage.
#' - `MAE`: Mean Absolute Error, showing the average absolute difference between actual and predicted values.
#' - `MSE`: Mean Squared Error, quantifying the average squared difference between actual and predicted values.
#' - `sMAPE`: Symmetric Mean Absolute Percentage Error, a variant of MAPE considering both over- and under-predictions.
#' - `RRMSE`: Relative Root Mean Squared Error, RMSE scaled by the mean of the actual values.
#' - `Quantile_Loss`: The quantile loss metric for probabilistic forecasting.
#' - `Loss_plot`: A ggplot object showing the loss curve over iterations or epochs.
#' - `Actual_vs_Predicted`: A ggplot object visualizing the comparison between actual and predicted values.
#'
#' @details
#' This function creates and trains a Transformer-based model for time series
#' forecasting using the Keras library. It allows customization of key architectural
#' parameters such as sequence size, attention head size, number of attention heads,
#' feed-forward network dimensions, number of Transformer blocks, and MLP
#' (multi-layer perceptron) configurations including units and dropout rates.
#'
#' Before running this function, we advise the users to install Python in your system and create the virtual conda environment.
#' Installation of the modules such as 'tensorflow', 'keras' and 'pandas' are necessary for this package. If the user
#' does not know about these steps, they can use the `install_r_dependencies()` function which is available in this package.
#'
#' The function begins by generating training sequences from the input data (`df`)
#' based on the specified `sequence_size`. Sliding windows of input sequences are
#' created as `x`, while the subsequent values in the series are used as targets (`y`).
#'
#' The model architecture includes an input layer, followed by one or more Transformer
#' encoder blocks, a global average pooling layer for feature aggregation, and MLP
#' layers for further processing. The final output layer is designed for the forecasting task.
#'
#' The model is compiled using the Adam optimizer and the mean squared error (MSE)
#' loss function. Training is performed with the specified number of `epochs`,
#' `batch_size`, and early stopping configured through the `patience` parameter.
#' During training, 20% of the data is used for validation, and the best model weights
#' are restored when validation performance stops improving.
#'
#' The package requires a dataset with two columns: Date (formatted as dates) and the Close price (numerical).
#' After loading the data and formatting it appropriately, the TRANSFORMER function
#' trains a Transformer-based model to predict future closing prices. It outputs
#' essential performance metrics like RMSE, MAPE, and sMAPE, along with visualizations
#' such as training loss trends and an actual vs. predicted plot. These features make
#' it an invaluable tool for understanding and forecasting stock market trends effectively..
#'
#' @importFrom reticulate import py_available py_module_available
#' @export
#'
#' @examples
#' # Load sample data
#' data(S_P_500_Close)
#' df <- S_P_500_Close
#'
#' # Run TRANSFORMER (will use mock results if Python is unavailable)
#' result <- TRANSFORMER(df = df,
#'   study_variable = "Price",
#'   sequence_size = 10,
#'   head_size = 128,
#'   num_heads = 8,
#'   ff_dim = 256,
#'   num_transformer_blocks = 4,
#'   mlp_units = c(128),
#'   mlp_dropout = 0.3,
#'   dropout = 0.2,
#'   epochs = 2,
#'   batch_size = 32,
#'   patience = 15
#' )
#'
#' # Display results
#' result$PREDICTIONS
#' result$RMSE
#' result$MAE
#' result$MAPE
#' result$sMAPE
#' result$Quantile_Loss
#' # Plots are NULL if Python is unavailable
#' if (!is.null(result$Loss_plot)) result$Loss_plot
#' if (!is.null(result$Actual_vs_Predicted)) result$Actual_vs_Predicted
#'
TRANSFORMER <- function(df, study_variable, sequence_size = 10,
                        head_size = 512, num_heads = 4, ff_dim = 4,
                        num_transformer_blocks = 4, mlp_units = c(128),
                        mlp_dropout = 0.4, dropout = 0.25,
                        epochs = 300, batch_size = 64, patience = 10) {

  epoch <- loss <- val_loss <- pd <- keras <- NULL

  # Check if Python and required modules are available
  python_available <- reticulate::py_available() &&
    reticulate::py_module_available("tensorflow") &&
    reticulate::py_module_available("keras")

  if (!python_available) {
    warning("Python, TensorFlow, or Keras not available. Run install_r_dependencies() to set up. Returning mock results for demonstration.")
    predictions <- df[[study_variable]]
    return(list(
      PREDICTIONS = predictions,
      RMSE = 0,
      MAE = 0,
      MAPE = 0,
      sMAPE = 0,
      Quantile_Loss = 0,
      Loss_plot = NULL,
      Actual_vs_Predicted = NULL
    ))
  }

  # Convert Date column to Date type
  df$Date <- as.Date(df$Date, format="%d-%m-%Y")

  # Split the data into training and test sets
  split_index <- floor(0.9 * nrow(df))
  df_train <- df[1:split_index, ]
  df_test <- df[(split_index+1):nrow(df), ]

  # Create sequences for training and testing
  to_sequences <- function(seq_size, obs) {
    x <- list()
    y <- list()

    for (i in 1:(length(obs) - seq_size)) {
      window <- obs[i:(i + seq_size - 1)]
      after_window <- obs[i + seq_size]
      x[[i]] <- window
      y[[i]] <- after_window
    }

    x <- array(unlist(x), dim = c(length(x), seq_size, 1))
    y <- unlist(y)

    return(list(x, y))
  }

  spots_train <- df_train[[study_variable]]
  spots_test <- df_test[[study_variable]]

  train_data <- to_sequences(sequence_size, spots_train)
  x_train <- train_data[[1]]
  y_train <- train_data[[2]]

  test_data <- to_sequences(sequence_size, spots_test)
  x_test <- test_data[[1]]
  y_test <- test_data[[2]]

  # Define Transformer Encoder Layer
  transformer_encoder <- function(inputs, head_size, num_heads, ff_dim, dropout = 0) {
    x <- keras::layer_layer_normalization(epsilon = 1e-6)(inputs)
    x <- keras::layer_attention(use_scale = TRUE)(list(x, x))
    x <- keras::layer_dropout(rate = dropout)(x)
    res <- x + inputs

    x <- keras::layer_layer_normalization(epsilon = 1e-6)(res)
    x <- keras::layer_conv_1d(filters = ff_dim, kernel_size = 1, activation = "relu")(x)
    x <- keras::layer_dropout(rate = dropout)(x)

    shape <- keras::k_int_shape(inputs)  # Get shape safely
    filters <- shape[[length(shape)]]  # Extract last dimension
    x <- keras::layer_conv_1d(filters = filters, kernel_size = 1)(x)

    return(x + res)
  }

  # Build model
  build_model <- function(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, mlp_dropout, dropout) {
    inputs <- keras::layer_input(shape = input_shape)
    x <- inputs
    for (i in 1:num_transformer_blocks) {
      x <- transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
    }

    x <- keras::layer_global_average_pooling_1d()(x)  # Global average pooling
    for (dim in mlp_units) {
      x <- keras::layer_dense(units = dim, activation = "relu")(x)  # Correct units argument
      x <- keras::layer_dropout(rate = mlp_dropout)(x)
    }
    outputs <- keras::layer_dense(units = 1)(x)

    model <- keras::keras_model(inputs, outputs)
    model
  }

  # Build model with the correct arguments
  model <- build_model(
    input_shape = dim(x_train)[-1],
    head_size = head_size,
    num_heads = num_heads,
    ff_dim = ff_dim,
    num_transformer_blocks = num_transformer_blocks,
    mlp_units = mlp_units,
    mlp_dropout = mlp_dropout,
    dropout = dropout
  )

  model$compile(
    loss = "mean_squared_error",
    optimizer = keras::optimizer_adam(learning_rate = 1e-4)
  )

  # Train the model
  callbacks <- list(keras::callback_early_stopping(patience = patience, restore_best_weights = TRUE))

  history <- model$fit(
    x = as.array(x_train),
    y = as.array(y_train),
    validation_split = 0.2,
    epochs = as.integer(epochs),
    batch_size = as.integer(batch_size),
    callbacks = callbacks
  )

  x_test <- array(as.numeric(x_test), dim = dim(x_test))
  x_test <- tensorflow::tf$cast(x_test, dtype = "float32")
  y_test <- array(as.numeric(y_test))
  y_test <- tensorflow::tf$cast(y_test, dtype = "float32")

  # Evaluate the model
  results <- model$evaluate(
    x_test,
    y_test,
    verbose = 1
  )

  y_test <- as.numeric(y_test)

  # Predict the values
  pred <- model$predict(x_test)

  # RMSE
  score_rmse <- sqrt(mean((pred - y_test)^2))

  # MAPE
  score_mape <- mean(abs((y_test - pred) / y_test)) * 100

  # MAE
  score_mae <- mean(abs(y_test - pred))

  # MSE
  score_mse <- mean((y_test - pred)^2)

  # sMAPE
  smape <- function(y_true, y_pred) {
    return(100 / length(y_true) * sum(2 * abs(y_pred - y_true) / (abs(y_pred) + abs(y_true))))
  }

  score_smape <- smape(y_test, pred)

  # RRMSE
  score_rrmse <- score_rmse / mean(y_test) * 100

  # Quantile Loss
  quantile_loss <- function(q, y, f) {
    e <- y - f
    return(mean(pmax(q * e, (q - 1) * e)))
  }
  score_quantile_loss <- quantile_loss(0.5, y_test, pred)

  # Plot actual vs predicted
  plot_actual_vs_predicted <- ggplot2::ggplot() +
    ggplot2::geom_line(ggplot2::aes(x = 1:length(y_test), y = y_test, color = "Actual"), linewidth = 1) +
    ggplot2::geom_line(ggplot2::aes(x = 1:length(pred), y = pred, color = "Predicted"), linetype = "dashed", linewidth = 1) +
    ggplot2::scale_color_manual(values = c("Actual" = "blue", "Predicted" = "red")) +
    ggplot2::labs(x = "Time", y = study_variable, color = NULL) +
    ggplot2::theme_minimal() +
    ggplot2::theme(
      text = ggplot2::element_text(size = 14, face = "bold"),
      legend.position = "top"  # Moves the legend to the top
    )

  # Convert the history object to a data frame
  history_df <- data.frame(
    epoch = 1:length(history$history$loss),
    loss = history$history$loss,
    val_loss = history$history$val_loss
  )

  # Plot training and validation loss
  plot_loss <- ggplot2::ggplot(history_df) +
    ggplot2::geom_line(ggplot2::aes(x = epoch, y = loss, color = "Training Loss"), linewidth = 1) +
    ggplot2::geom_line(ggplot2::aes(x = epoch, y = val_loss, color = "Validation Loss"), linewidth = 1, linetype = "dashed") +
    ggplot2::scale_color_manual(values = c("Training Loss" = "green", "Validation Loss" = "blue")) +
    ggplot2::labs(x = "Epochs", y = "Loss", color = NULL) +
    ggplot2::theme_minimal() +
    ggplot2::theme(
      text = ggplot2::element_text(size = 14, face = "bold"),
      legend.position = "top"  # Moves the legend to the top
    )

  # Return performance metrics (corrected list name consistency)
  return(list(
    PREDICTIONS = pred,
    RMSE = score_rmse,
    MAPE = score_mape,
    MAE = score_mae,
    sMAPE = score_smape,
    Quantile_Loss = score_quantile_loss,
    Loss_plot = plot_loss,
    Actual_vs_Predicted = plot_actual_vs_predicted
  ))
}
