maicplus coverage - 88.36%

Files
Source

#' Unanchored MAIC for binary and time-to-event endpoint
#'
#' This is a wrapper function to provide adjusted effect estimates and relevant statistics in unanchored case (i.e.
#' there is no common comparator arm in the internal and external trial).
#'
#' @param weights_object an object returned by \code{estimate_weight}
#' @param ipd a data frame that meet format requirements in 'Details', individual patient data (IPD) of internal trial
#' @param pseudo_ipd a data frame, pseudo IPD from digitized KM curve of external trial (for time-to-event endpoint) or
#'   from contingency table (for binary endpoint)
#' @param trt_ipd  a string, name of the interested investigation arm in internal trial \code{dat_igd} (real IPD)
#' @param trt_agd a string, name of the interested investigation arm in external trial \code{pseudo_ipd} (pseudo IPD)
#' @param trt_var_ipd a string, column name in \code{ipd} that contains the treatment assignment
#' @param trt_var_agd a string, column name in \code{ipd} that contains the treatment assignment
#' @param normalize_weights logical, default is \code{FALSE}. If \code{TRUE},
#'   \code{scaled_weights} (normalized weights) in \code{weights_object$data} will be used.
#' @param endpoint_type a string, one out of the following "binary", "tte" (time to event)
#' @param eff_measure a string, "RD" (risk difference), "OR" (odds ratio), "RR" (relative risk) for a binary endpoint;
#'   "HR" for a time-to-event endpoint. By default is \code{NULL}, "OR" is used for binary case, otherwise "HR" is used.
#' @param boot_ci_type a string, one of `c("norm","basic", "stud", "perc", "bca")` to select the type of bootstrap
#'   confidence interval. See [boot::boot.ci] for more details.
#' @param endpoint_name a string, name of time to event endpoint, to be show in the last line of title
#' @param time_scale a string, time unit of median survival time, taking a value of 'years', 'months', 'weeks' or
#'   'days'. NOTE: it is assumed that values in TIME column of \code{ipd} and \code{pseudo_ipd} is in the unit of days
#' @param km_conf_type a string, pass to \code{conf.type} of \code{survfit}
#' @param binary_robust_cov_type a string to pass to argument `type` of [sandwich::vcovHC], see possible options in the
#'   documentation of that function. Default is `"HC3"`
#'
#' @details For time-to-event analysis, it is required that input \code{ipd} and \code{pseudo_ipd} to have the following
#'   columns. This function is not sensitive to upper or lower case of letters in column names.
#' \itemize{
#'   \item USUBJID - character, unique subject ID
#'   \item ARM - character or factor, treatment indicator, column name does not have to be 'ARM'. User specify in
#'   \code{trt_var_ipd} and \code{trt_var_agd}
#'   \item EVENT - numeric, 1 for censored/death, 0 for otherwise
#'   \item TIME - numeric column, observation time of the \code{EVENT}; unit in days
#' }
#'
#' @importFrom survival survfit Surv coxph
#' @importFrom lmtest coeftest coefci
#' @importFrom sandwich vcovHC
#' @importFrom boot boot boot.ci
#' @return A list, contains 'descriptive' and 'inferential'
#' @example inst/examples/maic_unanchored_ex.R
#' @example inst/examples/maic_unanchored_binary_ex.R
#' @export

maic_unanchored <- function(weights_object,
                            ipd,
                            pseudo_ipd,
                            trt_ipd,
                            trt_agd,
                            trt_var_ipd = "ARM",
                            trt_var_agd = "ARM",
                            normalize_weights = FALSE,
                            endpoint_type = "tte",
                            endpoint_name = "Time to Event Endpoint",
                            eff_measure = c("HR", "OR", "RR", "RD"),
                            boot_ci_type = c("norm", "basic", "stud", "perc", "bca"),
                            # time to event specific args
                            time_scale = "months",
                            km_conf_type = "log-log",
                            # binary specific args
                            binary_robust_cov_type = "HC3") {
  # ==> Initial Setup ------------------------------------------
  # ~~~ Create the hull for the output from this function
  res <- list(
    descriptive = list(),
    inferential = list()
  )

  res_AB_unadj <- res_AB <- list(
    est = NA,
    se = NA,
    ci_l = NA,
    ci_u = NA,
    pval = NA
  )

  # ~~~ Initial colname process and precheck on effect measure
  names(ipd) <- toupper(names(ipd))
  names(pseudo_ipd) <- toupper(names(pseudo_ipd))
  trt_var_ipd <- toupper(trt_var_ipd)
  trt_var_agd <- toupper(trt_var_agd)

  if (length(eff_measure) > 1) eff_measure <- NULL
  if (is.null(eff_measure)) eff_measure <- list(binary = "OR", tte = "HR")[[endpoint_type]]

  # ~~~ Setup ARM column and make related pre-checks
  if (!trt_var_ipd %in% names(ipd)) stop("cannot find arm indicator column trt_var_ipd in ipd")
  if (!trt_var_agd %in% names(pseudo_ipd)) stop("cannot find arm indicator column trt_var_agd in pseudo_ipd")
  if (trt_var_ipd != "ARM") ipd$ARM <- ipd[[trt_var_ipd]]
  if (trt_var_agd != "ARM") pseudo_ipd$ARM <- pseudo_ipd[[trt_var_agd]]
  ipd$ARM <- as.character(ipd$ARM) # just to avoid potential error when merging
  pseudo_ipd$ARM <- as.character(pseudo_ipd$ARM) # just to avoid potential error when merging
  if (!trt_ipd %in% ipd$ARM) stop("trt_ipd does not exist in ipd$ARM")
  if (!trt_agd %in% pseudo_ipd$ARM) stop("trt_agd does not exist in pseudo_ipd$ARM")

  # ~~~ More pre-checks
  endpoint_type <- match.arg(endpoint_type, c("binary", "tte"))
  if (!"maicplus_estimate_weights" %in% class(weights_object)) {
    stop("weights_object should be an object returned by estimate_weights")
  }
  if (any(duplicated(ipd$USUBJID))) {
    warning(
      "check your ipd, it has duplicated usubjid, this indicates, ",
      "it might contain multiple endpoints for each subject"
    )
  }
  if (!all(ipd$USUBJID %in% weights_object$data$USUBJID)) {
    stop(
      "These pts in ipd cannot be found in weights_object ",
      toString(setdiff(ipd$USUBJID, weights_object$USUBJID))
    )
  }
  time_scale <- match.arg(arg = time_scale, choices = c("days", "weeks", "months", "years"))
  if (endpoint_type == "binary") { # for binary effect measure

    if (any(!c("USUBJID", "RESPONSE") %in% names(ipd))) stop("ipd should have 'USUBJID', 'RESPONSE' columns at minimum")
    eff_measure <- match.arg(eff_measure, choices = c("OR", "RD", "RR"), several.ok = FALSE)
    binary_robust_cov_type <- match.arg(
      binary_robust_cov_type,
      choices = c("HC3", "const", "HC", "HC0", "HC1", "HC2", "HC4", "HC4m", "HC5")
    )
  } else if (endpoint_type == "tte") { # for time to event effect measure

    if (!all(c("USUBJID", "TIME", "EVENT", trt_var_ipd) %in% names(ipd))) {
      stop("ipd needs to include at least USUBJID, TIME, EVENT, ", trt_var_ipd)
    }
    if (!all(c("TIME", "EVENT", trt_var_agd) %in% names(pseudo_ipd))) {
      stop("pseudo_ipd needs to include at least TIME, EVENT, ", trt_var_agd)
    }
    eff_measure <- match.arg(eff_measure, choices = c("HR"), several.ok = FALSE)
  }
  boot_ci_type <- match.arg(boot_ci_type)

  # ==> IPD and AgD data preparation ------------------------------------------
  # : subset ipd, retain only ipd from interested trts
  ipd <- ipd[ipd$ARM == trt_ipd, , drop = TRUE]
  pseudo_ipd <- pseudo_ipd[pseudo_ipd$ARM == trt_agd, , drop = TRUE]

  # : assign weights to real and pseudo ipd
  if (normalize_weights) {
    ipd$weights <- weights_object$data$scaled_weights[match(ipd$USUBJID, weights_object$data$USUBJID)]
  } else {
    ipd$weights <- weights_object$data$weights[match(ipd$USUBJID, weights_object$data$USUBJID)]
  }
  pseudo_ipd$weights <- 1

  # : necessary formatting for pseudo ipd
  if (!"USUBJID" %in% names(pseudo_ipd)) pseudo_ipd$USUBJID <- paste0("ID", seq_len(nrow(pseudo_ipd)))
  if ("RESPONSE" %in% names(pseudo_ipd) && is.logical(pseudo_ipd$RESPONSE)) {
    pseudo_ipd$RESPONSE <- as.numeric(pseudo_ipd$RESPONSE)
  }

  # : give warning when individual pts in IPD has no weights
  if (any(is.na(ipd$weights))) {
    ipd <- ipd[!is.na(ipd$weights), , drop = FALSE]
    warning(
      paste(
        "these usubjid in ipd have no weight in weights_object, and hence excluded from analysis:",
        paste(ipd$USUBJID[is.na(ipd$weights)], collapse = ",")
      )
    )
    if (nrow(ipd) == 0) stop("there is no pts with weight in IPD!!")
  }

  # : retain necessary columns
  if (endpoint_type == "tte") {
    retain_cols <- c("USUBJID", "ARM", "TIME", "EVENT", "weights")
  } else {
    retain_cols <- c("USUBJID", "ARM", "RESPONSE", "weights")
  }
  ipd <- ipd[, retain_cols, drop = FALSE]
  pseudo_ipd <- pseudo_ipd[, retain_cols, drop = FALSE]

  # : merge real and pseudo ipds
  dat <- rbind(ipd, pseudo_ipd)
  dat$ARM <- factor(dat$ARM, levels = c(trt_agd, trt_ipd))

  # ==> Inferential output ------------------------------------------

  result <- if (endpoint_type == "tte") {
    maic_unanchored_tte(
      res, res_AB, res_AB_unadj, dat, ipd, pseudo_ipd, km_conf_type, time_scale,
      weights_object, endpoint_name, normalize_weights, boot_ci_type, trt_ipd, trt_agd
    )
  } else if (endpoint_type == "binary") {
    maic_unanchored_binary(
      res, res_AB, res_AB_unadj, dat, ipd, pseudo_ipd, binary_robust_cov_type,
      weights_object, endpoint_name, normalize_weights, eff_measure, boot_ci_type, trt_ipd, trt_agd
    )
  } else {
    stop("Endpoint type ", endpoint_type, " currently unsupported.")
  }

  # output
  result
}

# MAIC inference functions for TTE outcome type ------------

maic_unanchored_tte <- function(res,
                                res_AB,
                                res_AB_unadj,
                                dat,
                                ipd,
                                pseudo_ipd,
                                km_conf_type,
                                time_scale,
                                weights_object,
                                endpoint_name,
                                normalize_weights,
                                boot_ci_type,
                                trt_ipd,
                                trt_agd) {
  # ~~~ Descriptive table before and after matching
  # : derive km w and w/o weights
  kmobj_dat <- survfit(Surv(TIME, EVENT) ~ ARM, dat, conf.type = km_conf_type)
  kmobj_dat_adj <- survfit(Surv(TIME, EVENT) ~ ARM, dat, weights = dat$weights, conf.type = km_conf_type)
  res$descriptive[["survfit_before"]] <- survfit_makeup(kmobj_dat)
  res$descriptive[["survfit_after"]] <- survfit_makeup(kmobj_dat_adj)
  # : derive median survival time
  medSurv_dat <- medSurv_makeup(kmobj_dat, legend = "Before matching", time_scale = time_scale)
  medSurv_dat_adj <- medSurv_makeup(kmobj_dat_adj, legend = "After matching", time_scale = time_scale)
  medSurv_out <- rbind(medSurv_dat, medSurv_dat_adj)
  medSurv_out <- cbind(trt_ind = c("B", "A")[match(medSurv_out$treatment, levels(dat$ARM))], medSurv_out)

  res$descriptive[["summary"]] <- medSurv_out

  # ~~~ Analysis table (Cox model) before and after matching
  # : fit PH Cox regression model
  coxobj_dat <- coxph(Surv(TIME, EVENT) ~ ARM, dat)
  coxobj_dat_adj <- coxph(Surv(TIME, EVENT) ~ ARM, dat, weights = weights, robust = TRUE)

  # : derive adjusted estimate for ipd exp arm vs agd exp arm
  res_AB$est <- summary(coxobj_dat_adj)$conf.int[1]
  mu <- summary(coxobj_dat_adj)$coef[1]
  sig <- summary(coxobj_dat_adj)$coef[4]
  res_AB$se <- sqrt((exp(sig^2) - 1) * exp(2 * mu + sig^2)) # log normal parametrization
  res_AB$ci_l <- summary(coxobj_dat_adj)$conf.int[3]
  res_AB$ci_u <- summary(coxobj_dat_adj)$conf.int[4]
  res_AB$pval <- summary(coxobj_dat_adj)$coef[6]

  # : derive unadjusted estimate
  res_AB_unadj$est <- summary(coxobj_dat)$conf.int[1]
  mu <- summary(coxobj_dat)$coef[1]
  sig <- summary(coxobj_dat)$coef[3]
  res_AB_unadj$se <- sqrt((exp(sig^2) - 1) * exp(2 * mu + sig^2)) # log normal parametrization
  res_AB_unadj$ci_l <- summary(coxobj_dat)$conf.int[3]
  res_AB_unadj$ci_u <- summary(coxobj_dat)$conf.int[4]
  res_AB_unadj$pval <- summary(coxobj_dat)$coef[5]

  # : get bootstrapped estimates if applicable
  if (!is.null(weights_object$boot)) {
    keep_rows <- setdiff(seq_len(nrow(weights_object$data)), weights_object$rows_with_missing)
    boot_ipd_id <- weights_object$data[keep_rows, "USUBJID", drop = FALSE]

    boot_ipd <- merge(boot_ipd_id, ipd, by = "USUBJID", all.x = TRUE)
    if (nrow(boot_ipd) != nrow(boot_ipd_id)) stop("ipd has multiple observations for some patients")
    boot_ipd <- boot_ipd[match(boot_ipd$USUBJID, boot_ipd_id$USUBJID), ]

    stat_fun <- function(data, index, w_obj, pseudo_ipd, normalize) {
      boot_ipd <- data[index, ]
      r <- dynGet("r", ifnotfound = NA) # Get bootstrap iteration
      if (!is.na(r)) {
        if (!all(index == w_obj$boot[, 1, r])) stop("Bootstrap and weight indices don't match")
        boot_ipd$weights <- w_obj$boot[, 2, r]
        if (normalize) boot_ipd$weights <- boot_ipd$weights / mean(boot_ipd$weights, na.rm = TRUE)
      }
      boot_dat <- rbind(boot_ipd, pseudo_ipd)
      boot_dat$ARM <- factor(boot_dat$ARM, levels = c(trt_agd, trt_ipd))
      boot_coxobj_dat_adj <- coxph(Surv(TIME, EVENT) ~ ARM, boot_dat, weights = weights)
      c(est = coef(boot_coxobj_dat_adj)[1], var = vcov(boot_coxobj_dat_adj)[1, 1])
    }

    # Revert seed to how it was for weight bootstrap sampling
    old_seed <- globalenv()$.Random.seed
    on.exit(suspendInterrupts(set_random_seed(old_seed)))
    set_random_seed(weights_object$boot_seed)
    R <- dim(weights_object$boot)[3]

    boot_res <- boot(
      boot_ipd,
      stat_fun,
      R = R,
      w_obj = weights_object,
      pseudo_ipd = pseudo_ipd,
      normalize = normalize_weights,
      strata = weights_object$boot_strata
    )
    boot_ci <- boot.ci(boot_res, type = boot_ci_type, w_obj = weights_object, pseudo_ipd = pseudo_ipd)

    l_u_index <- switch(boot_ci_type,
      "norm" = list(2, 3, "normal"),
      "basic" = list(4, 5, "basic"),
      "stud" = list(4, 5, "student"),
      "perc" = list(4, 5, "percent"),
      "bca" = list(4, 5, "bca")
    )

    transform_estimate <- exp
    boot_res_AB <- list(
      est = as.vector(transform_estimate(boot_res$t0[1])),
      se = NA,
      ci_l = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[1]]]),
      ci_u = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[2]]]),
      pval = NA
    )
  } else {
    boot_res_AB <- NULL
    boot_res <- NULL
  }

  # : report all raw fitted obj
  res$inferential[["fit"]] <- list(
    km_before = kmobj_dat,
    km_after = kmobj_dat_adj,
    model_before = coxobj_dat,
    model_after = coxobj_dat_adj,
    res_AB = res_AB,
    res_AB_unadj = res_AB_unadj,
    boot_res = boot_res,
    boot_res_AB = boot_res_AB
  )

  # : compile HR result
  res$inferential[["summary"]] <- data.frame(
    case = c("AB", "adjusted_AB"),
    HR = c(res_AB_unadj$est, res_AB$est),
    LCL = c(res_AB_unadj$ci_l, res_AB$ci_l),
    UCL = c(res_AB_unadj$ci_u, res_AB$ci_u),
    pval = c(res_AB_unadj$pval, res_AB$pval)
  )

  # output
  res
}

# MAIC inference functions for Binary outcome type ------------

maic_unanchored_binary <- function(res,
                                   res_AB,
                                   res_AB_unadj,
                                   dat,
                                   ipd,
                                   pseudo_ipd,
                                   binary_robust_cov_type,
                                   weights_object,
                                   endpoint_name,
                                   normalize_weights,
                                   eff_measure,
                                   boot_ci_type,
                                   trt_ipd,
                                   trt_agd) {
  # ~~~ Analysis table
  # : set up proper link
  glm_link <- switch(eff_measure,
    "RD" = "identity",
    "RR" = "log",
    "OR" = "logit"
  )
  transform_estimate <- switch(eff_measure,
    "RD" = function(x) x * 100,
    "RR" = exp,
    "OR" = exp
  )

  # : fit glm for binary outcome and robust estimate with weights
  binobj_dat <- glm(RESPONSE ~ ARM, dat, family = binomial(link = glm_link))
  binobj_dat_adj <- suppressWarnings(glm(RESPONSE ~ ARM, dat, weights = weights, family = binomial(link = glm_link)))

  bin_robust_cov <- sandwich::vcovHC(binobj_dat_adj, type = binary_robust_cov_type)
  bin_robust_coef <- lmtest::coeftest(binobj_dat_adj, vcov. = bin_robust_cov)
  bin_robust_ci <- lmtest::coefci(binobj_dat_adj, vcov. = bin_robust_cov)

  # : make general summary
  glmDesc_dat <- glm_makeup(binobj_dat, legend = "Before matching", weighted = FALSE)
  glmDesc_dat_adj <- glm_makeup(binobj_dat_adj, legend = "After matching", weighted = TRUE)
  glmDesc <- rbind(glmDesc_dat, glmDesc_dat_adj)
  glmDesc <- cbind(trt_ind = c("B", "A")[match(glmDesc$treatment, levels(dat$ARM))], glmDesc)
  rownames(glmDesc) <- NULL
  res$descriptive[["summary"]] <- glmDesc

  # : derive adjusted estimate
  res_AB$est <- bin_robust_coef[2, "Estimate"]
  res_AB$se <- bin_robust_coef[2, "Std. Error"]
  res_AB$ci_l <- bin_robust_ci[2, "2.5 %"]
  res_AB$ci_u <- bin_robust_ci[2, "97.5 %"]
  res_AB$pval <- bin_robust_coef[2, "Pr(>|z|)"]

  # : derive unadjusted estimate
  binobj_dat_summary <- summary(binobj_dat)
  res_AB_unadj$est <- binobj_dat_summary$coefficients[2, "Estimate"]
  res_AB_unadj$se <- binobj_dat_summary$coefficients[2, "Std. Error"]
  res_AB_unadj$ci_l <- confint.default(binobj_dat)[2, "2.5 %"]
  res_AB_unadj$ci_u <- confint.default(binobj_dat)[2, "97.5 %"]
  res_AB_unadj$pval <- binobj_dat_summary$coefficients[2, "Pr(>|z|)"]

  # : transform
  if (eff_measure %in% c("RR", "OR")) {
    res_AB <- transform_ratio(res_AB)
    res_AB_unadj <- transform_ratio(res_AB_unadj)
  } else if (eff_measure == "RD") {
    res_AB <- transform_absolute(res_AB)
    res_AB_unadj <- transform_absolute(res_AB_unadj)
  }

  # : get bootstrapped estimates if applicable
  if (!is.null(weights_object$boot)) {
    keep_rows <- setdiff(seq_len(nrow(weights_object$data)), weights_object$rows_with_missing)
    boot_ipd_id <- weights_object$data[keep_rows, "USUBJID", drop = FALSE]

    boot_ipd <- merge(boot_ipd_id, ipd, by = "USUBJID", all.x = TRUE)
    if (nrow(boot_ipd) != nrow(boot_ipd_id)) stop("ipd has multiple observations for some patients")
    boot_ipd <- boot_ipd[match(boot_ipd$USUBJID, boot_ipd_id$USUBJID), ]

    stat_fun <- function(data, index, w_obj, pseudo_ipd, normalize) {
      boot_ipd <- data[index, ]
      r <- dynGet("r", ifnotfound = NA) # Get bootstrap iteration
      if (!is.na(r)) {
        if (!all(index == w_obj$boot[, 1, r])) stop("Bootstrap and weight indices don't match")
        boot_ipd$weights <- w_obj$boot[, 2, r]
        if (normalize) boot_ipd$weights <- boot_ipd$weights / mean(boot_ipd$weights, na.rm = TRUE)
      }
      boot_dat <- rbind(boot_ipd, pseudo_ipd)
      boot_dat$ARM <- factor(boot_dat$ARM, levels = c(trt_agd, trt_ipd))
      boot_binobj_dat_adj <- suppressWarnings(
        glm(RESPONSE ~ ARM, boot_dat, weights = weights, family = binomial(link = glm_link))
      )
      c(est = coef(boot_binobj_dat_adj)[2], var = vcov(boot_binobj_dat_adj)[2, 2])
    }

    # Revert seed to how it was for weight bootstrap sampling
    old_seed <- globalenv()$.Random.seed
    on.exit(suspendInterrupts(set_random_seed(old_seed)))
    set_random_seed(weights_object$boot_seed)
    R <- dim(weights_object$boot)[3]
    boot_res <- boot(
      boot_ipd,
      stat_fun,
      R = R,
      w_obj = weights_object,
      pseudo_ipd = pseudo_ipd,
      normalize = normalize_weights,
      strata = weights_object$boot_strata
    )
    boot_ci <- boot.ci(boot_res, type = boot_ci_type, w_obj = weights_object, pseudo_ipd = pseudo_ipd)

    l_u_index <- switch(boot_ci_type,
      "norm" = list(2, 3, "normal"),
      "basic" = list(4, 5, "basic"),
      "stud" = list(4, 5, "student"),
      "perc" = list(4, 5, "percent"),
      "bca" = list(4, 5, "bca")
    )

    boot_res_AB <- list(
      est = as.vector(transform_estimate(boot_res$t0[1])),
      se = NA,
      ci_l = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[1]]]),
      ci_u = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[2]]]),
      pval = NA
    )
  } else {
    boot_res_AB <- NULL
    boot_res <- NULL
  }

  # : report all raw fitted obj
  res$inferential[["fit"]] <- list(
    model_before = binobj_dat,
    model_after = binobj_dat_adj,
    res_AB = res_AB,
    res_AB_unadj = res_AB_unadj,
    boot_res = boot_res,
    boot_res_AB = boot_res_AB
  )

  # : compile binary effect estimate result
  res$inferential[["summary"]] <- data.frame(
    case = c("AB", "adjusted_AB"),
    EST = c(
      res_AB_unadj$est,
      res_AB$est
    ),
    LCL = c(
      res_AB_unadj$ci_l,
      res_AB$ci_l
    ),
    UCL = c(
      res_AB_unadj$ci_u,
      res_AB$ci_u
    ),
    pval = c(
      res_AB_unadj$pval,
      res_AB$pval
    )
  )
  names(res$inferential[["summary"]])[2] <- eff_measure

  # : output
  res
}

#' Kaplan Meier (KM) plot function for anchored and unanchored cases
#'
#' It is wrapper function of \code{basic_kmplot}. The argument setting is similar to \code{maic_anchored} and
#' \code{maic_unanchored}, and it is used in those two functions.
#'
#' @param weights_object an object returned by \code{estimate_weight}
#' @param tte_ipd a data frame of individual patient data (IPD) of internal trial, contain at least `"USUBJID"`,
#'   `"EVENT"`, `"TIME"` columns and a column indicating treatment assignment
#' @param tte_pseudo_ipd a data frame of pseudo IPD by digitized KM curves of external trial (for time-to-event
#'   endpoint), contain at least `"EVENT"`, `"TIME"`
#' @param trt_ipd  a string, name of the interested investigation arm in internal trial \code{dat_igd} (real IPD)
#' @param trt_agd a string, name of the interested investigation arm in external trial \code{dat_pseudo} (pseudo IPD)
#' @param trt_common a string, name of the common comparator in internal and external trial, by default is NULL,
#'   indicating unanchored case
#' @param trt_var_ipd a string, column name in \code{tte_ipd} that contains the treatment assignment
#' @param trt_var_agd a string, column name in \code{tte_pseudo_ipd} that contains the treatment assignment
#' @param normalize_weights logical, default is \code{FALSE}. If \code{TRUE},
#'   \code{scaled_weights} (normalized weights) in \code{weights_object$data} will be used.
#' @param km_conf_type a string, pass to \code{conf.type} of \code{survfit}
#' @param km_layout a string, only applicable for unanchored case (\code{trt_common = NULL}), indicated the desired
#'   layout of output KM curve.
#' @param ... other arguments in \code{basic_kmplot}
#'
#' @return In unanchored case, a KM plot with risk set table. In anchored case, depending on \code{km_layout},
#' \itemize{
#'   \item if "by_trial", 2 by 1 plot, first all KM curves (incl. weighted) in IPD trial, and then KM curves in AgD
#'   trial, with risk set table.
#'   \item if "by_arm", 2 by 1 plot, first KM curves of \code{trt_agd} and  \code{trt_ipd} (with and without weights),
#'    and then KM curves of \code{trt_common} in AgD trial and IPD trial (with and without weights). Risk set table is
#'     appended.
#'   \item if "all", 2 by 2 plot, all plots in "by_trial" and "by_arm" without risk set table appended.
#' }
#' @example inst/examples/kmplot_unanchored_ex.R
#' @example inst/examples/kmplot_anchored_ex.R
#' @export

kmplot <- function(weights_object,
                   tte_ipd,
                   tte_pseudo_ipd,
                   trt_ipd,
                   trt_agd,
                   trt_common = NULL,
                   normalize_weights = FALSE,
                   trt_var_ipd = "ARM",
                   trt_var_agd = "ARM",
                   km_conf_type = "log-log",
                   km_layout = c("all", "by_trial", "by_arm"),
                   ...) {
  names(tte_ipd) <- toupper(names(tte_ipd))
  names(tte_pseudo_ipd) <- toupper(names(tte_pseudo_ipd))
  trt_var_ipd <- toupper(trt_var_ipd)
  trt_var_agd <- toupper(trt_var_agd)

  # pre check
  if (!"maicplus_estimate_weights" %in% class(weights_object)) {
    stop("weights_object should be an object returned by estimate_weights")
  }
  if (!all(c("USUBJID", "TIME", "EVENT", trt_var_ipd) %in% names(tte_ipd))) {
    stop("tte_ipd needs to include at least USUBJID, TIME, EVENT, ", trt_var_ipd)
  }
  if (!all(c("TIME", "EVENT", trt_var_agd) %in% names(tte_pseudo_ipd))) {
    stop("tte_pseudo_ipd needs to include at least TIME, EVENT, ", trt_var_agd)
  }
  km_layout <- match.arg(km_layout, choices = c("all", "by_trial", "by_arm"), several.ok = FALSE)

  # preparing data
  is_anchored <- !is.null(trt_common)
  tte_ipd <- tte_ipd[tte_ipd[[trt_var_ipd]] %in% c(trt_ipd, trt_common), , drop = FALSE]
  tte_pseudo_ipd <- tte_pseudo_ipd[tte_pseudo_ipd[[trt_var_agd]] %in% c(trt_agd, trt_common), , drop = FALSE]

  if (normalize_weights) {
    tte_ipd$weights <- weights_object$data$scaled_weights[match(weights_object$data$USUBJID, tte_ipd$USUBJID)]
  } else {
    tte_ipd$weights <- weights_object$data$weights[match(weights_object$data$USUBJID, tte_ipd$USUBJID)]
  }
  tte_pseudo_ipd$weights <- 1

  # generate plot
  if (!is_anchored) {
    ## unanchored case
    kmobj_B <- survfit(as.formula(paste("Surv(TIME, EVENT) ~", trt_var_agd)),
      data = tte_pseudo_ipd,
      conf.type = km_conf_type
    )
    kmobj_A <- survfit(as.formula(paste("Surv(TIME, EVENT) ~", trt_var_ipd)),
      data = tte_ipd,
      conf.type = km_conf_type
    )
    kmobj_A_adj <- survfit(as.formula(paste("Surv(TIME, EVENT) ~", trt_var_ipd)),
      data = tte_ipd,
      conf.type = km_conf_type,
      weights = weights
    )

    kmdat <- do.call(
      rbind,
      c(
        survfit_makeup(kmobj_B, trt_agd),
        survfit_makeup(kmobj_A, trt_ipd),
        survfit_makeup(kmobj_A_adj, paste(trt_ipd, "(weighted)"))
      )
    )
    kmdat$treatment <- factor(kmdat$treatment, levels = unique(kmdat$treatment))

    basic_kmplot(kmdat,
      show_risk_set = TRUE,
      main_title = "Kaplan-Meier Curves",
      subplot_heights = NULL,
      suppress_plot_layout = FALSE,
      ...
    )
  } else {
    # anchored case
    # - agd trial km data
    kmobj_C_S1 <- survfit(as.formula(paste("Surv(TIME, EVENT) ~", trt_var_agd)),
      data = tte_pseudo_ipd,
      conf.type = km_conf_type,
      subset = eval(parse(text = paste0("(tte_pseudo_ipd$", trt_var_agd, " == '", trt_common, "')")))
    )
    kmobj_B_S1 <- survfit(as.formula(paste("Surv(TIME, EVENT) ~", trt_var_agd)),
      data = tte_pseudo_ipd,
      conf.type = km_conf_type,
      subset = eval(parse(text = paste0("(tte_pseudo_ipd$", trt_var_agd, " == '", trt_agd, "')")))
    )
    # - ipd trial km data
    kmobj_C_S2 <- survfit(as.formula(paste("Surv(TIME, EVENT) ~", trt_var_ipd)),
      data = tte_ipd,
      conf.type = km_conf_type,
      subset = eval(parse(text = paste0("(tte_ipd$", trt_var_ipd, " == '", trt_common, "')")))
    )
    kmobj_A_S2 <- survfit(as.formula(paste("Surv(TIME, EVENT) ~", trt_var_ipd)),
      data = tte_ipd,
      conf.type = km_conf_type,
      subset = eval(parse(text = paste0("(tte_ipd$", trt_var_ipd, " == '", trt_ipd, "')")))
    )
    # - ipd trial km data with weights
    kmobj_Cadj_S2 <- survfit(as.formula(paste("Surv(TIME, EVENT) ~", trt_var_ipd)),
      data = tte_ipd,
      conf.type = km_conf_type,
      weights = weights,
      subset = eval(parse(text = paste0("(tte_ipd$", trt_var_ipd, " == '", trt_common, "')")))
    )
    kmobj_Aadj_S2 <- survfit(as.formula(paste("Surv(TIME, EVENT) ~", trt_var_ipd)),
      data = tte_ipd,
      conf.type = km_conf_type,
      weights = weights,
      subset = eval(parse(text = paste0("(tte_ipd$", trt_var_ipd, " == '", trt_ipd, "')")))
    )
    # - plotdat for layout by trial
    kmdat_s2 <- do.call(
      rbind,
      c(
        survfit_makeup(kmobj_C_S2, trt_common),
        survfit_makeup(kmobj_A_S2, trt_ipd),
        survfit_makeup(kmobj_Aadj_S2, paste(trt_ipd, "(weighted)")),
        survfit_makeup(kmobj_Cadj_S2, paste(trt_common, "(weighted)"))
      )
    )
    kmdat_s2$treatment <- factor(kmdat_s2$treatment, levels = unique(kmdat_s2$treatment))
    kmdat_s1 <- do.call(
      rbind,
      c(
        survfit_makeup(kmobj_C_S1, trt_common),
        survfit_makeup(kmobj_B_S1, trt_agd)
      )
    )
    kmdat_s1$treatment <- factor(kmdat_s1$treatment, levels = unique(kmdat_s1$treatment))
    # - plotdat for layout by arm
    kmdat_a2 <- do.call(
      rbind,
      c(
        survfit_makeup(kmobj_B_S1, trt_agd),
        survfit_makeup(kmobj_A_S2, trt_ipd),
        survfit_makeup(kmobj_Aadj_S2, paste(trt_ipd, "(weighted)"))
      )
    )
    kmdat_a2$treatment <- factor(kmdat_a2$treatment, levels = unique(kmdat_a2$treatment))
    kmdat_a1 <- do.call(
      rbind,
      c(
        survfit_makeup(kmobj_C_S1, paste(trt_common, "(AgD)")),
        survfit_makeup(kmobj_C_S2, paste(trt_common, "(IPD)")),
        survfit_makeup(kmobj_Cadj_S2, paste(trt_common, "(IPD,weighted)"))
      )
    )
    kmdat_a1$treatment <- factor(kmdat_a1$treatment, levels = unique(kmdat_a1$treatment))


    # make plot depending on the layout
    if (km_layout == "by_trial") {
      # 1 by 2 plot, each plot is per trial
      subplot_heights <- c(7, 0.7 + 2 * 0.7, 0.8)
      layout_mat <- matrix(1:4, ncol = 2)
      layout(layout_mat, heights = subplot_heights)

      basic_kmplot(kmdat_s2,
        main_title = paste0("Kaplan-Meier Curves \n(", trt_ipd, " vs ", trt_common, ") in the IPD trial"),
        suppress_plot_layout = TRUE, ...
      )

      basic_kmplot(kmdat_s1,
        main_title = paste0("Kaplan-Meier Curves \n(", trt_agd, " vs ", trt_common, ") in the AgD trial"),
        suppress_plot_layout = TRUE, ...
      )
    } else if (km_layout == "by_arm") {
      # 1 by 2 plot, by 1 is for investigational arm, the other is for common comparator
      subplot_heights <- c(7, 0.7 + 2 * 0.7, 0.8)
      layout_mat <- matrix(1:4, ncol = 2)
      layout(layout_mat, heights = subplot_heights)

      basic_kmplot(kmdat_a2,
        main_title = paste0("Kaplan-Meier Curves \n(", trt_ipd, " vs ", trt_agd, ")"),
        suppress_plot_layout = TRUE, ...
      )

      basic_kmplot(kmdat_a1,
        main_title = paste0("Kaplan-Meier Curves of Common Comparator \n", trt_common, "(IPD vs AgD Trial)"),
        suppress_plot_layout = TRUE, ...
      )
    } else {
      # 2 by 2 plot, combine by trial and by arm
      layout_mat <- matrix(1:4, ncol = 2, byrow = TRUE)
      layout(layout_mat)

      basic_kmplot(kmdat_s2,
        main_title = paste0("Kaplan-Meier Curves \n(", trt_ipd, " vs ", trt_common, ") in the IPD trial"),
        show_risk_set = FALSE,
        suppress_plot_layout = TRUE, ...
      )

      basic_kmplot(kmdat_s1,
        main_title = paste0("Kaplan-Meier Curves \n(", trt_agd, " vs ", trt_common, ") in the AgD trial"),
        show_risk_set = FALSE,
        suppress_plot_layout = TRUE, ...
      )

      basic_kmplot(kmdat_a2,
        main_title = paste0("Kaplan-Meier Curves \n(", trt_ipd, " vs ", trt_agd, ")"),
        show_risk_set = FALSE,
        suppress_plot_layout = TRUE, ...
      )

      basic_kmplot(kmdat_a1,
        main_title = paste0("Kaplan-Meier Curves of Common Comparator \n", trt_common, "(IPD vs AgD Trial)"),
        show_risk_set = FALSE,
        suppress_plot_layout = TRUE, ...
      )
    }
  }
  invisible(NULL)
}


#' Basic Kaplan Meier (KM) plot function
#'
#' This function can generate a basic KM plot with or without risk set table appended at the bottom. In a single plot,
#' it can include up to 4 KM curves. This depends on number of levels in 'treatment' column in the input data.frame
#' \code{kmdat}
#'
#' @param kmdat a `data.frame`, must consist `treatment`, `time` (unit in days), `n.risk`, `censor`, `surv`, similar to
#'   an output from \code{maicplus:::survfit_makeup}
#' @param endpoint_name a string, name of time to event endpoint, to be show in the last line of title
#' @param time_scale a string, time unit of median survival time, taking a value of 'years', 'months', 'weeks' or 'days'
#' @param time_grid a numeric vector in the unit of \code{time_scale}, risk set table and x axis of the km plot will be
#'   defined based on this time grid
#' @param show_risk_set logical, show risk set table or not, TRUE by default
#' @param main_title a string, main title of the KM plot
#' @param subplot_heights a numeric vector, heights argument to \code{graphic::layout()},NULL by default which means
#'   user will use the default setting
#' @param suppress_plot_layout logical, suppress the layout setting in this function so that user can specify layout
#'   outside of the function, FALSE by default
#' @param use_colors a character vector of length up to 4, colors to the KM curves, it will be passed to `col` of
#'   \code{lines()}
#' @param use_line_types a numeric vector of length up to 4, line type to the KM curves, it will be passed to `lty` of
#'   \code{lines()}
#' @param use_pch_cex a scalar between 0 and 1, point size to indicate censored individuals on the KM curves, it will be
#'   passed to `cex` of \code{points()}
#' @param use_pch_alpha a scalar between 0 and 255, degree of color transparency of points to indicate censored
#'   individuals on the KM curves, it will be passed to `cex` of \code{points()}
#'
#' @example inst/examples/basic_kmplot_ex.R
#'
#' @return a KM plot with or without risk set table appended at the bottom, with up to 4 KM curves
#' @export

basic_kmplot <- function(kmdat,
                         endpoint_name = "Time to Event Endpoint",
                         time_scale = NULL,
                         time_grid = NULL,
                         show_risk_set = TRUE,
                         main_title = "Kaplan-Meier Curves",
                         subplot_heights = NULL,
                         suppress_plot_layout = FALSE,
                         use_colors = NULL,
                         use_line_types = NULL,
                         use_pch_cex = 0.65,
                         use_pch_alpha = 100) {
  original_par <- par("bty", "tcl", "mgp", "cex.lab", "cex.axis", "cex.main", "mar")
  on.exit(par(original_par))
  # precheck
  if (!length(subplot_heights) %in% c(0, (1 + show_risk_set))) {
    stop("length of subplot_heights should be ", (1 + show_risk_set))
  }
  if (!is.factor(kmdat$treatment)) {
    stop("kmdat$treatment needs to be a factor, its levels will be used in legend and title, first level is comparator")
  }
  if (nlevels(kmdat$treatment) > 4) stop("kmdat$treatment cannot have more than 4 levels")

  # set up x axis (time)
  if (is.null(time_grid)) {
    max_t <- max(kmdat$time)
    t_range <- c(0, get_time_as(max_t, time_scale) * 1.07)
    time_grid <- pretty(t_range)
  } else {
    t_range <- c(0, max(time_grid))
  }



  # plat layout in par
  if (!suppress_plot_layout) {
    nr_subplot <- (1 + show_risk_set)
    if (is.null(subplot_heights)) subplot_heights <- c(7, 0.7 + nlevels(kmdat$treatment) * 0.7, 0.8)
    layout_mat <- matrix(1:nr_subplot, ncol = 1)
    layout(layout_mat, heights = subplot_heights)
  }

  # plot cosmetic setup
  if (is.null(use_line_types)) {
    use_lty <- c(1, 1, 2, 2)
    use_lwd <- c(1.5, 1.5, 1.2, 1.2)
  } else {
    use_lty <- use_line_types
    use_lwd <- c(1.5, 1.5, 1.2, 1.2)
  }

  if (is.null(use_colors)) {
    use_col <- c("#5450E4", "#00857C", "#6ECEB2", "#7B68EE")
  } else {
    use_col <- use_colors
  }
  use_col2 <- col2rgb(use_col) # preparing semi-transparent colors
  use_col2 <- rgb(use_col2[1, ], use_col2[2, ], use_col2[3, ], alpha = use_pch_alpha, maxColorValue = 255)

  ## : first subplot: KM curve -------------------
  # base plot
  par(bty = "n", tcl = -0.15, mgp = c(1.8, 0.4, 0), cex.lab = 0.85, cex.axis = 0.8, cex.main = 0.9, mar = c(3, 4, 5, 1))
  plot(0, 0,
    type = "n", xlab = paste0("Time in ", time_scale), ylab = "Survival Probability",
    ylim = c(0, 1), xlim = t_range, yaxt = "n", xaxt = "n",
    main = paste0(main_title, "\nEndpoint:", endpoint_name)
  )
  axis(2, las = 1)
  if (!is.null(time_grid)) {
    axis(1, at = time_grid)
  } else {
    axis(1)
  }

  # add km line
  for (ii in 1:nlevels(kmdat$treatment)) {
    tmpkmdat <- kmdat[as.numeric(kmdat$treatment) == ii, , drop = FALSE]
    lines(
      y = tmpkmdat$surv,
      x = get_time_as(tmpkmdat$time, time_scale),
      col = use_col[ii],
      lty = use_lty[ii],
      lwd = use_lwd[ii],
      type = "s"
    )
    tmpid <- (tmpkmdat$censor != 0) # cannot just ==1, anticipating weighted case
    points(
      y = tmpkmdat$surv[tmpid],
      x = get_time_as(tmpkmdat$time[tmpid], time_scale),
      col = use_col2[ii],
      pch = 3,
      cex = use_pch_cex
    )
  }

  ## : second subplot: risk set table (if applicable) -------------------
  if (show_risk_set) {
    # add legend, with treatment index
    legend("topright",
      bty = "n",
      cex = 0.8,
      lty = use_lty[1:nlevels(kmdat$treatment)],
      lwd = use_lwd[1:nlevels(kmdat$treatment)],
      col = use_col[1:nlevels(kmdat$treatment)],
      legend = paste0("(T", 1:nlevels(kmdat$treatment), ") ", levels(kmdat$treatment))
    )

    # add risk set table
    par(bty = "n", tcl = -0.15, mgp = c(1.8, 0.4, 0), mar = c(1, 4, 0, 1))
    plot(0, 0,
      type = "n", xlab = "", ylab = "", main = NULL,
      ylim = c(nlevels(kmdat$treatment) + 1.2, -0.5),
      xlim = t_range,
      yaxt = "n", xaxt = "n"
    )
    axis(2,
      at = 1:nlevels(kmdat$treatment), labels = paste0("T", 1:nlevels(kmdat$treatment)),
      line = NA, lty = "blank", las = 1
    )

    for (ii in 1:nlevels(kmdat$treatment)) {
      tmpkmdat <- kmdat[as.numeric(kmdat$treatment) == ii, , drop = FALSE]
      tmptime <- get_time_as(tmpkmdat$time, time_scale)
      tmpnr <- sapply(time_grid, function(kk) {
        tmpid <- which(tmptime > kk)
        if (length(tmpid) == 0) {
          if (min(tmpkmdat$n.risk) == 0) {
            tmpid <- which.min(tmpkmdat$n.risk)[1]
          } else {
            tmpid <- NULL
          }
        }
        tout <- ifelse(is.null(tmpid), "n/a", round(tmpkmdat$n.risk[tmpid], 1))
      })
      text(0, 0, labels = "Number at risk", pos = 4, cex = 0.8, offset = -0.8)
      text(
        y = rep(ii, length(time_grid)),
        x = time_grid,
        labels = tmpnr,
        col = use_col[ii],
        cex = 0.75
      )
      text(0, nlevels(kmdat$treatment) + 1,
        pos = 4, cex = 0.7, offset = -0.8, col = "gray30",
        labels = "Note: Number at risk for adjusted/weighted treament arm is the sum of individual weight at risk."
      )
    }
  } else {
    # add simple
    legend("topright",
      bty = "n",
      cex = 0.8,
      lty = use_lty[1:nlevels(kmdat$treatment)],
      lwd = use_lwd[1:nlevels(kmdat$treatment)],
      col = use_col[1:nlevels(kmdat$treatment)],
      legend = levels(kmdat$treatment)
    )
  }
  invisible(NULL)
}


#' Diagnosis plot of proportional hazard assumption for anchored and unanchored
#'
#' @param weights_object an object returned by \code{estimate_weight}
#' @param tte_ipd a data frame of individual patient data (IPD) of internal trial, contain at least "USUBJID", "EVENT",
#'   "TIME" columns and a column indicating treatment assignment
#' @param tte_pseudo_ipd a data frame of pseudo IPD by digitized KM curves of external trial (for time-to-event
#'   endpoint), contain at least "EVENT", "TIME"
#' @param trt_ipd  a string, name of the interested investigation arm in internal trial \code{tte_ipd} (real IPD)
#' @param trt_agd a string, name of the interested investigation arm in external trial
#'   \code{tte_pseudo_ipd} (pseudo IPD)
#' @param trt_common a string, name of the common comparator in internal and external trial, by default is NULL,
#'   indicating unanchored case
#' @param trt_var_ipd a string, column name in \code{tte_ipd} that contains the treatment assignment
#' @param trt_var_agd a string, column name in \code{tte_pseudo_ipd} that contains the treatment assignment
#' @param endpoint_name a string, name of time to event endpoint, to be show in the last line of title
#' @param time_scale a string, time unit of median survival time, taking a value of 'years', 'months', 'weeks' or 'days'
#' @param zph_transform a string, pass to \code{survival::cox.zph}, default is "log"
#' @param zph_log_hazard a logical, if TRUE (default), y axis of the time dependent hazard function is log-hazard,
#'   otherwise, hazard.
#'
#' @return a 3 by 2 plot, include log-cumulative hazard plot, time dependent hazard function and unscaled Schoenfeld
#'   residual plot, before and after matching
#'
#' @example inst/examples/ph_diagplot_unanchored_ex.R
#' @example inst/examples/ph_diagplot_anchored_ex.R
#' @export
ph_diagplot <- function(weights_object,
                        tte_ipd,
                        tte_pseudo_ipd,
                        trt_ipd,
                        trt_agd,
                        trt_common = NULL,
                        trt_var_ipd = "ARM",
                        trt_var_agd = "ARM",
                        endpoint_name = "Time to Event Endpoint",
                        time_scale,
                        zph_transform = "log",
                        zph_log_hazard = TRUE) {
  names(tte_ipd) <- toupper(names(tte_ipd))
  names(tte_pseudo_ipd) <- toupper(names(tte_pseudo_ipd))
  trt_var_ipd <- toupper(trt_var_ipd)
  trt_var_agd <- toupper(trt_var_agd)

  # pre check
  if (!"maicplus_estimate_weights" %in% class(weights_object)) {
    stop("weights_object should be an object returned by estimate_weights")
  }
  if (!all(c("USUBJID", "TIME", "EVENT", trt_var_ipd) %in% names(tte_ipd))) {
    stop("tte_ipd needs to include at least USUBJID, TIME, EVENT, ", trt_var_ipd)
  }
  if (!all(c("TIME", "EVENT", trt_var_agd) %in% names(tte_pseudo_ipd))) {
    stop("tte_ipd needs to include at least TIME, EVENT, ", trt_var_agd)
  }

  # preparing analysis data
  is_anchored <- ifelse(is.null(trt_common), FALSE, TRUE)
  tte_ipd <- tte_ipd[tte_ipd[[trt_var_ipd]] %in% c(trt_ipd, trt_common), , drop = TRUE]
  tte_pseudo_ipd <- tte_pseudo_ipd[tte_pseudo_ipd[[trt_var_agd]] %in% c(trt_agd, trt_common), , drop = TRUE]
  tte_ipd$weights <- weights_object$data$weights[match(weights_object$data$USUBJID, tte_ipd$USUBJID)]
  tte_pseudo_ipd$weights <- 1
  tte_ipd$TIME2 <- get_time_as(tte_ipd$TIME, as = time_scale) # for cox.zph
  tte_pseudo_ipd$TIME2 <- get_time_as(tte_pseudo_ipd$TIME, as = time_scale) # for cox.zph
  if (!"USUBJID" %in% names(tte_pseudo_ipd)) tte_pseudo_ipd$USUBJID <- paste0("ID", seq_len(nrow(tte_pseudo_ipd)))
  if (trt_var_ipd != "ARM") tte_ipd$ARM <- tte_ipd[[trt_var_ipd]]
  if (trt_var_agd != "ARM") tte_pseudo_ipd$ARM <- tte_pseudo_ipd[[trt_var_agd]]

  # prepare plot data
  retain_cols <- c("USUBJID", "TIME", "TIME2", "EVENT", "ARM", "weights")
  if (!is_anchored) {
    # unanchored case
    tte_dat <- rbind(
      tte_ipd[, retain_cols, drop = FALSE],
      tte_pseudo_ipd[, retain_cols, drop = FALSE]
    )
  } else {
    tte_dat <- tte_ipd[, retain_cols, drop = FALSE]
  }
  kmobj <- survival::survfit(Surv(TIME, EVENT) ~ ARM, tte_dat, conf.type = "log-log")
  kmobj_adj <- survival::survfit(Surv(TIME, EVENT) ~ ARM, tte_dat, conf.type = "log-log", weights = weights)
  coxobj <- survival::coxph(Surv(TIME, EVENT) ~ ARM, data = tte_dat)
  coxobj2 <- survival::coxph(Surv(TIME2, EVENT) ~ ARM, data = tte_dat)
  zphobj <- survival::cox.zph(coxobj2, transform = zph_transform, global = FALSE)
  coxobj_adj <- survival::coxph(Surv(TIME, EVENT) ~ ARM, data = tte_dat, weights = weights)
  coxobj_adj2 <- survival::coxph(Surv(TIME2, EVENT) ~ ARM, data = tte_dat, weights = weights)
  zphobj_adj <- survival::cox.zph(coxobj_adj2, transform = zph_transform, global = FALSE)

  # making the plot
  original_par <- par(mfrow = c(3, 2), cex.lab = 0.85, cex.axis = 0.8, cex.main = 0.9)
  on.exit(par(original_par))
  # log-cum-hazard plot
  ph_diagplot_lch(kmobj,
    time_scale = time_scale,
    log_time = TRUE,
    endpoint_name = endpoint_name,
    subtitle = "(Before Matching)"
  )

  ph_diagplot_lch(kmobj_adj,
    time_scale = time_scale,
    log_time = TRUE,
    endpoint_name = endpoint_name,
    subtitle = "(After Matching)"
  )
  # time dependent hazard plot
  plot(zphobj,
    main = paste0(
      "Time-dependent Hazard function (scaled Schoenfeld residual)\n",
      "Endpoint:", endpoint_name, "\n(Before Matching)"
    ),
    resid = FALSE, se = TRUE, df = 4, nsmo = 40,
    # xlim = range(0,zphobj$time),
    ylab = ifelse(zph_log_hazard, "Log Hazard", "Hazard"),
    xlab = paste("Time in", time_scale),
    lty = 1:2, lwd = 2, pch = 16, cex = 0.8,
    col = rgb(0, 0, 128, alpha = 120, maxColorValue = 255),
    hr = (!zph_log_hazard), yaxt = "n"
  )
  axis(2, las = 1)
  pv <- as.data.frame(zphobj$table)$p
  pv <- ifelse(round(pv, 4) < 0.0001, "<0.0001", format(round(pv, 4), nsmall = 4))
  legend("bottomright",
    cex = 0.75, bty = "n", text.col = "dodgerblue3",
    legend = c(paste0("p-value: ", pv), paste0("time-transform: ", zph_transform)),
    title = "PH test (survival::cox.zph)"
  )

  plot(zphobj_adj,
    main = paste0(
      "Time-dependent Hazard function (scaled Schoenfeld residual)\n",
      "Endpoint:", endpoint_name, "\n(After Matching)"
    ),
    resid = FALSE, se = TRUE, df = 4, nsmo = 40,
    # xlim = range(0,zphobj$time),
    ylab = ifelse(zph_log_hazard, "Log Hazard", "Hazard"),
    xlab = paste("Time in", time_scale),
    lty = 1:2, lwd = 2, pch = 16, cex = 0.8,
    col = rgb(0, 0, 128, alpha = 120, maxColorValue = 255),
    hr = (!zph_log_hazard), yaxt = "n"
  )
  axis(2, las = 1)
  pv <- as.data.frame(zphobj_adj$table)$p
  pv <- ifelse(round(pv, 4) < 0.0001, "<0.0001", format(round(pv, 4), nsmall = 4))
  legend("bottomright",
    cex = 0.75, bty = "n", text.col = "dodgerblue3",
    legend = c(paste0("p-value: ", pv), paste0("time-transform: ", zph_transform)),
    title = "PH test (survival::cox.zph)"
  )

  # unscaled schoenfeld residual
  ph_diagplot_schoenfeld(coxobj,
    time_scale = time_scale,
    log_time = FALSE,
    endpoint_name = endpoint_name,
    subtitle = "(Before Matching)"
  )

  ph_diagplot_schoenfeld(coxobj_adj,
    time_scale = time_scale,
    log_time = FALSE,
    endpoint_name = endpoint_name,
    subtitle = "(After Matching)"
  )
}

#' PH Diagnosis Plot of Log Cumulative Hazard Rate versus time or log-time
#'
#' This plot is also known as log negative log survival rate.
#'
#' a diagnosis plot for proportional hazard assumption, versus log-time (default) or time
#'
#' @param km_fit returned object from \code{survival::survfit}
#' @param time_scale a character string, 'years', 'months', 'weeks' or 'days', time unit of median survival time
#' @param log_time logical, TRUE (default) or FALSE
#' @param endpoint_name a character string, name of the endpoint
#' @param subtitle a character string, subtitle of the plot
#' @param exclude_censor logical, should censored data point be plotted
#' @examples
#' library(survival)
#' data(adtte_sat)
#' data(pseudo_ipd_sat)
#' combined_data <- rbind(adtte_sat[, c("TIME", "EVENT", "ARM")], pseudo_ipd_sat)
#' kmobj <- survfit(Surv(TIME, EVENT) ~ ARM, combined_data, conf.type = "log-log")
#' ph_diagplot_lch(kmobj,
#'   time_scale = "month", log_time = TRUE,
#'   endpoint_name = "OS", subtitle = "(Before Matching)"
#' )
#' @return a plot of log cumulative hazard rate
#' @export

ph_diagplot_lch <- function(km_fit,
                            time_scale,
                            log_time = TRUE,
                            endpoint_name = "",
                            subtitle = "",
                            exclude_censor = TRUE) {
  time_scale <- match.arg(arg = time_scale, choices = c("days", "weeks", "months", "years"))

  clldat <- survfit_makeup(km_fit)

  if (exclude_censor) {
    clldat <- lapply(clldat, function(xxt) xxt[xxt$censor == 0, , drop = FALSE])
  }

  all.times <- get_time_as(do.call(rbind, clldat)$time, time_scale)
  if (log_time) all.times <- log(all.times)
  t_range <- range(all.times)
  y_range <- range(log(do.call(rbind, clldat)$cumhaz))

  original_par <- par("mar", "bty", "tcl", "mgp")
  par(mar = c(4, 4, 4, 1), bty = "n", tcl = -0.15, mgp = c(1.5, 0.3, 0))
  on.exit(par(original_par))
  plot(0, 0,
    type = "n", xlab = paste0(ifelse(log_time, "Log-", ""), "Time in ", time_scale),
    ylab = "Log-Cumulative Hazard Rate",
    ylim = y_range, xlim = t_range, yaxt = "n",
    main = paste0(
      "Log Cumulative Hazard versus Log Time\nEndpoint: ", endpoint_name,
      ifelse(subtitle == "", "", "\n"), subtitle
    )
  )
  axis(2, las = 1)

  trts <- names(clldat)
  cols <- c("dodgerblue3", "firebrick3")
  pchs <- c(1, 4)
  for (i in seq_along(clldat)) {
    use_x <- get_time_as(clldat[[i]]$time, time_scale)
    if (log_time) use_x <- log(use_x)

    lines(
      y = log(clldat[[i]]$cumhaz),
      x = use_x, col = cols[i],
      type = "s",
    )
    points(
      y = log(clldat[[i]]$cumhaz),
      x = use_x,
      col = cols[i], pch = pchs[i], cex = 0.7
    )
  }
  legend("bottomright",
    bty = "n", lty = c(1, 1, 2), cex = 0.8,
    col = cols, pch = pchs, legend = paste0("Treatment: ", trts)
  )
}


#' PH Diagnosis Plot of Schoenfeld residuals for a Cox model fit
#'
#' @param coxobj object returned from \code{\link[survival]{coxph}}
#' @param time_scale a character string, 'years', 'months', 'weeks' or 'days', time unit of median survival time
#' @param log_time logical, TRUE (default) or FALSE
#' @param endpoint_name a character string, name of the endpoint
#' @param subtitle a character string, subtitle of the plot
#' @examples
#' library(survival)
#' data(adtte_sat)
#' data(pseudo_ipd_sat)
#' combined_data <- rbind(adtte_sat[, c("TIME", "EVENT", "ARM")], pseudo_ipd_sat)
#' unweighted_cox <- coxph(Surv(TIME, EVENT == 1) ~ ARM, data = combined_data)
#' ph_diagplot_schoenfeld(unweighted_cox,
#'   time_scale = "month", log_time = TRUE,
#'   endpoint_name = "OS", subtitle = "(Before Matching)"
#' )
#' @return a plot of Schoenfeld residuals
#' @export

ph_diagplot_schoenfeld <- function(coxobj,
                                   time_scale = "months",
                                   log_time = TRUE,
                                   endpoint_name = "",
                                   subtitle = "") {
  # pre-check
  time_scale <- match.arg(arg = time_scale, choices = c("days", "weeks", "months", "years"))

  # prepare data
  schresid <- residuals(coxobj, type = "schoenfeld")
  plot_x <- get_time_as(as.numeric(names(schresid)), time_scale)
  if (log_time) plot_x <- log(plot_x)

  # loewss fit
  fit0 <- predict(loess(schresid ~ plot_x), se = TRUE)
  uppband <- fit0$fit + qt(0.975, fit0$df) * fit0$se
  lowband <- fit0$fit - qt(0.975, fit0$df) * fit0$se
  use_yrange <- range(schresid, uppband, lowband)

  # making the plot
  original_par <- par(bty = "n", mar = c(4, 4, 4, 1), tcl = -0.15, mgp = c(1.5, 0.3, 0))
  on.exit(par(original_par))
  plot(schresid ~ plot_x,
    type = "n",
    yaxt = "n", ylim = use_yrange,
    ylab = "Unscaled Schoenfeld Residual",
    xlab = paste0(ifelse(log_time, "Log-", ""), "Time in ", time_scale),
    main = paste0(
      "Unscaled Schoenfeld Residual\nEndpoint: ", endpoint_name,
      ifelse(subtitle == "", "", "\n"), subtitle
    )
  )
  axis(2, las = 1)
  lines(fit0$fit ~ plot_x, lty = 2, lwd = 1, col = rgb(0, 0, 128, 150, maxColorValue = 255))
  # lines(uppband ~ plot_x, lty =2, lwd=1, col = rgb(0,0,128,120,maxColorValue = 255))
  # lines(lowband ~ plot_x, lty =2, lwd=1, col = rgb(0,0,128,120,maxColorValue = 255))
  polygon(
    x = c(plot_x, rev(plot_x)),
    y = c(uppband, rev(lowband)),
    col = rgb(0, 0, 128, 60, maxColorValue = 255),
    border = NA
  )
  abline(h = 0, lty = 1, lwd = 1, col = "deeppink")
  points(schresid ~ plot_x,
    pch = 16,
    cex = 0.85,
    col = rgb(169, 169, 169, 120, maxColorValue = 255)
  )
}

#' Anchored MAIC for binary and time-to-event endpoint
#'
#' This is a wrapper function to provide adjusted effect estimates and relevant statistics in anchored case (i.e. there
#' is a common comparator arm in the internal and external trial).
#'
#' @param weights_object an object returned by \code{estimate_weight}
#' @param ipd a data frame that meet format requirements in 'Details', individual patient data (IPD) of internal trial
#' @param pseudo_ipd a data frame, pseudo IPD from digitized KM curve of external trial (for time-to-event endpoint) or
#'   from contingency table (for binary endpoint)
#' @param trt_ipd  a string, name of the interested investigation arm in internal trial \code{ipd} (internal IPD)
#' @param trt_agd a string, name of the interested investigation arm in external trial \code{pseudo_ipd} (pseudo IPD)
#' @param trt_common a string, name of the common comparator in internal and external trial
#' @param trt_var_ipd a string, column name in \code{ipd} that contains the treatment assignment
#' @param trt_var_agd a string, column name in \code{ipd} that contains the treatment assignment
#' @param normalize_weights logical, default is \code{FALSE}. If \code{TRUE},
#'   \code{scaled_weights} (normalized weights) in \code{weights_object$data} will be used.
#' @param endpoint_type a string, one out of the following "binary", "tte" (time to event)
#' @param endpoint_name a string, name of time to event endpoint, to be show in the last line of title
#' @param time_scale a string, time unit of median survival time, taking a value of 'years', 'months', 'weeks' or
#'   'days'. NOTE: it is assumed that values in TIME column of \code{ipd} and \code{pseudo_ipd} is in the unit of days
#' @param km_conf_type a string, pass to \code{conf.type} of \code{survfit}
#' @param eff_measure a string, "RD" (risk difference), "OR" (odds ratio), "RR" (relative risk) for a binary endpoint;
#'   "HR" for a time-to-event endpoint. By default is \code{NULL}, "OR" is used for binary case, otherwise "HR" is used.
#' @param boot_ci_type a string, one of `c("norm","basic", "stud", "perc", "bca")` to select the type of bootstrap
#'   confidence interval. See [boot::boot.ci] for more details.
#' @param binary_robust_cov_type a string to pass to argument `type` of [sandwich::vcovHC], see possible options in the
#'   documentation of that function. Default is `"HC3"`
#'
#' @details It is required that input \code{ipd} and \code{pseudo_ipd} to have the following
#'   columns. This function is not sensitive to upper or lower case of letters in column names.
#' \itemize{
#'   \item USUBJID - character, unique subject ID
#'   \item ARM - character or factor, treatment indicator, column name does not have to be 'ARM'. User specify in
#'    \code{trt_var_ipd} and \code{trt_var_agd}
#'  }
#'  For time-to-event analysis, the follow columns are required:
#'  \itemize{
#'   \item EVENT - numeric, `1` for censored/death, `0` otherwise
#'   \item TIME - numeric column, observation time of the \code{EVENT}; unit in days
#' }
#' For binary outcomes:
#' \itemize{
#'   \item RESPONSE - numeric, `1` for event occurred, `0` otherwise
#' }
#'
#' @importFrom survival survfit Surv coxph
#' @importFrom lmtest coeftest coefci
#' @importFrom sandwich vcovHC
#' @importFrom boot boot boot.ci
#' @return A list, contains 'descriptive' and 'inferential'
#' @example inst/examples/maic_anchored_ex.R
#' @example inst/examples/maic_anchored_binary_ex.R
#' @export

maic_anchored <- function(weights_object,
                          ipd,
                          pseudo_ipd,
                          trt_ipd,
                          trt_agd,
                          trt_common,
                          trt_var_ipd = "ARM",
                          trt_var_agd = "ARM",
                          normalize_weights = FALSE,
                          endpoint_type = "tte",
                          endpoint_name = "Time to Event Endpoint",
                          eff_measure = c("HR", "OR", "RR", "RD"),
                          boot_ci_type = c("norm", "basic", "stud", "perc", "bca"),
                          # time to event specific args
                          time_scale = "months",
                          km_conf_type = "log-log",
                          # binary specific args
                          binary_robust_cov_type = "HC3") {
  # ==> Initial Setup ------------------------------------------
  # ~~~ Create the hull for the output from this function
  res <- list(
    descriptive = list(),
    inferential = list()
  )

  res_AB <- list(
    est = NA,
    se = NA,
    ci_l = NA,
    ci_u = NA,
    pval = NA
  )

  # ~~~ Initial colname process and precheck on effect measure
  names(ipd) <- toupper(names(ipd))
  names(pseudo_ipd) <- toupper(names(pseudo_ipd))
  trt_var_ipd <- toupper(trt_var_ipd)
  trt_var_agd <- toupper(trt_var_agd)
  if (length(eff_measure) > 1) eff_measure <- NULL
  if (is.null(eff_measure)) eff_measure <- list(binary = "OR", tte = "HR")[[endpoint_type]]

  # ~~~ Setup ARM column and make related pre-checks
  if (!trt_var_ipd %in% names(ipd)) stop("cannot find arm indicator column trt_var_ipd in ipd")
  if (!trt_var_agd %in% names(pseudo_ipd)) stop("cannot find arm indicator column trt_var_agd in pseudo_ipd")
  if (trt_var_ipd != "ARM") ipd$ARM <- ipd[[trt_var_ipd]]
  if (trt_var_agd != "ARM") pseudo_ipd$ARM <- pseudo_ipd[[trt_var_agd]]
  ipd$ARM <- as.character(ipd$ARM) # just to avoid potential error when merging

  # ~~~ More pre-checks
  pseudo_ipd$ARM <- as.character(pseudo_ipd$ARM) # just to avoid potential error when merging
  if (!trt_ipd %in% ipd$ARM) stop("trt_ipd does not exist in ipd$ARM")
  if (!trt_agd %in% pseudo_ipd$ARM) stop("trt_agd does not exist in pseudo_ipd$ARM")
  if (!trt_common %in% ipd$ARM) stop("trt_common does not exist in ipd$ARM")
  if (!trt_common %in% pseudo_ipd$ARM) stop("trt_common does not exist in pseudo_ipd$ARM")
  ipd_arms <- unique(ipd$ARM)
  pseudo_ipd_arms <- unique(pseudo_ipd$ARM)
  if (!length(ipd_arms) >= 2) {
    stop("In anchored case, there should be at least two arms in ipd, but you have: ", toString(ipd_arms))
  }
  if (!length(pseudo_ipd_arms) >= 2) {
    stop("In anchored case, there should be at least two arms in pseudo_ipd, but you have: ", toString(pseudo_ipd_arms))
  }
  endpoint_type <- match.arg(endpoint_type, c("binary", "tte"))
  if (!"maicplus_estimate_weights" %in% class(weights_object)) {
    stop("weights_object should be an object returned by estimate_weights")
  }
  if (any(duplicated(ipd$USUBJID))) {
    warning(
      "check your ipd, it has duplicated usubjid, this indicates, ",
      "it might contain multiple endpoints for each subject"
    )
  }
  if (!all(ipd$USUBJID %in% weights_object$data$USUBJID)) {
    stop(
      "These patients in ipd cannot be found in weights_object ",
      toString(setdiff(ipd$USUBJID, weights_object$USUBJID))
    )
  }
  time_scale <- match.arg(arg = time_scale, choices = c("days", "weeks", "months", "years"))
  if (endpoint_type == "binary") { # for binary effect measure

    if (any(!c("USUBJID", "RESPONSE") %in% names(ipd))) stop("ipd should have 'USUBJID', 'RESPONSE' columns at minimum")
    eff_measure <- match.arg(eff_measure, choices = c("OR", "RD", "RR"), several.ok = FALSE)
  } else if (endpoint_type == "tte") { # for time to event effect measure

    if (!all(c("USUBJID", "TIME", "EVENT", trt_var_ipd) %in% names(ipd))) {
      stop("ipd needs to include at least USUBJID, TIME, EVENT, ", toString(trt_var_ipd))
    }
    if (!all(c("TIME", "EVENT", trt_var_agd) %in% names(pseudo_ipd))) {
      stop("pseudo_ipd needs to include at least TIME, EVENT, ", toString(trt_var_agd))
    }
    eff_measure <- match.arg(eff_measure, choices = c("HR"), several.ok = FALSE)
  }
  boot_ci_type <- match.arg(boot_ci_type)

  # ==> IPD and AgD data preparation ------------------------------------------
  # : subset ipd, retain only ipd from interested trts
  ipd <- ipd[ipd$ARM %in% c(trt_ipd, trt_common), , drop = TRUE]
  pseudo_ipd <- pseudo_ipd[pseudo_ipd$ARM %in% c(trt_agd, trt_common), , drop = TRUE]

  # : assign weights to real and pseudo ipd
  if (normalize_weights) {
    ipd$weights <- weights_object$data$scaled_weights[match(ipd$USUBJID, weights_object$data$USUBJID)]
  } else {
    ipd$weights <- weights_object$data$weights[match(ipd$USUBJID, weights_object$data$USUBJID)]
  }
  pseudo_ipd$weights <- 1
  if (!"USUBJID" %in% names(pseudo_ipd)) pseudo_ipd$USUBJID <- paste0("ID", seq_len(nrow(pseudo_ipd)))

  # : give warning when individual pts in IPD has no weights
  if (any(is.na(ipd$weights))) {
    ipd <- ipd[!is.na(ipd$weights), , drop = FALSE]
    warning(
      "These USUBJID in IPD have no weight in weights_object and hence excluded from analysis: ",
      toString(ipd$USUBJID[is.na(ipd$weights)])
    )
    if (nrow(ipd) == 0) stop("There are no patients with valid weights in IPD!")
  }

  # : retain necessary columns
  outcome_cols <- if (endpoint_type == "tte") c("TIME", "EVENT") else "RESPONSE"
  retain_cols <- c("USUBJID", "ARM", outcome_cols, "weights")

  ipd <- ipd[, retain_cols, drop = FALSE]
  pseudo_ipd <- pseudo_ipd[, retain_cols, drop = FALSE]

  # : merge real and pseudo ipds, only used if apply contrast method,
  #   since contrast method is not implemented in v0.1, this R obj is not used
  #   just a placeholder
  dat <- rbind(ipd, pseudo_ipd)

  # : setup ARM column as a factor,
  # * these line cannot be move prior to "dat <- rbind(ipd, pseudo_ipd)"
  ipd$ARM <- factor(ipd$ARM, levels = c(trt_common, trt_ipd))
  pseudo_ipd$ARM <- factor(pseudo_ipd$ARM, levels = c(trt_common, trt_agd))
  dat$ARM <- factor(dat$ARM, levels = c(trt_common, trt_agd, trt_ipd))

  # ==> Inferential output ------------------------------------------
  result <- if (endpoint_type == "tte") {
    maic_anchored_tte(
      res,
      res_BC = NULL,
      dat,
      ipd,
      pseudo_ipd,
      km_conf_type,
      time_scale,
      weights_object,
      endpoint_name,
      normalize_weights,
      trt_ipd,
      trt_agd,
      boot_ci_type
    )
  } else if (endpoint_type == "binary") {
    maic_anchored_binary(
      res,
      res_BC = NULL,
      dat,
      ipd,
      pseudo_ipd,
      binary_robust_cov_type,
      weights_object,
      endpoint_name,
      normalize_weights,
      eff_measure,
      trt_ipd,
      trt_agd,
      boot_ci_type
    )
  } else {
    stop("Endpoint type ", endpoint_type, " currently unsupported.")
  }

  result
}


# MAIC inference functions for TTE outcome type ------------
maic_anchored_tte <- function(res,
                              res_BC = NULL,
                              dat,
                              ipd,
                              pseudo_ipd,
                              km_conf_type,
                              time_scale,
                              weights_object,
                              endpoint_name,
                              normalize_weights,
                              trt_ipd,
                              trt_agd,
                              boot_ci_type) {
  # Analysis table (Cox model) before and after matching, incl Median Survival Time
  # derive km w and w/o weights
  kmobj_ipd <- survfit(Surv(TIME, EVENT) ~ ARM, ipd, conf.type = km_conf_type)
  kmobj_ipd_adj <- survfit(Surv(TIME, EVENT) ~ ARM, ipd, weights = ipd$weights, conf.type = km_conf_type)
  kmobj_agd <- survfit(Surv(TIME, EVENT) ~ ARM, pseudo_ipd, conf.type = km_conf_type)

  res$descriptive[["survfit_ipd_before"]] <- survfit_makeup(kmobj_ipd)
  res$descriptive[["survfit_ipd_after"]] <- survfit_makeup(kmobj_ipd_adj)
  res$descriptive[["survfit_pseudo"]] <- survfit_makeup(kmobj_agd)
  # derive median survival time
  medSurv_ipd <- medSurv_makeup(kmobj_ipd, legend = "IPD, before matching", time_scale = time_scale)
  medSurv_ipd_adj <- medSurv_makeup(kmobj_ipd_adj, legend = "IPD, after matching", time_scale = time_scale)
  medSurv_agd <- medSurv_makeup(kmobj_agd, legend = "AgD, external", time_scale = time_scale)
  medSurv_out <- rbind(medSurv_ipd, medSurv_ipd_adj, medSurv_agd)
  medSurv_out <- cbind(medSurv_out[, 1:6],
    `events%` = medSurv_out$events * 100 / medSurv_out$n.max,
    medSurv_out[7:ncol(medSurv_out)]
  )
  medSurv_out <- cbind(trt_ind = c("C", "B", "A")[match(medSurv_out$treatment, levels(dat$ARM))], medSurv_out)

  res$descriptive[["summary"]] <- medSurv_out

  # ~~~ Analysis table (Cox model) before and after matching
  # fit PH Cox regression model
  coxobj_ipd <- coxph(Surv(TIME, EVENT) ~ ARM, ipd) # robust = TRUE or not makes a diff
  coxobj_ipd_adj <- coxph(Surv(TIME, EVENT) ~ ARM, ipd, weights = weights, robust = TRUE)
  coxobj_agd <- coxph(Surv(TIME, EVENT) ~ ARM, pseudo_ipd)

  # derive ipd exp arm vs agd exp arm via bucher
  res_AC_unadj <- as.list(summary(coxobj_ipd)$coef)[c(1, 3)] # est, se
  res_AC <- as.list(summary(coxobj_ipd_adj)$coef)[c(1, 4)] # est, robust se
  if (is.null(res_BC)) res_BC <- as.list(summary(coxobj_agd)$coef)[c(1, 3)] # est, se

  names(res_AC_unadj) <- names(res_AC) <- names(res_BC) <- c("est", "se")

  coxobj_ipd_summary <- summary(coxobj_ipd)
  res_AC_unadj$ci_l <- coxobj_ipd_summary$conf.int[3]
  res_AC_unadj$ci_u <- coxobj_ipd_summary$conf.int[4]
  res_AC_unadj$pval <- as.vector(coxobj_ipd_summary$waldtest[3])

  coxobj_ipd_adj_summary <- summary(coxobj_ipd_adj)
  res_AC$ci_l <- coxobj_ipd_adj_summary$conf.int[3]
  res_AC$ci_u <- coxobj_ipd_adj_summary$conf.int[4]
  res_AC$pval <- as.vector(coxobj_ipd_adj_summary$waldtest[3])

  coxobj_agd_summary <- summary(coxobj_agd)
  res_BC$ci_l <- coxobj_agd_summary$conf.int[3]
  res_BC$ci_u <- coxobj_agd_summary$conf.int[4]
  res_BC$pval <- as.vector(coxobj_agd_summary$waldtest[3])

  res_AB <- bucher(res_AC, res_BC, conf_lv = 0.95)
  res_AB_unadj <- bucher(res_AC_unadj, res_BC, conf_lv = 0.95)

  # : get bootstrapped estimates if applicable
  if (!is.null(weights_object$boot)) {
    keep_rows <- setdiff(seq_len(nrow(weights_object$data)), weights_object$rows_with_missing)
    boot_ipd_id <- weights_object$data[keep_rows, "USUBJID", drop = FALSE]

    boot_ipd <- merge(boot_ipd_id, ipd, by = "USUBJID", all.x = TRUE)
    if (nrow(boot_ipd) != nrow(boot_ipd_id)) stop("ipd has multiple observations for some patients")
    boot_ipd <- boot_ipd[match(boot_ipd$USUBJID, boot_ipd_id$USUBJID), ]

    stat_fun <- function(data, index, w_obj, normalize) {
      r <- dynGet("r", ifnotfound = NA) # Get bootstrap iteration
      if (!is.na(r)) {
        if (!all(index == w_obj$boot[, 1, r])) stop("Bootstrap and weight indices don't match")
        boot_ipd <- data[w_obj$boot[, 1, r], ]
        boot_ipd$weights <- w_obj$boot[, 2, r]

        if (normalize) {
          boot_ipd$weights <- ave(
            boot_ipd$weights,
            boot_ipd$ARM,
            FUN = function(w) w / mean(w, na.rm = TRUE)
          )
        }
      }
      boot_coxobj_dat_adj <- coxph(Surv(TIME, EVENT) ~ ARM, boot_ipd, weights = boot_ipd$weights, robust = TRUE)
      boot_res_AC <- list(est = coef(boot_coxobj_dat_adj)[1], se = sqrt(vcov(boot_coxobj_dat_adj)[1, 1]))
      # temp method to source in variance of BC in AgD via monte carlo, may be removed in future
      res_BC_mc <- res_BC
      res_BC_mc$est <- rnorm(1, mean = res_BC$est, sd = res_BC$se)
      boot_res_AB <- bucher(boot_res_AC, res_BC_mc, conf_lv = 0.95)
      c(
        est_AB = boot_res_AB$est,
        var_AB = boot_res_AB$se^2,
        se_AB = boot_res_AB$se,
        est_AC = boot_res_AC$est,
        se_AC = boot_res_AC$se,
        var_AC = boot_res_AC$se^2
      )
    }

    # Revert seed to how it was for weight bootstrap sampling
    old_seed <- globalenv()$.Random.seed
    on.exit(suspendInterrupts(set_random_seed(old_seed)))
    set_random_seed(weights_object$boot_seed)

    R <- dim(weights_object$boot)[3]
    boot_res <- boot(
      boot_ipd,
      stat_fun,
      R = R,
      w_obj = weights_object,
      normalize = normalize_weights,
      strata = weights_object$boot_strata
    )
    boot_ci <- boot.ci(boot_res, type = boot_ci_type, w_obj = weights_object, normalize = normalize_weights)

    l_u_index <- switch(boot_ci_type,
      "norm" = list(2, 3, "normal"),
      "basic" = list(4, 5, "basic"),
      "stud" = list(4, 5, "student"),
      "perc" = list(4, 5, "percent"),
      "bca" = list(4, 5, "bca")
    )

    # boot results for A v B, method 1 (maybe retired in future version)
    boot_res_AB <- list(
      est = as.vector(exp(boot_res$t0[1])),
      se = NA,
      ci_l = exp(boot_ci[[l_u_index[[3]]]][l_u_index[[1]]]),
      ci_u = exp(boot_ci[[l_u_index[[3]]]][l_u_index[[2]]]),
      pval = NA
    )

    # boot results for A v C
    boot_ci_ac <- boot.ci(boot_res, type = boot_ci_type, w_obj = weights_object, index = c(4, 6))
    boot_res_AC <- list(
      est = as.vector(exp(boot_res$t0[4])),
      se = NA,
      ci_l = exp(boot_ci_ac[[l_u_index[[3]]]][l_u_index[[1]]]),
      ci_u = exp(boot_ci_ac[[l_u_index[[3]]]][l_u_index[[2]]]),
      pval = NA
    )

    # boot results for A v B, method 2
    boot_res_AC2 <- list(
      est = as.vector(boot_res$t0[4]),
      se = NA,
      ci_l = boot_ci_ac[[l_u_index[[3]]]][l_u_index[[1]]],
      ci_u = boot_ci_ac[[l_u_index[[3]]]][l_u_index[[2]]],
      pval = NA
    )
    boot_res_AC2$se <- find_SE_from_CI(boot_res_AC2$ci_l, boot_res_AC2$ci_u, 0.95, log = FALSE)
    boot_res_AB2 <- bucher(boot_res_AC2, res_BC, conf_lv = 0.95)
    boot_res_AB2 <- list(
      est = exp(boot_res_AB2$est),
      se = NA,
      ci_l = exp(boot_res_AB2$ci_l),
      ci_u = exp(boot_res_AB2$ci_u),
      pval = NA
    )
  } else {
    boot_res <- NULL
    boot_res_AB <- NULL
    boot_res_AB2 <- NULL
    boot_res_AC <- NULL
  }

  # transform
  res_AB$est <- exp(res_AB$est)
  res_AB$ci_l <- exp(res_AB$ci_l)
  res_AB$ci_u <- exp(res_AB$ci_u)
  res_AB_unadj$est <- exp(res_AB_unadj$est)
  res_AB_unadj$ci_l <- exp(res_AB_unadj$ci_l)
  res_AB_unadj$ci_u <- exp(res_AB_unadj$ci_u)

  res_AC$est <- exp(res_AC$est)
  res_AC_unadj$est <- exp(res_AC_unadj$est)
  res_BC$est <- exp(res_BC$est)

  # : report all raw fitted obj
  res$inferential[["fit"]] <- list(
    km_before_ipd = kmobj_ipd,
    km_after_ipd = kmobj_ipd_adj,
    km_agd = kmobj_agd,
    model_before_ipd = coxobj_ipd,
    model_after_ipd = coxobj_ipd_adj,
    model_agd = coxobj_agd,
    res_AC = res_AC,
    res_AC_unadj = res_AC_unadj,
    res_BC = res_BC,
    res_AB = res_AB,
    res_AB_unadj = res_AB_unadj,
    boot_res = boot_res,
    boot_res_AC = boot_res_AC,
    boot_res_AB_mc = boot_res_AB,
    boot_res_AB = boot_res_AB2
  )

  # : compile HR result
  res$inferential[["summary"]] <- data.frame(
    case = c("AC", "adjusted_AC", "BC", "AB", "adjusted_AB"),
    HR = c(
      summary(coxobj_ipd)$conf.int[1],
      summary(coxobj_ipd_adj)$conf.int[1],
      summary(coxobj_agd)$conf.int[1],
      res_AB_unadj$est, res_AB$est
    ),
    LCL = c(
      summary(coxobj_ipd)$conf.int[3],
      summary(coxobj_ipd_adj)$conf.int[3],
      summary(coxobj_agd)$conf.int[3],
      res_AB_unadj$ci_l, res_AB$ci_l
    ),
    UCL = c(
      summary(coxobj_ipd)$conf.int[4],
      summary(coxobj_ipd_adj)$conf.int[4],
      summary(coxobj_agd)$conf.int[4],
      res_AB_unadj$ci_u, res_AB$ci_u
    ),
    pval = c(
      summary(coxobj_ipd)$waldtest[3],
      summary(coxobj_ipd_adj)$waldtest[3],
      summary(coxobj_agd)$waldtest[3],
      res_AB_unadj$pval, res_AB$pval
    )
  )

  # output
  res
}

# MAIC inference functions for Binary outcome type ------------
maic_anchored_binary <- function(res,
                                 res_BC = NULL,
                                 dat,
                                 ipd,
                                 pseudo_ipd,
                                 binary_robust_cov_type,
                                 weights_object,
                                 endpoint_name,
                                 normalize_weights,
                                 eff_measure,
                                 trt_ipd,
                                 trt_agd,
                                 boot_ci_type) {
  # ~~~ Analysis table
  # : set up proper link
  glm_link <- switch(eff_measure,
    "RD" = "identity",
    "RR" = "log",
    "OR" = "logit"
  )
  res_template <- list(
    est = NA,
    se = NA,
    ci_l = NA,
    ci_u = NA,
    pval = NA
  )

  # : fit glm for binary outcome and robust estimate with weights
  binobj_ipd <- glm(RESPONSE ~ ARM, ipd, family = binomial(link = glm_link))
  binobj_ipd_adj <- suppressWarnings(glm(RESPONSE ~ ARM, ipd, weights = weights, family = binomial(link = glm_link)))
  binobj_agd <- glm(RESPONSE ~ ARM, pseudo_ipd, family = binomial(link = glm_link))

  bin_robust_cov <- sandwich::vcovHC(binobj_ipd_adj, type = binary_robust_cov_type)
  bin_robust_coef <- lmtest::coeftest(binobj_ipd_adj, vcov. = bin_robust_cov)
  bin_robust_ci <- lmtest::coefci(binobj_ipd_adj, vcov. = bin_robust_cov)

  # : make general summary
  glmDesc_ipd <- glm_makeup(binobj_ipd, legend = "IPD, before matching", weighted = FALSE)
  glmDesc_ipd_adj <- glm_makeup(binobj_ipd_adj, legend = "IPD, after matching", weighted = TRUE)
  glmDesc_agd <- glm_makeup(binobj_agd, legend = "AgD, external", weighted = FALSE)
  glmDesc <- rbind(glmDesc_ipd, glmDesc_ipd_adj, glmDesc_agd)
  glmDesc <- cbind(trt_ind = c("C", "B", "A")[match(glmDesc$treatment, levels(dat$ARM))], glmDesc)
  rownames(glmDesc) <- NULL
  res$descriptive[["summary"]] <- glmDesc

  # derive ipd exp arm vs agd exp arm via bucher
  res_AC <- res_template
  res_AC$est <- bin_robust_coef[2, "Estimate"]
  res_AC$se <- bin_robust_coef[2, "Std. Error"]
  res_AC$ci_l <- bin_robust_ci[2, "2.5 %"]
  res_AC$ci_u <- bin_robust_ci[2, "97.5 %"]
  res_AC$pval <- bin_robust_coef[2, "Pr(>|z|)"]

  # unadjusted AC
  res_AC_unadj <- res_template
  res_AC_unadj$est <- summary(binobj_ipd)$coefficients[2, "Estimate"]
  res_AC_unadj$se <- summary(binobj_ipd)$coefficients[2, "Std. Error"]
  res_AC_unadj$ci_l <- confint.default(binobj_ipd)[2, "2.5 %"]
  res_AC_unadj$ci_u <- confint.default(binobj_ipd)[2, "97.5 %"]
  res_AC_unadj$pval <- summary(binobj_ipd)$coefficients[2, "Pr(>|z|)"]

  # BC
  if (is.null(res_BC)) {
    res_BC <- res_template
    res_BC$est <- summary(binobj_agd)$coefficients[2, "Estimate"]
    res_BC$se <- summary(binobj_agd)$coefficients[2, "Std. Error"]
    res_BC$ci_l <- confint.default(binobj_agd)[2, "2.5 %"]
    res_BC$ci_u <- confint.default(binobj_agd)[2, "97.5 %"]
    res_BC$pval <- summary(binobj_agd)$coefficients[2, "Pr(>|z|)"]
  }

  # derive AB
  res_AB <- bucher(res_AC, res_BC, conf_lv = 0.95)
  res_AB_unadj <- bucher(res_AC_unadj, res_BC, conf_lv = 0.95)

  # : get bootstrapped estimates if applicable
  if (!is.null(weights_object$boot)) {
    keep_rows <- setdiff(seq_len(nrow(weights_object$data)), weights_object$rows_with_missing)
    boot_ipd_id <- weights_object$data[keep_rows, "USUBJID", drop = FALSE]

    boot_ipd <- merge(boot_ipd_id, ipd, by = "USUBJID", all.x = TRUE)
    if (nrow(boot_ipd) != nrow(boot_ipd_id)) stop("ipd has multiple observations for some patients")
    boot_ipd <- boot_ipd[match(boot_ipd$USUBJID, boot_ipd_id$USUBJID), ]

    stat_fun <- function(data, index, w_obj, eff_measure, normalize) {
      r <- dynGet("r", ifnotfound = NA) # Get bootstrap iteration
      if (!is.na(r)) {
        if (!all(index == w_obj$boot[, 1, r])) stop("Bootstrap and weight indices don't match")
        boot_ipd <- data[w_obj$boot[, 1, r], ]
        boot_ipd$weights <- w_obj$boot[, 2, r]

        if (normalize) {
          boot_ipd$weights <- ave(
            boot_ipd$weights,
            boot_ipd$ARM,
            FUN = function(w) w / mean(w, na.rm = TRUE)
          )
        }
      }

      boot_binobj_dat_adj <- suppressWarnings(
        glm(RESPONSE ~ ARM, boot_ipd, weights = boot_ipd$weights, family = binomial(link = glm_link))
      )
      boot_AC_est <- coef(boot_binobj_dat_adj)[2]
      boot_AC_var <- vcov(boot_binobj_dat_adj)[2, 2]

      boot_res_AC <- list(est = boot_AC_est, se = sqrt(boot_AC_var))
      # temp method to source in variance of BC in AgD via monte carlo, may be removed in future
      res_BC_mc <- res_BC
      res_BC_mc$est <- rnorm(1, mean = res_BC$est, sd = res_BC$se)

      boot_res_AB <- bucher(boot_res_AC, res_BC_mc, conf_lv = 0.95)

      c(
        est_AB = boot_res_AB$est,
        var_AB = boot_res_AB$se^2,
        se_AB = boot_res_AB$se,
        est_AC = boot_res_AC$est,
        se_AC = boot_res_AC$se,
        var_AC = boot_res_AC$se^2
      )
    }

    # Revert seed to how it was for weight bootstrap sampling
    old_seed <- globalenv()$.Random.seed
    on.exit(suspendInterrupts(set_random_seed(old_seed)))
    set_random_seed(weights_object$boot_seed)

    R <- dim(weights_object$boot)[3]
    boot_res <- boot(
      boot_ipd,
      stat_fun,
      R = R,
      w_obj = weights_object,
      eff_measure = eff_measure,
      normalize = normalize_weights,
      strata = weights_object$boot_strata
    )
    boot_ci <- boot.ci(boot_res, type = boot_ci_type, w_obj = weights_object)

    l_u_index <- switch(boot_ci_type,
      "norm" = list(2, 3, "normal"),
      "basic" = list(4, 5, "basic"),
      "stud" = list(4, 5, "student"),
      "perc" = list(4, 5, "percent"),
      "bca" = list(4, 5, "bca")
    )

    transform_estimate <- switch(eff_measure,
      "RD" = function(x) x * 100,
      "RR" = exp,
      "OR" = exp
    )

    # boot results for A v B, method 1 (maybe retired in future version)
    boot_res_AB <- list(
      est = as.vector(transform_estimate(boot_res$t0[1])),
      se = NA,
      ci_l = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[1]]]),
      ci_u = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[2]]]),
      pval = NA
    )

    # boot results for A v C
    boot_ci_ac <- boot.ci(boot_res, type = boot_ci_type, w_obj = weights_object, index = c(4, 6))
    boot_res_AC <- list(
      est = as.vector(transform_estimate(boot_res$t0[4])),
      se = NA,
      ci_l = transform_estimate(boot_ci_ac[[l_u_index[[3]]]][l_u_index[[1]]]),
      ci_u = transform_estimate(boot_ci_ac[[l_u_index[[3]]]][l_u_index[[2]]]),
      pval = NA
    )

    # boot results for A v B, method 2
    boot_res_AC2 <- list(
      est = as.vector(boot_res$t0[4]),
      se = NA,
      ci_l = boot_ci_ac[[l_u_index[[3]]]][l_u_index[[1]]],
      ci_u = boot_ci_ac[[l_u_index[[3]]]][l_u_index[[2]]],
      pval = NA
    )
    boot_res_AC2$se <- find_SE_from_CI(boot_res_AC2$ci_l, boot_res_AC2$ci_u, 0.95, log = FALSE)
    boot_res_AB2 <- bucher(boot_res_AC2, res_BC, conf_lv = 0.95)
    boot_res_AB2 <- list(
      est = transform_estimate(boot_res_AB2$est),
      se = NA,
      ci_l = transform_estimate(boot_res_AB2$ci_l),
      ci_u = transform_estimate(boot_res_AB2$ci_u),
      pval = NA
    )
  } else {
    boot_res_AC <- NULL
    boot_res_AB <- NULL
    boot_res_AB2 <- NULL
    boot_res <- NULL
  }

  # transform effect measures
  if (eff_measure %in% c("RR", "OR")) {
    res_AB <- transform_ratio(res_AB)
    res_AB_unadj <- transform_ratio(res_AB_unadj)
    res_AC <- transform_ratio(res_AC)
    res_AC_unadj <- transform_ratio(res_AC_unadj)
    res_BC <- transform_ratio(res_BC)
  } else if (eff_measure == "RD") {
    res_AB <- transform_absolute(res_AB)
    res_AB_unadj <- transform_absolute(res_AB_unadj)
    res_AC <- transform_absolute(res_AC)
    res_AC_unadj <- transform_absolute(res_AC_unadj)
    res_BC <- transform_absolute(res_BC)
  }


  # report all raw fitted obj
  res$inferential[["fit"]] <- list(
    model_before_ipd = binobj_ipd,
    model_after_ipd = binobj_ipd_adj,
    model_agd = binobj_agd,
    res_AC = res_AC,
    res_AC_unadj = res_AC_unadj,
    res_BC = res_BC,
    res_AB = res_AB,
    res_AB_unadj = res_AB_unadj,
    boot_res = boot_res,
    boot_res_AC = boot_res_AC,
    boot_res_AB_mc = boot_res_AB,
    boot_res_AB = boot_res_AB2
  )

  # compile binary effect estimate result
  res$inferential[["summary"]] <- data.frame(
    case = c("AC", "adjusted_AC", "BC", "AB", "adjusted_AB"),
    EST = c(
      res_AC_unadj$est,
      res_AC$est,
      res_BC$est,
      res_AB_unadj$est,
      res_AB$est
    ),
    LCL = c(
      res_AC_unadj$ci_l,
      res_AC$ci_l,
      res_BC$ci_l,
      res_AB_unadj$ci_l,
      res_AB$ci_l
    ),
    UCL = c(
      res_AC_unadj$ci_u,
      res_AC$ci_u,
      res_BC$ci_u,
      res_AB_unadj$ci_u,
      res_AB$ci_u
    ),
    pval = c(
      res_AC_unadj$pval,
      res_AC$pval,
      res_BC$pval,
      res_AB_unadj$pval,
      res_AB$pval
    )
  )
  names(res$inferential[["summary"]])[2] <- eff_measure

  # output
  res
}

# Functions for matching step: estimation of individual weights

# functions to be exported ---------------------------------------

#' Derive individual weights in the matching step of MAIC
#'
#' Assuming data is properly processed, this function takes individual patient data (IPD) with centered covariates
#' (effect modifiers and/or prognostic variables) as input, and generates weights for each individual in IPD trial to
#' match the covariates in aggregate data.
#'
#' @param data a numeric matrix, centered covariates of IPD, no missing value in any cell is allowed
#' @param centered_colnames a character or numeric vector (column indicators) of centered covariates
#' @param start_val a scalar, the starting value for all coefficients of the propensity score regression
#' @param method a string, name of the optimization algorithm (see 'method' argument of \code{base::optim()}) The
#'   default is `"BFGS"`, other options are `"Nelder-Mead"`, `"CG"`, `"L-BFGS-B"`, `"SANN"`, and `"Brent"`
#' @param n_boot_iteration an integer, number of bootstrap iterations. By default is NULL which means bootstrapping
#'   procedure will not be triggered, and hence the element `"boot"` of output list object will be NULL.
#' @param set_seed_boot a scalar, the random seed for conducting the bootstrapping, only relevant if
#'   \code{n_boot_iteration} is not NULL. By default, use seed 1234
#' @param boot_strata a character vector of column names in \code{data} that defines the strata for bootstrapping.
#'   This ensures that samples are drawn proportionally from each defined stratum. If \code{NULL},
#'   no stratification during bootstrapping process. By default, it is "ARM"
#' @param ... Additional `control` parameters passed to [stats::optim].
#'
#' @return a list with the following 4 elements,
#' \describe{
#'   \item{data}{a data.frame, includes the input \code{data} with appended column 'weights' and 'scaled_weights'.
#'   Scaled weights has a summation to be the number of rows in \code{data} that has no missing value in any of the
#'   effect modifiers}
#'   \item{centered_colnames}{column names of centered effect modifiers in \code{data}}
#'   \item{nr_missing}{number of rows in \code{data} that has at least 1 missing value in specified centered effect
#'   modifiers}
#'   \item{ess}{effective sample size, square of sum divided by sum of squares}
#'   \item{opt}{R object returned by \code{base::optim()}, for assess convergence and other details}
#'   \item{boot_strata}{'strata' from a boot::boot object}
#'   \item{boot_seed}{column names in \code{data} of the stratification factors}
#'   \item{boot}{a n by 2 by k array or NA, where n equals to number of rows in \code{data}, and k equals
#'      \code{n_boot_iteration}. The 2 columns in the second dimension include a column of numeric indexes of the rows
#'      in \code{data} that are selected at a bootstrapping iteration and a column of weights. \code{boot} is NA when
#'      argument \code{n_boot_iteration} is set as NULL
#'   }
#' }
#' @importFrom boot boot
#' @examples
#' data(centered_ipd_sat)
#' centered_colnames <- grep("_CENTERED", colnames(centered_ipd_sat), value = TRUE)
#' weighted_data <- estimate_weights(data = centered_ipd_sat, centered_colnames = centered_colnames)
#' \donttest{
#' # To later estimate bootstrap confidence intervals, we calculate the weights
#' # for the bootstrap samples:
#' weighted_data_boot <- estimate_weights(
#'   data = centered_ipd_sat, centered_colnames = centered_colnames, n_boot_iteration = 100
#' )
#' }
#' @export

estimate_weights <- function(data,
                             centered_colnames = NULL,
                             start_val = 0,
                             method = "BFGS",
                             n_boot_iteration = NULL,
                             set_seed_boot = 1234,
                             boot_strata = "ARM",
                             ...) {
  # pre check
  ch1 <- is.data.frame(data)
  if (!ch1) {
    stop("'data' is not a data.frame")
  }

  ch2 <- (!is.null(centered_colnames))
  if (ch2 && is.numeric(centered_colnames)) {
    ch2b <- any(centered_colnames < 1 | centered_colnames > ncol(data))
    if (ch2b) {
      stop("specified centered_colnames are out of bound")
    }
  } else if (ch2 && is.character(centered_colnames)) {
    ch2b <- !all(centered_colnames %in% names(data))
    if (ch2b) {
      stop("1 or more specified centered_colnames are not found in 'data'")
    }
  } else {
    stop("'centered_colnames' should be either a numeric or character vector")
  }

  ch3 <- sapply(centered_colnames, function(ii) {
    !is.numeric(data[[ii]])
  })
  if (any(ch3)) {
    stop(paste0(
      "following columns of 'data' are not numeric for the calculation:",
      paste(which(ch3), collapse = ",")
    ))
  }

  if (!is.null(boot_strata)) {
    ch4 <- boot_strata %in% names(data)
    if (!all(ch4)) {
      stop("Some variables in boot_strata are not in data: ", toString(boot_strata[!ch4]))
    }
  }

  # prepare data for optimization
  if (is.null(centered_colnames)) centered_colnames <- seq_len(ncol(data))
  EM <- data[, centered_colnames, drop = FALSE]
  ind <- apply(EM, 1, function(xx) any(is.na(xx)))
  nr_missing <- sum(ind)
  rows_with_missing <- which(ind)
  EM <- as.matrix(EM[!ind, , drop = FALSE])

  # estimate weights
  opt1 <- optimise_weights(matrix = EM, par = rep(start_val, ncol(EM)), method = method, ...)
  alpha <- opt1$alpha
  wt <- opt1$wt
  wt_rs <- (wt / sum(wt)) * nrow(EM)

  # bootstrapping
  outboot <- if (is.null(n_boot_iteration)) {
    boot_seed <- NULL
    boot_strata_out <- NULL
    NULL
  } else {
    # Make sure to leave '.Random.seed' as-is on exit
    old_seed <- globalenv()$.Random.seed
    on.exit(suspendInterrupts(set_random_seed(old_seed)))
    set.seed(set_seed_boot)

    if (!is.null(boot_strata)) {
      use_strata <- interaction(data[!ind, boot_strata])
    } else {
      use_strata <- rep(1, nrow(EM))
    }
    boot_statistic <- function(d, w) optimise_weights(d[w, ], par = alpha, method = method, ...)$wt[, 1]
    boot_out <- boot::boot(EM, statistic = boot_statistic, R = n_boot_iteration, strata = use_strata)

    boot_array <- array(dim = list(nrow(EM), 2, n_boot_iteration))
    dimnames(boot_array) <- list(sampled_patient = NULL, c("rowid", "weight"), bootstrap_iteration = NULL)
    boot_array[, 1, ] <- t(boot.array(boot_out, TRUE))
    boot_array[, 2, ] <- t(boot_out$t)
    boot_seed <- boot_out$seed
    boot_strata_out <- boot_out$strata
    boot_array
  }

  # append weights to data
  data$weights <- NA
  data$weights[!ind] <- wt

  data$scaled_weights <- NA
  data$scaled_weights[!ind] <- wt_rs

  if (is.numeric(centered_colnames)) centered_colnames <- names(data)[centered_colnames]

  # Output
  outdata <- list(
    data = data,
    centered_colnames = centered_colnames,
    nr_missing = nr_missing,
    ess = sum(wt)^2 / sum(wt^2),
    opt = opt1$opt,
    boot = outboot,
    boot_seed = boot_seed,
    boot_strata = boot_strata_out,
    rows_with_missing = rows_with_missing
  )

  class(outdata) <- c("maicplus_estimate_weights", "list")
  outdata
}


#' Estimate weights using `optim`
#'
#' @param matrix Matrix of data to be used for estimating weights
#' @param par Vector of starting values for the parameters with length equal to the number of columns in `matrix`
#' @param method Method parameter passed to [stats::optim]
#' @param ... Additional `control` parameters passed to [stats::optim]
#'
#' @return List containing estimated `alpha` values and `wt` weights for all rows of matrix
#' @noRd
optimise_weights <- function(matrix,
                             par = rep(0, ncol(matrix)),
                             method = "BFGS",
                             maxit = 300,
                             trace = 0,
                             ...) {
  if (!all(is.numeric(par) || is.finite(par), length(par) == ncol(matrix))) {
    stop("par must be a numeric vector with finite values of length equal to the number of columns in 'matrix'")
  }
  opt1 <- optim(
    par = par,
    fn = function(alpha, X) sum(exp(X %*% alpha)),
    gr = function(alpha, X) colSums(sweep(X, 1, exp(X %*% alpha), "*")),
    X = matrix,
    method = method,
    control = list(maxit = maxit, trace = trace, ...)
  )
  if (opt1$convergence != 0) {
    warning(
      "optim() did not converge. ",
      opt1$message,
      "\nSee ?optim for more information on convergence code: ", opt1$convergence
    )
  }
  list(
    opt = opt1,
    alpha = opt1$par,
    wt = exp(matrix %*% opt1$par)
  )
}

#' Calculate Statistics for Weight Plot Legend
#'
#' Calculates ESS reduction and median weights which is used to create legend for weights plot
#'
#' @param weighted_data object returned after calculating weights using [estimate_weights]
#'
#' @return list of ESS, ESS reduction, median value of scaled and unscaled weights, and missing count
#' @examples
#' data("weighted_sat")
#' calculate_weights_legend(weighted_sat)
#' @export
#' @keywords internal

calculate_weights_legend <- function(weighted_data) {
  if (!inherits(weighted_data, "maicplus_estimate_weights")) {
    stop("weighted_data must be class `maicplus_estimate_weights` generated by estimate_weights()")
  }
  ess <- weighted_data$ess
  wt <- weighted_data$data$weights
  wt_scaled <- weighted_data$data$scaled_weights

  # calculate sample size and exclude NA from wt
  nr_na <- sum(is.na(wt))
  n <- length(wt) - nr_na
  wt <- na.omit(wt)
  wt_scaled <- na.omit(wt_scaled)

  # calculate ess reduction and median weights
  ess_reduction <- (1 - (ess / n)) * 100
  wt_median <- median(wt)
  wt_scaled_median <- median(wt_scaled)

  list(
    ess = round(ess, 2),
    ess_reduction = round(ess_reduction, 2),
    wt_median = round(wt_median, 4),
    wt_scaled_median = round(wt_scaled_median, 4),
    nr_na = nr_na
  )
}

#' Plot MAIC weights in a histogram with key statistics in legend
#'
#' Generates a base R histogram of weights. Default is to plot either unscaled or scaled weights and not both.
#'
#' @param weighted_data object returned after calculating weights using [estimate_weights]
#' @param bin_col a string, color for the bins of histogram
#' @param vline_col a string, color for the vertical line in the histogram
#' @param main_title title of the plot
#' @param scaled_weights an indicator for using scaled weights instead of regular weights
#'
#' @return a plot of unscaled or scaled weights
#' @examples
#' plot_weights_base(weighted_sat,
#'   bin_col = "#6ECEB2",
#'   vline_col = "#688CE8",
#'   main_title = c("Scaled Individual Weights", "Unscaled Individual Weights"),
#'   scaled_weights = TRUE
#' )
#' @export

plot_weights_base <- function(weighted_data,
                              bin_col, vline_col, main_title,
                              scaled_weights) {
  weights_stat <- calculate_weights_legend(weighted_data)

  if (scaled_weights) {
    wt <- weighted_data$data$scaled_weights
    median_wt <- weights_stat$wt_scaled_median
  } else {
    wt <- weighted_data$data$weights
    median_wt <- weights_stat$wt_median
  }

  # prepare legend
  plot_legend <- c(
    paste0("Median = ", median_wt),
    paste0("ESS = ", weights_stat$ess),
    paste0("Reduction% = ", weights_stat$ess_reduct)
  )
  plot_lty <- c(2, NA, NA)

  if (weights_stat$nr_na > 0) {
    plot_legend <- c(plot_legend, paste0("#Missing Weights = ", weights_stat$nr_na))
    plot_lty <- c(plot_lty, NA)
  }

  # plot
  original_par <- par(mgp = c(2.3, 0.5, 0), cex.axis = 0.9, cex.lab = 0.95, bty = "n")
  on.exit(par(original_par))
  hist(wt, border = "white", col = bin_col, main = main_title, breaks = 20, yaxt = "n", xlab = "")
  axis(2, las = 1)
  abline(v = median(wt), lty = 2, col = vline_col, lwd = 2)
  legend("topright", bty = "n", lty = plot_lty, cex = 0.8, legend = plot_legend)
}

#' Plot MAIC weights in a histogram with key statistics in legend using `ggplot2`
#'
#' Generates a `ggplot` histogram of weights. Default is to plot both unscaled and scaled weights on a same graph.
#'
#' @param weighted_data object returned after calculating weights using [estimate_weights]
#' @param bin_col a string, color for the bins of histogram
#' @param vline_col a string, color for the vertical line in the histogram
#' @param main_title Name of scaled weights plot and unscaled weights plot, respectively.
#' @param bins number of bin parameter to use
#'
#' @return a plot of unscaled and scaled weights
#' @examples
#' if (requireNamespace("ggplot2")) {
#'   plot_weights_ggplot(weighted_sat,
#'     bin_col = "#6ECEB2",
#'     vline_col = "#688CE8",
#'     main_title = c("Scaled Individual Weights", "Unscaled Individual Weights"),
#'     bins = 50
#'   )
#' }
#' @export

plot_weights_ggplot <- function(weighted_data, bin_col, vline_col,
                                main_title,
                                bins) {
  # check if ggplot2 package is installed
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("ggplot2 package is needed to run this function")
  }

  weights_stat <- calculate_weights_legend(weighted_data)

  # prepare dataset to use in ggplot
  wt_data0 <- weighted_data$data[, c("weights", "scaled_weights")]
  colnames(wt_data0) <- main_title
  wt_data <- stack(wt_data0)
  wt_data$median <- ifelse(wt_data$ind == main_title[1],
    weights_stat$wt_median, weights_stat$wt_scaled_median
  )

  # create legend data
  lab <- with(weights_stat, {
    lab <- c(paste0("Median = ", wt_median), paste0("Median = ", wt_scaled_median))
    lab <- paste0(lab, "\nESS = ", ess, "\nReduction% = ", ess_reduction)
    if (nr_na > 0) lab <- paste0(lab, "\n#Missing Weights = ", nr_na)
    lab
  })
  legend_data <- data.frame(ind = main_title, lab = lab)

  values <- median <- NULL # dummy assignment for undefined variable check
  hist_plot <- ggplot2::ggplot(wt_data) +
    ggplot2::geom_histogram(ggplot2::aes(x = values), bins = bins, color = bin_col, fill = bin_col) +
    ggplot2::geom_vline(ggplot2::aes(xintercept = median),
      color = vline_col,
      linetype = "dashed"
    ) +
    ggplot2::theme_bw() +
    ggplot2::facet_wrap(~ind, ncol = 1) +
    ggplot2::geom_text(
      data = legend_data,
      ggplot2::aes(label = lab), x = Inf, y = Inf, hjust = 1, vjust = 1, size = 3
    ) +
    ggplot2::theme(
      axis.title = ggplot2::element_text(size = 12),
      axis.text = ggplot2::element_text(size = 12)
    ) +
    ggplot2::ylab("Frequency") +
    ggplot2::xlab("Weight")

  hist_plot
}


#' Plot method for Estimate Weights objects
#'
#' The plot function displays individuals weights with key summary in top right legend that includes
#' median weight, effective sample size (ESS), and reduction percentage (what percent ESS is reduced from the
#' original sample size). There are two options of plotting: base R plot and `ggplot`. The default
#' for base R plot is to plot unscaled and scaled separately. The default
#' for `ggplot` is to plot unscaled and scaled weights on a same plot.
#'
#' @param x object from [estimate_weights]
#' @param ggplot indicator to print base weights plot or `ggplot` weights plot
#' @param bin_col a string, color for the bins of histogram
#' @param vline_col a string, color for the vertical line in the histogram
#' @param main_title title of the plot. For ggplot, name of scaled weights plot and unscaled weights plot, respectively.
#' @param scaled_weights (base plot only) an indicator for using scaled weights instead of regular weights
#' @param bins (`ggplot` only) number of bin parameter to use
#'
#' @examples
#' plot(weighted_sat)
#'
#' if (requireNamespace("ggplot2")) {
#'   plot(weighted_sat, ggplot = TRUE)
#' }
#' @describeIn estimate_weights Plot method for estimate_weights objects
#' @export

plot.maicplus_estimate_weights <- function(x, ggplot = FALSE,
                                           bin_col = "#6ECEB2", vline_col = "#688CE8",
                                           main_title = NULL,
                                           scaled_weights = TRUE,
                                           bins = 50, ...) {
  if (ggplot) {
    if (is.null(main_title)) main_title <- c("Scaled Individual Weights", "Unscaled Individual Weights")
    plot_weights_ggplot(x, bin_col, vline_col, main_title, bins)
  } else {
    if (is.null(main_title)) {
      main_title <- ifelse(scaled_weights, "Scaled Individual Weights", "Unscaled Individual Weights")
    }
    plot_weights_base(x, bin_col, vline_col, main_title, scaled_weights)
  }
}


#' Check to see if weights are optimized correctly
#'
#' This function checks to see if the optimization is done properly
#' by checking the covariate averages before and after adjustment.
#' In case of ties when calculating median,
#' we return the mean of the two numbers. For more details,
#' see `ties` parameter in [matrixStats::weightedMedian].
#'
#' @param weighted_data object returned after calculating weights using \code{\link{estimate_weights}}
#' @param processed_agd a data frame, object returned after using \code{\link{process_agd}} or
#' aggregated data following the same naming convention
#'
#' @examples
#' data(weighted_sat)
#' data(agd)
#' check_weights(weighted_sat, process_agd(agd))
#' @importFrom matrixStats weightedMedian
#'
#' @return data.frame of weighted and unweighted covariate averages of the IPD,
#' average of aggregate data, and sum of inner products of covariate \eqn{x_i} and the weights (\eqn{exp(x_i\beta)})
#' @export

check_weights <- function(weighted_data, processed_agd) {
  ipd_with_weights <- weighted_data$data
  match_cov <- weighted_data$centered_colnames

  # if algorithm is correct, all centered columns should have a weighted summation to a very small number around zero
  num_check <- ipd_with_weights$weights %*% as.matrix(ipd_with_weights[, match_cov, drop = FALSE])
  num_check <- round(num_check, 4)

  # for reporting
  outdata <- data.frame(
    covariate = gsub("_CENTERED$", "", match_cov),
    match_stat = NA,
    internal_trial = NA,
    internal_trial_after_weighted = NA,
    external_trial = NA,
    sum_centered_IPD_with_weights = as.vector(num_check)
  )
  attr(outdata, "footer") <- list()
  # find item that was matched by mean
  ind_mean <- lapply(outdata$covariate, grep, x = names(processed_agd), value = TRUE)
  ind_mean <- sapply(ind_mean, function(ii) any(grepl("_MEAN$", ii)))
  outdata$match_stat <- ifelse(grepl("_MEDIAN$", outdata$covariate), "Median",
    ifelse(grepl("_SQUARED$", outdata$covariate), "SD",
      ifelse(ind_mean, "Mean", "Prop")
    )
  )
  outdata$covariate <- gsub("_MEDIAN|_SQUARED", "", outdata$covariate)
  # fill in corresponding agd data
  outdata$external_trial <- unlist(processed_agd[paste(outdata$covariate, toupper(outdata$match_stat), sep = "_")])

  # fill in stat from unweighted and weighted IPD
  for (ii in seq_len(nrow(outdata))) {
    covname <- outdata$covariate[ii]
    if (outdata$match_stat[ii] %in% c("Mean", "Prop")) {
      outdata$internal_trial[ii] <- mean(ipd_with_weights[[covname]], na.rm = TRUE)
      outdata$internal_trial_after_weighted[ii] <- weighted.mean(
        ipd_with_weights[[covname]],
        w = ipd_with_weights$weights, na.rm = TRUE
      )
    } else if (outdata$match_stat[ii] == "Median") {
      outdata$internal_trial[ii] <- quantile(ipd_with_weights[[covname]],
        probs = 0.5,
        na.rm = TRUE,
        type = 2,
        names = FALSE
      ) # SAS default
      outdata$internal_trial_after_weighted[ii] <- weightedMedian(
        x = ipd_with_weights[[covname]],
        w = ipd_with_weights$weights,
        interpolate = FALSE,
        ties = "mean",
        na.rm = TRUE
      )
      # no IPD equals to reported AgD median
      msg_ind <- !any(ipd_with_weights[[covname]] == outdata$external_trial[ii], na.rm = TRUE)
      if (msg_ind) {
        msg_txt <- paste0(
          "For covariate ", covname, ", it was matched to AgD median, but there is no IPD identical to AgD median,",
          "hence median after weighted will not equal to AgD median exactly."
        )
        attr(outdata, "footer") <- c(attr(outdata, "footer"), msg_txt)
      }
    } else if (outdata$match_stat[ii] == "SD") {
      outdata$internal_trial[ii] <- sd(ipd_with_weights[[covname]], na.rm = TRUE)
      wm_squared <- weighted.mean(ipd_with_weights[[covname]]^2, w = ipd_with_weights$weights, na.rm = TRUE)
      ms_agd <- processed_agd[[paste0(outdata$covariate[ii], "_MEAN")]]^2
      outdata$internal_trial_after_weighted[ii] <- sqrt(wm_squared - ms_agd)
    }
  }

  # output
  class(outdata) <- c("maicplus_check_weights", "data.frame")
  outdata
}


#' Print method for Check Weights objects
#'
#' @param x object from [check_weights]
#' @param mean_digits number of digits for rounding mean columns in the output
#' @param prop_digits number of digits for rounding proportion columns in the output
#' @param sd_digits number of digits for rounding mean columns in the output
#' @param digits minimal number of significant digits, see [print.default].
#' @param ... further arguments to [print.data.frame]
#' @describeIn check_weights Print method for check_weights objects
#' @export

print.maicplus_check_weights <- function(x,
                                         mean_digits = 2,
                                         prop_digits = 2,
                                         sd_digits = 3,
                                         digits = getOption("digits"), ...) {
  round_digits <- c("Mean" = mean_digits, "Prop" = prop_digits, "SD" = sd_digits)[x$match_stat]
  round_digits[is.na(round_digits)] <- digits

  x$external_trial <- round(x$external_trial, round_digits)
  x$internal_trial <- round(x$internal_trial, round_digits)
  x$internal_trial_after_weighted <- round(x$internal_trial_after_weighted, round_digits)

  print.data.frame(x, ...)
  footer <- unlist(attr(x, "footer"))
  if (length(footer)) {
    cat("\n")
    for (f in seq_along(footer)) {
      cat(paste0("[", f, "] ", footer[f]))
    }
  }
}

#' Note on Expected Sample Size Reduction
#'
#' @param width Number of characters to break string into new lines (`\n`).
#'
#' @return A character string
#' @keywords internal
ess_footnote_text <- function(width = 0.9 * getOption("width")) {
  text <- "An ESS reduction up to ~60% is not unexpected based on the 2021 survey of NICE's technology appraisals
(https://onlinelibrary.wiley.com/doi/full/10.1002/jrsm.1511), whereas a reduction of >75% is less common
and it may be considered suboptimal."
  paste0(strwrap(text, width = width), collapse = "\n")
}

# Functions for pre-processing data before conduct MAIC

# Functions to be exported ---------------------------------------

#' Pre-process aggregate data
#'
#' This function checks the format of the aggregate data.
#' Data is required to have three columns: STUDY, ARM, and N.
#' Column names that do not have legal suffixes (MEAN, MEDIAN, SD, COUNT, or PROP) are dropped.
#' If a variable is a count variable, it is converted to proportions by dividing the sample size (N).
#' Note, when the count is specified, proportion is always calculated based on the count, that is,
#' specified proportion will be ignored if applicable.
#' If the aggregated data comes from multiple sources (i.e. different analysis population) and
#' sample size differs for each variable, one option is to specify proportion directly instead of count by using suffix
#' _PROP.
#'
#' @param raw_agd raw aggregate data should contain STUDY, ARM, and N. Variable names should be followed
#' by legal suffixes (i.e. MEAN, MEDIAN, SD, COUNT, or PROP).
#'
#' @examples
#' data(agd)
#' agd <- process_agd(agd)
#'
#' @return pre-processed aggregate level data
#' @export

process_agd <- function(raw_agd) {
  raw_agd <- as.data.frame(raw_agd)
  # make all column names to be capital letters to avoid different style
  names(raw_agd) <- toupper(names(raw_agd))

  # define column name patterns[-]
  must_exist <- c("STUDY", "ARM", "N")
  legal_suffix <- c("MEAN", "MEDIAN", "SD", "COUNT", "PROP")

  # swap "TREATMENT" column to "ARM", if applicable
  if ("TREATMENT" %in% names(raw_agd) && (!"ARM" %in% names(raw_agd))) {
    raw_agd$ARM <- raw_agd$TREATMENT
    raw_agd <- raw_agd[, names(raw_agd) != "TREATMENT"]
    warning("'TREATMENT' column is renamed as 'ARM'")
  }

  # check: must exist
  if (!all(must_exist %in% names(raw_agd))) {
    stop("At least 1 of the must-exists columns (STUDY, ARM, N) cannot be found in raw_agd!")
  }

  # check: legal suffix
  other_colnames <- setdiff(names(raw_agd), must_exist)
  ind1 <- grepl("_", other_colnames, fixed = TRUE)
  ind2 <- sapply(other_colnames, function(xx) {
    tmp <- unlist(strsplit(xx, split = "_"))
    tmp[length(tmp)] # this deployment is robust to the cases that there are multiple _ in the column name
  })
  ind2 <- (ind2 %in% legal_suffix)

  use_cols <- other_colnames[ind1 & ind2]
  use_agd <- raw_agd[, c(must_exist, use_cols), drop = FALSE]
  if (!all(other_colnames %in% use_cols)) {
    warning(paste0(
      "following columns are ignored since it does not follow the naming conventions:",
      paste(setdiff(other_colnames, use_cols), collapse = ",")
    ))
  }

  # If the aggregate data is divided by different arms, calculate pooled arm statistics using
  # complete_agd function; complete statistics is specified by ARM=="Total"
  if (!"total" %in% tolower(use_agd$ARM)) {
    use_agd <- complete_agd(use_agd)
  }

  # calculate percentage columns
  ind <- grepl("_COUNT$", names(use_agd))
  if (any(ind)) {
    for (i in which(ind)) {
      tmp_prop <- use_agd[[i]] / use_agd$N
      # in case some count are not specified, but proportion are specified, copy over those proportions
      # this also means, in case count is specified, proportion is ignored even it is specified
      prop_name_i <- gsub("_COUNT$", "_PROP", names(use_agd)[i])
      if (prop_name_i %in% names(use_agd)) {
        tmp_prop[is.na(tmp_prop)] <- use_agd[is.na(tmp_prop), prop_name_i]
        names(use_agd)[names(use_agd) == prop_name_i] <- paste0(prop_name_i, "_redundant")
      }
      use_agd[[i]] <- tmp_prop
    }
    names(use_agd) <- gsub("_COUNT$", "_PROP", names(use_agd))
  }
  use_agd <- use_agd[, !grepl("_redundant$", names(use_agd))]

  # output
  with(use_agd, use_agd[tolower(ARM) == "total", , drop = FALSE])
}


#' Create dummy variables from categorical variables in an individual patient data (ipd)
#'
#' This is a convenient function to convert categorical variables into dummy binary variables.
#' This would be especially useful if the variable has more than two factors.
#' Note that the original variable is kept after a variable is dummized.
#'
#' @param raw_ipd ipd data that contains variable to dummize
#' @param dummize_cols vector of column names to binarize
#' @param dummize_ref_level vector of reference level of the variables to binarize
#'
#' @examples
#' data(adsl_twt)
#' dummize_ipd(adsl_twt, dummize_cols = c("SEX"), dummize_ref_level = c("Male"))
#'
#' @return ipd with dummized columns
#' @export

dummize_ipd <- function(raw_ipd, dummize_cols, dummize_ref_level) {
  for (i in seq_along(dummize_cols)) {
    yy <- raw_ipd[[dummize_cols[i]]]
    yy_levels <- na.omit(unique(yy))
    yy <- factor(as.character(yy), levels = c(dummize_ref_level[i], setdiff(yy_levels, dummize_ref_level[i])))
    new_yy <- sapply(levels(yy)[-1], function(j) {
      as.numeric(yy == j)
    })
    new_yy <- as.data.frame(new_yy)
    names(new_yy) <- toupper(paste(dummize_cols[i], levels(yy)[-1], sep = "_"))
    raw_ipd <- cbind(raw_ipd, new_yy)
  }
  raw_ipd
}


#' Center individual patient data (IPD) variables using aggregate data averages
#'
#' This function subtracts IPD variables (prognostic variables and/or effect modifiers)
#' by the aggregate data averages. This centering is needed in order to calculate weights.
#' IPD and aggregate data variable names should match.
#'
#' @param ipd IPD variable names should match the aggregate data names without the suffix.
#' This would involve either changing the aggregate data name or the ipd name.
#' For instance, if we binarize SEX variable with MALE as a reference using [dummize_ipd],
#' function names the new variable as SEX_MALE.
#' In this case, SEX_MALE should also be available in the aggregate data.
#' @param agd pre-processed aggregate data which contain STUDY, ARM, and N. Variable names
#' should be followed by legal suffixes (i.e. MEAN, MEDIAN, SD, or PROP). Note that COUNT
#' suffix is no longer accepted.
#' @examples
#' data(adsl_sat)
#' data(agd)
#' agd <- process_agd(agd)
#' ipd_centered <- center_ipd(ipd = adsl_sat, agd = agd)
#' @return centered ipd using aggregate level data averages
#' @export

center_ipd <- function(ipd, agd) {
  # regularized column name patterns
  must_exist <- c("STUDY", "ARM", "N")
  legal_suffix <- c("MEAN", "MEDIAN", "SD", "PROP")
  suffix_pat <- paste(paste0("_", legal_suffix, "$"), collapse = "|")

  for (i in seq_len(nrow(agd))) { # study i
    study_id <- agd$STUDY[i]
    use_agd <- agd[i, !names(agd) %in% must_exist, drop = FALSE]
    param_id <- gsub(suffix_pat, "", names(use_agd))

    for (j in seq_len(ncol(use_agd))) { # effect modifier j
      if (is.na(use_agd[[j]])) next

      ipd_param <- param_id[j]

      if (grepl("_MEAN$|_PROP$", names(use_agd)[j])) {
        ipd[[paste0(ipd_param, "_", "CENTERED")]] <- ipd[[ipd_param]] - use_agd[[j]]
      } else if (grepl("_MEDIAN$", names(use_agd)[j])) {
        ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] <- ipd[[ipd_param]] > use_agd[[j]]
        ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] - 0.5
      } else if (grepl("_SD$", names(use_agd)[j])) {
        ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] <- ipd[[ipd_param]]^2
        tmp_aim <- use_agd[[j]]^2 + (use_agd[[paste0(ipd_param, "_MEAN")]]^2)
        ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] - tmp_aim
      }
    } # end of j
  } # end of i

  # output
  ipd
}


#' Calculate pooled arm statistics in Aggregated Data (AgD) based on arm-specific statistics
#'
#' This is a convenient function to pool arm statistics. This function is called
#' within process_agd and when the ARM is not equal to "Total". Note pooled
#' median can't be calculated and it is only an approximation.
#'
#' @param use_agd aggregated data that is processed within process_agd
#' @noRd
#' @return Complete N, count, mean, sd, and median for the pooled arm

complete_agd <- function(use_agd) {
  use_agd <- as.data.frame(use_agd)
  use_agd <- with(use_agd, {
    use_agd[tolower(ARM) != "total", , drop = FALSE]
  })

  if (nrow(use_agd) < 2) stop("error in call complete_agd: need to have at least 2 rows that ARM!='total' ")

  rowId <- nrow(use_agd) + 1
  use_agd[rowId, ] <- NA
  use_agd$STUDY[rowId] <- use_agd$STUDY[1]
  use_agd$ARM[rowId] <- "total"

  # complete N and count
  NN <- use_agd[["N"]][rowId] <- sum(use_agd[["N"]], na.rm = TRUE)
  nn <- use_agd[["N"]][-rowId]
  for (i in grep("_COUNT$", names(use_agd), value = TRUE)) {
    use_agd[[i]][rowId] <- sum(use_agd[[i]][-rowId], na.rm = TRUE)
  }

  # complete MEAN
  for (i in grep("_MEAN$", names(use_agd), value = TRUE)) {
    use_agd[[i]][rowId] <- sum(use_agd[[i]][-rowId] * nn) / NN
  }

  # complete SD
  for (i in grep("_SD$", names(use_agd), value = TRUE)) {
    use_agd[[i]][rowId] <- sqrt(sum(use_agd[[i]][-rowId]^2 * (nn - 1)) / (NN - 1))
  }

  # complete MEDIAN, approximately!!
  for (i in grep("_MEDIAN$", names(use_agd), value = TRUE)) {
    use_agd[[i]][rowId] <- mean(use_agd[[i]][-rowId])
  }

  # output
  rownames(use_agd) <- NULL
  use_agd
}


#' helper function: transform TTE ADaM data to suitable input for survival R package
#'
#' @param dd data frame, ADTTE read via `haven::read_sas`
#' @param time_scale a character string, 'years', 'months', 'weeks' or 'days', time unit of median survival time
#' @param trt values to include in treatment column
#'
#' @return a data frame that can be used as input to `survival::Surv`
#' @keywords internal

ext_tte_transfer <- function(dd, time_scale = "months", trt = NULL) {
  time_scale <- match.arg(time_scale, choices = c("years", "months", "weeks", "days"))
  time_units <- get_time_conversion(time_scale)

  if ("CENSOR" %in% names(dd)) {
    dd <- dd[!is.na(dd$CENSOR), ]
    dd$status <- as.numeric(dd$CENSOR == 0)
  }
  if ("EVENT" %in% names(dd)) {
    dd$status <- as.numeric(as.character(dd$EVENT))
  }
  if ("TIME" %in% names(dd)) {
    dd$AVAL <- as.numeric(as.character(dd$TIME))
  }

  dd$time <- dd$AVAL * time_units
  if (!is.null(trt)) dd$treatment <- trt
  as.data.frame(dd)
}

#' Kaplan-Meier (KM) plot function for anchored and unanchored cases using ggplot
#'
#' This is wrapper function of \code{basic_kmplot2}.
#' The argument setting is similar to \code{maic_anchored} and \code{maic_unanchored},
#' and it is used in those two functions.
#'
#' @param weights_object an object returned by \code{estimate_weight}
#' @param tte_ipd a data frame of individual patient data (IPD) of internal trial, contain at least `"USUBJID"`,
#'   `"EVENT"`, `"TIME"` columns and a column indicating treatment assignment
#' @param tte_pseudo_ipd a data frame of pseudo IPD by digitized KM curves of external trial (for time-to-event
#'   endpoint), contain at least `"EVENT"`, `"TIME"`
#' @param trt_ipd  a string, name of the interested investigation arm in internal trial \code{dat_igd} (real IPD)
#' @param trt_agd a string, name of the interested investigation arm in external trial \code{dat_pseudo} (pseudo IPD)
#' @param trt_common a string, name of the common comparator in internal and external trial, by default is NULL,
#'   indicating unanchored case
#' @param trt_var_ipd a string, column name in \code{tte_ipd} that contains the treatment assignment
#' @param trt_var_agd a string, column name in \code{tte_pseudo_ipd} that contains the treatment assignment
#' @param normalize_weights logical, default is \code{FALSE}. If \code{TRUE},
#'   \code{scaled_weights} (normalized weights) in \code{weights_object$data} will be used.
#' @param km_conf_type a string, pass to \code{conf.type} of \code{survfit}
#' @param km_layout a string, only applicable for unanchored case (\code{trt_common = NULL}), indicated the
#'   desired layout of output KM curve.
#' @param time_scale a string, time unit of median survival time, taking a value of 'years', 'months',
#'   weeks' or 'days'
#' @param ... other arguments in \code{basic_kmplot2}
#'
#' @return In unanchored case, a KM plot with risk set table. In anchored case, depending on \code{km_layout},
#' \itemize{
#'   \item if "by_trial", 2 by 1 plot, first all KM curves (incl. weighted) in IPD trial, and then KM curves in AgD
#'   trial, with risk set table.
#'   \item if "by_arm", 2 by 1 plot, first KM curves of \code{trt_agd} and  \code{trt_ipd} (with and without weights),
#'    and then KM curves of \code{trt_common} in AgD trial and IPD trial (with and without weights). Risk set table is
#'     appended.
#'   \item if "all", 2 by 2 plot, all plots in "by_trial" and "by_arm" without risk set table appended.
#' }
#' @example inst/examples/kmplot2_unanchored_ex.R
#' @example inst/examples/kmplot2_anchored_ex.R
#' @export

kmplot2 <- function(weights_object,
                    tte_ipd,
                    tte_pseudo_ipd,
                    trt_ipd,
                    trt_agd,
                    trt_common = NULL,
                    normalize_weights = FALSE,
                    trt_var_ipd = "ARM",
                    trt_var_agd = "ARM",
                    km_conf_type = "log-log",
                    km_layout = c("all", "by_trial", "by_arm"),
                    time_scale,
                    ...) {
  if (!requireNamespace("survminer", quietly = TRUE)) stop("survminer package is required for this function")

  names(tte_ipd) <- toupper(names(tte_ipd))
  names(tte_pseudo_ipd) <- toupper(names(tte_pseudo_ipd))
  trt_var_ipd <- toupper(trt_var_ipd)
  trt_var_agd <- toupper(trt_var_agd)

  # pre check
  if (!"maicplus_estimate_weights" %in% class(weights_object)) {
    stop("weights_object should be an object returned by estimate_weights")
  }
  if (!all(c("USUBJID", "TIME", "EVENT", trt_var_ipd) %in% names(tte_ipd))) {
    stop(paste("tte_ipd needs to include at least USUBJID, TIME, EVENT,", trt_var_ipd))
  }
  if (!all(c("TIME", "EVENT", trt_var_agd) %in% names(tte_pseudo_ipd))) {
    stop(paste("tte_pseudo_ipd needs to include at least TIME, EVENT,", trt_var_agd))
  }
  km_layout <- match.arg(km_layout, choices = c("all", "by_trial", "by_arm"), several.ok = FALSE)

  # preparing data
  is_anchored <- !is.null(trt_common)
  tte_ipd <- tte_ipd[tte_ipd[[trt_var_ipd]] %in% c(trt_ipd, trt_common), , drop = FALSE]
  tte_pseudo_ipd <- tte_pseudo_ipd[tte_pseudo_ipd[[trt_var_agd]] %in% c(trt_agd, trt_common), , drop = FALSE]
  if (normalize_weights) {
    tte_ipd$weights <- weights_object$data$scaled_weights[match(weights_object$data$USUBJID, tte_ipd$USUBJID)]
  } else {
    tte_ipd$weights <- weights_object$data$weights[match(weights_object$data$USUBJID, tte_ipd$USUBJID)]
  }
  tte_pseudo_ipd$weights <- 1

  tte_ipd$TIME <- get_time_as(tte_ipd$TIME, as = time_scale)
  tte_pseudo_ipd$TIME <- get_time_as(tte_pseudo_ipd$TIME, as = time_scale)
  my_survfit <- function(data, weighted = FALSE) {
    if (weighted) {
      survfit(Surv(TIME, EVENT) ~ 1, data = data, conf.type = km_conf_type, weights = data$weights)
    } else {
      survfit(Surv(TIME, EVENT) ~ 1, data = data, conf.type = km_conf_type)
    }
  }

  if (!is_anchored) {
    kmlist <- list(
      kmobj_B = my_survfit(data = tte_pseudo_ipd),
      kmobj_A = my_survfit(data = tte_ipd),
      kmobj_A_adj = my_survfit(data = tte_ipd, weighted = TRUE)
    )
    kmlist_name <- c(trt_agd, trt_ipd, paste0(trt_ipd, " (weighted)"))
    basic_kmplot2(kmlist, kmlist_name, ...)
  } else if (is_anchored) {
    all_km <- list(
      kmobj_A = my_survfit(data = tte_ipd[tte_ipd[, trt_var_ipd] == trt_ipd, ]),
      kmobj_B = my_survfit(data = tte_pseudo_ipd[tte_pseudo_ipd[, trt_var_agd] == trt_agd, ]),
      kmobj_A_adj = my_survfit(data = tte_ipd[tte_ipd[, trt_var_ipd] == trt_ipd, ], weighted = TRUE),
      kmobj_C = my_survfit(data = tte_ipd[tte_ipd[, trt_var_ipd] == trt_common, ]),
      kmobj_C_adj = my_survfit(data = tte_ipd[tte_ipd[, trt_var_ipd] == trt_common, ], weighted = TRUE),
      kmobj_C_agd = my_survfit(data = tte_pseudo_ipd[tte_pseudo_ipd[, trt_var_agd] == trt_common, ])
    )

    kmlist_combined <- list()
    if (km_layout %in% c("by_trial", "all")) {
      kmlist_1_2 <- list(
        setNames(
          all_km[c(4, 1, 3, 5)],
          c(trt_common, trt_ipd, paste0(trt_ipd, " (weighted)"), paste0(trt_common, " (weighted)"))
        ),
        setNames(all_km[c(6, 2)], c(trt_common, trt_agd))
      )
      names(kmlist_1_2) <- c(
        paste0("Kaplan-Meier Curves \n(", trt_ipd, " vs ", trt_common, ") in the IPD trial"),
        paste0("Kaplan-Meier Curves \n(", trt_agd, " vs ", trt_common, ") in the AgD trial")
      )
      kmlist_combined <- c(kmlist_combined, kmlist_1_2)
    }
    if (km_layout %in% c("by_arm", "all")) {
      kmlist_3_4 <- list(
        setNames(all_km[c(2, 1, 3)], c(trt_agd, trt_ipd, paste0(trt_ipd, " (weighted)"))),
        setNames(all_km[c(6, 4, 5)], paste(trt_common, c("(AgD)", "(IPD)", "(IPD,weighted)")))
      )
      names(kmlist_3_4) <- c(
        paste0("Kaplan-Meier Curves \n(", trt_ipd, " vs ", trt_agd, ")"),
        paste0("Kaplan-Meier Curves of Common Comparator \n", trt_common, "(IPD vs AgD Trial)")
      )
      kmlist_combined <- c(kmlist_combined, kmlist_3_4)
    }
    if (km_layout == "all") {
      kmlist_combined <- kmlist_combined[c(1, 3, 2, 4)]
    }

    splots <- mapply(
      FUN = basic_kmplot2,
      kmlist = kmlist_combined,
      kmlist_name = lapply(kmlist_combined, names),
      main_title = names(kmlist_combined),
      MoreArgs = list(...),
      SIMPLIFY = FALSE
    )
    survminer::arrange_ggsurvplots(splots, nrow = 1 + (km_layout == "all"))
  }
}

#' Basic Kaplan Meier (KM) plot function using ggplot
#'
#' This function generates a basic KM plot using ggplot.
#'
#' @param kmlist a list of \code{survfit} object
#' @param kmlist_name a vector indicating the treatment names of each \code{survfit} object
#' @param endpoint_name a string, name of time to event endpoint, to be show in the
#'   last line of title
#' @param show_risk_set logical, show risk set table or not, TRUE by default
#' @param main_title a string, main title of the KM plot
#' @param break_x_by bin parameter for \code{survminer}
#' @param censor indicator to include censor information
#' @param xlab label name for x-axis of the plot
#' @param xlim x limit for the x-axis of the plot
#' @param use_colors a character vector of length up to 4, colors to the KM curves,
#'   it will be passed to 'col' of \code{lines()}
#' @param use_line_types a numeric vector of length up to 4, line type to the KM curves,
#'   it will be passed to \code{lty} of \code{lines()}
#' @example inst/examples/basic_kmplot2_ex.R
#' @returns A Kaplan-Meier plot object created with `survminer::ggsurvplot()`.
#' @export

basic_kmplot2 <- function(kmlist,
                          kmlist_name,
                          endpoint_name = "Time to Event Endpoint",
                          show_risk_set = TRUE,
                          main_title = "Kaplan-Meier Curves",
                          break_x_by = NULL,
                          censor = TRUE,
                          xlab = "Time",
                          xlim = NULL,
                          use_colors = NULL,
                          use_line_types = NULL) {
  if (!requireNamespace("survminer", quietly = TRUE)) stop("survminer package is required for this function")

  if (is.null(use_line_types)) {
    use_line_types <- c(1, 1, 2, 2)
  }

  if (is.null(use_colors)) {
    use_colors <- c("#5450E4", "#00857C", "#6ECEB2", "#7B68EE")
  }

  # Produce the Kaplan-Meier plot
  survminer_plot <- survminer::ggsurvplot(kmlist,
    linetype = use_line_types,
    palette = use_colors,
    size = 0.2,
    combine = TRUE,
    risk.table = show_risk_set,
    risk.table.y.text.col = TRUE,
    risk.table.y.text = FALSE,
    break.x.by = break_x_by,
    censor = censor,
    censor.size = 2,
    xlab = xlab,
    ylab = endpoint_name,
    legend.title = "Treatment",
    legend = c(0.85, 0.82),
    title = paste0(main_title, "\nEndpoint: ", endpoint_name),
    legend.labs = kmlist_name,
    tables.theme = survminer::theme_cleantable(),
    ggtheme = ggplot2::theme_classic(base_size = 10),
    fontsize = 3,
    conf.int = FALSE,
    xlim = xlim
  )
  survminer_plot
}

#' Bucher method for combining treatment effects
#'
#' Given two treatment effects of A vs. C and B vs. C
#' derive the treatment effects of A vs. B using the Bucher method.
#' Two-sided confidence interval and Z-test p-value are also calculated.
#' Treatment effects and standard errors should be in log scale
#' for hazard ratio, odds ratio, and risk ratio.
#' Treatment effects and standard errors should be in natural scale
#' for risk difference and mean difference.
#'
#' @param trt a list of two scalars for the study with the
#' experimental arm. `'est'` is the point estimate and `'se'`
#' is the standard error of the treatment effect.
#' For time-to-event data, `'est'` and `'se'` should be point estimate and
#' standard error of the log hazard ratio.
#' For binary data, `'est'` and `'se'` should be point estimate and
#' standard error of the log odds ratio, log risk ratio, or risk
#' difference.
#' For continuous data,  `'est'` and `'se'` should be point estimate and
#' standard error of the mean difference.
#' @param com same as \code{trt}, but for the study with the
#' control arm
#' @param conf_lv a numerical scalar, prescribe confidence level to derive
#' two-sided confidence interval for the treatment effect
#'
#' @return a list with 5 elements,
#' \describe{
#'   \item{est}{a scalar, point estimate of the treatment effect}
#'   \item{se}{a scalar, standard error of the treatment effect}
#'   \item{ci_l}{a scalar, lower confidence limit of a two-sided CI
#'   with prescribed nominal level by \code{conf_lv}}
#'   \item{ci_u}{a scalar, upper confidence limit of a two-sided CI
#'   with prescribed nominal level by \code{conf_lv}}
#'   \item{pval}{p-value of Z-test, with null hypothesis that
#'   \code{est} is zero}
#' }
#' @export
#' @examples
#' trt <- list(est = log(1.1), se = 0.2)
#' com <- list(est = log(1.3), se = 0.18)
#' result <- bucher(trt, com, conf_lv = 0.9)
#' print(result, ci_digits = 3, pval_digits = 3)
bucher <- function(trt, com, conf_lv = 0.95) {
  if (!isTRUE(is.finite(trt$est))) stop("trt$est is not valid: ", trt$est)
  if (!isTRUE(is.finite(trt$se))) stop("trt$se is not valid: ", trt$se)
  if (!isTRUE(is.finite(com$est))) stop("com$est is not valid: ", com$est)
  if (!isTRUE(is.finite(com$se))) stop("com$se is not valid: ", com$se)
  if (conf_lv < 0 || 1 < conf_lv) stop("conf_lv must be in (0, 1): ", conf_lv)

  est <- trt$est - com$est
  se <- sqrt(trt$se^2 + com$se^2)
  ci_l <- est - stats::qnorm(0.5 + conf_lv / 2) * se
  ci_u <- est + stats::qnorm(0.5 + conf_lv / 2) * se
  if (est > 0) {
    pval <- 2 * (1 - stats::pnorm(est, 0, se))
  } else {
    pval <- 2 * stats::pnorm(est, 0, se)
  }

  outdata <- list(
    est = est,
    se = se,
    ci_l = ci_l,
    ci_u = ci_u,
    pval = pval
  )

  class(outdata) <- c("maicplus_bucher", "list")
  outdata
}


#' Calculate standard error from the reported confidence interval.
#'
#' Comparator studies often only report confidence interval of the
#' treatment effects. This function calculates standard error of the
#' treatment effect given the reported confidence interval.
#' For relative treatment effect (i.e. hazard ratio, odds ratio, and
#' risk ratio), the function would log the confidence interval.
#' For risk difference and mean difference,
#' we do not log the confidence interval.
#' The option to log the confidence interval is controlled
#' by `'log'` parameter.
#'
#' @param CI_lower Reported lower percentile value of the
#' treatment effect
#' @param CI_upper Reported upper percentile value of the
#' treatment effect
#' @param CI_perc Percentage of confidence interval reported
#' @param log Whether the confidence interval should be logged.
#' For relative treatment effect, log should be applied because
#' estimated log treatment effect is approximately normally distributed.
#' @return Standard error of log relative treatment effect if `'log'`
#' is true and standard error of the treatment effect if `'log'`
#' is false
#' @examples
#' find_SE_from_CI(CI_lower = 0.55, CI_upper = 0.90, CI_perc = 0.95)
#' @export

find_SE_from_CI <- function(CI_lower = NULL, CI_upper = NULL,
                            CI_perc = 0.95, log = TRUE) {
  if (CI_perc > 1 || CI_perc < 0) {
    stop("CI_perc has to be between 0 and 1")
  }

  if (is.null(CI_lower) || is.null(CI_upper)) {
    stop("Both CI_lower and CI_upper need to be specified")
  }

  if (!is.numeric(CI_lower) || !is.numeric(CI_upper)) {
    stop("Both CI_lower and CI_upper need to be specified")
  }

  alpha <- 1 - CI_perc
  se <- ifelse(log,
    (log(CI_upper) - log(CI_lower)) / (2 * qnorm(1 - alpha / 2)),
    (CI_upper - CI_lower) / (2 * qnorm(1 - alpha / 2))
  )
  return(se)
}

#' Print method for `maicplus_bucher` object
#'
#' @param x `maicplus_bucher` object
#' @param ci_digits an integer, number of decimal places for point
#' estimate and derived confidence limits
#' @param pval_digits an integer, number of decimal places to display
#' Z-test p-value
#' @param exponentiate whether the treatment effect and confidence
#' interval should be exponentiated. This applies to relative
#' treatment effects. Default is set to false.
#' @param ... not used
#' @describeIn bucher Print method for `maicplus_bucher` objects
#' @export

print.maicplus_bucher <- function(x, ci_digits = 2, pval_digits = 3,
                                  exponentiate = FALSE, ...) {
  output <- reformat(x, ci_digits, pval_digits,
    show_pval = TRUE, exponentiate
  )
  print(output)
}

#' Reformat `maicplus_bucher` alike object
#'
#' @param x a list, structured like a `maicplus_bucher` object
#' @param ci_digits an integer, number of decimal places for point
#' estimate and derived confidence limits
#' @param pval_digits an integer, number of decimal places to display
#' Z-test p-value
#' @param show_pval a logical value, default is TRUE. If FALSE, p-value will not
#' be output as the second element of the character vector
#' @param exponentiate whether the treatment effect and confidence
#' interval should be exponentiated. This applies to relative
#' treatment effects. Default is set to false.
#' @keywords internal

reformat <- function(x, ci_digits = 2, pval_digits = 3,
                     show_pval = TRUE, exponentiate = FALSE) {
  transform_this <- function(x) {
    ifelse(exponentiate, exp(x), x)
  }

  a <- format(round(transform_this(x$est), ci_digits),
    nsmall = ci_digits
  )
  b <- format(round(transform_this(x$ci_l), ci_digits),
    nsmall = ci_digits
  )
  c <- format(round(transform_this(x$ci_u), ci_digits),
    nsmall = ci_digits
  )
  res <- paste0(a, "[", b, "; ", c, "]")

  disp_pval <- round(x$pval, pval_digits)
  disp_pval <-
    ifelse(disp_pval == 0,
      paste0("<", 1 / (10^pval_digits)),
      format(disp_pval, nsmall = pval_digits)
    )

  if (show_pval) {
    output <- c(res, disp_pval)
    names(output) <- c("result", "pvalue")
  } else {
    output <- res
    names(output) <- "result"
  }

  output
}

# Restore the RNG back to a previous state using the global .Random.seed
set_random_seed <- function(old_seed) {
  if (is.null(old_seed)) {
    rm(".Random.seed", envir = globalenv(), inherits = FALSE)
  } else {
    assign(".Random.seed", value = old_seed, envir = globalenv(), inherits = FALSE)
  }
}

construct_boot_data <- function(weighted_data, i = 1) {
  if (is.null(weighted_data$boot)) stop("Must contain bootstrap results from estimate_weights()")
  i <- as.integer(i)
  R <- dim(weighted_data$boot)[3]
  if (i < 1 || i > R) stop("i must be integer between 1 and ", R)

  boot_data <- weighted_data$boot[, , i]
  weighted_data$data <- weighted_data$data[boot_data[, 1], ]
  weighted_data$data$weights <- boot_data[, 2]
  weighted_data$data$scaled_weights <- boot_data[, 2] / sum(boot_data[, 2])
  weighted_data
}


transform_ratio <- function(object) {
  result <- object
  result$est <- exp(object$est)
  # log normal parameterization for SE
  result$se <- sqrt((exp(object$se^2) - 1) * exp(2 * object$est + object$se^2))
  result$ci_l <- exp(object$ci_l)
  result$ci_u <- exp(object$ci_u)
  result
}

transform_absolute <- function(object) {
  result <- object
  result$est <- object$est * 100
  result$se <- object$se * 100
  result$ci_l <- object$ci_l * 100
  result$ci_u <- object$ci_u * 100
  result
}

#' Helper function to retrieve median survival time from a `survival::survfit` object
#'
#' Extract and display median survival time with confidence interval
#'
#' @param km_fit returned object from \code{survival::survfit}
#' @param legend a character string, name used in 'type' column in returned data frame
#' @param time_scale a character string, 'years', 'months', 'weeks' or 'days', time unit of median survival time
#'
#' @examples
#' data(adtte_sat)
#' data(pseudo_ipd_sat)
#' library(survival)
#' combined_data <- rbind(adtte_sat[, c("TIME", "EVENT", "ARM")], pseudo_ipd_sat)
#' kmobj <- survfit(Surv(TIME, EVENT) ~ ARM, combined_data, conf.type = "log-log")
#'
#' # Derive median survival time
#' medSurv <- medSurv_makeup(kmobj, legend = "before matching", time_scale = "day")
#' medSurv
#' @return a data frame with a index column 'type', median survival time and confidence interval
#' @export

medSurv_makeup <- function(km_fit, legend = "before matching", time_scale) {
  time_scale <- match.arg(time_scale, choices = c("years", "months", "weeks", "days"))

  # km_fit is the returned object from survival::survfit
  km_fit <- summary(km_fit)$table
  km_fit[, 5:ncol(km_fit)] <- get_time_as(km_fit[, 5:ncol(km_fit)], time_scale)

  toyadd <- data.frame(
    treatment = gsub("ARM=", "", rownames(km_fit)),
    type = rep(legend, 2)
  )

  km_fit <- cbind(toyadd, km_fit)
  rownames(km_fit) <- NULL

  km_fit
}


#' Helper function to select set of variables used for Kaplan-Meier plot
#'
#' @param km_fit returned object from \code{survival::survfit}
#' @param single_trt_name name of treatment if no strata are specified in `km_fit`
#'
#' @examples
#' library(survival)
#' data(adtte_sat)
#' data(pseudo_ipd_sat)
#' combined_data <- rbind(adtte_sat[, c("TIME", "EVENT", "ARM")], pseudo_ipd_sat)
#' kmobj <- survfit(Surv(TIME, EVENT) ~ ARM, combined_data, conf.type = "log-log")
#' survfit_makeup(kmobj)
#' @return a list of data frames of variables from [survival::survfit()]. Data frame is divided by treatment.
#' @export

survfit_makeup <- function(km_fit, single_trt_name = "treatment") {
  # in case km_fit is only for single arm
  if ("strata" %in% names(km_fit)) {
    use_trt <- mapply(rep, 1:2, each = km_fit$strata)
    if (is.list(use_trt)) use_trt <- unlist(use_trt)
    if (is.matrix(use_trt)) use_trt <- as.vector(use_trt)
    is_single <- FALSE
  } else {
    use_trt <- rep(single_trt_name, length(km_fit$time))
    is_single <- TRUE
  }

  kmdat <- data.frame(
    time = km_fit$time,
    treatment = use_trt,
    n.risk = km_fit$n.risk,
    n.event = km_fit$n.event,
    censor = km_fit$n.censor,
    surv = km_fit$surv,
    lower = km_fit$lower,
    upper = km_fit$upper,
    cumhaz = km_fit$cumhaz
  )
  if (!is_single) kmdat$treatment <- sapply(strsplit(names(km_fit$strata), "="), "[[", 2)[kmdat$treatment]
  split(kmdat, f = kmdat$treatment)
}

#' Create pseudo IPD given aggregated binary data
#'
#' @param binary_agd a data.frame that take different formats depending on \code{format}
#' @param format a string, "stacked" or "unstacked"
#'
#' @return a data.frame of pseudo binary IPD, with columns USUBJID, ARM, RESPONSE
#' @example inst/examples/get_pseudo_ipd_binary_ex.R
#' @export

get_pseudo_ipd_binary <- function(binary_agd, format = c("stacked", "unstacked")) {
  # pre check
  if (format == "stacked") {
    if (!is.data.frame(binary_agd)) {
      stop("stacked binary_agd should be data.frame with columns 'ARM', 'RESPONSE', 'COUNT'")
    }
    names(binary_agd) <- toupper(names(binary_agd))
    if (!all(c("ARM", "RESPONSE", "COUNT") %in% names(binary_agd))) {
      stop("stacked binary_agd should be data.frame with columns 'ARM', 'Response', 'Count'")
    }
    if (!is.logical(binary_agd$RESPONSE) && !all(toupper(binary_agd$RESPONSE) %in% c("YES", "NO"))) {
      stop("'RESPONSE' column in stacked binary_agd should be either logical vector or character vector of 'Yes'/'No'")
    }
    if (nrow(binary_agd) %% 2 != 0) {
      stop("nrow(binary_agd) is not even number, you may miss to provide 1 level of binary response to certain arm")
    }
  } else if (format == "unstacked") {
    if (!(is.data.frame(binary_agd) || is.matrix(binary_agd))) {
      stop("unstacked binary_agd should be either a 1x2 or 2x2 data frame or matrix")
    }
    if (ncol(binary_agd) != 2 || !nrow(binary_agd) %in% c(1, 2)) {
      stop("unstacked binary_agd should be either a 1x2 or 2x2 data frame or matrix")
    }
    bin_res <- toupper(colnames(binary_agd))
    bin_res <- sort(bin_res)
    if (!(identical(bin_res, c("FALSE", "TRUE")) || identical(bin_res, c("NO", "YES")))) {
      stop("column names of unstacked binary_agd should be either TRUE/FALSE or Yes/No")
    }
  }

  # pre process binary_agd, depending on format
  use_binary_agd <- switch(format,
    "stacked" = {
      names(binary_agd) <- toupper(names(binary_agd))
      if (!is.logical(binary_agd$RESPONSE)) {
        binary_agd$RESPONSE <- toupper(binary_agd$RESPONSE)
        binary_agd$RESPONSE <- binary_agd$RESPONSE == "YES"
      }
      binary_agd
    },
    "unstacked" = {
      trt_names <- rownames(binary_agd)
      bin_res <- toupper(colnames(binary_agd))
      if ("YES" %in% bin_res) {
        bin_res <- ifelse(bin_res == "YES", "TRUE", "FALSE")
        colnames(binary_agd) <- bin_res
      }
      tmpout <- utils::stack(binary_agd)
      tmpout <- cbind(ARM = rep(trt_names, each = 2), tmpout)
      names(tmpout) <- c("ARM", "COUNT", "RESPONSE")
      rownames(tmpout) <- NULL
      tmpout$RESPONSE <- as.logical(tmpout$RESPONSE)
      tmpout
    }
  )

  # create pseudo binary IPD
  use_binary_agd$ARM <- factor(use_binary_agd$ARM, levels = unique(use_binary_agd$ARM))
  n_per_arm <- tapply(use_binary_agd$COUNT, use_binary_agd$ARM, sum)
  n_yes_per_arm <- use_binary_agd$COUNT[use_binary_agd$RESPONSE] # use_binary_agd is already ordered as per factor ARM

  tmpipd <- data.frame(
    USUBJID = NA,
    ARM = unlist(
      mapply(rep, x = levels(use_binary_agd$ARM), each = n_per_arm, SIMPLIFY = FALSE, USE.NAMES = FALSE)
    ),
    RESPONSE = unlist(
      lapply(seq_along(n_per_arm), function(ii) {
        c(rep(TRUE, n_yes_per_arm[ii]), rep(FALSE, n_per_arm[ii] - n_yes_per_arm[ii]))
      })
    )
  )
  tmpipd$USUBJID <- paste0("pseudo_binary_subj_", seq_len(nrow(tmpipd)))

  # output
  tmpipd
}


#' Helper function to summarize outputs from glm fit
#'
#' @param binobj returned object from \code{stats::glm}
#' @param legend label to indicate the binary fit
#' @param weighted logical flag indicating whether weights have been applied in the glm fit
#' @returns A `data.frame` containing a summary of the number of events and subjects in a logistic
#' regression model.
#' @examples
#' data(adrs_sat)
#' pseudo_adrs <- get_pseudo_ipd_binary(
#'   binary_agd = data.frame(
#'     ARM = rep("B", 2),
#'     RESPONSE = c("YES", "NO"),
#'     COUNT = c(280, 120)
#'   ),
#'   format = "stacked"
#' )
#' pseudo_adrs$RESPONSE <- as.numeric(pseudo_adrs$RESPONSE)
#' combined_data <- rbind(adrs_sat[, c("USUBJID", "ARM", "RESPONSE")], pseudo_adrs)
#' combined_data$ARM <- as.factor(combined_data$ARM)
#' binobj_dat <- stats::glm(RESPONSE ~ ARM, combined_data, family = binomial("logit"))
#' glm_makeup(binobj_dat)
#' @export
glm_makeup <- function(binobj, legend = "before matching", weighted = FALSE) {
  arm <- levels(binobj$data$ARM)
  if (!weighted) {
    n <- tapply(binobj$data$USUBJID, binobj$data$ARM, length)
    n_event <- tapply(binobj$data$RESPONSE, binobj$data$ARM, sum)
  } else {
    n <- tapply(binobj$data$weights, binobj$data$ARM, sum)
    n_event <- tapply(binobj$data$weights * binobj$data$RESPONSE, binobj$data$ARM, sum)
  }
  data.frame(
    treatment = arm,
    type = legend,
    n = n,
    events = n_event,
    events_pct = n_event * 100 / n
  )
}

# Create an environment for settings
settings_env <- new.env()

#' Get and Set Time Conversion Factors
#'
#' @param default The default time scale, commonly whichever has factor = 1
#' @param days Factor to divide data time units to get time in days
#' @param weeks Factor to divide data time units to get time in weeks
#' @param months Factor to divide data time units to get time in months
#' @param years Factor to divide data time units to get time in years
#'
#' @return No value returned. Conversion factors are stored internally and used within functions.
#' @export
#' @rdname time_conversion
#'
#' @examples
#' # The default time scale is days:
#' set_time_conversion(default = "days", days = 1, weeks = 7, months = 365.25 / 12, years = 365.25)
#'
#' # Set the default time scale to years
#' set_time_conversion(
#'   default = "years",
#'   days = 1 / 365.25,
#'   weeks = 1 / 52.17857,
#'   months = 1 / 12,
#'   years = 1
#' )
#'
set_time_conversion <- function(default = "days", days = 1, weeks = 7, months = 365.25 / 12, years = 365.25) {
  if (!default %in% c("days", "weeks", "months", "years")) {
    stop("default must be one of \"days\", \"weeks\", \"months\", \"years\")")
  }
  factors <- c(days = days, weeks = weeks, months = months, years = years)
  check_factors <- vapply(factors, function(x) isFALSE(!is.finite(x) || x == 0), logical(1L))
  if (!all(check_factors)) {
    stop(
      "Conversion factors must be finite non-zero numerical values: ",
      paste0(names(factors)[!check_factors], " = ", factors[!check_factors], collapse = ", ")
    )
  }
  settings_env$time_conversion <- factors
  settings_env$default_time_scale <- default
}


#' @param factor Time factor to get.
#' @rdname time_conversion
#' @export
#'
#' @examples
#' # Get time scale factors:
#' get_time_conversion("years")
#' get_time_conversion("weeks")
get_time_conversion <- function(factor = c("days", "weeks", "months", "years")) {
  factor <- match.arg(factor, several.ok = TRUE)
  if (!exists("time_conversion", settings_env)) {
    warning("No time conversion factors previously set. Setting defaults.")
    set_time_conversion()
  }
  settings_env$time_conversion[factor]
}


#' Convert Time Values Using Scaling Factors
#'
#' @param times Numeric time values
#' @param as A time scale to convert to. One of "days", "weeks", "months", "years"
#'
#' @return Returns a numeric vector calculated from `times / get_time_conversion(factor = as)`
#' @export
#' @examples
#' get_time_as(50, as = "months")
get_time_as <- function(times, as = NULL) {
  if (is.null(as)) as <- settings_env$default_time_scale
  if (!is.numeric(times)) stop("times arguments must be numeric")
  as <- match.arg(as, c("days", "weeks", "months", "years"))
  times / get_time_conversion(as)
}

1		#' Unanchored MAIC for binary and time-to-event endpoint
2		#'
3		#' This is a wrapper function to provide adjusted effect estimates and relevant statistics in unanchored case (i.e.
4		#' there is no common comparator arm in the internal and external trial).
5		#'
6		#' @param weights_object an object returned by \code{estimate_weight}
7		#' @param ipd a data frame that meet format requirements in 'Details', individual patient data (IPD) of internal trial
8		#' @param pseudo_ipd a data frame, pseudo IPD from digitized KM curve of external trial (for time-to-event endpoint) or
9		#' from contingency table (for binary endpoint)
10		#' @param trt_ipd a string, name of the interested investigation arm in internal trial \code{dat_igd} (real IPD)
11		#' @param trt_agd a string, name of the interested investigation arm in external trial \code{pseudo_ipd} (pseudo IPD)
12		#' @param trt_var_ipd a string, column name in \code{ipd} that contains the treatment assignment
13		#' @param trt_var_agd a string, column name in \code{ipd} that contains the treatment assignment
14		#' @param normalize_weights logical, default is \code{FALSE}. If \code{TRUE},
15		#' \code{scaled_weights} (normalized weights) in \code{weights_object$data} will be used.
16		#' @param endpoint_type a string, one out of the following "binary", "tte" (time to event)
17		#' @param eff_measure a string, "RD" (risk difference), "OR" (odds ratio), "RR" (relative risk) for a binary endpoint;
18		#' "HR" for a time-to-event endpoint. By default is \code{NULL}, "OR" is used for binary case, otherwise "HR" is used.
19		#' @param boot_ci_type a string, one of `c("norm","basic", "stud", "perc", "bca")` to select the type of bootstrap
20		#' confidence interval. See [boot::boot.ci] for more details.
21		#' @param endpoint_name a string, name of time to event endpoint, to be show in the last line of title
22		#' @param time_scale a string, time unit of median survival time, taking a value of 'years', 'months', 'weeks' or
23		#' 'days'. NOTE: it is assumed that values in TIME column of \code{ipd} and \code{pseudo_ipd} is in the unit of days
24		#' @param km_conf_type a string, pass to \code{conf.type} of \code{survfit}
25		#' @param binary_robust_cov_type a string to pass to argument `type` of [sandwich::vcovHC], see possible options in the
26		#' documentation of that function. Default is `"HC3"`
27		#'
28		#' @details For time-to-event analysis, it is required that input \code{ipd} and \code{pseudo_ipd} to have the following
29		#' columns. This function is not sensitive to upper or lower case of letters in column names.
30		#' \itemize{
31		#' \item USUBJID - character, unique subject ID
32		#' \item ARM - character or factor, treatment indicator, column name does not have to be 'ARM'. User specify in
33		#' \code{trt_var_ipd} and \code{trt_var_agd}
34		#' \item EVENT - numeric, 1 for censored/death, 0 for otherwise
35		#' \item TIME - numeric column, observation time of the \code{EVENT}; unit in days
36		#' }
37		#'
38		#' @importFrom survival survfit Surv coxph
39		#' @importFrom lmtest coeftest coefci
40		#' @importFrom sandwich vcovHC
41		#' @importFrom boot boot boot.ci
42		#' @return A list, contains 'descriptive' and 'inferential'
43		#' @example inst/examples/maic_unanchored_ex.R
44		#' @example inst/examples/maic_unanchored_binary_ex.R
45		#' @export
46
47		maic_unanchored <- function(weights_object,
48		ipd,
49		pseudo_ipd,
50		trt_ipd,
51		trt_agd,
52		trt_var_ipd = "ARM",
53		trt_var_agd = "ARM",
54		normalize_weights = FALSE,
55		endpoint_type = "tte",
56		endpoint_name = "Time to Event Endpoint",
57		eff_measure = c("HR", "OR", "RR", "RD"),
58		boot_ci_type = c("norm", "basic", "stud", "perc", "bca"),
59		# time to event specific args
60		time_scale = "months",
61		km_conf_type = "log-log",
62		# binary specific args
63		binary_robust_cov_type = "HC3") {
64		# ==> Initial Setup ------------------------------------------
65		# ~~~ Create the hull for the output from this function
66	6x	res <- list(
67	6x	descriptive = list(),
68	6x	inferential = list()
69		)
70
71	6x	res_AB_unadj <- res_AB <- list(
72	6x	est = NA,
73	6x	se = NA,
74	6x	ci_l = NA,
75	6x	ci_u = NA,
76	6x	pval = NA
77		)
78
79		# ~~~ Initial colname process and precheck on effect measure
80	6x	names(ipd) <- toupper(names(ipd))
81	6x	names(pseudo_ipd) <- toupper(names(pseudo_ipd))
82	6x	trt_var_ipd <- toupper(trt_var_ipd)
83	6x	trt_var_agd <- toupper(trt_var_agd)
84
85	!	if (length(eff_measure) > 1) eff_measure <- NULL
86	!	if (is.null(eff_measure)) eff_measure <- list(binary = "OR", tte = "HR")[[endpoint_type]]
87
88		# ~~~ Setup ARM column and make related pre-checks
89	!	if (!trt_var_ipd %in% names(ipd)) stop("cannot find arm indicator column trt_var_ipd in ipd")
90	!	if (!trt_var_agd %in% names(pseudo_ipd)) stop("cannot find arm indicator column trt_var_agd in pseudo_ipd")
91	!	if (trt_var_ipd != "ARM") ipd$ARM <- ipd[[trt_var_ipd]]
92	!	if (trt_var_agd != "ARM") pseudo_ipd$ARM <- pseudo_ipd[[trt_var_agd]]
93	6x	ipd$ARM <- as.character(ipd$ARM) # just to avoid potential error when merging
94	6x	pseudo_ipd$ARM <- as.character(pseudo_ipd$ARM) # just to avoid potential error when merging
95	!	if (!trt_ipd %in% ipd$ARM) stop("trt_ipd does not exist in ipd$ARM")
96	!	if (!trt_agd %in% pseudo_ipd$ARM) stop("trt_agd does not exist in pseudo_ipd$ARM")
97
98		# ~~~ More pre-checks
99	6x	endpoint_type <- match.arg(endpoint_type, c("binary", "tte"))
100	6x	if (!"maicplus_estimate_weights" %in% class(weights_object)) {
101	!	stop("weights_object should be an object returned by estimate_weights")
102		}
103	6x	if (any(duplicated(ipd$USUBJID))) {
104	!	warning(
105	!	"check your ipd, it has duplicated usubjid, this indicates, ",
106	!	"it might contain multiple endpoints for each subject"
107		)
108		}
109	6x	if (!all(ipd$USUBJID %in% weights_object$data$USUBJID)) {
110	!	stop(
111	!	"These pts in ipd cannot be found in weights_object ",
112	!	toString(setdiff(ipd$USUBJID, weights_object$USUBJID))
113		)
114		}
115	6x	time_scale <- match.arg(arg = time_scale, choices = c("days", "weeks", "months", "years"))
116	6x	if (endpoint_type == "binary") { # for binary effect measure
117
118	!	if (any(!c("USUBJID", "RESPONSE") %in% names(ipd))) stop("ipd should have 'USUBJID', 'RESPONSE' columns at minimum")
119	4x	eff_measure <- match.arg(eff_measure, choices = c("OR", "RD", "RR"), several.ok = FALSE)
120	4x	binary_robust_cov_type <- match.arg(
121	4x	binary_robust_cov_type,
122	4x	choices = c("HC3", "const", "HC", "HC0", "HC1", "HC2", "HC4", "HC4m", "HC5")
123		)
124	2x	} else if (endpoint_type == "tte") { # for time to event effect measure
125
126	2x	if (!all(c("USUBJID", "TIME", "EVENT", trt_var_ipd) %in% names(ipd))) {
127	!	stop("ipd needs to include at least USUBJID, TIME, EVENT, ", trt_var_ipd)
128		}
129	2x	if (!all(c("TIME", "EVENT", trt_var_agd) %in% names(pseudo_ipd))) {
130	!	stop("pseudo_ipd needs to include at least TIME, EVENT, ", trt_var_agd)
131		}
132	2x	eff_measure <- match.arg(eff_measure, choices = c("HR"), several.ok = FALSE)
133		}
134	6x	boot_ci_type <- match.arg(boot_ci_type)
135
136		# ==> IPD and AgD data preparation ------------------------------------------
137		# : subset ipd, retain only ipd from interested trts
138	6x	ipd <- ipd[ipd$ARM == trt_ipd, , drop = TRUE]
139	6x	pseudo_ipd <- pseudo_ipd[pseudo_ipd$ARM == trt_agd, , drop = TRUE]
140
141		# : assign weights to real and pseudo ipd
142	6x	if (normalize_weights) {
143	!	ipd$weights <- weights_object$data$scaled_weights[match(ipd$USUBJID, weights_object$data$USUBJID)]
144		} else {
145	6x	ipd$weights <- weights_object$data$weights[match(ipd$USUBJID, weights_object$data$USUBJID)]
146		}
147	6x	pseudo_ipd$weights <- 1
148
149		# : necessary formatting for pseudo ipd
150	2x	if (!"USUBJID" %in% names(pseudo_ipd)) pseudo_ipd$USUBJID <- paste0("ID", seq_len(nrow(pseudo_ipd)))
151	6x	if ("RESPONSE" %in% names(pseudo_ipd) && is.logical(pseudo_ipd$RESPONSE)) {
152	4x	pseudo_ipd$RESPONSE <- as.numeric(pseudo_ipd$RESPONSE)
153		}
154
155		# : give warning when individual pts in IPD has no weights
156	6x	if (any(is.na(ipd$weights))) {
157	!	ipd <- ipd[!is.na(ipd$weights), , drop = FALSE]
158	!	warning(
159	!	paste(
160	!	"these usubjid in ipd have no weight in weights_object, and hence excluded from analysis:",
161	!	paste(ipd$USUBJID[is.na(ipd$weights)], collapse = ",")
162		)
163		)
164	!	if (nrow(ipd) == 0) stop("there is no pts with weight in IPD!!")
165		}
166
167		# : retain necessary columns
168	6x	if (endpoint_type == "tte") {
169	2x	retain_cols <- c("USUBJID", "ARM", "TIME", "EVENT", "weights")
170		} else {
171	4x	retain_cols <- c("USUBJID", "ARM", "RESPONSE", "weights")
172		}
173	6x	ipd <- ipd[, retain_cols, drop = FALSE]
174	6x	pseudo_ipd <- pseudo_ipd[, retain_cols, drop = FALSE]
175
176		# : merge real and pseudo ipds
177	6x	dat <- rbind(ipd, pseudo_ipd)
178	6x	dat$ARM <- factor(dat$ARM, levels = c(trt_agd, trt_ipd))
179
180		# ==> Inferential output ------------------------------------------
181
182	6x	result <- if (endpoint_type == "tte") {
183	2x	maic_unanchored_tte(
184	2x	res, res_AB, res_AB_unadj, dat, ipd, pseudo_ipd, km_conf_type, time_scale,
185	2x	weights_object, endpoint_name, normalize_weights, boot_ci_type, trt_ipd, trt_agd
186		)
187	6x	} else if (endpoint_type == "binary") {
188	4x	maic_unanchored_binary(
189	4x	res, res_AB, res_AB_unadj, dat, ipd, pseudo_ipd, binary_robust_cov_type,
190	4x	weights_object, endpoint_name, normalize_weights, eff_measure, boot_ci_type, trt_ipd, trt_agd
191		)
192		} else {
193	!	stop("Endpoint type ", endpoint_type, " currently unsupported.")
194		}
195
196		# output
197	6x	result
198		}
199
200		# MAIC inference functions for TTE outcome type ------------
201
202		maic_unanchored_tte <- function(res,
203		res_AB,
204		res_AB_unadj,
205		dat,
206		ipd,
207		pseudo_ipd,
208		km_conf_type,
209		time_scale,
210		weights_object,
211		endpoint_name,
212		normalize_weights,
213		boot_ci_type,
214		trt_ipd,
215		trt_agd) {
216		# ~~~ Descriptive table before and after matching
217		# : derive km w and w/o weights
218	2x	kmobj_dat <- survfit(Surv(TIME, EVENT) ~ ARM, dat, conf.type = km_conf_type)
219	2x	kmobj_dat_adj <- survfit(Surv(TIME, EVENT) ~ ARM, dat, weights = dat$weights, conf.type = km_conf_type)
220	2x	res$descriptive[["survfit_before"]] <- survfit_makeup(kmobj_dat)
221	2x	res$descriptive[["survfit_after"]] <- survfit_makeup(kmobj_dat_adj)
222		# : derive median survival time
223	2x	medSurv_dat <- medSurv_makeup(kmobj_dat, legend = "Before matching", time_scale = time_scale)
224	2x	medSurv_dat_adj <- medSurv_makeup(kmobj_dat_adj, legend = "After matching", time_scale = time_scale)
225	2x	medSurv_out <- rbind(medSurv_dat, medSurv_dat_adj)
226	2x	medSurv_out <- cbind(trt_ind = c("B", "A")[match(medSurv_out$treatment, levels(dat$ARM))], medSurv_out)
227
228	2x	res$descriptive[["summary"]] <- medSurv_out
229
230		# ~~~ Analysis table (Cox model) before and after matching
231		# : fit PH Cox regression model
232	2x	coxobj_dat <- coxph(Surv(TIME, EVENT) ~ ARM, dat)
233	2x	coxobj_dat_adj <- coxph(Surv(TIME, EVENT) ~ ARM, dat, weights = weights, robust = TRUE)
234
235		# : derive adjusted estimate for ipd exp arm vs agd exp arm
236	2x	res_AB$est <- summary(coxobj_dat_adj)$conf.int[1]
237	2x	mu <- summary(coxobj_dat_adj)$coef[1]
238	2x	sig <- summary(coxobj_dat_adj)$coef[4]
239	2x	res_AB$se <- sqrt((exp(sig^2) - 1) * exp(2 * mu + sig^2)) # log normal parametrization
240	2x	res_AB$ci_l <- summary(coxobj_dat_adj)$conf.int[3]
241	2x	res_AB$ci_u <- summary(coxobj_dat_adj)$conf.int[4]
242	2x	res_AB$pval <- summary(coxobj_dat_adj)$coef[6]
243
244		# : derive unadjusted estimate
245	2x	res_AB_unadj$est <- summary(coxobj_dat)$conf.int[1]
246	2x	mu <- summary(coxobj_dat)$coef[1]
247	2x	sig <- summary(coxobj_dat)$coef[3]
248	2x	res_AB_unadj$se <- sqrt((exp(sig^2) - 1) * exp(2 * mu + sig^2)) # log normal parametrization
249	2x	res_AB_unadj$ci_l <- summary(coxobj_dat)$conf.int[3]
250	2x	res_AB_unadj$ci_u <- summary(coxobj_dat)$conf.int[4]
251	2x	res_AB_unadj$pval <- summary(coxobj_dat)$coef[5]
252
253		# : get bootstrapped estimates if applicable
254	2x	if (!is.null(weights_object$boot)) {
255	1x	keep_rows <- setdiff(seq_len(nrow(weights_object$data)), weights_object$rows_with_missing)
256	1x	boot_ipd_id <- weights_object$data[keep_rows, "USUBJID", drop = FALSE]
257
258	1x	boot_ipd <- merge(boot_ipd_id, ipd, by = "USUBJID", all.x = TRUE)
259	!	if (nrow(boot_ipd) != nrow(boot_ipd_id)) stop("ipd has multiple observations for some patients")
260	1x	boot_ipd <- boot_ipd[match(boot_ipd$USUBJID, boot_ipd_id$USUBJID), ]
261
262	1x	stat_fun <- function(data, index, w_obj, pseudo_ipd, normalize) {
263	501x	boot_ipd <- data[index, ]
264	501x	r <- dynGet("r", ifnotfound = NA) # Get bootstrap iteration
265	501x	if (!is.na(r)) {
266	!	if (!all(index == w_obj$boot[, 1, r])) stop("Bootstrap and weight indices don't match")
267	500x	boot_ipd$weights <- w_obj$boot[, 2, r]
268	!	if (normalize) boot_ipd$weights <- boot_ipd$weights / mean(boot_ipd$weights, na.rm = TRUE)
269		}
270	501x	boot_dat <- rbind(boot_ipd, pseudo_ipd)
271	501x	boot_dat$ARM <- factor(boot_dat$ARM, levels = c(trt_agd, trt_ipd))
272	501x	boot_coxobj_dat_adj <- coxph(Surv(TIME, EVENT) ~ ARM, boot_dat, weights = weights)
273	501x	c(est = coef(boot_coxobj_dat_adj)[1], var = vcov(boot_coxobj_dat_adj)[1, 1])
274		}
275
276		# Revert seed to how it was for weight bootstrap sampling
277	1x	old_seed <- globalenv()$.Random.seed
278	1x	on.exit(suspendInterrupts(set_random_seed(old_seed)))
279	1x	set_random_seed(weights_object$boot_seed)
280	1x	R <- dim(weights_object$boot)[3]
281
282	1x	boot_res <- boot(
283	1x	boot_ipd,
284	1x	stat_fun,
285	1x	R = R,
286	1x	w_obj = weights_object,
287	1x	pseudo_ipd = pseudo_ipd,
288	1x	normalize = normalize_weights,
289	1x	strata = weights_object$boot_strata
290		)
291	1x	boot_ci <- boot.ci(boot_res, type = boot_ci_type, w_obj = weights_object, pseudo_ipd = pseudo_ipd)
292
293	1x	l_u_index <- switch(boot_ci_type,
294	1x	"norm" = list(2, 3, "normal"),
295	1x	"basic" = list(4, 5, "basic"),
296	1x	"stud" = list(4, 5, "student"),
297	1x	"perc" = list(4, 5, "percent"),
298	1x	"bca" = list(4, 5, "bca")
299		)
300
301	1x	transform_estimate <- exp
302	1x	boot_res_AB <- list(
303	1x	est = as.vector(transform_estimate(boot_res$t0[1])),
304	1x	se = NA,
305	1x	ci_l = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[1]]]),
306	1x	ci_u = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[2]]]),
307	1x	pval = NA
308		)
309		} else {
310	1x	boot_res_AB <- NULL
311	1x	boot_res <- NULL
312		}
313
314		# : report all raw fitted obj
315	2x	res$inferential[["fit"]] <- list(
316	2x	km_before = kmobj_dat,
317	2x	km_after = kmobj_dat_adj,
318	2x	model_before = coxobj_dat,
319	2x	model_after = coxobj_dat_adj,
320	2x	res_AB = res_AB,
321	2x	res_AB_unadj = res_AB_unadj,
322	2x	boot_res = boot_res,
323	2x	boot_res_AB = boot_res_AB
324		)
325
326		# : compile HR result
327	2x	res$inferential[["summary"]] <- data.frame(
328	2x	case = c("AB", "adjusted_AB"),
329	2x	HR = c(res_AB_unadj$est, res_AB$est),
330	2x	LCL = c(res_AB_unadj$ci_l, res_AB$ci_l),
331	2x	UCL = c(res_AB_unadj$ci_u, res_AB$ci_u),
332	2x	pval = c(res_AB_unadj$pval, res_AB$pval)
333		)
334
335		# output
336	2x	res
337		}
338
339		# MAIC inference functions for Binary outcome type ------------
340
341		maic_unanchored_binary <- function(res,
342		res_AB,
343		res_AB_unadj,
344		dat,
345		ipd,
346		pseudo_ipd,
347		binary_robust_cov_type,
348		weights_object,
349		endpoint_name,
350		normalize_weights,
351		eff_measure,
352		boot_ci_type,
353		trt_ipd,
354		trt_agd) {
355		# ~~~ Analysis table
356		# : set up proper link
357	4x	glm_link <- switch(eff_measure,
358	4x	"RD" = "identity",
359	4x	"RR" = "log",
360	4x	"OR" = "logit"
361		)
362	4x	transform_estimate <- switch(eff_measure,
363	4x	"RD" = function(x) x * 100,
364	4x	"RR" = exp,
365	4x	"OR" = exp
366		)
367
368		# : fit glm for binary outcome and robust estimate with weights
369	4x	binobj_dat <- glm(RESPONSE ~ ARM, dat, family = binomial(link = glm_link))
370	4x	binobj_dat_adj <- suppressWarnings(glm(RESPONSE ~ ARM, dat, weights = weights, family = binomial(link = glm_link)))
371
372	4x	bin_robust_cov <- sandwich::vcovHC(binobj_dat_adj, type = binary_robust_cov_type)
373	4x	bin_robust_coef <- lmtest::coeftest(binobj_dat_adj, vcov. = bin_robust_cov)
374	4x	bin_robust_ci <- lmtest::coefci(binobj_dat_adj, vcov. = bin_robust_cov)
375
376		# : make general summary
377	4x	glmDesc_dat <- glm_makeup(binobj_dat, legend = "Before matching", weighted = FALSE)
378	4x	glmDesc_dat_adj <- glm_makeup(binobj_dat_adj, legend = "After matching", weighted = TRUE)
379	4x	glmDesc <- rbind(glmDesc_dat, glmDesc_dat_adj)
380	4x	glmDesc <- cbind(trt_ind = c("B", "A")[match(glmDesc$treatment, levels(dat$ARM))], glmDesc)
381	4x	rownames(glmDesc) <- NULL
382	4x	res$descriptive[["summary"]] <- glmDesc
383
384		# : derive adjusted estimate
385	4x	res_AB$est <- bin_robust_coef[2, "Estimate"]
386	4x	res_AB$se <- bin_robust_coef[2, "Std. Error"]
387	4x	res_AB$ci_l <- bin_robust_ci[2, "2.5 %"]
388	4x	res_AB$ci_u <- bin_robust_ci[2, "97.5 %"]
389	4x	res_AB$pval <- bin_robust_coef[2, "Pr(>\|z\|)"]
390
391		# : derive unadjusted estimate
392	4x	binobj_dat_summary <- summary(binobj_dat)
393	4x	res_AB_unadj$est <- binobj_dat_summary$coefficients[2, "Estimate"]
394	4x	res_AB_unadj$se <- binobj_dat_summary$coefficients[2, "Std. Error"]
395	4x	res_AB_unadj$ci_l <- confint.default(binobj_dat)[2, "2.5 %"]
396	4x	res_AB_unadj$ci_u <- confint.default(binobj_dat)[2, "97.5 %"]
397	4x	res_AB_unadj$pval <- binobj_dat_summary$coefficients[2, "Pr(>\|z\|)"]
398
399		# : transform
400	4x	if (eff_measure %in% c("RR", "OR")) {
401	3x	res_AB <- transform_ratio(res_AB)
402	3x	res_AB_unadj <- transform_ratio(res_AB_unadj)
403	1x	} else if (eff_measure == "RD") {
404	1x	res_AB <- transform_absolute(res_AB)
405	1x	res_AB_unadj <- transform_absolute(res_AB_unadj)
406		}
407
408		# : get bootstrapped estimates if applicable
409	4x	if (!is.null(weights_object$boot)) {
410	1x	keep_rows <- setdiff(seq_len(nrow(weights_object$data)), weights_object$rows_with_missing)
411	1x	boot_ipd_id <- weights_object$data[keep_rows, "USUBJID", drop = FALSE]
412
413	1x	boot_ipd <- merge(boot_ipd_id, ipd, by = "USUBJID", all.x = TRUE)
414	!	if (nrow(boot_ipd) != nrow(boot_ipd_id)) stop("ipd has multiple observations for some patients")
415	1x	boot_ipd <- boot_ipd[match(boot_ipd$USUBJID, boot_ipd_id$USUBJID), ]
416
417	1x	stat_fun <- function(data, index, w_obj, pseudo_ipd, normalize) {
418	21x	boot_ipd <- data[index, ]
419	21x	r <- dynGet("r", ifnotfound = NA) # Get bootstrap iteration
420	21x	if (!is.na(r)) {
421	!	if (!all(index == w_obj$boot[, 1, r])) stop("Bootstrap and weight indices don't match")
422	20x	boot_ipd$weights <- w_obj$boot[, 2, r]
423	!	if (normalize) boot_ipd$weights <- boot_ipd$weights / mean(boot_ipd$weights, na.rm = TRUE)
424		}
425	21x	boot_dat <- rbind(boot_ipd, pseudo_ipd)
426	21x	boot_dat$ARM <- factor(boot_dat$ARM, levels = c(trt_agd, trt_ipd))
427	21x	boot_binobj_dat_adj <- suppressWarnings(
428	21x	glm(RESPONSE ~ ARM, boot_dat, weights = weights, family = binomial(link = glm_link))
429		)
430	21x	c(est = coef(boot_binobj_dat_adj)[2], var = vcov(boot_binobj_dat_adj)[2, 2])
431		}
432
433		# Revert seed to how it was for weight bootstrap sampling
434	1x	old_seed <- globalenv()$.Random.seed
435	1x	on.exit(suspendInterrupts(set_random_seed(old_seed)))
436	1x	set_random_seed(weights_object$boot_seed)
437	1x	R <- dim(weights_object$boot)[3]
438	1x	boot_res <- boot(
439	1x	boot_ipd,
440	1x	stat_fun,
441	1x	R = R,
442	1x	w_obj = weights_object,
443	1x	pseudo_ipd = pseudo_ipd,
444	1x	normalize = normalize_weights,
445	1x	strata = weights_object$boot_strata
446		)
447	1x	boot_ci <- boot.ci(boot_res, type = boot_ci_type, w_obj = weights_object, pseudo_ipd = pseudo_ipd)
448
449	1x	l_u_index <- switch(boot_ci_type,
450	1x	"norm" = list(2, 3, "normal"),
451	1x	"basic" = list(4, 5, "basic"),
452	1x	"stud" = list(4, 5, "student"),
453	1x	"perc" = list(4, 5, "percent"),
454	1x	"bca" = list(4, 5, "bca")
455		)
456
457	1x	boot_res_AB <- list(
458	1x	est = as.vector(transform_estimate(boot_res$t0[1])),
459	1x	se = NA,
460	1x	ci_l = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[1]]]),
461	1x	ci_u = transform_estimate(boot_ci[[l_u_index[[3]]]][l_u_index[[2]]]),
462	1x	pval = NA
463		)
464		} else {
465	3x	boot_res_AB <- NULL
466	3x	boot_res <- NULL
467		}
468
469		# : report all raw fitted obj
470	4x	res$inferential[["fit"]] <- list(
471	4x	model_before = binobj_dat,
472	4x	model_after = binobj_dat_adj,
473	4x	res_AB = res_AB,
474	4x	res_AB_unadj = res_AB_unadj,
475	4x	boot_res = boot_res,
476	4x	boot_res_AB = boot_res_AB
477		)
478
479		# : compile binary effect estimate result
480	4x	res$inferential[["summary"]] <- data.frame(
481	4x	case = c("AB", "adjusted_AB"),
482	4x	EST = c(
483	4x	res_AB_unadj$est,
484	4x	res_AB$est
485		),
486	4x	LCL = c(
487	4x	res_AB_unadj$ci_l,
488	4x	res_AB$ci_l
489		),
490	4x	UCL = c(
491	4x	res_AB_unadj$ci_u,
492	4x	res_AB$ci_u
493		),
494	4x	pval = c(
495	4x	res_AB_unadj$pval,
496	4x	res_AB$pval
497		)
498		)
499	4x	names(res$inferential[["summary"]])[2] <- eff_measure
500
501		# : output
502	4x	res
503		}

1		# Functions for pre-processing data before conduct MAIC
2
3		# Functions to be exported ---------------------------------------
4
5		#' Pre-process aggregate data
6		#'
7		#' This function checks the format of the aggregate data.
8		#' Data is required to have three columns: STUDY, ARM, and N.
9		#' Column names that do not have legal suffixes (MEAN, MEDIAN, SD, COUNT, or PROP) are dropped.
10		#' If a variable is a count variable, it is converted to proportions by dividing the sample size (N).
11		#' Note, when the count is specified, proportion is always calculated based on the count, that is,
12		#' specified proportion will be ignored if applicable.
13		#' If the aggregated data comes from multiple sources (i.e. different analysis population) and
14		#' sample size differs for each variable, one option is to specify proportion directly instead of count by using suffix
15		#' _PROP.
16		#'
17		#' @param raw_agd raw aggregate data should contain STUDY, ARM, and N. Variable names should be followed
18		#' by legal suffixes (i.e. MEAN, MEDIAN, SD, COUNT, or PROP).
19		#'
20		#' @examples
21		#' data(agd)
22		#' agd <- process_agd(agd)
23		#'
24		#' @return pre-processed aggregate level data
25		#' @export
26
27		process_agd <- function(raw_agd) {
28	6x	raw_agd <- as.data.frame(raw_agd)
29		# make all column names to be capital letters to avoid different style
30	6x	names(raw_agd) <- toupper(names(raw_agd))
31
32		# define column name patterns[-]
33	6x	must_exist <- c("STUDY", "ARM", "N")
34	6x	legal_suffix <- c("MEAN", "MEDIAN", "SD", "COUNT", "PROP")
35
36		# swap "TREATMENT" column to "ARM", if applicable
37	6x	if ("TREATMENT" %in% names(raw_agd) && (!"ARM" %in% names(raw_agd))) {
38	1x	raw_agd$ARM <- raw_agd$TREATMENT
39	1x	raw_agd <- raw_agd[, names(raw_agd) != "TREATMENT"]
40	1x	warning("'TREATMENT' column is renamed as 'ARM'")
41		}
42
43		# check: must exist
44	6x	if (!all(must_exist %in% names(raw_agd))) {
45	!	stop("At least 1 of the must-exists columns (STUDY, ARM, N) cannot be found in raw_agd!")
46		}
47
48		# check: legal suffix
49	6x	other_colnames <- setdiff(names(raw_agd), must_exist)
50	6x	ind1 <- grepl("_", other_colnames, fixed = TRUE)
51	6x	ind2 <- sapply(other_colnames, function(xx) {
52	42x	tmp <- unlist(strsplit(xx, split = "_"))
53	42x	tmp[length(tmp)] # this deployment is robust to the cases that there are multiple _ in the column name
54		})
55	6x	ind2 <- (ind2 %in% legal_suffix)
56
57	6x	use_cols <- other_colnames[ind1 & ind2]
58	6x	use_agd <- raw_agd[, c(must_exist, use_cols), drop = FALSE]
59	6x	if (!all(other_colnames %in% use_cols)) {
60	!	warning(paste0(
61	!	"following columns are ignored since it does not follow the naming conventions:",
62	!	paste(setdiff(other_colnames, use_cols), collapse = ",")
63		))
64		}
65
66		# If the aggregate data is divided by different arms, calculate pooled arm statistics using
67		# complete_agd function; complete statistics is specified by ARM=="Total"
68	6x	if (!"total" %in% tolower(use_agd$ARM)) {
69	!	use_agd <- complete_agd(use_agd)
70		}
71
72		# calculate percentage columns
73	6x	ind <- grepl("_COUNT$", names(use_agd))
74	6x	if (any(ind)) {
75	6x	for (i in which(ind)) {
76	18x	tmp_prop <- use_agd[[i]] / use_agd$N
77		# in case some count are not specified, but proportion are specified, copy over those proportions
78		# this also means, in case count is specified, proportion is ignored even it is specified
79	18x	prop_name_i <- gsub("_COUNT$", "_PROP", names(use_agd)[i])
80	18x	if (prop_name_i %in% names(use_agd)) {
81	!	tmp_prop[is.na(tmp_prop)] <- use_agd[is.na(tmp_prop), prop_name_i]
82	!	names(use_agd)[names(use_agd) == prop_name_i] <- paste0(prop_name_i, "_redundant")
83		}
84	18x	use_agd[[i]] <- tmp_prop
85		}
86	6x	names(use_agd) <- gsub("_COUNT$", "_PROP", names(use_agd))
87		}
88	6x	use_agd <- use_agd[, !grepl("_redundant$", names(use_agd))]
89
90		# output
91	6x	with(use_agd, use_agd[tolower(ARM) == "total", , drop = FALSE])
92		}
93
94
95		#' Create dummy variables from categorical variables in an individual patient data (ipd)
96		#'
97		#' This is a convenient function to convert categorical variables into dummy binary variables.
98		#' This would be especially useful if the variable has more than two factors.
99		#' Note that the original variable is kept after a variable is dummized.
100		#'
101		#' @param raw_ipd ipd data that contains variable to dummize
102		#' @param dummize_cols vector of column names to binarize
103		#' @param dummize_ref_level vector of reference level of the variables to binarize
104		#'
105		#' @examples
106		#' data(adsl_twt)
107		#' dummize_ipd(adsl_twt, dummize_cols = c("SEX"), dummize_ref_level = c("Male"))
108		#'
109		#' @return ipd with dummized columns
110		#' @export
111
112		dummize_ipd <- function(raw_ipd, dummize_cols, dummize_ref_level) {
113	2x	for (i in seq_along(dummize_cols)) {
114	3x	yy <- raw_ipd[[dummize_cols[i]]]
115	3x	yy_levels <- na.omit(unique(yy))
116	3x	yy <- factor(as.character(yy), levels = c(dummize_ref_level[i], setdiff(yy_levels, dummize_ref_level[i])))
117	3x	new_yy <- sapply(levels(yy)[-1], function(j) {
118	3x	as.numeric(yy == j)
119		})
120	3x	new_yy <- as.data.frame(new_yy)
121	3x	names(new_yy) <- toupper(paste(dummize_cols[i], levels(yy)[-1], sep = "_"))
122	3x	raw_ipd <- cbind(raw_ipd, new_yy)
123		}
124	2x	raw_ipd
125		}
126
127
128		#' Center individual patient data (IPD) variables using aggregate data averages
129		#'
130		#' This function subtracts IPD variables (prognostic variables and/or effect modifiers)
131		#' by the aggregate data averages. This centering is needed in order to calculate weights.
132		#' IPD and aggregate data variable names should match.
133		#'
134		#' @param ipd IPD variable names should match the aggregate data names without the suffix.
135		#' This would involve either changing the aggregate data name or the ipd name.
136		#' For instance, if we binarize SEX variable with MALE as a reference using [dummize_ipd],
137		#' function names the new variable as SEX_MALE.
138		#' In this case, SEX_MALE should also be available in the aggregate data.
139		#' @param agd pre-processed aggregate data which contain STUDY, ARM, and N. Variable names
140		#' should be followed by legal suffixes (i.e. MEAN, MEDIAN, SD, or PROP). Note that COUNT
141		#' suffix is no longer accepted.
142		#' @examples
143		#' data(adsl_sat)
144		#' data(agd)
145		#' agd <- process_agd(agd)
146		#' ipd_centered <- center_ipd(ipd = adsl_sat, agd = agd)
147		#' @return centered ipd using aggregate level data averages
148		#' @export
149
150		center_ipd <- function(ipd, agd) {
151		# regularized column name patterns
152	10x	must_exist <- c("STUDY", "ARM", "N")
153	10x	legal_suffix <- c("MEAN", "MEDIAN", "SD", "PROP")
154	10x	suffix_pat <- paste(paste0("_", legal_suffix, "$"), collapse = "\|")
155
156	10x	for (i in seq_len(nrow(agd))) { # study i
157	10x	study_id <- agd$STUDY[i]
158	10x	use_agd <- agd[i, !names(agd) %in% must_exist, drop = FALSE]
159	10x	param_id <- gsub(suffix_pat, "", names(use_agd))
160
161	10x	for (j in seq_len(ncol(use_agd))) { # effect modifier j
162	!	if (is.na(use_agd[[j]])) next
163
164	69x	ipd_param <- param_id[j]
165
166	69x	if (grepl("_MEAN$\|_PROP$", names(use_agd)[j])) {
167	40x	ipd[[paste0(ipd_param, "_", "CENTERED")]] <- ipd[[ipd_param]] - use_agd[[j]]
168	29x	} else if (grepl("_MEDIAN$", names(use_agd)[j])) {
169	19x	ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] <- ipd[[ipd_param]] > use_agd[[j]]
170	19x	ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_MEDIAN_", "CENTERED")]] - 0.5
171	10x	} else if (grepl("_SD$", names(use_agd)[j])) {
172	10x	ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] <- ipd[[ipd_param]]^2
173	10x	tmp_aim <- use_agd[[j]]^2 + (use_agd[[paste0(ipd_param, "_MEAN")]]^2)
174	10x	ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] <- ipd[[paste0(ipd_param, "_SQUARED_", "CENTERED")]] - tmp_aim
175		}
176		} # end of j
177		} # end of i
178
179		# output
180	10x	ipd
181		}
182
183
184		#' Calculate pooled arm statistics in Aggregated Data (AgD) based on arm-specific statistics
185		#'
186		#' This is a convenient function to pool arm statistics. This function is called
187		#' within process_agd and when the ARM is not equal to "Total". Note pooled
188		#' median can't be calculated and it is only an approximation.
189		#'
190		#' @param use_agd aggregated data that is processed within process_agd
191		#' @noRd
192		#' @return Complete N, count, mean, sd, and median for the pooled arm
193
194		complete_agd <- function(use_agd) {
195	1x	use_agd <- as.data.frame(use_agd)
196	1x	use_agd <- with(use_agd, {
197	1x	use_agd[tolower(ARM) != "total", , drop = FALSE]
198		})
199
200	!	if (nrow(use_agd) < 2) stop("error in call complete_agd: need to have at least 2 rows that ARM!='total' ")
201
202	1x	rowId <- nrow(use_agd) + 1
203	1x	use_agd[rowId, ] <- NA
204	1x	use_agd$STUDY[rowId] <- use_agd$STUDY[1]
205	1x	use_agd$ARM[rowId] <- "total"
206
207		# complete N and count
208	1x	NN <- use_agd[["N"]][rowId] <- sum(use_agd[["N"]], na.rm = TRUE)
209	1x	nn <- use_agd[["N"]][-rowId]
210	1x	for (i in grep("_COUNT$", names(use_agd), value = TRUE)) {
211	1x	use_agd[[i]][rowId] <- sum(use_agd[[i]][-rowId], na.rm = TRUE)
212		}
213
214		# complete MEAN
215	1x	for (i in grep("_MEAN$", names(use_agd), value = TRUE)) {
216	1x	use_agd[[i]][rowId] <- sum(use_agd[[i]][-rowId] * nn) / NN
217		}
218
219		# complete SD
220	1x	for (i in grep("_SD$", names(use_agd), value = TRUE)) {
221	1x	use_agd[[i]][rowId] <- sqrt(sum(use_agd[[i]][-rowId]^2 * (nn - 1)) / (NN - 1))
222		}
223
224		# complete MEDIAN, approximately!!
225	1x	for (i in grep("_MEDIAN$", names(use_agd), value = TRUE)) {
226	1x	use_agd[[i]][rowId] <- mean(use_agd[[i]][-rowId])
227		}
228
229		# output
230	1x	rownames(use_agd) <- NULL
231	1x	use_agd
232		}
233
234
235		#' helper function: transform TTE ADaM data to suitable input for survival R package
236		#'
237		#' @param dd data frame, ADTTE read via `haven::read_sas`
238		#' @param time_scale a character string, 'years', 'months', 'weeks' or 'days', time unit of median survival time
239		#' @param trt values to include in treatment column
240		#'
241		#' @return a data frame that can be used as input to `survival::Surv`
242		#' @keywords internal
243
244		ext_tte_transfer <- function(dd, time_scale = "months", trt = NULL) {
245	!	time_scale <- match.arg(time_scale, choices = c("years", "months", "weeks", "days"))
246	!	time_units <- get_time_conversion(time_scale)
247
248	!	if ("CENSOR" %in% names(dd)) {
249	!	dd <- dd[!is.na(dd$CENSOR), ]
250	!	dd$status <- as.numeric(dd$CENSOR == 0)
251		}
252	!	if ("EVENT" %in% names(dd)) {
253	!	dd$status <- as.numeric(as.character(dd$EVENT))
254		}
255	!	if ("TIME" %in% names(dd)) {
256	!	dd$AVAL <- as.numeric(as.character(dd$TIME))
257		}
258
259	!	dd$time <- dd$AVAL * time_units
260	!	if (!is.null(trt)) dd$treatment <- trt
261	!	as.data.frame(dd)
262		}

1		#' Kaplan-Meier (KM) plot function for anchored and unanchored cases using ggplot
2		#'
3		#' This is wrapper function of \code{basic_kmplot2}.
4		#' The argument setting is similar to \code{maic_anchored} and \code{maic_unanchored},
5		#' and it is used in those two functions.
6		#'
7		#' @param weights_object an object returned by \code{estimate_weight}
8		#' @param tte_ipd a data frame of individual patient data (IPD) of internal trial, contain at least `"USUBJID"`,
9		#' `"EVENT"`, `"TIME"` columns and a column indicating treatment assignment
10		#' @param tte_pseudo_ipd a data frame of pseudo IPD by digitized KM curves of external trial (for time-to-event
11		#' endpoint), contain at least `"EVENT"`, `"TIME"`
12		#' @param trt_ipd a string, name of the interested investigation arm in internal trial \code{dat_igd} (real IPD)
13		#' @param trt_agd a string, name of the interested investigation arm in external trial \code{dat_pseudo} (pseudo IPD)
14		#' @param trt_common a string, name of the common comparator in internal and external trial, by default is NULL,
15		#' indicating unanchored case
16		#' @param trt_var_ipd a string, column name in \code{tte_ipd} that contains the treatment assignment
17		#' @param trt_var_agd a string, column name in \code{tte_pseudo_ipd} that contains the treatment assignment
18		#' @param normalize_weights logical, default is \code{FALSE}. If \code{TRUE},
19		#' \code{scaled_weights} (normalized weights) in \code{weights_object$data} will be used.
20		#' @param km_conf_type a string, pass to \code{conf.type} of \code{survfit}
21		#' @param km_layout a string, only applicable for unanchored case (\code{trt_common = NULL}), indicated the
22		#' desired layout of output KM curve.
23		#' @param time_scale a string, time unit of median survival time, taking a value of 'years', 'months',
24		#' weeks' or 'days'
25		#' @param ... other arguments in \code{basic_kmplot2}
26		#'
27		#' @return In unanchored case, a KM plot with risk set table. In anchored case, depending on \code{km_layout},
28		#' \itemize{
29		#' \item if "by_trial", 2 by 1 plot, first all KM curves (incl. weighted) in IPD trial, and then KM curves in AgD
30		#' trial, with risk set table.
31		#' \item if "by_arm", 2 by 1 plot, first KM curves of \code{trt_agd} and \code{trt_ipd} (with and without weights),
32		#' and then KM curves of \code{trt_common} in AgD trial and IPD trial (with and without weights). Risk set table is
33		#' appended.
34		#' \item if "all", 2 by 2 plot, all plots in "by_trial" and "by_arm" without risk set table appended.
35		#' }
36		#' @example inst/examples/kmplot2_unanchored_ex.R
37		#' @example inst/examples/kmplot2_anchored_ex.R
38		#' @export
39
40		kmplot2 <- function(weights_object,
41		tte_ipd,
42		tte_pseudo_ipd,
43		trt_ipd,
44		trt_agd,
45		trt_common = NULL,
46		normalize_weights = FALSE,
47		trt_var_ipd = "ARM",
48		trt_var_agd = "ARM",
49		km_conf_type = "log-log",
50		km_layout = c("all", "by_trial", "by_arm"),
51		time_scale,
52		...) {
53	!	if (!requireNamespace("survminer", quietly = TRUE)) stop("survminer package is required for this function")
54
55	1x	names(tte_ipd) <- toupper(names(tte_ipd))
56	1x	names(tte_pseudo_ipd) <- toupper(names(tte_pseudo_ipd))
57	1x	trt_var_ipd <- toupper(trt_var_ipd)
58	1x	trt_var_agd <- toupper(trt_var_agd)
59
60		# pre check
61	1x	if (!"maicplus_estimate_weights" %in% class(weights_object)) {
62	!	stop("weights_object should be an object returned by estimate_weights")
63		}
64	1x	if (!all(c("USUBJID", "TIME", "EVENT", trt_var_ipd) %in% names(tte_ipd))) {
65	!	stop(paste("tte_ipd needs to include at least USUBJID, TIME, EVENT,", trt_var_ipd))
66		}
67	1x	if (!all(c("TIME", "EVENT", trt_var_agd) %in% names(tte_pseudo_ipd))) {
68	!	stop(paste("tte_pseudo_ipd needs to include at least TIME, EVENT,", trt_var_agd))
69		}
70	1x	km_layout <- match.arg(km_layout, choices = c("all", "by_trial", "by_arm"), several.ok = FALSE)
71
72		# preparing data
73	1x	is_anchored <- !is.null(trt_common)
74	1x	tte_ipd <- tte_ipd[tte_ipd[[trt_var_ipd]] %in% c(trt_ipd, trt_common), , drop = FALSE]
75	1x	tte_pseudo_ipd <- tte_pseudo_ipd[tte_pseudo_ipd[[trt_var_agd]] %in% c(trt_agd, trt_common), , drop = FALSE]
76	1x	if (normalize_weights) {
77	!	tte_ipd$weights <- weights_object$data$scaled_weights[match(weights_object$data$USUBJID, tte_ipd$USUBJID)]
78		} else {
79	1x	tte_ipd$weights <- weights_object$data$weights[match(weights_object$data$USUBJID, tte_ipd$USUBJID)]
80		}
81	1x	tte_pseudo_ipd$weights <- 1
82
83	1x	tte_ipd$TIME <- get_time_as(tte_ipd$TIME, as = time_scale)
84	1x	tte_pseudo_ipd$TIME <- get_time_as(tte_pseudo_ipd$TIME, as = time_scale)
85	1x	my_survfit <- function(data, weighted = FALSE) {
86	6x	if (weighted) {
87	2x	survfit(Surv(TIME, EVENT) ~ 1, data = data, conf.type = km_conf_type, weights = data$weights)
88		} else {
89	4x	survfit(Surv(TIME, EVENT) ~ 1, data = data, conf.type = km_conf_type)
90		}
91		}
92
93	1x	if (!is_anchored) {
94	!	kmlist <- list(
95	!	kmobj_B = my_survfit(data = tte_pseudo_ipd),
96	!	kmobj_A = my_survfit(data = tte_ipd),
97	!	kmobj_A_adj = my_survfit(data = tte_ipd, weighted = TRUE)
98		)
99	!	kmlist_name <- c(trt_agd, trt_ipd, paste0(trt_ipd, " (weighted)"))
100	!	basic_kmplot2(kmlist, kmlist_name, ...)
101	1x	} else if (is_anchored) {
102	1x	all_km <- list(
103	1x	kmobj_A = my_survfit(data = tte_ipd[tte_ipd[, trt_var_ipd] == trt_ipd, ]),
104	1x	kmobj_B = my_survfit(data = tte_pseudo_ipd[tte_pseudo_ipd[, trt_var_agd] == trt_agd, ]),
105	1x	kmobj_A_adj = my_survfit(data = tte_ipd[tte_ipd[, trt_var_ipd] == trt_ipd, ], weighted = TRUE),
106	1x	kmobj_C = my_survfit(data = tte_ipd[tte_ipd[, trt_var_ipd] == trt_common, ]),
107	1x	kmobj_C_adj = my_survfit(data = tte_ipd[tte_ipd[, trt_var_ipd] == trt_common, ], weighted = TRUE),
108	1x	kmobj_C_agd = my_survfit(data = tte_pseudo_ipd[tte_pseudo_ipd[, trt_var_agd] == trt_common, ])
109		)
110
111	1x	kmlist_combined <- list()
112	1x	if (km_layout %in% c("by_trial", "all")) {
113	1x	kmlist_1_2 <- list(
114	1x	setNames(
115	1x	all_km[c(4, 1, 3, 5)],
116	1x	c(trt_common, trt_ipd, paste0(trt_ipd, " (weighted)"), paste0(trt_common, " (weighted)"))
117		),
118	1x	setNames(all_km[c(6, 2)], c(trt_common, trt_agd))
119		)
120	1x	names(kmlist_1_2) <- c(
121	1x	paste0("Kaplan-Meier Curves \n(", trt_ipd, " vs ", trt_common, ") in the IPD trial"),
122	1x	paste0("Kaplan-Meier Curves \n(", trt_agd, " vs ", trt_common, ") in the AgD trial")
123		)
124	1x	kmlist_combined <- c(kmlist_combined, kmlist_1_2)
125		}
126	1x	if (km_layout %in% c("by_arm", "all")) {
127	!	kmlist_3_4 <- list(
128	!	setNames(all_km[c(2, 1, 3)], c(trt_agd, trt_ipd, paste0(trt_ipd, " (weighted)"))),
129	!	setNames(all_km[c(6, 4, 5)], paste(trt_common, c("(AgD)", "(IPD)", "(IPD,weighted)")))
130		)
131	!	names(kmlist_3_4) <- c(
132	!	paste0("Kaplan-Meier Curves \n(", trt_ipd, " vs ", trt_agd, ")"),
133	!	paste0("Kaplan-Meier Curves of Common Comparator \n", trt_common, "(IPD vs AgD Trial)")
134		)
135	!	kmlist_combined <- c(kmlist_combined, kmlist_3_4)
136		}
137	1x	if (km_layout == "all") {
138	!	kmlist_combined <- kmlist_combined[c(1, 3, 2, 4)]
139		}
140
141	1x	splots <- mapply(
142	1x	FUN = basic_kmplot2,
143	1x	kmlist = kmlist_combined,
144	1x	kmlist_name = lapply(kmlist_combined, names),
145	1x	main_title = names(kmlist_combined),
146	1x	MoreArgs = list(...),
147	1x	SIMPLIFY = FALSE
148		)
149	1x	survminer::arrange_ggsurvplots(splots, nrow = 1 + (km_layout == "all"))
150		}
151		}
152
153		#' Basic Kaplan Meier (KM) plot function using ggplot
154		#'
155		#' This function generates a basic KM plot using ggplot.
156		#'
157		#' @param kmlist a list of \code{survfit} object
158		#' @param kmlist_name a vector indicating the treatment names of each \code{survfit} object
159		#' @param endpoint_name a string, name of time to event endpoint, to be show in the
160		#' last line of title
161		#' @param show_risk_set logical, show risk set table or not, TRUE by default
162		#' @param main_title a string, main title of the KM plot
163		#' @param break_x_by bin parameter for \code{survminer}
164		#' @param censor indicator to include censor information
165		#' @param xlab label name for x-axis of the plot
166		#' @param xlim x limit for the x-axis of the plot
167		#' @param use_colors a character vector of length up to 4, colors to the KM curves,
168		#' it will be passed to 'col' of \code{lines()}
169		#' @param use_line_types a numeric vector of length up to 4, line type to the KM curves,
170		#' it will be passed to \code{lty} of \code{lines()}
171		#' @example inst/examples/basic_kmplot2_ex.R
172		#' @returns A Kaplan-Meier plot object created with `survminer::ggsurvplot()`.
173		#' @export
174
175		basic_kmplot2 <- function(kmlist,
176		kmlist_name,
177		endpoint_name = "Time to Event Endpoint",
178		show_risk_set = TRUE,
179		main_title = "Kaplan-Meier Curves",
180		break_x_by = NULL,
181		censor = TRUE,
182		xlab = "Time",
183		xlim = NULL,
184		use_colors = NULL,
185		use_line_types = NULL) {
186	!	if (!requireNamespace("survminer", quietly = TRUE)) stop("survminer package is required for this function")
187
188	2x	if (is.null(use_line_types)) {
189	2x	use_line_types <- c(1, 1, 2, 2)
190		}
191
192	2x	if (is.null(use_colors)) {
193	2x	use_colors <- c("#5450E4", "#00857C", "#6ECEB2", "#7B68EE")
194		}
195
196		# Produce the Kaplan-Meier plot
197	2x	survminer_plot <- survminer::ggsurvplot(kmlist,
198	2x	linetype = use_line_types,
199	2x	palette = use_colors,
200	2x	size = 0.2,
201	2x	combine = TRUE,
202	2x	risk.table = show_risk_set,
203	2x	risk.table.y.text.col = TRUE,
204	2x	risk.table.y.text = FALSE,
205	2x	break.x.by = break_x_by,
206	2x	censor = censor,
207	2x	censor.size = 2,
208	2x	xlab = xlab,
209	2x	ylab = endpoint_name,
210	2x	legend.title = "Treatment",
211	2x	legend = c(0.85, 0.82),
212	2x	title = paste0(main_title, "\nEndpoint: ", endpoint_name),
213	2x	legend.labs = kmlist_name,
214	2x	tables.theme = survminer::theme_cleantable(),
215	2x	ggtheme = ggplot2::theme_classic(base_size = 10),
216	2x	fontsize = 3,
217	2x	conf.int = FALSE,
218	2x	xlim = xlim
219		)
220	2x	survminer_plot
221		}

1		#' Bucher method for combining treatment effects
2		#'
3		#' Given two treatment effects of A vs. C and B vs. C
4		#' derive the treatment effects of A vs. B using the Bucher method.
5		#' Two-sided confidence interval and Z-test p-value are also calculated.
6		#' Treatment effects and standard errors should be in log scale
7		#' for hazard ratio, odds ratio, and risk ratio.
8		#' Treatment effects and standard errors should be in natural scale
9		#' for risk difference and mean difference.
10		#'
11		#' @param trt a list of two scalars for the study with the
12		#' experimental arm. `'est'` is the point estimate and `'se'`
13		#' is the standard error of the treatment effect.
14		#' For time-to-event data, `'est'` and `'se'` should be point estimate and
15		#' standard error of the log hazard ratio.
16		#' For binary data, `'est'` and `'se'` should be point estimate and
17		#' standard error of the log odds ratio, log risk ratio, or risk
18		#' difference.
19		#' For continuous data, `'est'` and `'se'` should be point estimate and
20		#' standard error of the mean difference.
21		#' @param com same as \code{trt}, but for the study with the
22		#' control arm
23		#' @param conf_lv a numerical scalar, prescribe confidence level to derive
24		#' two-sided confidence interval for the treatment effect
25		#'
26		#' @return a list with 5 elements,
27		#' \describe{
28		#' \item{est}{a scalar, point estimate of the treatment effect}
29		#' \item{se}{a scalar, standard error of the treatment effect}
30		#' \item{ci_l}{a scalar, lower confidence limit of a two-sided CI
31		#' with prescribed nominal level by \code{conf_lv}}
32		#' \item{ci_u}{a scalar, upper confidence limit of a two-sided CI
33		#' with prescribed nominal level by \code{conf_lv}}
34		#' \item{pval}{p-value of Z-test, with null hypothesis that
35		#' \code{est} is zero}
36		#' }
37		#' @export
38		#' @examples
39		#' trt <- list(est = log(1.1), se = 0.2)
40		#' com <- list(est = log(1.3), se = 0.18)
41		#' result <- bucher(trt, com, conf_lv = 0.9)
42		#' print(result, ci_digits = 3, pval_digits = 3)
43		bucher <- function(trt, com, conf_lv = 0.95) {
44	1x	if (!isTRUE(is.finite(trt$est))) stop("trt$est is not valid: ", trt$est)
45	1x	if (!isTRUE(is.finite(trt$se))) stop("trt$se is not valid: ", trt$se)
46	1x	if (!isTRUE(is.finite(com$est))) stop("com$est is not valid: ", com$est)
47	1x	if (!isTRUE(is.finite(com$se))) stop("com$se is not valid: ", com$se)
48	1x	if (conf_lv < 0 \|\| 1 < conf_lv) stop("conf_lv must be in (0, 1): ", conf_lv)
49
50	46x	est <- trt$est - com$est
51	46x	se <- sqrt(trt$se^2 + com$se^2)
52	46x	ci_l <- est - stats::qnorm(0.5 + conf_lv / 2) * se
53	46x	ci_u <- est + stats::qnorm(0.5 + conf_lv / 2) * se
54	46x	if (est > 0) {
55	8x	pval <- 2 * (1 - stats::pnorm(est, 0, se))
56		} else {
57	38x	pval <- 2 * stats::pnorm(est, 0, se)
58		}
59
60	46x	outdata <- list(
61	46x	est = est,
62	46x	se = se,
63	46x	ci_l = ci_l,
64	46x	ci_u = ci_u,
65	46x	pval = pval
66		)
67
68	46x	class(outdata) <- c("maicplus_bucher", "list")
69	46x	outdata
70		}
71
72
73		#' Calculate standard error from the reported confidence interval.
74		#'
75		#' Comparator studies often only report confidence interval of the
76		#' treatment effects. This function calculates standard error of the
77		#' treatment effect given the reported confidence interval.
78		#' For relative treatment effect (i.e. hazard ratio, odds ratio, and
79		#' risk ratio), the function would log the confidence interval.
80		#' For risk difference and mean difference,
81		#' we do not log the confidence interval.
82		#' The option to log the confidence interval is controlled
83		#' by `'log'` parameter.
84		#'
85		#' @param CI_lower Reported lower percentile value of the
86		#' treatment effect
87		#' @param CI_upper Reported upper percentile value of the
88		#' treatment effect
89		#' @param CI_perc Percentage of confidence interval reported
90		#' @param log Whether the confidence interval should be logged.
91		#' For relative treatment effect, log should be applied because
92		#' estimated log treatment effect is approximately normally distributed.
93		#' @return Standard error of log relative treatment effect if `'log'`
94		#' is true and standard error of the treatment effect if `'log'`
95		#' is false
96		#' @examples
97		#' find_SE_from_CI(CI_lower = 0.55, CI_upper = 0.90, CI_perc = 0.95)
98		#' @export
99
100		find_SE_from_CI <- function(CI_lower = NULL, CI_upper = NULL,
101		CI_perc = 0.95, log = TRUE) {
102	3x	if (CI_perc > 1 \|\| CI_perc < 0) {
103	!	stop("CI_perc has to be between 0 and 1")
104		}
105
106	3x	if (is.null(CI_lower) \|\| is.null(CI_upper)) {
107	!	stop("Both CI_lower and CI_upper need to be specified")
108		}
109
110	3x	if (!is.numeric(CI_lower) \|\| !is.numeric(CI_upper)) {
111	!	stop("Both CI_lower and CI_upper need to be specified")
112		}
113
114	3x	alpha <- 1 - CI_perc
115	3x	se <- ifelse(log,
116	3x	(log(CI_upper) - log(CI_lower)) / (2 * qnorm(1 - alpha / 2)),
117	3x	(CI_upper - CI_lower) / (2 * qnorm(1 - alpha / 2))
118		)
119	3x	return(se)
120		}
121
122		#' Print method for `maicplus_bucher` object
123		#'
124		#' @param x `maicplus_bucher` object
125		#' @param ci_digits an integer, number of decimal places for point
126		#' estimate and derived confidence limits
127		#' @param pval_digits an integer, number of decimal places to display
128		#' Z-test p-value
129		#' @param exponentiate whether the treatment effect and confidence
130		#' interval should be exponentiated. This applies to relative
131		#' treatment effects. Default is set to false.
132		#' @param ... not used
133		#' @describeIn bucher Print method for `maicplus_bucher` objects
134		#' @export
135
136		print.maicplus_bucher <- function(x, ci_digits = 2, pval_digits = 3,
137		exponentiate = FALSE, ...) {
138	1x	output <- reformat(x, ci_digits, pval_digits,
139	1x	show_pval = TRUE, exponentiate
140		)
141	1x	print(output)
142		}
143
144		#' Reformat `maicplus_bucher` alike object
145		#'
146		#' @param x a list, structured like a `maicplus_bucher` object
147		#' @param ci_digits an integer, number of decimal places for point
148		#' estimate and derived confidence limits
149		#' @param pval_digits an integer, number of decimal places to display
150		#' Z-test p-value
151		#' @param show_pval a logical value, default is TRUE. If FALSE, p-value will not
152		#' be output as the second element of the character vector
153		#' @param exponentiate whether the treatment effect and confidence
154		#' interval should be exponentiated. This applies to relative
155		#' treatment effects. Default is set to false.
156		#' @keywords internal
157
158		reformat <- function(x, ci_digits = 2, pval_digits = 3,
159		show_pval = TRUE, exponentiate = FALSE) {
160	1x	transform_this <- function(x) {
161	3x	ifelse(exponentiate, exp(x), x)
162		}
163
164	1x	a <- format(round(transform_this(x$est), ci_digits),
165	1x	nsmall = ci_digits
166		)
167	1x	b <- format(round(transform_this(x$ci_l), ci_digits),
168	1x	nsmall = ci_digits
169		)
170	1x	c <- format(round(transform_this(x$ci_u), ci_digits),
171	1x	nsmall = ci_digits
172		)
173	1x	res <- paste0(a, "[", b, "; ", c, "]")
174
175	1x	disp_pval <- round(x$pval, pval_digits)
176	1x	disp_pval <-
177	1x	ifelse(disp_pval == 0,
178	1x	paste0("<", 1 / (10^pval_digits)),
179	1x	format(disp_pval, nsmall = pval_digits)
180		)
181
182	1x	if (show_pval) {
183	1x	output <- c(res, disp_pval)
184	1x	names(output) <- c("result", "pvalue")
185		} else {
186	!	output <- res
187	!	names(output) <- "result"
188		}
189
190	1x	output
191		}

1		# Restore the RNG back to a previous state using the global .Random.seed
2		set_random_seed <- function(old_seed) {
3	16x	if (is.null(old_seed)) {
4	8x	rm(".Random.seed", envir = globalenv(), inherits = FALSE)
5		} else {
6	8x	assign(".Random.seed", value = old_seed, envir = globalenv(), inherits = FALSE)
7		}
8		}
9
10		construct_boot_data <- function(weighted_data, i = 1) {
11	!	if (is.null(weighted_data$boot)) stop("Must contain bootstrap results from estimate_weights()")
12	!	i <- as.integer(i)
13	!	R <- dim(weighted_data$boot)[3]
14	!	if (i < 1 \|\| i > R) stop("i must be integer between 1 and ", R)
15
16	!	boot_data <- weighted_data$boot[, , i]
17	!	weighted_data$data <- weighted_data$data[boot_data[, 1], ]
18	!	weighted_data$data$weights <- boot_data[, 2]
19	!	weighted_data$data$scaled_weights <- boot_data[, 2] / sum(boot_data[, 2])
20	!	weighted_data
21		}
22
23
24		transform_ratio <- function(object) {
25	27x	result <- object
26	27x	result$est <- exp(object$est)
27		# log normal parameterization for SE
28	27x	result$se <- sqrt((exp(object$se^2) - 1) * exp(2 * object$est + object$se^2))
29	27x	result$ci_l <- exp(object$ci_l)
30	27x	result$ci_u <- exp(object$ci_u)
31	27x	result
32		}
33
34		transform_absolute <- function(object) {
35	8x	result <- object
36	8x	result$est <- object$est * 100
37	8x	result$se <- object$se * 100
38	8x	result$ci_l <- object$ci_l * 100
39	8x	result$ci_u <- object$ci_u * 100
40	8x	result
41		}

1		#' Helper function to retrieve median survival time from a `survival::survfit` object
2		#'
3		#' Extract and display median survival time with confidence interval
4		#'
5		#' @param km_fit returned object from \code{survival::survfit}
6		#' @param legend a character string, name used in 'type' column in returned data frame
7		#' @param time_scale a character string, 'years', 'months', 'weeks' or 'days', time unit of median survival time
8		#'
9		#' @examples
10		#' data(adtte_sat)
11		#' data(pseudo_ipd_sat)
12		#' library(survival)
13		#' combined_data <- rbind(adtte_sat[, c("TIME", "EVENT", "ARM")], pseudo_ipd_sat)
14		#' kmobj <- survfit(Surv(TIME, EVENT) ~ ARM, combined_data, conf.type = "log-log")
15		#'
16		#' # Derive median survival time
17		#' medSurv <- medSurv_makeup(kmobj, legend = "before matching", time_scale = "day")
18		#' medSurv
19		#' @return a data frame with a index column 'type', median survival time and confidence interval
20		#' @export
21
22		medSurv_makeup <- function(km_fit, legend = "before matching", time_scale) {
23	10x	time_scale <- match.arg(time_scale, choices = c("years", "months", "weeks", "days"))
24
25		# km_fit is the returned object from survival::survfit
26	10x	km_fit <- summary(km_fit)$table
27	10x	km_fit[, 5:ncol(km_fit)] <- get_time_as(km_fit[, 5:ncol(km_fit)], time_scale)
28
29	10x	toyadd <- data.frame(
30	10x	treatment = gsub("ARM=", "", rownames(km_fit)),
31	10x	type = rep(legend, 2)
32		)
33
34	10x	km_fit <- cbind(toyadd, km_fit)
35	10x	rownames(km_fit) <- NULL
36
37	10x	km_fit
38		}
39
40
41		#' Helper function to select set of variables used for Kaplan-Meier plot
42		#'
43		#' @param km_fit returned object from \code{survival::survfit}
44		#' @param single_trt_name name of treatment if no strata are specified in `km_fit`
45		#'
46		#' @examples
47		#' library(survival)
48		#' data(adtte_sat)
49		#' data(pseudo_ipd_sat)
50		#' combined_data <- rbind(adtte_sat[, c("TIME", "EVENT", "ARM")], pseudo_ipd_sat)
51		#' kmobj <- survfit(Surv(TIME, EVENT) ~ ARM, combined_data, conf.type = "log-log")
52		#' survfit_makeup(kmobj)
53		#' @return a list of data frames of variables from [survival::survfit()]. Data frame is divided by treatment.
54		#' @export
55
56		survfit_makeup <- function(km_fit, single_trt_name = "treatment") {
57		# in case km_fit is only for single arm
58	116x	if ("strata" %in% names(km_fit)) {
59	20x	use_trt <- mapply(rep, 1:2, each = km_fit$strata)
60	20x	if (is.list(use_trt)) use_trt <- unlist(use_trt)
61	!	if (is.matrix(use_trt)) use_trt <- as.vector(use_trt)
62	20x	is_single <- FALSE
63		} else {
64	96x	use_trt <- rep(single_trt_name, length(km_fit$time))
65	96x	is_single <- TRUE
66		}
67
68	116x	kmdat <- data.frame(
69	116x	time = km_fit$time,
70	116x	treatment = use_trt,
71	116x	n.risk = km_fit$n.risk,
72	116x	n.event = km_fit$n.event,
73	116x	censor = km_fit$n.censor,
74	116x	surv = km_fit$surv,
75	116x	lower = km_fit$lower,
76	116x	upper = km_fit$upper,
77	116x	cumhaz = km_fit$cumhaz
78		)
79	20x	if (!is_single) kmdat$treatment <- sapply(strsplit(names(km_fit$strata), "="), "[[", 2)[kmdat$treatment]
80	116x	split(kmdat, f = kmdat$treatment)
81		}

1		#' Create pseudo IPD given aggregated binary data
2		#'
3		#' @param binary_agd a data.frame that take different formats depending on \code{format}
4		#' @param format a string, "stacked" or "unstacked"
5		#'
6		#' @return a data.frame of pseudo binary IPD, with columns USUBJID, ARM, RESPONSE
7		#' @example inst/examples/get_pseudo_ipd_binary_ex.R
8		#' @export
9
10		get_pseudo_ipd_binary <- function(binary_agd, format = c("stacked", "unstacked")) {
11		# pre check
12	5x	if (format == "stacked") {
13	4x	if (!is.data.frame(binary_agd)) {
14	!	stop("stacked binary_agd should be data.frame with columns 'ARM', 'RESPONSE', 'COUNT'")
15		}
16	4x	names(binary_agd) <- toupper(names(binary_agd))
17	4x	if (!all(c("ARM", "RESPONSE", "COUNT") %in% names(binary_agd))) {
18	!	stop("stacked binary_agd should be data.frame with columns 'ARM', 'Response', 'Count'")
19		}
20	4x	if (!is.logical(binary_agd$RESPONSE) && !all(toupper(binary_agd$RESPONSE) %in% c("YES", "NO"))) {
21	!	stop("'RESPONSE' column in stacked binary_agd should be either logical vector or character vector of 'Yes'/'No'")
22		}
23	4x	if (nrow(binary_agd) %% 2 != 0) {
24	!	stop("nrow(binary_agd) is not even number, you may miss to provide 1 level of binary response to certain arm")
25		}
26	1x	} else if (format == "unstacked") {
27	1x	if (!(is.data.frame(binary_agd) \|\| is.matrix(binary_agd))) {
28	!	stop("unstacked binary_agd should be either a 1x2 or 2x2 data frame or matrix")
29		}
30	1x	if (ncol(binary_agd) != 2 \|\| !nrow(binary_agd) %in% c(1, 2)) {
31	!	stop("unstacked binary_agd should be either a 1x2 or 2x2 data frame or matrix")
32		}
33	1x	bin_res <- toupper(colnames(binary_agd))
34	1x	bin_res <- sort(bin_res)
35	1x	if (!(identical(bin_res, c("FALSE", "TRUE")) \|\| identical(bin_res, c("NO", "YES")))) {
36	!	stop("column names of unstacked binary_agd should be either TRUE/FALSE or Yes/No")
37		}
38		}
39
40		# pre process binary_agd, depending on format
41	5x	use_binary_agd <- switch(format,
42	5x	"stacked" = {
43	4x	names(binary_agd) <- toupper(names(binary_agd))
44	4x	if (!is.logical(binary_agd$RESPONSE)) {
45	4x	binary_agd$RESPONSE <- toupper(binary_agd$RESPONSE)
46	4x	binary_agd$RESPONSE <- binary_agd$RESPONSE == "YES"
47		}
48	4x	binary_agd
49		},
50	5x	"unstacked" = {
51	1x	trt_names <- rownames(binary_agd)
52	1x	bin_res <- toupper(colnames(binary_agd))
53	1x	if ("YES" %in% bin_res) {
54	1x	bin_res <- ifelse(bin_res == "YES", "TRUE", "FALSE")
55	1x	colnames(binary_agd) <- bin_res
56		}
57	1x	tmpout <- utils::stack(binary_agd)
58	1x	tmpout <- cbind(ARM = rep(trt_names, each = 2), tmpout)
59	1x	names(tmpout) <- c("ARM", "COUNT", "RESPONSE")
60	1x	rownames(tmpout) <- NULL
61	1x	tmpout$RESPONSE <- as.logical(tmpout$RESPONSE)
62	1x	tmpout
63		}
64		)
65
66		# create pseudo binary IPD
67	5x	use_binary_agd$ARM <- factor(use_binary_agd$ARM, levels = unique(use_binary_agd$ARM))
68	5x	n_per_arm <- tapply(use_binary_agd$COUNT, use_binary_agd$ARM, sum)
69	5x	n_yes_per_arm <- use_binary_agd$COUNT[use_binary_agd$RESPONSE] # use_binary_agd is already ordered as per factor ARM
70
71	5x	tmpipd <- data.frame(
72	5x	USUBJID = NA,
73	5x	ARM = unlist(
74	5x	mapply(rep, x = levels(use_binary_agd$ARM), each = n_per_arm, SIMPLIFY = FALSE, USE.NAMES = FALSE)
75		),
76	5x	RESPONSE = unlist(
77	5x	lapply(seq_along(n_per_arm), function(ii) {
78	7x	c(rep(TRUE, n_yes_per_arm[ii]), rep(FALSE, n_per_arm[ii] - n_yes_per_arm[ii]))
79		})
80		)
81		)
82	5x	tmpipd$USUBJID <- paste0("pseudo_binary_subj_", seq_len(nrow(tmpipd)))
83
84		# output
85	5x	tmpipd
86		}
87
88
89		#' Helper function to summarize outputs from glm fit
90		#'
91		#' @param binobj returned object from \code{stats::glm}
92		#' @param legend label to indicate the binary fit
93		#' @param weighted logical flag indicating whether weights have been applied in the glm fit
94		#' @returns A `data.frame` containing a summary of the number of events and subjects in a logistic
95		#' regression model.
96		#' @examples
97		#' data(adrs_sat)
98		#' pseudo_adrs <- get_pseudo_ipd_binary(
99		#' binary_agd = data.frame(
100		#' ARM = rep("B", 2),
101		#' RESPONSE = c("YES", "NO"),
102		#' COUNT = c(280, 120)
103		#' ),
104		#' format = "stacked"
105		#' )
106		#' pseudo_adrs$RESPONSE <- as.numeric(pseudo_adrs$RESPONSE)
107		#' combined_data <- rbind(adrs_sat[, c("USUBJID", "ARM", "RESPONSE")], pseudo_adrs)
108		#' combined_data$ARM <- as.factor(combined_data$ARM)
109		#' binobj_dat <- stats::glm(RESPONSE ~ ARM, combined_data, family = binomial("logit"))
110		#' glm_makeup(binobj_dat)
111		#' @export
112		glm_makeup <- function(binobj, legend = "before matching", weighted = FALSE) {
113	23x	arm <- levels(binobj$data$ARM)
114	23x	if (!weighted) {
115	14x	n <- tapply(binobj$data$USUBJID, binobj$data$ARM, length)
116	14x	n_event <- tapply(binobj$data$RESPONSE, binobj$data$ARM, sum)
117		} else {
118	9x	n <- tapply(binobj$data$weights, binobj$data$ARM, sum)
119	9x	n_event <- tapply(binobj$data$weights * binobj$data$RESPONSE, binobj$data$ARM, sum)
120		}
121	23x	data.frame(
122	23x	treatment = arm,
123	23x	type = legend,
124	23x	n = n,
125	23x	events = n_event,
126	23x	events_pct = n_event * 100 / n
127		)
128		}

1		# Create an environment for settings
2		settings_env <- new.env()
3
4		#' Get and Set Time Conversion Factors
5		#'
6		#' @param default The default time scale, commonly whichever has factor = 1
7		#' @param days Factor to divide data time units to get time in days
8		#' @param weeks Factor to divide data time units to get time in weeks
9		#' @param months Factor to divide data time units to get time in months
10		#' @param years Factor to divide data time units to get time in years
11		#'
12		#' @return No value returned. Conversion factors are stored internally and used within functions.
13		#' @export
14		#' @rdname time_conversion
15		#'
16		#' @examples
17		#' # The default time scale is days:
18		#' set_time_conversion(default = "days", days = 1, weeks = 7, months = 365.25 / 12, years = 365.25)
19		#'
20		#' # Set the default time scale to years
21		#' set_time_conversion(
22		#' default = "years",
23		#' days = 1 / 365.25,
24		#' weeks = 1 / 52.17857,
25		#' months = 1 / 12,
26		#' years = 1
27		#' )
28		#'
29		set_time_conversion <- function(default = "days", days = 1, weeks = 7, months = 365.25 / 12, years = 365.25) {
30	3x	if (!default %in% c("days", "weeks", "months", "years")) {
31	!	stop("default must be one of \"days\", \"weeks\", \"months\", \"years\")")
32		}
33	3x	factors <- c(days = days, weeks = weeks, months = months, years = years)
34	3x	check_factors <- vapply(factors, function(x) isFALSE(!is.finite(x) \|\| x == 0), logical(1L))
35	3x	if (!all(check_factors)) {
36	1x	stop(
37	1x	"Conversion factors must be finite non-zero numerical values: ",
38	1x	paste0(names(factors)[!check_factors], " = ", factors[!check_factors], collapse = ", ")
39		)
40		}
41	2x	settings_env$time_conversion <- factors
42	2x	settings_env$default_time_scale <- default
43		}
44
45
46		#' @param factor Time factor to get.
47		#' @rdname time_conversion
48		#' @export
49		#'
50		#' @examples
51		#' # Get time scale factors:
52		#' get_time_conversion("years")
53		#' get_time_conversion("weeks")
54		get_time_conversion <- function(factor = c("days", "weeks", "months", "years")) {
55	238x	factor <- match.arg(factor, several.ok = TRUE)
56	238x	if (!exists("time_conversion", settings_env)) {
57	!	warning("No time conversion factors previously set. Setting defaults.")
58	!	set_time_conversion()
59		}
60	238x	settings_env$time_conversion[factor]
61		}
62
63
64		#' Convert Time Values Using Scaling Factors
65		#'
66		#' @param times Numeric time values
67		#' @param as A time scale to convert to. One of "days", "weeks", "months", "years"
68		#'
69		#' @return Returns a numeric vector calculated from `times / get_time_conversion(factor = as)`
70		#' @export
71		#' @examples
72		#' get_time_as(50, as = "months")
73		get_time_as <- function(times, as = NULL) {
74	42x	if (is.null(as)) as <- settings_env$default_time_scale
75	1x	if (!is.numeric(times)) stop("times arguments must be numeric")
76	236x	as <- match.arg(as, c("days", "weeks", "months", "years"))
77	236x	times / get_time_conversion(as)
78		}

1		.onLoad <- function(libname, pkgname) {
2	!	set_time_conversion()
3		}