02. Simulation of clinical trial data

For applying the models, simulated data from clinical trials was used. The following code simulates recurrent event data for relapses in RRMS and SPMS patients. The data is then combined with baseline characteristics for RRMS and SPMS patients. The simulated data is saved as a CSV file for further use.

library(dplyr)
library(purrr)
library(tibble)
library(reda)
library(tidyr)
library(gtsummary)
library(writexl)
library(kableExtra)

# Set seed for reproducibility
set.seed(123)

# Define a function to simulate recurrent event data using reda
simulate_recurrent_event_data <- function(n, max_event, mean_follow_up_time, max_follow_up_time, mstype, outcome) {
  
  simulate_individual_data <- function(id) {
    
    # Generate baseline hazard times (exponential distribution)
    basehaz <- rexp(n = pmax(pmin(round(rnorm(1, max_event/2, max_event/4)), max_event), 1), 
                    rate = 1 / mean_follow_up_time)
    basehaz <- basehaz[basehaz < max_follow_up_time]
    basehaz <- c(basehaz,max_follow_up_time)
    
    # Create a tibble for the individual's data
    tibble(
      id = id,
      status = c(rep(1, length(basehaz) - 1), 0),
      time = basehaz,
      mstype = mstype,
      outcome = outcome
    )
  }
  
  # Use purrr::map_dfr to apply the helper function to each individual and combine the results
  sim_data_all <- map_dfr(1:n, simulate_individual_data) %>%
    arrange(id, time)
  
  # Return the simulated data
  return(sim_data_all)
}

# Generate the datasets for relapses in RRMS and SPMS patients
# this parameters have been changed in order to find differences between groups
# just for illustrative purposes
rrms_relapse_data<-simulate_recurrent_event_data(n = 791,
                                                 max_event = 10,
                                                 mean_follow_up_time = 8,
                                                 max_follow_up_time = 10, 
                                                 mstype = "RRMS", 
                                                 outcome = "RELAPSE")
spms_relapse_data<-simulate_recurrent_event_data(n = 522,
                                                 max_event = 4,
                                                 mean_follow_up_time = 5,
                                                 max_follow_up_time = 10, 
                                                 mstype = "SPMS", 
                                                 outcome = "RELAPSE")
spms_relapse_data$id <- spms_relapse_data$id + 791

# Bind datasets
df <- bind_rows(rrms_relapse_data,
                spms_relapse_data)

#Add baaseline characteristics
# Function to generate data for RRMS
generate_rrms_data <- function(n) {
  tibble(
    id = 1:n,
    Age = rnorm(n, mean = 36.5, sd = 9.1),
    Sex = factor(rbinom(n, 1, prob = 0.70), labels = c("Male", "Female")),
    Race = factor(rbinom(n, 1, prob = 0.88), labels = c("No white", "White")),
    Time_since_diagnosis = rlnorm(n, meanlog = log(2), sdlog = log(5/2)),
    EDSS_overall = rlnorm(n, meanlog = log(2), sdlog = log(3.5/1.5))
  )
}

# Function to generate data for SPMS
generate_spms_data <- function(n) {
  tibble(
    id = 1:n,
    Age = rnorm(n, mean = 49.4, sd = 8.1),
    Sex = factor(rbinom(n, 1, prob = 0.63), labels = c("Male", "Female")),
    Race = factor(rbinom(n, 1, prob = 0.96), labels = c("No white", "White")),
    Time_since_diagnosis = rlnorm(n, meanlog = log(14.5), sdlog = log(22/7.8)), # approximate log-normal
    EDSS_overall = rlnorm(n, meanlog = log(6), sdlog = log(6.5/4.5)) # approximate log-normal
  )
}

# Generate the datasets
rrms_data <- generate_rrms_data(791)
spms_data <- generate_spms_data(522)
spms_data$id <- spms_data$id + 791

#Merge with the previous part
df_revents <- inner_join(bind_rows(rrms_data, spms_data),df, by="id")

kbl(df_revents[1:10,]) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

id	Age	Sex	Race	Time_since_diagnosis	EDSS_overall	status	time	mstype	outcome
1	41.32254	Male	White	6.099722	0.3727128	1	0.2526189	RRMS	RELAPSE
1	41.32254	Male	White	6.099722	0.3727128	1	0.4496878	RRMS	RELAPSE
1	41.32254	Male	White	6.099722	0.3727128	1	2.5320097	RRMS	RELAPSE
1	41.32254	Male	White	6.099722	0.3727128	0	10.0000000	RRMS	RELAPSE
2	31.44477	Female	White	3.371547	6.0269468	1	0.2332276	RRMS	RELAPSE
2	31.44477	Female	White	3.371547	6.0269468	1	1.1621344	RRMS	RELAPSE
2	31.44477	Female	White	3.371547	6.0269468	1	1.5062723	RRMS	RELAPSE
2	31.44477	Female	White	3.371547	6.0269468	1	2.2481090	RRMS	RELAPSE
2	31.44477	Female	White	3.371547	6.0269468	1	2.8411302	RRMS	RELAPSE
2	31.44477	Female	White	3.371547	6.0269468	1	3.0169426	RRMS	RELAPSE

It is pertinent to mention that this type of data is not suitable for running recurrent events models (see database layout section). However, this raw data can be used for the first steps of data exploration as it is shown in the following section: descriptive analyses.

Quentin Pilard