Code

I post the code I wrote in various projects here for reference. If there is any problem or advice, feel free to contact me at my email (jing.zhang17@case.edu).

Contents

Statistical Analyses in Clinical Trials


Data

Synthetic data (Download) is used to illustrate results. Some data may not be meaningful.

There are 159 observations, 64 of which are in Group0, the control group, and 95 of which are in Group1, the experiment group.

The 20 variables are as follows.

Variable name Notes
id identity number
group in Group0 or Group1
age continuous variable
agegroup discrete variable
bmi body mass index, continuous variable
sex discrete variable
ecog Eastern Cooperative Oncology Group performance, discrete variable
stage cancer stage, discrete variable
class cancer classification, discrete variable
week treatment duration in weeks, continuous variable
cycle number of treatment cycles
dose dose intensity %
totaldose total cumulative dose mg
pfsyear progression-free survival in years
pfscensor progression-free censor with 0 equaling to censored and 1 equaling to event
response types of status which include complete response, partial response, stable disease and progressive disease
doryear duration of response for patients with complete or partial response
dorcensor duration of response censor with 0 equaling to censored and 1 equaling to event
adverse1 grade of adverse event 1, such as nausea, with 0 equaling to not experiencing adverse event 1
adverse2 grade of adverse event 1, such as nausea, with 0 equaling to not experiencing adverse event 1

Read data

dt<-read.csv("https://jingzhang1.github.io/assets/clinical%20trial%20project/example_data.csv")
head(dt)
##       id group age agegroup  bmi sex ecog stage class week cycle dose
## 1 688265     0  61        1 20.6   1    0     2     1 87.7     2 84.8
## 2 691261     0  44        1 21.2   1    1     2     1 93.2     2 80.6
## 3 754859     0  71        2 20.2   1    0     2     2 78.4     2 74.1
## 4 687411     0  56        1 26.7   1    2     2     1 68.0     2 52.7
## 5 689438     0  73        2 22.0   1    2     2     1 90.3     2 89.9
## 6 670733     0  60        1 20.7   2    0     2     1 67.8     2 86.9
##   totaldose pfsyear pfscensor response doryear dorcensor adverse1 adverse2
## 1      1696     4.6         1        3      NA        NA        1        0
## 2      1612     4.6         1        3      NA        NA        1        0
## 3      1482     2.8         1        3      NA        NA        1        2
## 4      1054     4.7         1        3      NA        NA        2        2
## 5      1798     1.5         0        3      NA        NA        0        3
## 6      1738     5.2         1        3      NA        NA        2        1

Study Summaries


Treatment exposure and dose intensity


library("tableone")
listVars<-c("week","cycle","dose","totaldose")
table1 <- CreateContTable(vars = listVars, data = dt,strata = "group", test = F)
###mean sd
table2 <- print(table1,quote = F,varLabels = T,printToggle=F, noSpaces = TRUE)
table2<-as.data.frame(table2)
###median range
table3 <- print(table1,nonnormal =listVars,minMax=T,quote = F,varLabels = T,printToggle=F,  noSpaces = TRUE)
table3<-as.data.frame(table3)

###week proportion
#cutoff:52,78,104,156
dt$week52<-ifelse(dt$week>52,1,0)
dt$week78<-ifelse(dt$week>78,1,0)
dt$week104<-ifelse(dt$week>104,1,0)
dt$week156<-ifelse(dt$week>156,1,0)
listVars<-c("week52","week78","week104","week156")
table4 <- CreateCatTable(vars = listVars, data = dt,strata = "group", test = F,includeNA=T)
table5<-print(table4,showAllLevels = F,quote = F,varLabels = T,printToggle=F, noSpaces = TRUE)
table5<-as.data.frame(table5)

tablesum<-rbind(table2,table3[-1,],table5[-1,])

# Write to RTF files
library(dplyr)
tablesum <- tablesum %>% 
  mutate(Characteristic = rownames(.)) %>% 
  select(Characteristic, everything()) %>% 
  mutate_all(trimws)
library(rtf)
rtffile <- RTF("Treatment Exposure and Dose Intensity.rtf") 
addHeader(rtffile, "Treatment Exposure and Dose Intensity")
#addParagraph(rtffile, "Ouput for table:\n")
addTable(rtffile, tablesum, NA.string="")
done(rtffile)

Characteristics of the patients at baseline


library("tableone")
listVars <- c("age","agegroup","bmi","sex","ecog","stage","class")
factorVars<-c("agegroup","sex","ecog","stage","class")
table1 <- CreateTableOne(vars = listVars, data = dt, factorVars = factorVars,strata = "group",test=F)
###mean sd
table2 <- print(table1, showAllLevels = TRUE,quote = F,varLabels = T,printToggle=F, noSpaces = TRUE)
table2<-as.data.frame(table2)
###median range
table3 <- print(table1, showAllLevels = TRUE,quote = F,varLabels = T,nonnormal=c("age","bmi"), minMax=T, printToggle=F, noSpaces = TRUE)
table3<-as.data.frame(table3)

# Write to RTF files
library(dplyr)
table3 <- table3 %>% 
  mutate(Characteristic = rownames(.)) %>% 
  select(Characteristic, everything()) %>% 
  mutate_all(trimws)
library(rtf)
rtffile <- RTF("Characteristics of the patients at baseline.rtf") 
addHeader(rtffile, "Characteristics of the patients at baseline")
addTable(rtffile, table3, NA.string="")
done(rtffile)

Efficacy Analyses


Survival analysis for the primary endpoint such as progression-free survival


library("survival")
library("survminer")
fit <- survfit(Surv(pfsyear, pfscensor) ~ group, data = dt) 
km<-ggsurvplot(fit,                     # survfit object with calculated statistics.
           censor=T,
           pval = F,                    # show p-value of log-rank test.
           conf.int = F,                # show confidence intervals for point estimaes of survival curves.
           surv.median.line = "hv",     # add the median survival pointer.
           xlim=c(0,7),
           xlab = "Time in years",      # customize X axis label.
           break.time.by = 1,           # break X axis in time intervals by 200.
           risk.table = "absolute",     # absolute number and percentage at risk.
           risk.table.y.text.col = T,   # colour risk table text annotations.
           risk.table.y.text = T,       # show bars instead of names in text annotations in legend of risk table.
           ncensor.plot = TRUE,         # plot the number of censored subjects at time t
           legend.labs =c("Group0", "Group1"), 
           palette =c("#E7B800", "#2E9FDF"),   
           ggtheme =theme_bw()+theme(plot.background = element_blank(),
                                     panel.grid.major = element_blank(),
                                     panel.grid.minor = element_blank()) 
)

###stratified cox model HR
model<-coxph(Surv(pfsyear,pfscensor)~group+strata(stage,class),data=dt)
a<-summary(model)  
hrci<-paste(round(a$conf.int[1],2)," (95%CI, ",round(a$conf.int[3],2),"-",round(a$conf.int[4],2),")",sep="")

###stratified log rank
survdiff(Surv(pfsyear, pfscensor) ~ group + strata(stage,class), data=dt, rho = 0 )#0.475 
p<-"P=0.475"

###Summary of survival
summary(fit)
summary(fit)$table
text0<-paste(paste(" "," "," "," ",sep  = "\n"),"Group0","Group1",sep  = "\n")
text1<-paste(paste(" ","No. of Events/","No. of Patients"," ",sep  = "\n"),"43/64","69/95",sep  = "\n")
text2<-paste(paste("Median", "Progression-free", "Survival (95% CI)","(year)",sep  = "\n"),"4.2 (3.5-4.6)","3.0 (2.7-3.9)",sep  = "\n")
text3<-paste(paste("4-year Rate of", "Progression-free Survival", "(95% CI)","(%)",sep  = "\n"),"54.5 (41.5-71.5)","27.9 (19.1-40.9)",sep  = "\n")
cat(text1)  
cat(text2)         
cat(text3)  
km$plot<-km$plot+annotate("text", x=1, y=0.1,label=paste("Stratified hazardratio for progression",hrci,p,sep  = "\n"))+
  annotate("text", x=3.5, y=0.9,label=text0)+               
  annotate("text", x=4.2, y=0.9,label=text1)+
  annotate("text", x=5.2, y=0.9,label=text2)+
  annotate("text", x=6.5, y=0.9,label=text3)

km1<-list()
km1[[1]]<-km
km2<-arrange_ggsurvplots(km1, print = F,ncol = 1, nrow = 1,
                         surv.plot.height=0.7,
                         risk.table.height = 0.15,
                         ncensor.plot.height =0.15)
ggsave(filename="Kaplan-Meier plot.jpeg", km2, width=26, height=23, dpi=500, units="cm")

Forest-plot analyses of progression-free survival in key subgroups


library("survival")
sex1<-coxph(Surv(pfsyear,pfscensor)~group,subset={sex==1},data=dt) 
sex2<-coxph(Surv(pfsyear,pfscensor)~group,subset={sex==2},data=dt) 

ageg1<-coxph(Surv(pfsyear,pfscensor)~group,subset={agegroup==1},data=dt)
ageg2<-coxph(Surv(pfsyear,pfscensor)~group,subset={agegroup==2},data=dt)

ecog0<-coxph(Surv(pfsyear,pfscensor)~group,subset={ecog==0},data=dt)
ecog1<-coxph(Surv(pfsyear,pfscensor)~group,subset={ecog==1},data=dt)
ecog2<-coxph(Surv(pfsyear,pfscensor)~group,subset={ecog==2},data=dt)


coef<-c(0.0366,0.672,-0.0353,0.463,0.715,0.141,-0.393)  ##coefficient in cox (not HR)
se<-c(0.2334,0.435,0.2492,0.374,0.332,0.354,0.398)  ##se(coef) in cox 

datacoef<-data.frame(group=c("Sex","  Male","  Female","Age","  <65yr","  ≥60yr","Ecog Score","  0","  1","  2"),
                   coef=c(NA,0.0366,0.672,NA,-0.0353,0.463,NA,0.715,0.141,-0.393),
                   se=c(NA,0.2334,0.435,NA,0.2492,0.374,NA,0.332,0.354,0.398),
                   subgroup=c("Sex","Sex","Sex","Age","Age","Age","Ecog Score","Ecog Score","Ecog Score","Ecog Score"))

###no.death/no.total
addmargins(xtabs(~sex+pfscensor+group,data=dt))
addmargins(xtabs(~agegroup+pfscensor+group,data=dt))
addmargins(xtabs(~ecog+pfscensor+group,data=dt))

grp0<-c("","33/50","10/14", "","32/43","11/21", "","17/21","14/27","12/16")
grp1<-c("","54/68","15/27", "","45/64","24/31", "","24/35","25/32","20/28")

datacoef<-cbind(datacoef,grp0,grp1)

library(meta)
metaba<-metagen(coef,se,data=datacoef,sm="HR",studlab=group,
                label.e="group=0",label.c="group=1",
                byvar=subgroup,print.byvar = TRUE)
plot<-forest(metaba,
       leftcols=c("group","grp0","grp1"),
       leftlabs=c("Subgroup", "Group 0", "Group 1"),
       just.addcols.left=c("left","center","center"),
       rightcols=c("effect.ci"),
       col.square="blue",
       squaresize=0.7,
       xlim=c(0.3, 5),
       label.right="Group=0 better", col.label.right="black",
       label.left="Group=1 better", col.label.left="black",
       ff.lr="bold",
       weight.study="same",
       ref=1,print.I2=FALSE,print.tau2=FALSE,
       comb.fixed=FALSE,print.Q.subgroup=FALSE,comb.random=FALSE)

Summaries of response rate, duration of response and disease progression


library("tableone")
table1 <- CreateCatTable(vars = "response", data = dt, strata = "group",test=F)
table2 <- print(table1, showAllLevels = TRUE,quote = F,varLabels = T,format = "f")
table2<-as.data.frame(table2)
table2<-table2[,-1]
rownames(table2)<-c("Total","CR","PR","SD","PD")
colnames(table2)<-c("Group0","Group1")
table2$Group0<-as.numeric(as.character(table2$Group0))
table2$Group1<-as.numeric(as.character(table2$Group1))
table2["OR",1]<-table2["CR",1]+table2["PR",1]
table2["OR",2]<-table2["CR",2]+table2["PR",2]

table2[,c(3:8)]<-NA
for (i in 1:5){
  for (j in 1:2){
    table2[i+1,j+2]<-round(table2[i+1,j]/table2[1,j]*100,1) 
    table2[i+1,j+4]<-round(binom.test(table2[i+1,j], table2[1,j])$conf.int[1]*100,1) #Clopper and Pearson 
    table2[i+1,j+6]<-round(binom.test(table2[i+1,j], table2[1,j])$conf.int[2]*100,1)
  }
}

table2[,c(9:10)]<-NA
for (i in 1:5){
  for (j in 1:2){
    table2[i+1,j+8]<-paste(table2[i+1,j]," (",table2[i+1,j+2]," [",table2[i+1,j+4],"-",table2[i+1,j+6],"] )",sep="") 
  }
}
table2[1,9]<-table2[1,1]
table2[1,10]<-table2[1,2]
table2[,c(1:8)]<-NULL

###DOR,CENSOR
library("survival")
fitdor <- survfit(Surv(doryear, dorcensor) ~ group, data = dt) 
dorsum<-summary(fitdor)$table

for (i in 1:2){
  table2["DOR",i]<-paste(dorsum[i,"median"]," (",dorsum[i,"0.95LCL"],"-",dorsum[i,"0.95UCL"],")",sep="")
  table2["CENSOR",i]<-paste(dorsum[i,"records"]-dorsum[i,"events"]," (",
                            round((dorsum[i,"records"]-dorsum[i,"events"])/dorsum[i,"records"]*100,1),")",sep="")
}

table2<-table2[c("Total","OR","CR","PR","SD","PD","DOR","CENSOR"),]

dt$response1<-ifelse(dt$response<3,1,ifelse(dt$response>2,2,NA))
dt$strata<-interaction(dt$stage,dt$class)
mytable <- xtabs(~group+response1+strata, data=dt)
a<-mantelhaen.test(mytable)
a$p.value

addmargins(table(dt$response1,dt$group),1)
prop.table(table(dt$response1,dt$group),2)
ord<-0.141-0.568
ordse<-sqrt(0.141*(1-0.141)/64+0.568*(1-0.568)/95)
ordlow<-round(ord-1.96*ordse,3)
ordup<-round(ord+1.96*ordse,3)

table2[2,3]<-paste(ord*100," (",ordlow*100,"-",ordup*100,")",sep="")
table2[2,4]<-ifelse(a$p.value<0.001,"<0.001",round(a$p.value,3))
colnames(table2)<-c("Group0","Group1","Difference (95%CI)","P value")

# Write to RTF files
library(dplyr)
table2 <- table2 %>% 
  mutate(Characteristic = rownames(.)) %>% 
  select(Characteristic, everything()) %>% 
  mutate_all(trimws)
library(rtf)
rtffile <- RTF("Summaries of response rate, duration of response and disease progression.rtf") 
addHeader(rtffile, "Summaries of response rate, duration of response and disease progression")
addTable(rtffile, table2,NA.string="")
done(rtffile)

Kaplan-Meier analysis of duration of response


library("survival")
library("survminer")
fitdor <- survfit(Surv(doryear, dorcensor) ~ group, data = dt)  
dor<-ggsurvplot(fitdor,                     # survfit object with calculated statistics.
               censor=T,
               pval = F,                    # show p-value of log-rank test.
               conf.int = F,                # show confidence intervals for point estimaes of survival curves.
               surv.median.line = "hv",     # add the median survival pointer.
               xlim=c(0,4),
               xlab = "Time in years",      # customize X axis label.
               ylab="Proportion with Ongoing Response (%)",
               break.time.by = 0.5,         # break X axis in time intervals by 200.
               risk.table = "absolute",     # absolute number and percentage at risk.
               ncensor.plot = F,            # plot the number of censored subjects at time t
               legend.labs =c("Group0", "Group1"),   
               palette =c("#E7B800", "#2E9FDF"),
               ggtheme =theme_bw()+theme(plot.background = element_blank(),
                                         panel.grid.major = element_blank(),
                                         panel.grid.minor = element_blank()) 
)
###unstratified cox model HR
model<-coxph(Surv(doryear,dorcensor)~group,data=dt)
a<-summary(model)  
hrci<-paste(round(a$conf.int[1],2)," (95%CI, ", round(a$conf.int[3],2), "-", round(a$conf.int[4],2),")", sep="")

dorsum<-summary(fitdor)$table
text0<-paste(paste(" "," ",sep  = "\n"),"Group0","Group1",sep  = "\n")
text1<-paste(paste("No. of Events/","No. of Patients",sep  = "\n"),"6/9","40/54",sep  = "\n")
text2<-paste(paste("Median DOR", "(95% CI)(year)",sep  = "\n"),table2["DOR",1],table2["DOR",2],sep  = "\n")
cat(text1)  
cat(text2)      

dor$plot<-dor$plot+annotate("text", x=3, y=1,label=paste("Unstratified hazard ratio, ",hrci,sep  = ""))+
  annotate("text", x=2.5, y=0.85,label=text0)+               
  annotate("text", x=3.0, y=0.85,label=text1)+
  annotate("text", x=3.5, y=0.85,label=text2)
#km
dor1<-list()
dor1[[1]]<-dor
dor2<-arrange_ggsurvplots(dor1, print = F,ncol = 1, nrow = 1,
                         surv.plot.height=0.7,
                         risk.table.height = 0.2)
ggsave(filename="Duration of response plot.jpeg",
       dor2,width=26,height=23,dpi=500,units="cm")

Safety Analyses


Summaries of key adverse events


library("tableone")
###adverse event any grade, redefine a variable
dt$adverse1l<-ifelse(dt$adverse1>0,1,ifelse(dt$adverse1==0,0,NA))
dt$adverse2l<-ifelse(dt$adverse2>0,1,ifelse(dt$adverse2==0,0,NA))
listVars<-c("adverse1l","adverse2l")
table1 <- CreateCatTable(vars = listVars, data = dt, strata = "group",test=F)
table2 <- print(table1, showAllLevels = F,quote = F,varLabels = T)
table2<-as.data.frame(table2)
colnames(table2)<-c("Group0_Anygrade","Group1_Anygrade")
rownames(table2)<-c("Total","Adverse1","Adverse2")

###adverse event grade>3,4, redefine a variable
dt$adverse1h<-ifelse(dt$adverse1>2,1,ifelse(dt$adverse1<3,0,NA))
dt$adverse2h<-ifelse(dt$adverse2>2,1,ifelse(dt$adverse2<3,0,NA))
listVars<-c("adverse1h","adverse2h")
table3 <- CreateCatTable(vars = listVars, data = dt, strata = "group",test=F)
table4 <- print(table3, showAllLevels = F,quote = F,varLabels = T)
table4<-as.data.frame(table4)
colnames(table4)<-c("Group0_grade>3,4","Group1_grade>3,4")
rownames(table4)<-c("Total","Adverse1","Adverse2")

table5<-cbind(table2,table4)
table5<-table5[,c("Group0_Anygrade","Group0_grade>3,4","Group1_Anygrade","Group1_grade>3,4")]

# Write to RTF files
library(dplyr)
table5 <- table5 %>% 
  mutate(Characteristic = rownames(.)) %>% 
  select(Characteristic, everything()) %>% 
  mutate_all(trimws)
library(rtf)
rtffile <- RTF("Summaries of key adverse events .rtf") 
addHeader(rtffile, "Summaries of key adverse events ")
addTable(rtffile, table5, NA.string="")
done(rtffile)