8.Survival Analysis: Non-parametric, Semi-parametric, Parametric#

Hide code cell source
// Comments and Indentation by GPT-4o
// Load the dataset based on the operating system
if c(os) == "MacOSX" {
    use wk7output/student_pressure, clear  
}
else {
    use wk7output\student_pressure, clear 
} 

// Set the seed for reproducibility
set seed 340600

// Generate a withdrawal variable with a 10% probability
g withdraw = rbinomial(1, .1)

// Sort the dataset by student_id and session_date
sort student_id session_date

// List records for student_ids 2 to 5
list if inrange(student_id, 2, 5)

// Generate a withdrawal date variable
g withdraw_dt = session_date if withdraw==1

// Summarize session_date
sum session_date

// Format withdraw_dt as a date
format withdraw_dt %td

// List records for student_ids 2 to 5
list if inrange(student_id, 2, 5)

// Generate an end date as the minimum withdrawal date by student_id
by student_id: egen end = min(withdraw_dt)

// Format end date
format end %td

// List records for student_ids 2 to 5
list if inrange(student_id, 2, 5)

// Tabulate the end date with missing values
tab end, mi

// Summarize session_date
sum session_date 

// List records for student_ids 2 to 5
list if inrange(student_id, 2, 5)

// Replace missing end dates with the maximum date
replace end=r(max) if missing(end)

// Tabulate the end date with missing values
tab end, mi

// Generate a begin date as the minimum session date by student_id
by student_id: egen begin = min(session_date)

// Format begin date
format begin %td

// List records for student_ids 2 to 5
list if inrange(student_id, 2, 5)

// Ensure withdrawal continuity by replacing the next record's withdrawal status
by student_id: replace withdraw = 1 if withdraw[_n-1]==1  

// List records for student_ids 2 to 5
list if inrange(student_id, 2, 5)

/* --- insertion ---*/
// Generate ineligibility indicator
bys student_id: egen inelig=min(withdraw)

// List ineligible students
l if inelig==1

// Drop ineligible students
drop if inelig==1 

// Show the codebook for student_id
codebook student_id 

// Keep selected variables
keep student_id session sbp withdraw end begin 

// Order the variables
order student_id session begin end withdraw sbp 
/* --- continue ---*/

// Set the survival time settings
stset end, fail(withdraw) enter(begin) origin(begin) scale(7.0242308)

// Generate a stressed variable based on systolic blood pressure
g stressed = sbp>140

#delimit ;   
// Plot survival curves
sts graph, 
    fail per(100)
    ylab(,format(%2.0f))
    by(stressed)
    tmax(7)
    xlab(0(1)7)
    xti("Week")
    ti("Stata I dropout rate, %")
    /*
    legend(
        order(1 2)
        lab(1 "SBP>140mmHg")
        lab(2 "SBP<140mmHg")
        ring(0)
        pos(11)
    )
    */
    risktable
    note("Source: From Simulated Data",
        pos(7)
    )
; 
#delimit cr 

// Export the graph based on the operating system
if c(os) == "MacOSX" {
    graph export wk7output/class_attrition_wk8.png, replace  
}
else {
    graph export wk7output\class_attrition_wk8.png, replace 
} 

// Calculate absolute risks
sts list, fail by(stressed) at(2 4 6) saving(km, replace ) 
preserve  
    use km, clear  
    replace failure=failure*100
    
    // Calculate mean failure rates at different time points for non-stressed students
    sum failure if stressed==0 & time==2
    local wk2: di %3.2f r(mean)
    sum failure if stressed==0 & time==4
    local wk4: di %3.2f r(mean)
    sum failure if stressed==0 & time==6
    local wk6: di %3.2f r(mean)
    
    // Calculate mean failure rates at different time points for stressed students
    sum failure if stressed==1 & time==2
    local swk2: di %3.2f r(mean)
    sum failure if stressed==1 & time==4
    local swk4: di %3.2f r(mean)
    sum failure if stressed==1 & time==6 
    local swk6: di %3.2f r(mean)
restore 

// Perform log-rank test for nonparametric hypothesis testing
sts test stressed, logrank 

// Capture the chi-squared value and degrees of freedom
local chi2_value = r(chi2)
local df = r(df)

// Calculate the p-value
local p = chi2tail(`df', `chi2_value')
di `p'

qui {    
    if `p' < 0.01 {
        local p: di "p < 0.01"
    }
    else if inrange(`p',0.01,0.05) {
        local p: di %3.2f `p'
    }
    else {
        local p: di %2.1f `p'
    }
    
    if `p' < .05 {
        noi di "Withdraw rates for Stata I at weeks 2, 4 and 6 were `wk2'%, `wk4'%, `wk6'% for students with SBP<140mmHg "
        noi di "For students with SBP>140mmHg, the rates were `swk2'%, `swk4'%, `swk6'%"
        noi di "There's an association between SBP>140mmHg and withdrawal from Stata I: p=`p'"
    }
    else {
        noi di "Withdraw rates for Stata I at weeks 2, 4 and 6 were `wk2'%, `wk4'%, `wk6'% for students with SBP<140mmHg "
        noi di "For students with SBP>140mmHg, the rates were `swk2'%, `swk4'%, `swk6'%"
        noi di "There's no association between SBP>140mmHg and withdrawal from Stata I: p=`p'"
    }
} 

// List the first record
list in 1

// Check if the data is nested
capture isid student_id
if _rc !=0 {
    di "These data are nested and each student has several records"    
}
else {
    di "There's only a single record per student"
}

// Perform Cox regression analysis
stcox stressed

// List results
ereturn list 
return list

// Store the results matrix
matrix m = r(table)
matrix list m

// Perform linear combination of coefficients
lincom stressed 
return list

qui {  
    local p = r(p)
    local hr : di %3.2f exp(r(estimate))
    local lb : di %3.2f exp(r(lb))
    local ub : di %3.2f exp(r(ub))
    if r(p) < 0.01 {
        local p: di "p < 0.01"
    }
    else if inrange(r(p),0.01,0.05) {
        local p: di %3.2f r(p)
    }
    else {
        local p: di %2.1f r(p)
    }
    if `p' < .05 {
        noi di "There's an association between SBP>140mmHg and withrawal from Stata I: HR=`hr', 95%CI: `lb'-`ub', p=`p'"
    }
    else {
        noi di "There's no association between SBP>140mmHg and withrawal from Stata I: HR=`hr', 95%CI: `lb'-`ub', p=`p'"
    }
} 

// Generate a representation tag for each student
egen representation = tag(student_id)

// List the first 10 records
l in 1/10 

// Keep only unique records for each student
keep if representation==1

// Count the number of unique records
count

// Check if the data is nested
capture isid student_id
if _rc !=0 {
    di "These data are nested and each student has several records"    
}
else {
    di "There's only a single record per student"
}
Hide code cell output
file wk7output\student_pressure.dta not found
r(601);
r(601);
r(601);

8.1 Nonparametric#

GPT-4o annotation:#

This Stata script analyzes student data for dropout rates and survival analysis, including nonparametric hypothesis testing. Here’s a breakdown of each part of the script:

  1. Loading Data Based on OS:

    • The script first checks the operating system (OS) and loads the appropriate dataset:

      if c(os) == "MacOSX" {
          use wk7output/student_pressure, clear  
      }
      else {
          use wk7output\student_pressure, clear 
      } 
      
  2. Setting Seed and Creating Withdrawal Variable:

    • Sets a seed for reproducibility and creates a binary variable withdraw to indicate whether a student withdraws:

      set seed 340600
      g withdraw = rbinomial(1, .1)
      
  3. Sorting and Listing Data:

    • Sorts the data by student_id and session_date and lists a subset of the data:

      sort student_id session_date
      list if inrange(student_id, 2, 5)
      
  4. Generating Withdrawal Date and Summary Statistics:

    • Creates a variable withdraw_dt to record the session date if the student withdraws and provides summary statistics:

      g withdraw_dt = session_date if withdraw==1
      sum session_date
      
  5. Formatting and Listing Data:

    • Formats the withdraw_dt variable and lists the data again:

      format withdraw_dt %td
      list if inrange(student_id, 2, 5)
      
  6. Generating and Formatting End Date:

    • Creates an end variable to record the earliest withdrawal date for each student and formats it:

      by student_id: egen end = min(withdraw_dt)
      format end %td
      list if inrange(student_id, 2, 5)
      
  7. Handling Missing End Dates:

    • Replaces missing end dates with the maximum session date and tabulates the end variable:

      replace end=r(max) if missing(end)
      tab end, mi
      
  8. Generating and Formatting Begin Date:

    • Creates a begin variable for the earliest session date for each student and formats it:

      by student_id: egen begin = min(session_date)
      format begin %td
      list if inrange(student_id, 2, 5)
      
  9. Ensuring Sequential Withdrawals:

    • Ensures that once a student withdraws, they are marked as withdrawn in subsequent records:

      by student_id: replace withdraw = 1 if withdraw[_n-1]==1  
      list if inrange(student_id, 2, 5)
      
  10. Marking and Dropping Ineligible Students:

    • Marks ineligible students (those who withdraw) and drops them from the dataset:

      bys student_id: egen inelig=min(withdraw)
      l if inelig==1
      drop if inelig==1 
      
  11. Preparing for Survival Analysis:

    • Keeps relevant variables and orders them, then sets up the survival data structure:

      codebook student_id 
      keep student_id session sbp withdraw end begin 
      order student_id session begin end withdraw sbp 
      stset end, fail(withdraw) enter(begin) origin(begin) scale(7.0242308)
      
  12. Creating a Stress Indicator:

    • Creates a stressed variable indicating high systolic blood pressure (SBP):

      g stressed = sbp>140
      
  13. Generating Survival Graph:

    • Generates a survival graph with failure percentages, stratified by the stressed variable, and exports the graph as a PNG file:

      #delimit ;   
      sts graph, 
          fail per(100)
          ylab(,format(%2.0f))
          by(stressed)
          tmax(7)
          xlab(0(1)7)
          xti("Week")
          ti("Stata I dropout rate, %")
          /*
          legend(
              order(1 2)
              lab(1 "SBP>140mmHg")
              lab(2 "SBP<140mmHg")
              ring(0)
              pos(11)
          )
          */
          risktable
          note("Source: From Simulated Data",
              pos(7)
          )
      ; 
      #delimit cr 
      graph export wk7output/class_attrition_wk8.png, replace 
      sts test
      
  14. Checking Data Structure:

    • Checks if the data is nested by student and displays a message accordingly:

      list in 1
      capture isid student_id
      if _rc !=0 {
          di "These data are nested and each student has several records"    
      }
      else {
          di "There's only a single record per student"
      }
      
  15. Calculating Absolute Risks:

    • Calculates absolute risks at specific time points for stressed and non-stressed students:

      sts list, ...
      
  16. Performing Hypothesis Testing:

    • Performs a log-rank test for nonparametric hypothesis testing:

      sts test stressed, logrank 
      
  17. Calculating p-value and Displaying Results:

    • Calculates the p-value and displays results based on significance level:

      local p = chi2tail(`df', `chi2_value')
      if `p' < .05 {
          ...
      }
      
  18. Checking Data Structure:

    • Checks if the data is nested by student and displays a message accordingly:

      capture isid student_id
      if _rc !=0 {
          di "These data are nested and each student has several records"    
      }
      else {
          di "There's only a single record per student"
      }
      

This Stata script demonstrates a thorough analysis process for examining dropout rates and survival in a student population, including data preparation, hypothesis testing, and regression analysis.

8.2 Semiparametric#

GPT-4o annotation:#

This Stata script performs a Cox proportional hazards regression, extracts results, and formats the output. Here’s an annotated breakdown of the script:

  1. Cox Proportional Hazards Model:

    • The script fits a Cox model with stressed as the predictor:

      stcox stressed
      
  2. Listing Stored Results:

    • Lists estimation results and other stored results:

      //creturn list
      ereturn list 
      return list
      
  3. Storing Results in a Matrix:

    • Stores the results of the Cox model in a matrix m and lists the matrix:

      matrix m = r(table)
      matrix list m
      
  4. Linear Combination of Estimates:

    • Calculates a linear combination of the stressed coefficient:

      lincom stressed 
      return list
      
  5. Extracting and Formatting Results:

    • Quietly extracts the p-value, hazard ratio (HR), and its 95% confidence interval (CI), formats them, and prints a message based on the significance of the association:

      qui {  
          local p = r(p)
          local hr : di %3.2f exp(r(estimate))
          local lb : di %3.2f exp(r(lb))
          local ub : di %3.2f exp(r(ub))
          
          if r(p) < 0.01 {
              local p: di "p < 0.01"
          }
          else if inrange(r(p),0.01,0.05) {
              local p: di %3.2f r(p)
          }
          else {
              local p: di %2.1f r(p)
          }
          
          if `p' < .05 {
              noi di "There's an association between SBP>140mmHg and withdrawal from Stata I: HR=`hr', 95%CI: `lb'-`ub', p=`p'"
          }
          else {
              noi di "There's no association between SBP>140mmHg and withdrawal from Stata I: HR=`hr', 95%CI: `lb'-`ub', p=`p'"
          }
      } 
      

Explanation of Key Components:#

  1. Fitting the Cox Model:

    • stcox stressed fits a Cox proportional hazards model with the variable stressed (indicating SBP > 140 mmHg) as the predictor.

  2. Listing Results:

    • ereturn list and return list display the estimation results and stored results from the last estimation command.

  3. Extracting Results into a Matrix:

    • matrix m = r(table) stores the results of the Cox model in a matrix named m, which is then listed using matrix list m.

  4. Calculating Linear Combinations:

    • lincom stressed calculates the linear combination of estimates for the stressed variable, with return list listing the results.

  5. Quiet Block for Extracting and Formatting:

    • qui (quietly) suppresses output while extracting and formatting results.

    • The local commands assign results (p-value, HR, lower and upper bounds of the 95% CI) to local macros.

    • The di (display) command formats these values.

    • The script checks the significance of the p-value and prints a message indicating whether there is a statistically significant association between high SBP and withdrawal from the Stata I class.

This script is comprehensive for fitting a Cox model, extracting results, and providing a clear interpretation based on statistical significance.

8.3 Parametric#

GPT-4o annotation:#

This Stata script uses a parametric survival regression model to analyze the relationship between high systolic blood pressure (SBP > 140 mmHg) and student withdrawal from a class. The script then extracts, formats, and interprets the results. Here’s an annotated breakdown:

  1. Fitting a Parametric Survival Regression Model:

    • The script fits a parametric survival regression model with stressed as the predictor:

      streg stressed
      
  2. Linear Combination of Estimates:

    • Calculates a linear combination of the stressed coefficient:

      lincom stressed 
      
  3. Listing Stored Results:

    • Lists the stored results from the linear combination:

      return list
      
  4. Extracting and Formatting Results:

    • Quietly extracts the p-value, hazard ratio (HR), and its 95% confidence interval (CI), formats them, and prints a message based on the significance of the association:

      qui {  
          local p = r(p)
          local hr : di %3.2f exp(r(estimate))
          local lb : di %3.2f exp(r(lb))
          local ub : di %3.2f exp(r(ub))
          
          if r(p) < 0.01 {
              local p: di "p < 0.01"
          }
          else if inrange(r(p),0.01,0.05) {
              local p: di %3.2f r(p)
          }
          else {
              local p: di %2.1f r(p)
          }
          
          if `p' < .05 {
              noi di "There's an association between SBP>140mmHg and withdrawal from Stata I: HR=`hr', 95%CI: `lb'-`ub', p=`p'"
          }
          else {
              noi di "There's no association between SBP>140mmHg and withdrawal from Stata I: HR=`hr', 95%CI: `lb'-`ub', p=`p'"
          }
      } 
      

Explanation of Key Components:#

  1. Fitting the Model:

    • streg stressed fits a parametric survival regression model using the stressed variable (indicating SBP > 140 mmHg) as the predictor.

  2. Calculating Linear Combinations:

    • lincom stressed calculates the linear combination of estimates for the stressed variable, with return list listing the results.

  3. Quiet Block for Extracting and Formatting:

    • qui (quietly) suppresses output while extracting and formatting results.

    • The local commands assign results (p-value, HR, lower and upper bounds of the 95% CI) to local macros.

    • The di (display) command formats these values.

    • The script checks the significance of the p-value and prints a message indicating whether there is a statistically significant association between high SBP and withdrawal from the Stata I class.

Script Flow Summary:#

  1. Model Fitting:

    • Fit the parametric survival model with stressed as the predictor.

  2. Linear Combination and Results Extraction:

    • Compute the linear combination of the predictor, list the results, and store relevant statistics in local macros.

  3. Formatting and Displaying Results:

    • Format the HR and CI values.

    • Determine the significance of the p-value and display an appropriate message indicating the presence or absence of a significant association between high SBP and class withdrawal.

This script effectively combines parametric survival regression with robust result extraction and interpretation, providing clear and informative output.