capture log close
log using lecture02.log, replace text

//This log file contains most of the examples used in Lecture 2 of
//Stata Programming and Data Management, along with additional explanations
//and examples.

//commented out "version 12" because I want to use some Stata 15 features
//version 12 //I'm using Stata 18, but some students may have earlier versions
clear all //clear all data from memory
macro drop _all //clear all macros in memory
set more off   //give output all at once (not one screenful at a time)
set linesize 80 //maximum allowed width for output

//open up transplants.dta
use transplants, clear

//a very brief overview of statistical commands

//chi2
tab dx gender, row
tab dx gender, row chi2

//exact
tab abo gender in 1/100, row
tab abo gender in 1/100, row exact

//t test, 2 sample unpaired by group
ttest age, by(prev)

//t test, 2 sample unpaired - comparing two variables
ttest don_wgt_kg==rec_wgt_kg, unpaired

//t test, 2 sample paired
ttest don_wgt_kg==rec_wgt_kg

//linear regression
regress rec_wgt_kg rec_hgt

regress rec_wgt rec_hgt age gender

//logistic regression
logit rec_work age
logit rec_work age gender 
logistic rec_work age
logistic rec_work age gender 


//review.
sum age   //get summary data for age variable
disp "variable" _col(20) "mean"
disp "age" _col(20) r(mean)  //display mean age obtained from sum command

//what else is available?
return list
disp r(N)   //# records with nonmissing age
disp %4.3f r(mean)  //mean
disp %4.3f r(var)  //variance
disp %4.3f r(sd)  //standard dev. Square root of variance
disp r(min)  //minimum
disp r(max)  //maximum
disp r(sum) //sum of all values for age

sum age if abo==1 //summarize age for a subset of data
return list  //everything is different; we're looking at patients with abo==1
disp r(N)   //Now we see a much smaller number of records
disp %4.3f r(mean)  //mean is different
disp %4.3f r(var)  //variance is different
//etc

//if we do sum, detail we get more returned data
return list
disp r(skewness)  //skewness
disp r(p50)  //median
disp r(p25)  //first quartile
disp r(p75)  //third quartile

//another example of sum, detail, from the slides
sum bmi, detail
disp "median: " _col(12) r(p50)
disp "IQR: " _col(12) r(p25) "-" r(p75)
disp "kurtosis: " _col(12) r(kurtosis)

//more examples of r-class commands. 
count if age < 45   //counts # patients with age < 45
disp r(N) " young patients"

tab race abo, chi2   //2-way table of ethnicity and ABO blood type
//with chi-square test
disp "p-value: " r(p) //display p value of chi2 test

sum age
assert inrange(r(mean), 45, 55) //assert that mean age is between 40 and 50.
//so if mean age is 47, then we keep going. If mean age is 39, then there will
//be an error and the script will stop.

tab race abo, chi2
assert r(p) < 0.05 //check that there is a statistically significant 
//relationship between race and abo, per chi2 test

sum age
disp "Mean: " r(mean) " years"  //display mean age, in a sentence

sum age
disp "Mean: " %4.0f r(mean) " years" //dispay, formatted. 
//output: Mean: 50 years 
//the "0" of "%4.0f" means "display zero digits after the decimal point"
//the "4" doesn't mean much - it just has to be any number higher than "0"
//so %4.2f or %6.2f or %9.2f would display "50.40 years"

//ereturn
regress bmi gender age
ereturn list

//log likelihood
disp e(ll)

regress //"replay" the regression; show results again without re-computing

//show the betas (regression coefficients)
disp _b[gender]
disp _b[age]
disp _b[_cons]

//show several betas together
disp "estimated BMI: " %3.2f _b[_cons] " + " %3.2f _b[age] ///
    "*age (+ " _b[gender] " if male)" 

//NOTE: for logistic regression, the beta is the *LOG of the odds ratio.
//So to see the odds ratio you'd say "disp exp(_b[gender])"
//same thing for Cox (exponentiate to obtain hazard ratio), etc.

//more examples of the exp() function to exponentiate
disp exp(0)  //returns 1
disp exp(1)  //returns 0

//show the regression results again
regress

//standard error matrix
disp _se[gender]
disp _se[age]

//t statistic
disp _b[gender]/_se[gender]
disp _b[age]/_se[age]

//Absolute value of t-statistic:
disp abs(_b[gender]/_se[gender]) //function "abs" returns absolute value

//a few more examples with "abs"
disp abs(5)
disp abs(-4)
disp abs(0)

//residual degrees of freedom from regression
//(necessary for obtaining p value of t statistic)
disp e(df_r)

regress
//p value
//returns p value of t statistic abs(beta/se) with e(df_r) degrees of freedom
//see "help ttail" for details
di 2*ttail(e(df_r), abs(_b[gender]/_se[gender]))

//NOTE: for logistic regression or Cox regression, instead of using a 
//t statistic you use a Z statistic (normal distribution)
//which would be this:
//di 2*(1-normal(abs(_b[gender]/_se[gender])))

regress
//lower and upper bound of confidence interval
disp _b[gender] + _se[gender]*invttail(e(df_r), 0.975)
disp _b[gender] + _se[gender]*invttail(e(df_r), 0.025)

//NOTE: for logistic, Cox, etc. you would again use the normal distribution,
//not the t distribution, to obtain confidence intervals. 
//And again you would exponentiate to obtain OR/HR etc
//example: disp exp(_b[age]+invnormal(0.025)*_se[age]) //lower bound
//example: disp exp(_b[age]+invnormal(0.975)*_se[age]) //upper bound

//Stata 15.1: an easier way to do confidence intervals and p values
if c(version) >= 15.09 {
    regress bmi age
    lincom age
    disp r(p)
    disp r(lb) " - " r(ub)

    logit rec_work age prev
    lincom prev
    disp r(p)
    //the coefficient is the log odds ratio, 
    //so let's see the 95% CI of the odds ratio
    disp exp(r(lb)) " - " exp(r(ub))
}

//creturn examples. This information isn't as useful as return/ereturn
//but it might come in handy

disp c(current_date)
disp c(os)
disp c(pi)
creturn list

//local macros
//there's also such a thing as a *global* macro 
//but let's worry about those later
local a = 4  //set macro a equal to 4
disp `a'   //display the value of macro a

local a 4 //another way to set the value of a macro
disp `a'

local a 4 + 7  //this means, set a equal to "4 + 7". 
//At this point, Stata doesn't actually compute what 4+7 is

disp `a'*2  //becomes "disp 4 + 7*2" which is the same as 4+14 which is 18

local a = 4 + 7 //the equals means, *COMPUTE* 4+7 and set a equal to the answer
disp `a'*2  //becomes "disp 11*2" which is 22


//another macro example
//we want to display the mean age for males, and the mean age for females,
//on the same line
quietly sum age if gender==0
local age_m = r(mean)  //store mean age for males in macro "age_m"
quietly sum age if gender==1
local age_f = r(mean)  //store mean age for females in macro "age_f"
disp "Mean age: " %3.1f `age_m' " (males) " %3.1f `age_f' " (females)"
//Note the use of formatting code %3.1f. 
//one digit to the right of the decimal point will be shown.

//More macro examples
local my_var age  //my_var is a macro which equals "age"
disp "Here's a summary of `my_var'"  //displays "Here's a summary of age"
sum `my_var' //sumarizes age

local my_command summarize  //my_command is a macro which equals "summarize"
//Stata sees "summarize age, detail"
`my_command' `my_var', detail

//here are some additional macros not in the slides.
//they are a bit more complex but they show how this macro thing works
local mymacro gen
sum `mymacro'der  //Stata sees "sum gender"

local macro1 ma  //macro1 is "ma"
local macro2 age gender //macro2 is "age gender"
sum ``macro1'cro2' 
//first, `macro1' becomes "ma"
//so ``macro1'cro2' becomes `macro2'
//next, `macro2' becomes "age gender"
//so "sum `macro2'" becomes "sum age gender"

//In this case, in_study defines a subpopulation of age<45 and gender==male
//and then we run several commands using this macro
//if we later decide to include females instead, 
//all we have to do is change "gender==0" to "gender==1"
//and everything will run on females instead of males
local in_study if age<45 & gender==0
sum bmi `in_study'
sum peak_pra `in_study'
regress bmi age `in_study'

//here, "confounders" refers to a set of three variables
//you can imagine that we would do several different analyses 
//treating these variables the same.
//if we change the list of confounders, we just have to change the macro
//instead of changing every command
local confounders age gender peak_pra
regress bmi wait_yrs `confounders'


//review: quietly
//run a regression. Don't print any output or error messages
quietly regress bmi age

//a "quietly block": run several commands without printing anything
quietly {
    gen over40 = (age > 40)
    count if over40 == 1
    regress bmi over40
}

drop over40
//now, let's run several commands and only print output from the last one
quietly {
    gen over40 = (age > 40)
    sum bmi if over40 == 1
    noisily di "mean BMI: " %3.1f r(mean)
}

//which command. Show whether a command is built-in, or a stata .do file
which tab
which chelp
which gen
which regress
which help
which graph

//define a program "do_something"
//first, if the program already exists in memory, delete it
//(NOTE: this is superfluous here, since line 9 says "clear all" which deletes
//all programs in memory... but it's a good idea to do "capture program drop"
//before any "program define"
capture program drop do_something

//now, define the program. All it does is display "It doesn't do much."
program define do_something  //create a program called do_something
    disp "It doesn't do much."  //here's what the program does.
end  //that's the end of the program definition

do_something

capture program drop do_something_else
program define do_something_else  //create a program called do_something
    disp "One line is not enough."
    disp "So display two lines."
end  //that's the end of the program definition

do_something_else

//now for a program that actually does something useful.
//NOTE: this would be more efficient with loops, but those come later

capture program drop table1
program define table1
    disp "Variable" _col(20) "mean (SD)" _col(40) "range"
    quietly sum age
    disp "age" _col(20) %3.2f r(mean) " (" %3.2f r(sd) ")" ///
            _col(40) %3.2f r(min) "-" %3.2f r(max)
    quietly sum wait_yrs
    disp "wait_yrs" _col(20) %3.2f r(mean) " (" %3.2f r(sd) ")" ///
            _col(40) %3.2f r(min) "-" %3.2f r(max)
    quietly sum bmi
    disp "bmi" _col(20) %3.2f r(mean) " (" %3.2f r(sd) ")" ///
            _col(40) %3.2f r(min) "-" %3.2f r(max)
end

use transplants, clear
table1

log close _all

//exit command. Within a .do file, it doesn't close Stata. It just stops 
//running the .do file
exit