R

The Basics

# Clear the working space
rm(list = ls())

# Load the packages. 
library(tprstats)
tprstats::setup()

Hypothesis Testing

Normal Distribution

# Calculate 97.5th quantile of the standard normal distribution
# Find Za/2 for a=xxxxx
qnorm(.975)
qnorm(1-a/2)

# Find the probability of obtaining a value less than 1.9.
pnorm(1.9)

T Distribution

qt(1-a/2,v) # Critical t-value

#Calculate 97.5th quantile of the t distribution with 12 d.f.
qt(.975,12)

Quantile

# View 75th quantile of Pittsburgh may Temperatures
quantile(Pgh_May_Temps$Temperature,.75)

#Calculate the 10th, 50th, and 90th quantiles
quantile(Pgh_May_Temps$Temperature,probs=c(.10,.5,.90))

Basics

#hist(file$data)
hist(Pgh_Daily_Temp_May_1995_2018$Temperature)
hist(xn,breaks = 20, col="red",main="Normal Distribution") # style

mean(Pgh_Daily_Temp_May_1995_2018$Temperature)
sd(Pgh_Daily_Temp_May_1995_2018$Temperature)

Reject the null hypothesis

The hypothesized null value is not in the confidence interval.
The calculated t-statistic is greater than the critical t value.
The p-value is less than α.

Confidence Interval Interpretation: (1-𝛼) percent of the samples should have a sample mean within the above bounds.

interpret p-value: If the population coefficient of Sqft is zero, the probability we obtain a coefficient as large or larger in absolute value than 148.39 is 2e_16.

# hypothesis test for population mean
t.test(Dollar_Bank$TVL,mu=15) 
# hypothesis test for population variance
varTest(Dice_Data$X, alternative = "two.sided", conf.level = 0.95, sigma.squared = 2.917)

2*pt(t,df)
# Always use the negative t-value in the above R command, even if your 
# calculated t-statistic is positive.
2*pt(-1.205,29)
2*pt(1.205,29,lower=FALSE)

# Draw a sample of size 730 from the uniform distribution on the interval from 0 to 10
xu=runif(730,min=0,max=10)

# Draw a sample of size 650 from the normal distribution with mean 2 and std dev 1
xn=rnorm(650,mean=2,sd=1)
xn=rnorm(1000,2,3)

# Draw a sample of size 1000 from t distribution with mean=2, s=3, df=10
xt=2+1.5*rt(1000,10)

# Exponential Distribution with parameter 1.5
xe=rexp(1000,1.5)

Quality Control

Control Chart

Acceptance Interval

controlChart(chartdata, mu, sig, n, alpha)
controlChart(Theta_data$XBAR,50,.3,25,.05)

Control Chart for Binary Outcome

Acceptance Interval

controlChart(chartdata, expected defect rate, n, alpha)
controlChartBinary(Defects_data$d_rate,.06,85,.05)

Control Chart is used to monitor a process variable over time.

Out of Control (Per Six Sigma)

One or more data points fall outside the control limits
Seven consecutive data points increasing or decreasing
Eight consecutive data points are on one side of average

Testing Differences in Population Means

Simple Regression

sapply(Diamonds_17,mean) # Mean of each variable
sapply(Diamonds_17,sd) # Standard deviation of each variable
summary(Diamonds_17) # Min, max, median, etc of each variable
summaryStates(Diamonds_17) # Includeds standard deviations

# regression and summary: variable name = lm(dependent~independent, data=file)
OurReg=lm(Price~Carat,data=Diamonds_17)
summaryH(OurReg) # H->Heteroskedaticity
summaryHAC(OurReg) # Time Series

# plot
plot(Price~Carat,main="Price vs Carat",xlab="Carat",ylab = "Price",data=Diamonds_17)
CIcoefH(OurReg) # confidence intervals for the coefficients
abline(OurReg,col="red")

# *Two ways to generate plot
plot(Marst_Condos$Sqft,Marst_Condos$Price)
plot(Price~Sqft,data = Marst_Condos)

# plot actual fitted
plotActualFitted(OurReg,Diamonds_17,"Price") # Name of regression, dataset name, and name of dependent variable in quotes

# fit to the regression line
yhat=fitted(OurReg)
cbind(Diamonds_17$Price,yhat) #view side by side and compare

# residual
resid=residuals(OurReg) #calculate residual
plot(resid)
lines(resid) # connecting the points
abline(h=0)

# prediction with upper and lower bound
predict(OurReg, data.frame(Carat=.75),level = .95, interval="predict")
predict(OurReg, data.frame(Carat=c(.62,.74,.81)),level = .95, interval="predict")


# prediction
PredDat=data.frame(Carat=1.5,Clarity=6,Cut=2)
predict(DiamReg,PredDat,level = .95,interval = "predict")
MyPred=data.frame(predict(DiamReg,PredDat,level = .95,interval = "predict"))

# Fit data into a spreadsheet
FitDat=data.frame(Diamonds_17$Price,yhat,resid)

# Write to excel csv file
Pred_and_PI=predict(OurReg, data.frame(Carat=c(.62,.74,.81)),level = .95, interval="predict")
write.csv(Pred_and_PI, file = "Our_Predictions.csv")

Simulation

# Example 1
set.seed(10)
x1=runif(5000,10000,20000) # 5000 draws, range 10000 to 20000
x2=runif(5000,10000,20000)
x3=runif(5000,10000,20000)
Amount=x1+x2+x3
hist_CI(Amount)
hist_CI(Amount, alpha=.05) #range sparse, precision decrease

# what is the probability of getting more than $50,000
Gt50K=ifelse(Amount>50000,1,0)
mean(Gt50K)

# Example 2a: draw 3
xbar=0
set.seed(33)
x=runif(30,0,3)   # Sample 1
xbar[1]=mean(x)   # Mean of Sample 1
x=runif(30,0,3)   # Sample 2
xbar[2]=mean(x)   # Mean of Sample 2
x=runif(30,0,3)   # Sample 3
xbar[3]=mean(x)   # Mean of sample 3
# View the three means
xbar
xbar[1:3]

# Example 2b: draw 1000
for (i in 1:1000){  
  x=runif(30,0,3)
  xbar[i]=mean(x)  
}

# fit/select bistribution
select_distribution(xbar)
fit_distribution(xbar,"weibull")

#==============================================================================
# Hungry Dawg Example Model 1: Estimates provided by CEO
#==============================================================================
mu =250
N=18533
EmplContrib=125
Claim=0
CC=0
TCC=0
for (t in 2:13) {
  N[t]=N[t-1]*1.02
  mu[t]=mu[t-1]*1.01
  Claim[t]=mu[t]
  EmplContrib[t]=125
  CC[t]=N[t]*(Claim[t]-EmplContrib[t])
}
TCC=sum(CC[2:13]) #sum

#==============================================================================
# Hungry Dawg Example Model 2: Estimate a distribution and a regression
#==============================================================================
select_distribution(HungryDawg_32months$EmplGro) #obtain a uniform distribution [-0.029, 0.083]
CGro=1+.01
EGroMin = 1-.03
EGroMax = 1+.07
SDC = 3
SDECresid= 3.10
Coef=coefficients(EmplReg)

set.seed(33)
for (r in 1:1000) {
  CC=0
  Claim=0
  for (t in 2:13) {
    N[t]=N[t-1]*runif(1,EGroMin,EGroMax)  
    mu[t]=mu[t-1]*1.01
    Claim[t]=rnorm(1,mu[t],SDC) 
    EmplContrib[t]=Coef[1]+Coef[2]*Claim[t]+rnorm(1,0,SDECresid)
    CC[t]=N[t]*(Claim[t]-EmplContrib[t])
  }
  TCC[r]=sum(CC[2:13])/1e6 

#==============================================================================
# Hungry Dawg Example Model 3: incorperate coefficient uncertainty 
#==============================================================================
set.seed(33)
for (r in 1:1000) {
  CC=0
  Claim=0
  coefD=coefDrawHAC(EmplReg)
  # simulate a draw of the coefficients from our regression
  # time series data, use coefDrawHAC()
   for (t in 2:13) {
    N[t]=N[t-1]*runif(1,EGroMin,EGroMax)  
    mu[t]=mu[t-1]*1.01
    Claim[t]=rnorm(1,mu[t],SDC) 
    EmplContrib[t]=coefD[1]+coefD[2]*Claim[t]+rnorm(1,0,SDECresid)
    CC[t]=N[t]*(Claim[t]-EmplContrib[t])
  }
  TCC[r]=sum(CC[2:13])/1e6
}

# remove missing values
TEMP=na.omit(Fargo_Feb_Temp$TEMP_FARGO)

Module 1

Linear Regression

Least Squares Estimation

Subset

# remove missing values
WatDat34=subset(Water_Conservation_Data,GROUP==3|GROUP==4)
WatUse4_gt30k=subset(Water_Conservation_Data,SUMMER_07>30 & GROUP==4)
WatUse3_4_gt30k=subset(Water_Conservation_Data,SUMMER_07>30 & GROUP==3|GROUP==4
Marst_Condos=subset(Boston_Condos_577,Boston_Condos_577$Marst==1)

Test difference in population means using regression

WatReg34=lm(SUMMER_07~TREAT3,data=WatDat34)
summaryH(WatReg34)

Sample Size and Power

# Calculate power
effect_size=2; stdev=5
pwr.t.test(n =50,d =effect_size/stdev,sig.level =.05,power =NULL,type = c("one.sample"))

# Calculate sample size
effect_size=2; stdev=5
pwr.t.test(n =NULL,d =effect_size/stdev,sig.level =.05,power =.85,type = c("one.sample"))

Power

To determine sample size:

Effect size to be detected
Power
Standard diviation
Significance level

What is this?

The probability that a statistical test will detect differences when they truly exist

Why is this important?

You want to have the statistical "muscle" to be able to detect differences between the groups you are studying, or making sure you do not "miss" finding differences

effect_size=2; stdev=5
pwr.t.test(n =50,d =effect_size/stdev,sig.level =.05,power =NULL,type = c("one.sample"))

effect_size=2; stdev=5
pwr.t.test(n =NULL,d =effect_size/stdev,sig.level =.05,power =.85,type = c("one.sample"))

Test Difference in Means

AB_t2n(percent_B = .5, mean_diff = 2., sd_A = 3,
sd_B = 3, sig_level = .05, power = .85, alternative = 'two_sided')
AB_t2n(N = 325, percent_B = 200/325, mean_diff = 5, sd_A = 15.25, 
sd_B = 15.25, sig_level = .05, alternative = 'two_sided')

Find the observation numbers of the largest positive residual and the largest negative residual.

which(resid==max(resid))
which(resid==min(resid))

Module 2

Multiple Regression

summaryStats(Diamonds_212) # Mean, median, max, min, stdev

DiamReg=lm(Price~Carat+Clarity,data = Diamonds_212)

Observation=seq(1,212) # integers from 1 to 212

plot(Observation,Diamonds_212$Price,ylab="Prices", main = "Actual (black) and Fitted (red) Prices",col="white")
lines(Observation,Diamonds_212$Price)
lines(Observation,yhat,col="red") # compare

plot(Observation,resid,main = "Residuals",col="white")
lines(Observation,resid)
abline(h=0,col="red") # residual

ActFitResid=data.frame(Diamonds_212$Price,yhat,resid) # data frame
write.csv(ActFitResid,"ActFitResid.csv") # white csv

“Big Three” for Regression:

Statistical Significance
Algebraic Sign
Magnitud

Interpretation: A one carat increase in weight holding clarity constant would increase price by $11,922.

Adjusted R^2: imposes a penalty for adding a variable

Bias Verbal and writing skills are positively correlated.When we leave out GMATVERBAL, the GMATWRITING variable attempts to do double duty. To obtain unbiased coefficient estimates, we need to include other variables that belong in the model, even if we are not particularly interested in the coefficients of those variables.

“abline” vs. “line”: lines(x,y) by default connects the x and y coordinates. abline(a=NULL,b=NULL,v=NULL,h=NULL) draws a line. abline(a=15,b=1) draws a line with intercept = 15 and slope =1 abline(v=12) draws a vertical line at x=12. In the example below, I set it so that we draw a vertical line at the mean of speed variable abline(h=10) draws a horizontal line at y=10. In the example below, I set it so that we draw a horizontal line at the mean of distvariable lines(cars$speed,cars$dist) draws lines by connecting the data points from the left (like time series). I think we used this for drawing control charts. This is out of the scope of this course, but lines()command can draw more abstract lines. For example, if you smooth the trend ofdistwith respect tospeedwith lowess(), lines() can draw that for you (of course,speed` variable is not time, so this example does not make sense, but let's keep it this way for the sake of example).

plot(cars)
reg <- lm(cars$dist~cars$speed)
abline(coef(reg),col='blue')
abline(a=15,b=1,col='red')
abline(v=mean(cars$speed),col='black')
abline(h=mean(cars$dist),col='black')
lines(cars$speed,cars$dist)
lines(lowess(cars$speed,cars$dist),col='green') # we don't use this in class

Multiple Regression: Hypothesis Testing

# Wald test
coefTestH(DiamReg,c("Carat=13000")) # Test the null hypothesis population coefficient of carat = 13,000

(n-K-1) degrees of freedom

Multiple Regression: Prediction

newcarat=rbind(.8,1.3,2.5) # puts the data into a single column with three rows
newclarity=rbind(7,6,2)
predict(DiamReg, data.frame(Carat=newcarat,Clarity=newclarity), interval="predict")
# 90% CI
predict(DiamReg, data.frame(Carat=newcarat,Clarity=newclarity), level=.90, interval="predict")
# another way: predict from the data in excel
predict(DiamReg,Diamond_Prediction_Data, interval="predict")

Calculate the correlation

with(Ward14_Homes,cor(Liveable_sqft,Totalrm))

Which one

which(resid==max(resid))

Wine Example

WineReg=lm(log(PRICE)~AGE+SUMTEMP+WRAIN+HRAIN,data = Wine_Quality_Data)
summaryHAC(WineReg)
Log_Pred_and_PI=predict(WineReg,Wine_Quality_Data_for_Prediction,level=.95, interval="predict")
Vintage=Wine_Quality_Data_for_Prediction$YEAR
# The matplot command is used when there are multiple variables to plot on the vertical axis
matplot(Vintage, Log_Pred_and_PI,lty=1, type = "l",lwd=2,
        main="Predicted Log Prices and 95% Prediction Interval",
        ylab="Prediction and PI")
abline(h=0,lty=2)

Pred_and_PI=exp(Log_Pred_and_PI)
matplot(Vintage, Pred_and_PI,lty=1, type = "l",lwd=2, # lty: vector of line types; lwd: widths
        main="Predicted Prices and 95% Prediction Interval",
        ylab="Prediction and PI")
abline(h=1,lty=2)

Visualization and 3D

par(mfrow=c(2,2))  # This command will put four plots in two rows and two columns
with(Diamond_E_100,hist(Carat,col="light blue"))
with(Diamond_E_100,hist(Clarity,col="light blue"))
with(Diamond_E_100,hist(Cut, col="light blue"))
with(Diamond_E_100,hist(Price,breaks=18, col="gold"))
par(mfrow=c(1,1)) # This command restores one-plot-at-a-time format

DiamCut2=subset(Diamond_E_100,Diamond_E_100$Cut==2)
go3DPlot("Carat","Clarity","Price",30,30,data = DiamCut2)
go3DPlot("Carat","Clarity","Price",30,11,data = DiamCut2)

Module 3

Heteroskedasticity

Heteroskedasticity is the term that we use for the situation in which variances of the error terms are not the same across all observations.

When heteroskedasticity is present: Coefficient estimates are unbiased. Standard formula for calculating standard errors of coefficients is biased.

Indicator Variables

Indicator Variables = Dummy Variables = Fixed Effects

WatRegAll_07=lm(SUMMER_07~TREAT1+TREAT2+TREAT3,data=Water_Conservation_Data)
summaryH(WatRegAll_07)

# Wald Test - test null hypothesis that all population treatment effects are the same.
coefTestH(WatRegAll_07,c("TREAT1=TREAT2","TREAT2=TREAT3"))

Interpretation: Intercept: control group (Reference Category) Slope: difference between the test group and the control group

Indicator Variables and Continuous Variables

ParcelReg=lm(DELTIM~PARCELS+RB+RC,data = Parcel_Delivery)
summaryH(ParcelReg)

ParcelRegFac=lm(DELTIM~PARCELS+factor(ROUTE_NAME),data=Parcel_Delivery)
summaryH(ParcelRegFac)

ParcelRegFac=lm(DELTIM~PARCELS+factor(ROUTE_NUMBER),data=Parcel_Delivery)
summaryH(ParcelRegFac)

# change the reference category using the "levels" feature
ParcelRegFac=lm(DELTIM~PARCELS+factor(ROUTE_NAME,levels=c("RC", "RA","RB")),
    data=Parcel_Delivery)
summaryH(ParcelRegFac)
ParcelRegFac2=lm(DELTIM~PARCELS+factor(ROUTE_NUMBER,levels=c(2,1,3)),
            data=Parcel_Delivery)
summaryH(ParcelRegFac2)

Interpretation: Intercept: estimated fixed cost Slope: difference in fixed time between routes A and B/C RA: reference category = “excluded” category

Scaling of Variables

DiamReg=lm(Price~Carat+Clarity,data = Diamonds_212)
summaryH(DiamReg)

plot(Price~Carat,data = Diamonds_67,pch=20,cex.main=1,main="Linear Equation Does Not Fit Well")
# pch - dots shape; cex.main - title size
abline(LinDiam,col="red",lw=2)

Interpretation:

Our regression tells us that an increase in clarity by one unit would increase price by $1,301 holding carat weight constant.
Our regression tells us that a one carat increase in weight holding clarity constant would increase price by $11,922.

scaledCoefficients(DiamReg)

Scale Free:

Standardized Coefficient: What standard deviation change in y occurs if xj increases by one standard deviation?
- eg. A one standard deviation increase in clarity increases predicted price by .276 standard deviations of price.
Elasticity: By what percentage does y increase if xj is increased by 1%?
- eg. A 1% increase in carat yields a predicted 1.68% increase in price.

Cannot compare directly, because:

The magnitude of a coefficient of a variable depends on the units in which we measure the variable.
The impact of a a right-hand-side variable on the left-hand-side variable depends on the amount of variation in the right-hand-side variable.

Elasticity

elasticities(DiamReg)

Nonlinear Relationships (Functional Form)

Functional Form: the way in which we transform the data

Quadratic Transformation:

QuadMPG=lm(MPG~Speed+I(Speed^2),data = MPGData)
summaryH(QuadMPG)

QuadFit=fitted(QuadMPG)

par(pty="m")
plot(MPG~Speed,data=MPGData,
     main="Quadratic Fits Nicely",pch=16)
lines(MPGData$Speed,QuadFit,col="blue",lw=2)

Interpretation:

If current speed is 20, a one mph increase in speed will increase fuel efficiency by .59 mpg
If current speed is 65, a one mph increase in speed will reduce fuel efficiency by .30 mpg

# Calculate the slope
MPGCoefs=coefficients(QuadMPG)
MPGData$Slope=MPGCoefs[2]+2*MPGCoefs[3]*MPGData$Speed

plot(Slope~Speed,data=MPGData,pch=20)
abline(h=0,lty=3)

Logarithmic Transformation: When both the independent and dependent variables are in log form, the coefficient of the independent variable is an elasticity. eg. the predicted price of a diamond increases by 1.65% for each 1% increase in carat. Preferred: one fewer coefficients / constant elasticity property of the log-log model has much intuitive appeal a log function is monotone

LogDiam67=lm(log(Price)~log(Carat),data=Diamonds_67)
summaryH(LogDiam67))

# priction
predict(LogDiam67, data.frame(Carat=2),interval="predict",level=.95)
exp(predict(LogDiam67, data.frame(Carat=2),interval="predict",level=.95))
Log_yhat=fitted(LogDiam67)
yhat=exp(fitted(LogDiam67))

# Compare
par(pty="m") # rectangular plot
par(pty="s") # squared plot

plot(Price~Carat,data=Diamonds_67,pch=16,cex.main=.75,cex.lab=.8,cex.axis=.8,
     main="Actual and Fitted Prices from Quadratic (Blue) and Log-Log Model (Red)",
     xlab="Carat",ylab="Actual and Fitted")
lines(QuadFit~Carat,data=Diamonds_67,col="blue")
lines(yhat~Carat,data=Diamonds_67,col="red")

Module 4

Semi-log Models

If |b1| < .15:

b1 is the fraction by which y changes when x1 increases by one unit
100*b1 is the percent by which y increases when x1 increases by one unit

If |b1| > .15: [Image: Screen Shot 2021-02-19 at 12.24.35 PM.png][Image: Screen Shot 2021-02-19 at 12.23.52 PM.png]

(e^b1−1) is the fraction by which y changes when x1 increases by one unit
is the percent by which y increases when x1 increases by one unit

If b2:

b2 is the percent by which y increases when x2 increases by one percent

Stepwise Regression

SSE - sum of squared errors

We run 10 regressions, trying each variable one at a time. We keep the variable that gives the smallest SSE.

We then run 9 regressions, keeping the first variable we selected and including each of the remaining 9 variables one at a time. As our second variable, we choose the one that, paired with our first variable, gives the smallest SSE.

...
Keep going until there is no remaining variable that reduces the SSE by more than a specified amount. (stopping rule)

Akaike Information Criterion (AIC) - AIC = 2(1+k) + n*log(SSE/n) (Let k denote the number of variables in a regression and n the number of observations of data.)

Our stepwise procedure chooses the variables that minimize the AIC.
We keep going until there is no additional variable that reduces AIC.
Tradeoff: Adding a variable reduces SSE but increases k. (Stop adding variables when the reduction of SSE from the additional variable is less than the “cost” of increasing k.)

Stepwise Regression (a decision aid, not a decision maker)

Step_Hosp = step(Reg_Hosp,test='F')
summaryH(Step_Hosp)

Several variables are highly correlated

with(Hospital_Satisfaction,cor(ACCOMODATIONS,ROOMRESTROOM))
# We obtain .668

Sometimes we may want to include variable(s) that R excluded

Step_Hosp2=step(Reg_Hosp,scope=list(lower=OVERALL~ACCOMODATIONS+ROOMRESTROOM, upper=Reg_Hosp), test='F')

Multicollinearity: When two or more right-hand-side variables are sufficiently highly correlated with each other that the data do not yield precise estimates of the coefficients of those variables, we have multicollinearity. (difficult to isolate the effect of individual variables, and difficult to decide which variables to keep and which to remove)

Ramsey Test

Ramsey Test - evaluates the function we have chosen Null hypothesis is that the functional form you have chosen is correct.

two fitted terms includes y^2 and y^3 and tests the joint null hypothesis that the coefficients of both fitted terms are zero
low p-values suggest linear model fits poorly
high p-values suggest the log-log model fits well
is a decision aid, do not want to be too slavish
Rejection by the Ramsey test suggests that you should do some investigation of alternative functional forms.
With time series data, the Ramsey test can often be uninformative

The Ramsey test tests the functional form of the regression. Suppose we ran the regression like y=β0+β1x1+β2x2+ε. The null hypothesis is that your functional form is correct. Let's get the fitted value, y^=β0+β1x1+β2x2 for each observation. If the functional form is correct, then if we run the regression of the form y=β0+β1x1+β2x2+β3y^2+ε should not have a statistically significant β3. One-fitted term adds just y^2 to the original regression and Two-fitted terms add y^2 and y^3. Technical note: the Ramsey test with two fitted terms includes y^2 and y^3 so they test the joint null hypothesis that the coefficients of both terms are 0.

ramseyTest(LogDiam67)

Leverage Plots - help us to determine which transformations we might consider

Leverage Plot shows the effect of each right-hand-side variable on y after removing the effects of all other x variables.

avPlots(LinDiam212,pch=16,id=FALSE,col.lines="red")

Models with Indicator Variables and Continuous Variables: Tire Price Application

Find the Outliers

which(resid==min(resid))
Ztires=subset(Tire_Pricing,Tire_Pricing$Rating=="Z")

Prediction

predict(TireReg, data.frame(Diameter=15,Ratio=65,Terrain=0,Weight=25,
        Tread=225,WW=0,OWL=0, H=0,V=0,Z=0,Volume=300),interval="predict",level=.75)

Sensitivity Analysis: remove a variable from the regression model and re-estimate. Then redo the prediction.

Module 5

Models with Product of Variables: Interactions

summary(Water_Conservation_Data) # find median water use in 2006
# Create an indicator variable equal to 1 if household used more than 46 and 0 otherwise
High=ifelse(Water_Conservation_Data$WATER_2006>=46,1,0)

WatDatHighTreat3=subset(Water_Conservation_Data,High==1 & GROUP==3) 
mean(WatDatHighTreat3$SUMMER_07)# calculate mean
sd(WatDatHighTreat3$SUMMER_07) # calculate sd
rm(WatDatHighTreat3,WatDatHighControl, WatDatLowTreat3,WatDatLowControl) # remove

WatDat34=subset(Water_Conservation_Data,GROUP==3|GROUP==4) # select treatment 3
High=ifelse(WatDat34$WATER_2006>=46,1,0)
RegH_T3=lm(SUMMER_07~High+TREAT3+High*TREAT3,data=WatDat34)
summaryH(RegH_T3)

# Based on High
High=ifelse(Water_Conservation_Data$WATER_2006>=46,1,0)
RegH_All=lm(SUMMER_07~High+TREAT1+TREAT2+TREAT3+High*TREAT1+High*TREAT2+High*TREAT3,
            data=Water_Conservation_Data)
summaryH(RegH_All)

# Based on Low
Low=ifelse(Water_Conservation_Data$WATER_2006<46,1,0) 
RegL_All=lm(SUMMER_07~Low+TREAT1+TREAT2+TREAT3+Low*TREAT1+Low*TREAT2+Low*TREAT3, 
            data=Water_Conservation_Data)
summaryH(RegL_All)

# The hypothesis that the difference between high and low users is the same for TREAT1 and TREAT2
coefTestH(RegH_All,c("High:TREAT1=High:TREAT2"))

# Model with only main effects: no interaction; assumes the difference between 
# high and low users is the same across the four groups
RegMain=lm(SUMMER_07~High+TREAT1+TREAT2+TREAT3, data=Water_Conservation_Data)
summaryH(RegMain)

Interaction of the variables: product of two variables

b0 (Intercept): “low” in control group b1 (High): difference in ”high“ and ”low“ in control group b2 (Treat3): treatment effect for “low” b2+b3: treatment effect for “high” b3 (High: Treat 3): difference in treatment effects between “high” and “low” users - “difference-in-differences”

Analysis of Variance (ANOVA): an analysis in which all variables are indicator variables / Estimation procedures (such as the "variation" among and between groups) used to analyze the differences among group means in a sample.

F = between groups variance / within groups variance the larger the radio, the more likely it is that the groups have different means (reject H0)

Treat2: effect of treatment 2 on “low” HighTreat2 (High: Treat2): difference in treatment 2 between “high” and “low” Treat1: effect of treatment 1 on “low” HighTreat1 (High: Treat1): difference in treatment 1 between “high” and “low”

Terminology: Factors and Factor Levels Factor - A category of indicator variables, eg. Experimental Treatment: Factor Levels - subcategories with a category, eg. Control, Treatment 1, Treatment 2, and Treatment 3

y=ifelse(condition,x1,x2)
# If the condition is true, y is set equal to x1.
# If the condition is false, y is set equal to x2.

Modeling Choice

1) Linear Probability Model

Binary Choice Model: (0,1) dependent variable Linear Probability Model: regression is used with a (0,1) left-hand-side variable

Pro: coefficients are typically easy to interpret
- eg. An increase in income of $10,000 increases probability of recycling by 1.1 percentage points holding other variables constant.
Con: predicted probabilities from a Linear Probability Model can sometimes be negative or greater than one

Percentage Change vs. Percentage Point Change

percentage change: 100*(.30-.25)/.25
percentage point change: 100*(.30-.25)

2) Logit Model (Logistic Distribution)

Probability of y_i=1 → π_i Probability of y_i=0 → 1 - π_i

Margins Command: the average marginal effect

RecylLogit = glm(Recycler~Income+Midwest+Northeast+West+Them+Me+Green+
       factor(Recylaw)+factor(Deposlaw),family="binomial",data=Bottle_Recycling_Data)
summary(RecylLogit)
margins(RecylLogit) # marginal effects - interpretation

RecylPredLogit=fitted(RecylLogit)
hist(RecylPredLogit,breaks=100,col="light blue", main="Predicted Probabilities from Logit Model")

Do not include “"keeping all other variables constant" when interpret non-linear model For logit models (or any nonlinear functions, really), it is important to be more specific about how you are "'keeping all other variables constant". The impact of a marginal change in x1 on y may differ when x1=0.5 or x1=3.0. Also, the impact of x1 on y may also depend on the values of other variables x3, x4, etc. Thus, we need to say "at which values of x-variables" are we talking about a certain x-variable's impact on y. The simplest baseline is to take the mean of all the x-variables, so margins() calculates the marginal effect of each variable on y at the mean of all the x-variables.

Interpret: An increase in discount of 1 percentage point increases the predicted probability of purchase by 0.6638 percentage points.

logit classification table

logitClassificationTable(RecylLogit,"Recycler",Bottle_Recycling_Data,p_cutoff=NULL)

• Name of logit model (RecylLogit) • Name of left-hand-side variable in quotes (Recycler) • Name of file containing the data (Bottle_Recycling_Data) • p_cutoff. If none is provided, p_cutoff is set equal to the average value of the dependent variable (default value) → fraction of ones in our data

As we increase the p_cutoff: • The sales team will be asked to try to sell to fewer prospects. • The sales team success rate is likely to increase. • The total number of sales is likely to fall.

Interpretation:

We see above that an increase in income of $10,000 increases the probability of recycling by 1.34 percentage points.
We see that the strongest recycling law is predicted to increase the probability of recycling by 15.05 percentage points.

3) Probit Model (Normal Distribution)

RecylProbit = glm(Recycler~Income+Midwest+Northeast+West+Them+Me+Green
    +factor(Recylaw)+factor(Deposlaw),family=binomial(link="probit"),data=Bottle_Recycling_Data)
summary(RecylProbit)
margins(RecylProbit)

Modeling Choice: Application to Residential Solar Energy Systems

Step 1: Try Linear Probability Model

# Solar sales linear probability model
LPMsolar=lm(Sale~AnnualSave+ElecBill+AnnualCost+DrivesHybrid
            +NearbySolar+HomeValue,data=Solar_Data)
summaryH(LPMsolar)

#Test hypothesis that coefficients of AnnualSave and ElecBill are both equal to zero
coefTestH(LPMsolar,c("AnnualSave=0","ElecBill=0")) 

# Solar LPM without ElecBill 
LPMsolar=lm(Sale~AnnualSave+AnnualCost+DrivesHybrid
            +NearbySolar+HomeValue,data=Solar_Data)
summaryH(LPMsolar)

# Calculate fitted values and do a histogram
LPMfit=fitted(LPMsolar)
hist(LPMfit,breaks=20,col="light blue",main="Histogram of Fitted Values 
     from Linear Probability Model")

Step 2: Try Logit Model

# Now estimate solar sales model using logit
logitsolar = glm(Sale~AnnualSave+ElecBill+AnnualCost+DrivesHybrid+NearbySolar+HomeValue,
                 family="binomial", data=Solar_Data)
summary(logitsolar)

# Calculate average marginal effects
margins(logitsolar)

# Calculate fitted values and plot a histogram
logitfit=fitted(logitsolar)
hist(logitfit,breaks=100,col="light blue")

# View classification table with default cutoff probability
#  Enter name of logit equation, name of dependent variable in the data, data file.
logitClassificationTable(logitsolar,"Sale",Solar_Data,p_cutoff=NULL)
# With p_cutoff set to .10
logitClassificationTable(logitsolar,"Sale",Solar_Data,.10)
# With p_cutoff set to .15
logitClassificationTable(logitsolar,"Sale",Solar_Data,.15)

Prediction 1:

# We can do prediction for homes not in our sample. For example, we can
# predict purchase probabilities for three homes with the following characteristics
AnnualSave=c(2.5,1.9,2.3) 
AnnualCost=c(1.8,1.5,1.7) 
DrivesHybrid=c(1,0,0)
NearbySolar=c(0,1,0)
HomeValue=c(550,675,495)
# Put above into a data frame named newx
newx=data.frame(AnnualSave,AnnualCost,DrivesHybrid,NearbySolar,HomeValue)
# Calculated predicted probability of purchase
predict(logitsolar, newdata = newx, type = "response")

Prediction 2: Spreadsheet

# Predict values from the Solar_New_Prospects data and name result PredNew
# Notice that we are putting "PredNew" into the Solar_New_Prospects spreadsheet
Solar_New_Prospects$PredNew = predict(logitsolar, newdata = Solar_New_Prospects, type = "response")

#View a histogram
hist(Solar_New_Prospects$PredNew,breaks=20,col="light blue")

# Rank new prospects in order of predicted probability of purchase
# Notice that we are putting "RankNew" into the Solar_New_Prospects spreadsheet
Solar_New_Prospects$RankNew=rank(Solar_New_Prospects$PredNew)

# Plot predicted probability of purchase versus rank
plot(Solar_New_Prospects$RankNew,Solar_New_Prospects$PredNew,main="Predicted Probability of Sale by Rank")

# Write a file named "Ranked_New_Prospects" with predicted probabilities and rankings included
write.csv(Solar_New_Prospects, file = "Ranked_New_Prospects.csv") # saved to the working directory

confint(model, level = .99) # Additional Tip for CI

Choosing Functional Form

Boston Condo Example

# Estimate Linear Regression
Reg_Condos=lm(PRICE~BATH+BECST+BED+BHILL+CDIST+FLOORS+MARST+PARK+SQFT,
              data=Boston_Condos_572)
summaryH(Reg_Condos)

# Test functional form. 
# Low p-values suggest this may not be a satisfactory functional form
ramseyTest(Reg_Condos)

# Histograms
par(pty="m")
hist(Boston_Condos_572$PRICE,col="light blue")
hist(log(Boston_Condos_572$PRICE),col="light blue")
hist(Boston_Condos_572$SQFT,col="light blue")


# View added variable plots for the two continuous variables
avPlots(Reg_Condos,id=FALSE, "SQFT")
avPlots(Reg_Condos,id=FALSE, "CDIST")

# Investigate functional form 
# Include CDIST and SQFT in both linear and log form
Log_Reg_Condos=lm(log(PRICE)~BATH+BED+CDIST+log(CDIST)+FLOORS+
                    PARK+SQFT+log(SQFT)+BECST+BHILL+MARST,
                  data=Boston_Condos_572)
summaryH(Log_Reg_Condos)
# Put Stepwise procedure to work
Log_Step_Condos = step(Log_Reg_Condos,test='F')
summaryH(Log_Step_Condos)

# Stepwise dropped BATH variable
# Require that BATH be retained and re-run stepwise procedure
Log_Step_Condos2 = step(Log_Reg_Condos,
                        scope=list(lower=log(PRICE) ~ BATH,upper=Log_Reg_Condos),
                        test='F')
summaryH(Log_Step_Condos2)

# Ramsey test
ramseyTest(Log_Step_Condos2)

# Let's test the null hypothesis that the coefficient of log(SQFT) equals 1
coefTestH(Log_Step_Condos2,"log(SQFT)=1")
# Test the hypothesis that coefficint of BED equals coefficient of BATH
coefTestH(Log_Step_Condos2,c("BED=BATH"))

# Let's investigate the shape of the function of CDIST
# Calculate the combined effect of terms involving CDIST
CdistTerms=with(Boston_Condos_572,(-.030138*CDIST+.118956*log(CDIST)))
# Express as difference from smallest value of CdistTerms
CdistTerms=CdistTerms-min(CdistTerms)
# Exponentiate
ExpCdistTerms=exp(CdistTerms)
# Plot showing effect of CDIST 
plot(ExpCdistTerms~CDIST,pch=16,data = Boston_Condos_572,
     main="Variation in Price with CDIST Holding Other Variables Constant")

Writing and Numerically Optimizing Functions Using R

Example 1: Optimizing Simple Quadratic Function

# Write an R function to calculate the following: y=6+4*x-x^2
# I chose name Func_y
# Note inside the function we use x[1].
Func_y=function(x){
  y=6+4*x[1]-x[1]^2
  return(y)}

# View the value of y when x=3
Func_y(3)

# Limit the range of search to values of x in the interval 1 to 5. Don't change nrow or ncol.
bounds=matrix(c(1,5),nrow = 1,ncol = 2)
# Solve for x to maximize y. The name of our function, Func_y, is the first entry in genoud.
# I chose the name Opt_y for the output.
Opt_y = genoud(Func_y, nvars=1,max=TRUE,Domains=bounds,boundary.enforcement = 2,
               data.type.int = FALSE,max.generations = 300)

# View the  value of x that maximizes y
Opt_y$par
# View the value of y at the maximum
Opt_y$value

Example 2: Automobile Fuel Efficiency and Speed

# Rename for simplicity to MPGData
MPGData=Fuel_Efficiency_vs_Speed

# Estimate Quadratic Model of MPG vs Speed
QuadMPG=lm(MPG~Speed+I(Speed^2),data = MPGData)
summaryH(QuadMPG)

# Here is an example to remind us how to predict MPG given Speed.
# Lets prdict MPG when Speed equals 30 miles per hour
predict(QuadMPG,data.frame(Speed=30))

# Step 1: Fit A Distribution to The Residuals of The Regression #

# Retrieve residuals and fit a distribution to them
resid=residuals(QuadMPG)
select_distribution(resid)
#We obtain: rnorm(n,0,.503)

# Step 2: Simulate 1,000 Draws of MPG for a Given Speed #

# Write a function to simulate MPG including residuals
# Let's use a sample size 1000
# Optimization is done below using this function.
# Setting a seed is essential for the optimization.
SimMPG=function(x){
  n=1000
  set.seed(33)
  MPGPred=predict(QuadMPG,data.frame(Speed=x[1]))
  MPGSim=MPGPred+rnorm(n,0,.503)
  return(MPGSim)}

# To illustrate the use fo the function, simulate MPG when Speed=30.
MPGSim30=SimMPG(30)
# View a histogram of the 1000 simulated values with 90% confidence interval
hist_CI(MPGSim30)

# Step 3: Calculate The Mean of The 1,000 Simulated Values #

# Write a function to define expected MPG
# Notice that this function uses function SimMPG
SimMeanMPG=function(x){
  ExpectedMPG=mean(SimMPG(x[1]))
  return(ExpectedMPG)}

# To illustrate use of the function, view simulated expected MPG when Speed=30.
SimMeanMPG(30)

# Step 4: Find The Speed That Maximizes MPG #

# Next we use R to find most fuel-efficient speed.
# Limit the range of search to values of Speed from 20 to 70
bounds=matrix(c(20,70),nrow = 1,ncol = 2)

# Enter SimMeanMPG as first entry in genoud command below
# I chose the name Opt_MPG for the output of the function.
Opt_MPG = genoud(SimMeanMPG, nvars=1,max=TRUE,Domains=bounds,boundary.enforcement = 2,
                 data.type.int = FALSE,max.generations = 300)

# Step 5: Obtain Results for Analysis of Fuel Efficient Speed #

# View optimal speed that maximizes fuel efficiency
Opt_MPG$par
# View expected MPG at optimal speed
Opt_MPG$value

# Let's find the 90% confidence interval for MPG at the optimal speed
SimSampleAtOptimum=SimMPG(Opt_MPG$par)
#Use the following to display the results:
hist_CI(SimSampleAtOptimum)

# For some applications, you may wish to require integer values of the decision variable.
bounds=matrix(c(20,70),nrow = 1,ncol = 2)
# Set data.type.int=TRUE as shown below. No other change
Opt_MPG = genoud(SimMeanMPG, nvars=1,max=TRUE,Domains=bounds,boundary.enforcement = 2,
                 data.type.int = TRUE,max.generations = 300)

# View optimal speed to maximize fuel efficiency.Note integer value.
Opt_MPG$par
# View the value of expected MPG at optimal speed
Opt_MPG$value

Module 6

Time Series

Time Series Data：

error terms that are correlated over time
Heteroskedasticity
estimating a model using period-to-period changes

Residuals: amplitude appears to vary over time, so there may also be heteroskedasticity.

summaryHAC() - Heteroscedasticity and Autocorrelation Correction Newey-West correction If short-term forecasting → a portion of the error term in a given period may carry over to the next period / improve our forecasts by taking account of such correlation of errors over time

cons -> coffee bean consumption per capita
rpcinc -> captita real (inflatio adjsuted) income
rpcofe -> price of coffee
rpcard -> price of carbonated beverage (substitute good)

# Plot coffee consumption per capita
par(pty="m")
plot(Coffee_Data$year,Coffee_Data$cons,type="l",col="blue",
     main="Per Capita Consumption of Coffee Beans",
     xlab="year",ylab="Pounds of Coffee",ylim=c(6,18)) # set the limits of the y axis
lines(Coffee_Data$year,Coffee_Data$cons,col="blue",lw=2)

# Plot Prices and Per Capita Income
# Note: lw sets line width, pch=NA suppresses plot symbol
plot(Coffee_Data$year,Coffee_Data$rpcofe,col="blue",pch=NA,
     main="Variables Affecting Coffee Demand in Our Model",
     xlab="year",ylab="Real Prices and Income",ylim=c(0,240)); 
lines(Coffee_Data$year,Coffee_Data$rpcofe,col="blue",lw=2)
lines(Coffee_Data$year,Coffee_Data$rpcinc,col="dark green",lw=2)
lines(Coffee_Data$year,Coffee_Data$rpcarb,col="red",lw=2)
# The text locator command lets you put text on your graph
text(locator(1),"rpcofe")
text(locator(1),"rpcarb")
text(locator(1),"rpcinc")

# Estimate and view coffee regression. 
CoffeeReg=lm(cons~rpcofe+rpcinc+rpcarb,data=Coffee_Data)
summaryHAC(CoffeeReg)

# Calculate and plot residuals from coffee regression
cofresid=residuals(CoffeeReg)
plot(Coffee_Data$year,cofresid,col="black",
     main="Residuals by Year",
     xlab="year",ylab="Residuals",ylim=c(-1.2,1.2))
lines(Coffee_Data$year,cofresid,col="black",lw=2)
abline(h=0,lw=1,col="blue")

# Calculate elasticities for our coffee model
scaledCoefficients(CoffeeReg)

# Conduct Ramsey test
ramseyTest(CoffeeReg)

# View leverage plots using default setting
par(pty="m")
avPlots(CoffeeReg,id=FALSE,pch=20)

Modeling Correlated Errors： ex. An increase in price may result in a temporary increase in consumption from inventories / habit formation may cause some delay in adjusting consumption / the effect may persist subsequent days until the problem is fully corrected

Each period, a fraction of the error from the previous period is carried forward, and a new random error is added.
ut= ρ1ut-1+εt → AR(1) “first-order serial correlation” / first-order autocorrelation / first-order autoregressive model
- ut-1 be the error term in our regression last period
- ρ1 the fraction carried forward to period t
- εt a new random event in period t
yt = β0 + β 1x1t + … + β KxKt + ut
ut= ρ1ut-1+ ρ2ut-2 + εt (2nd order)

Estimating Models: summaryArima() ->Summary for Regression with Autoregressive Errors*

MyReg=with(MyData, Arima(y, xreg=cbind(x1,x2,x3), order=c(1,0,0), include.constant = TRUE))
summaryArima(MyReg)

# 1st Order
cof_arima=with(Coffee_Data, Arima(cons, xreg=cbind(rpcofe,rpcinc,rpcarb), order=c(1,0,0), include.constant = TRUE))
summaryArima(cof_arima)

# 2nd Order
cof_arima2=with(Coffee_Data, Arima(cons, xreg=cbind(rpcofe,rpcinc,rpcarb), order=c(2,0,0), include.constant = TRUE))
summaryArima(cof_arima2)

Forecast (the model with correlated errors is better most of the time)

MyForecast=forecast(cof_arima,xreg = cbind(83.04195,39.20158,67.11273))
summary(MyForecast)

proceed cautiously if...

AR(1) coefficient (ρ1) is close to one
a lagged dependent variable with coefficient close to one
→ In such circumstances, it may be appropriate to estimate the model with all variables expressed in changes rather than levels.

Forecasting

In a production setting, you might know some right-hand-side variables and your managerial decisions might determine other variables. 1) Forecast future values of the right-hand-side variables, or 2) Use values from several periods in the past as predictors.

# Let's select observations on housing starts from 1990 onward for some visual displays
Houst90=subset(HousDat$HOUST,Year>=1990)
# The date will then appear automatically on the horizontal axis when we do our plot
tsHoust90=ts(Houst90,frequency = 12, start = 1990)
# Plot Housing Starts
plot(tsHoust90,col="blue",lw=2,ylab="Housing Starts",main="Monthly Housing Starts")
#Histogram of Housing Starts
hist(Houst90,breaks=30,col="light blue",xlab="Housing Starts", main="Histogram of Housing Starts")

# Regression of starts on starts one month earlier
HousRegLag=lm(Houst90~lag(Houst90,1))
summaryHAC(HousRegLag)
# calculate fitted values
yhat=fitted(HousRegLag)

# For plotting, create time series version of yhat
tsyhat90=ts(yhat,frequency = 12,start = 1990)

# Plotted actual housing starts and starts predicted by HousRegLat
plot(tsHoust90,col="blue",lw=2,ylab="Housing Starts",main="Monthly Housing Starts Actual (blue) and Predicted (red)")
lines(tsyhat90,col="red",lw=1)

# Choose data from before the financial crisis, 1961 through 2006
HousDat1=subset(HousDat,Year<=2006)
Hreg1=lm(HOUST_PCH~lag(AWHMFG,6)+lag(PERMIT,6)+
           lag(TREASSPREAD,6)+lag(INDPRO,6)+
           lag(HOUST_PCH,6),data=HousDat1)
summaryHAC(Hreg1)

# View standardized coefficients
scaledCoefficients(Hreg1)

# Use all data from 1961 to March 2019
# Estimate model to predict HOUST_PCH
HregAll=lm(HOUST_PCH~lag(AWHMFG,6)+lag(PERMIT,6)+
             lag(TREASSPREAD,6)+lag(INDPRO,6)+
             lag(HOUST_PCH,6),data=HousDat)
summaryHAC(HregAll)

# In an application you would need to create the counterpart of HousPredDat
Pred_and_PI=predict(HregAll,HousPredDat,level=.95, interval="predict")

Levels vs. changes:

The value of housing starts at a given point in time is the level of housing starts (April housing starts)
The difference between two points in time is the change in starts (the change from March to April) ← more appropriate to measure rate of change
- by percentage - asymmetry
  - If percentage changes are not too big, percentages are sometime used because percentages are easier for an audience to understand.
  - eg. We will use the percentage change in starts in month t relative to the same month a year earlier. This is very useful for seasonal data because we benchmark against the same time a year earlier.
- by (natural) logarithms - log(2.2)-log(2)= log(2)-log(2.2)

• INDPRO: Industrial Production Index, Percent Change from Year Ago, Seasonally Adjusted • PERMIT: New Private Housing Units Authorized by Building Permits, Percent Change from Year Ago, Seasonally Adjusted Annual Rate • AWHMFG: Average Weekly Hours of Production and Nonsupervisory Employees: Manufacturing, Percent Change from Year Ago, Seasonally Adjusted • Treasury Spread: 10yr yield minus Federal Funds Rate, annualized

Suppose we have a monthly series in levels named X We wish to calculate the percentage change relative to 12 months earlier.

X_PCH=100*((X-lag(X,12))/lag(X,12))

𝑦t+6 = b0 + b1xt

# Create HousDat2, which has data from 1961 through 2015
HousDat2=subset(HousDat,Year<=2015)
Hreg2=lm(HOUST_PCH~lag(AWHMFG,6)+lag(PERMIT,6)+
           lag(TREASSPREAD,6)+lag(INDPRO,6)+
           lag(HOUST_PCH,6),data=HousDat2)
summaryHAC(Hreg2)

# Prepare Variables for Plotting for years 2015-2019
HPred2=predict(Hreg2, HousDat,level = .95, interval="predict")
HPred2=data.frame(HPred2)
HPred2=subset(HPred2,Year>=2015 & Year<=2019)
tsHPred2=ts(HPred2,frequency = 12,start=2015,end=2019)
Houst2=subset(HousDat$HOUST_PCH,Year>=2015 & Year<=2019)
tsHoust2=ts(Houst2,frequency = 12,start = 2015, end=2019)

# Plot actual and predicted with prediction interval for 2015-2019
ts.plot(tsHPred2,tsHoust2, col=c("red","blue","blue","black"),
        ylab="Housing Starts",
        main="Actual (black) and Predicted (red) Starts with 95% Pred. Interval")

# Prepare Variables for Plotting for years 2006 to 2016
HPred1=predict(Hreg1, HousDat,level = .95, interval="predict")
HPred1=data.frame(HPred1)
HPred1=subset(HPred1,Year>=2006 & Year<=2015)
tsHPred1=ts(HPred1,frequency = 12,start=2006)
Houst1=subset(HousDat$HOUST_PCH,Year>=2006 & Year<=2015)
tsHoust1=ts(Houst1,frequency = 12,start = 2006)

# Plot actual and predicted with prediction interval for 2006-2016
ts.plot(tsHPred1,tsHoust1, col=c("red","blue","blue","black"),
        ylab="Housing Starts",
        main="Actual (black) and Predicted (red) Starts with 95% Prediction Interval")

All variables except Treasspread are expressed as percent changes, so the coefficients of all variables except Treaspread are elasticities. → coefficients in our model capture a combination of demand and supply effects “reduced form model”.

Estimation, Simulation, and Optimization

• Estimation: Obtaining estimates of parameters required for our decision. • Simulation: Capturing the effects of uncertainty. • Optimization: Choosing the optimal policy.

Step 1: Fit A Distribution to The Residuals of The Regression

# Estimate Regression
RegKidsafe=lm(log(Sales)~log(Price)+log(Compprice)+log(Advertising)
        +Age+Education+Income+City+factor(Shelveloc),data=Kidsafe)
summaryH(RegKidsafe)

# Obtain residuals
resid=residuals(RegKidsafe)

# Select a distribution to fit residuals
# We find normal with mean=0 and standard deviation .169
select_distribution(resid)

# Standard deviation from select_distribution command
sd=.169

Step 2: Simulate 1,000 Draws of Profit for a Given Advertising

# Function to draw a sample of n observations of Profit
# Note two places where advertising, x[1], appears in the function.
FuncProfitSample = function(x){ set.seed(33)
  n = 1000    
  set.seed(33)
  Pred_Log_Sales=predict(RegKidsafe, data.frame(Price=90,Compprice=80,
        Advertising=x[1],Age=40,Education=12,Income=50,
        City=1,Shelveloc="Good"))
  Sim_Log_Sales=Pred_Log_Sales+rnorm(n,0,sd)
  Sales=exp(Sim_Log_Sales)
  Profit=10*Sales-x[1]
  return(Profit)}

Step 3: Calculate The Mean of The 1,000 Simulated Values

# We wish to calculate expected profit.
FuncMeanProfit = function(x){ 
  Profit=FuncProfitSample(x)   # Draw profit sample of n for given x
  ExpectedProfit=mean(Profit)  # Mean of sample of profit
  return(ExpectedProfit)}

Step 4: Find The Advertising That Maximizes Profit

# Solve for the optimum
bounds=matrix(c(500,1200),nrow = 1,ncol = 2) # advertising range from $500 to $1200
OptAdv = genoud(FuncMeanProfit, nvars=1,max=TRUE,Domains=bounds,boundary.enforcement = 2,
          data.type.int = FALSE,max.generations = 300)

Step 5: Obtain Results for Analysis of Optimal Advertising

# View optimal advertising and expected profit at the optimum
OptAdv$par
OptAdv$value

# Simulate profit sample at optimum
SimSampleAtOptimum=FuncProfitSample(OptAdv$par)
# Calculate simulated 90% confidence interval of profit.
hist_CI(SimSampleAtOptimum)

# Predicted carseats sold and 95% CI
# We calculate the predicted log values and the exponentiate.
Pred_Log_Sales=predict(RegKidsafe, data.frame(Price=90,Compprice=80,
              Advertising=OptAdv$par,Age=40,Education=12,Income=50,
              City=1,Shelveloc="Good"),interval="predict") 
exp(Pred_Log_Sales)

Regression Analysis with Non-Experimental Data

amount of advertising that the company allocates to each store is determined in part by sales in each store
some store managers provide improved shelf location for products that sell well.
reverse causality
the regression model would tend to overstate the effect of advertising on sales

Addressing Reverse Causality

Investigate how the advertising budget of each store is actually determined.
If the advertising budget is determined in part by sales, then consider doing experimental variation of the advertising budgets of stores.
If experimental variation is not feasible, find the algorithm that is uses to allocate advertising to stores. This algorithm can potentially be used to separate out the causal effect of advertising on sales from the effect of sales on the advertising budget.