R
The Basics
# Clear the working space
rm(list = ls())
# Load the packages.
library(tprstats)
tprstats::setup()
Hypothesis Testing
Normal Distribution
# Calculate 97.5th quantile of the standard normal distribution
# Find Za/2 for a=xxxxx
qnorm(.975)
qnorm(1-a/2)
# Find the probability of obtaining a value less than 1.9.
pnorm(1.9)
T Distribution
qt(1-a/2,v) # Critical t-value
#Calculate 97.5th quantile of the t distribution with 12 d.f.
qt(.975,12)
Quantile
# View 75th quantile of Pittsburgh may Temperatures
quantile(Pgh_May_Temps$Temperature,.75)
#Calculate the 10th, 50th, and 90th quantiles
quantile(Pgh_May_Temps$Temperature,probs=c(.10,.5,.90))
Basics
#hist(file$data)
hist(Pgh_Daily_Temp_May_1995_2018$Temperature)
hist(xn,breaks = 20, col="red",main="Normal Distribution") # style
mean(Pgh_Daily_Temp_May_1995_2018$Temperature)
sd(Pgh_Daily_Temp_May_1995_2018$Temperature)
Reject the null hypothesis
- The hypothesized null value is not in the confidence interval.
- The calculated t-statistic is greater than the critical t value.
- The p-value is less than α.
Confidence Interval Interpretation: (1-𝛼) percent of the samples should have a sample mean within the above bounds.
interpret p-value: If the population coefficient of Sqft is zero, the probability we obtain a coefficient as large or larger in absolute value than 148.39 is 2e_16.
# hypothesis test for population mean
t.test(Dollar_Bank$TVL,mu=15)
# hypothesis test for population variance
varTest(Dice_Data$X, alternative = "two.sided", conf.level = 0.95, sigma.squared = 2.917)
2*pt(t,df)
# Always use the negative t-value in the above R command, even if your
# calculated t-statistic is positive.
2*pt(-1.205,29)
2*pt(1.205,29,lower=FALSE)
# Draw a sample of size 730 from the uniform distribution on the interval from 0 to 10
xu=runif(730,min=0,max=10)
# Draw a sample of size 650 from the normal distribution with mean 2 and std dev 1
xn=rnorm(650,mean=2,sd=1)
xn=rnorm(1000,2,3)
# Draw a sample of size 1000 from t distribution with mean=2, s=3, df=10
xt=2+1.5*rt(1000,10)
# Exponential Distribution with parameter 1.5
xe=rexp(1000,1.5)
Quality Control
Control Chart
Acceptance Interval
controlChart(chartdata, mu, sig, n, alpha)
controlChart(Theta_data$XBAR,50,.3,25,.05)
Control Chart for Binary Outcome
Acceptance Interval
controlChart(chartdata, expected defect rate, n, alpha)
controlChartBinary(Defects_data$d_rate,.06,85,.05)
Out of Control (Per Six Sigma)
-
One or more data points fall outside the control limits
-
Seven consecutive data points increasing or decreasing
-
Eight consecutive data points are on one side of average
Testing Differences in Population Means
Simple Regression
sapply(Diamonds_17,mean) # Mean of each variable
sapply(Diamonds_17,sd) # Standard deviation of each variable
summary(Diamonds_17) # Min, max, median, etc of each variable
summaryStates(Diamonds_17) # Includeds standard deviations
# regression and summary: variable name = lm(dependent~independent, data=file)
OurReg=lm(Price~Carat,data=Diamonds_17)
summaryH(OurReg) # H->Heteroskedaticity
summaryHAC(OurReg) # Time Series
# plot
plot(Price~Carat,main="Price vs Carat",xlab="Carat",ylab = "Price",data=Diamonds_17)
CIcoefH(OurReg) # confidence intervals for the coefficients
abline(OurReg,col="red")
# *Two ways to generate plot
plot(Marst_Condos$Sqft,Marst_Condos$Price)
plot(Price~Sqft,data = Marst_Condos)
# plot actual fitted
plotActualFitted(OurReg,Diamonds_17,"Price") # Name of regression, dataset name, and name of dependent variable in quotes
# fit to the regression line
yhat=fitted(OurReg)
cbind(Diamonds_17$Price,yhat) #view side by side and compare
# residual
resid=residuals(OurReg) #calculate residual
plot(resid)
lines(resid) # connecting the points
abline(h=0)
# prediction with upper and lower bound
predict(OurReg, data.frame(Carat=.75),level = .95, interval="predict")
predict(OurReg, data.frame(Carat=c(.62,.74,.81)),level = .95, interval="predict")
# prediction
PredDat=data.frame(Carat=1.5,Clarity=6,Cut=2)
predict(DiamReg,PredDat,level = .95,interval = "predict")
MyPred=data.frame(predict(DiamReg,PredDat,level = .95,interval = "predict"))
# Fit data into a spreadsheet
FitDat=data.frame(Diamonds_17$Price,yhat,resid)
# Write to excel csv file
Pred_and_PI=predict(OurReg, data.frame(Carat=c(.62,.74,.81)),level = .95, interval="predict")
write.csv(Pred_and_PI, file = "Our_Predictions.csv")
Simulation
# Example 1
set.seed(10)
x1=runif(5000,10000,20000) # 5000 draws, range 10000 to 20000
x2=runif(5000,10000,20000)
x3=runif(5000,10000,20000)
Amount=x1+x2+x3
hist_CI(Amount)
hist_CI(Amount, alpha=.05) #range sparse, precision decrease
# what is the probability of getting more than $50,000
Gt50K=ifelse(Amount>50000,1,0)
mean(Gt50K)
# Example 2a: draw 3
xbar=0
set.seed(33)
x=runif(30,0,3) # Sample 1
xbar[1]=mean(x) # Mean of Sample 1
x=runif(30,0,3) # Sample 2
xbar[2]=mean(x) # Mean of Sample 2
x=runif(30,0,3) # Sample 3
xbar[3]=mean(x) # Mean of sample 3
# View the three means
xbar
xbar[1:3]
# Example 2b: draw 1000
for (i in 1:1000){
x=runif(30,0,3)
xbar[i]=mean(x)
}
# fit/select bistribution
select_distribution(xbar)
fit_distribution(xbar,"weibull")
#==============================================================================
# Hungry Dawg Example Model 1: Estimates provided by CEO
#==============================================================================
mu =250
N=18533
EmplContrib=125
Claim=0
CC=0
TCC=0
for (t in 2:13) {
N[t]=N[t-1]*1.02
mu[t]=mu[t-1]*1.01
Claim[t]=mu[t]
EmplContrib[t]=125
CC[t]=N[t]*(Claim[t]-EmplContrib[t])
}
TCC=sum(CC[2:13]) #sum
#==============================================================================
# Hungry Dawg Example Model 2: Estimate a distribution and a regression
#==============================================================================
select_distribution(HungryDawg_32months$EmplGro) #obtain a uniform distribution [-0.029, 0.083]
CGro=1+.01
EGroMin = 1-.03
EGroMax = 1+.07
SDC = 3
SDECresid= 3.10
Coef=coefficients(EmplReg)
set.seed(33)
for (r in 1:1000) {
CC=0
Claim=0
for (t in 2:13) {
N[t]=N[t-1]*runif(1,EGroMin,EGroMax)
mu[t]=mu[t-1]*1.01
Claim[t]=rnorm(1,mu[t],SDC)
EmplContrib[t]=Coef[1]+Coef[2]*Claim[t]+rnorm(1,0,SDECresid)
CC[t]=N[t]*(Claim[t]-EmplContrib[t])
}
TCC[r]=sum(CC[2:13])/1e6
#==============================================================================
# Hungry Dawg Example Model 3: incorperate coefficient uncertainty
#==============================================================================
set.seed(33)
for (r in 1:1000) {
CC=0
Claim=0
coefD=coefDrawHAC(EmplReg)
# simulate a draw of the coefficients from our regression
# time series data, use coefDrawHAC()
for (t in 2:13) {
N[t]=N[t-1]*runif(1,EGroMin,EGroMax)
mu[t]=mu[t-1]*1.01
Claim[t]=rnorm(1,mu[t],SDC)
EmplContrib[t]=coefD[1]+coefD[2]*Claim[t]+rnorm(1,0,SDECresid)
CC[t]=N[t]*(Claim[t]-EmplContrib[t])
}
TCC[r]=sum(CC[2:13])/1e6
}
# remove missing values
TEMP=na.omit(Fargo_Feb_Temp$TEMP_FARGO)
Module 1
Linear Regression
Least Squares Estimation
Subset
# remove missing values
WatDat34=subset(Water_Conservation_Data,GROUP==3|GROUP==4)
WatUse4_gt30k=subset(Water_Conservation_Data,SUMMER_07>30 & GROUP==4)
WatUse3_4_gt30k=subset(Water_Conservation_Data,SUMMER_07>30 & GROUP==3|GROUP==4
Marst_Condos=subset(Boston_Condos_577,Boston_Condos_577$Marst==1)
Test difference in population means using regression
WatReg34=lm(SUMMER_07~TREAT3,data=WatDat34)
summaryH(WatReg34)
Sample Size and Power
# Calculate power
effect_size=2; stdev=5
pwr.t.test(n =50,d =effect_size/stdev,sig.level =.05,power =NULL,type = c("one.sample"))
# Calculate sample size
effect_size=2; stdev=5
pwr.t.test(n =NULL,d =effect_size/stdev,sig.level =.05,power =.85,type = c("one.sample"))
Power
To determine sample size:
- Effect size to be detected
- Power
- Standard diviation
- Significance level
What is this?
- The probability that a statistical test will detect differences when they truly exist
Why is this important?
- You want to have the statistical "muscle" to be able to detect differences between the groups you are studying, or making sure you do not "miss" finding differences
effect_size=2; stdev=5
pwr.t.test(n =50,d =effect_size/stdev,sig.level =.05,power =NULL,type = c("one.sample"))
effect_size=2; stdev=5
pwr.t.test(n =NULL,d =effect_size/stdev,sig.level =.05,power =.85,type = c("one.sample"))
Test Difference in Means
AB_t2n(percent_B = .5, mean_diff = 2., sd_A = 3,
sd_B = 3, sig_level = .05, power = .85, alternative = 'two_sided')
AB_t2n(N = 325, percent_B = 200/325, mean_diff = 5, sd_A = 15.25,
sd_B = 15.25, sig_level = .05, alternative = 'two_sided')
Find the observation numbers of the largest positive residual and the largest negative residual.
which(resid==max(resid))
which(resid==min(resid))
Module 2
Multiple Regression
summaryStats(Diamonds_212) # Mean, median, max, min, stdev
DiamReg=lm(Price~Carat+Clarity,data = Diamonds_212)
Observation=seq(1,212) # integers from 1 to 212
plot(Observation,Diamonds_212$Price,ylab="Prices", main = "Actual (black) and Fitted (red) Prices",col="white")
lines(Observation,Diamonds_212$Price)
lines(Observation,yhat,col="red") # compare
plot(Observation,resid,main = "Residuals",col="white")
lines(Observation,resid)
abline(h=0,col="red") # residual
ActFitResid=data.frame(Diamonds_212$Price,yhat,resid) # data frame
write.csv(ActFitResid,"ActFitResid.csv") # white csv
“Big Three” for Regression:
- Statistical Significance
- Algebraic Sign
- Magnitud
Interpretation: A one carat increase in weight holding clarity constant would increase price by $11,922.
Adjusted R^2: imposes a penalty for adding a variable
Bias Verbal and writing skills are positively correlated.When we leave out GMATVERBAL, the GMATWRITING variable attempts to do double duty. To obtain unbiased coefficient estimates, we need to include other variables that belong in the model, even if we are not particularly interested in the coefficients of those variables.
“abline” vs. “line”:
lines(x,y)
by default connects the x and y coordinates. abline(a=NULL,b=NULL,v=NULL,h=NULL)
draws a line.
abline(a=15,b=1)
draws a line with intercept = 15 and slope =1
abline(v=12)
draws a vertical line at x=12. In the example below, I set it so that we draw a vertical line at the mean of speed variable
abline(h=10)
draws a horizontal line at y=10. In the example below, I set it so that we draw a horizontal line at the mean of distvariable
lines(cars$speed,cars$dist)
draws lines by connecting the data points from the left (like time series). I think we used this for drawing control charts. This is out of the scope of this course, but lines()
command can draw more abstract lines. For example, if you smooth the trend ofdistwith respect tospeedwith lowess()
, lines()
can draw that for you (of course,speed` variable is not time, so this example does not make sense, but let's keep it this way for the sake of example).
plot(cars)
reg <- lm(cars$dist~cars$speed)
abline(coef(reg),col='blue')
abline(a=15,b=1,col='red')
abline(v=mean(cars$speed),col='black')
abline(h=mean(cars$dist),col='black')
lines(cars$speed,cars$dist)
lines(lowess(cars$speed,cars$dist),col='green') # we don't use this in class
Multiple Regression: Hypothesis Testing
# Wald test
coefTestH(DiamReg,c("Carat=13000")) # Test the null hypothesis population coefficient of carat = 13,000
(n-K-1) degrees of freedom
Multiple Regression: Prediction
newcarat=rbind(.8,1.3,2.5) # puts the data into a single column with three rows
newclarity=rbind(7,6,2)
predict(DiamReg, data.frame(Carat=newcarat,Clarity=newclarity), interval="predict")
# 90% CI
predict(DiamReg, data.frame(Carat=newcarat,Clarity=newclarity), level=.90, interval="predict")
# another way: predict from the data in excel
predict(DiamReg,Diamond_Prediction_Data, interval="predict")
Calculate the correlation
with(Ward14_Homes,cor(Liveable_sqft,Totalrm))
Which one
which(resid==max(resid))
Wine Example
WineReg=lm(log(PRICE)~AGE+SUMTEMP+WRAIN+HRAIN,data = Wine_Quality_Data)
summaryHAC(WineReg)
Log_Pred_and_PI=predict(WineReg,Wine_Quality_Data_for_Prediction,level=.95, interval="predict")
Vintage=Wine_Quality_Data_for_Prediction$YEAR
# The matplot command is used when there are multiple variables to plot on the vertical axis
matplot(Vintage, Log_Pred_and_PI,lty=1, type = "l",lwd=2,
main="Predicted Log Prices and 95% Prediction Interval",
ylab="Prediction and PI")
abline(h=0,lty=2)
Pred_and_PI=exp(Log_Pred_and_PI)
matplot(Vintage, Pred_and_PI,lty=1, type = "l",lwd=2, # lty: vector of line types; lwd: widths
main="Predicted Prices and 95% Prediction Interval",
ylab="Prediction and PI")
abline(h=1,lty=2)
Visualization and 3D
par(mfrow=c(2,2)) # This command will put four plots in two rows and two columns
with(Diamond_E_100,hist(Carat,col="light blue"))
with(Diamond_E_100,hist(Clarity,col="light blue"))
with(Diamond_E_100,hist(Cut, col="light blue"))
with(Diamond_E_100,hist(Price,breaks=18, col="gold"))
par(mfrow=c(1,1)) # This command restores one-plot-at-a-time format
DiamCut2=subset(Diamond_E_100,Diamond_E_100$Cut==2)
go3DPlot("Carat","Clarity","Price",30,30,data = DiamCut2)
go3DPlot("Carat","Clarity","Price",30,11,data = DiamCut2)
Module 3
Heteroskedasticity
Heteroskedasticity is the term that we use for the situation in which variances of the error terms are not the same across all observations.
When heteroskedasticity is present: Coefficient estimates are unbiased. Standard formula for calculating standard errors of coefficients is biased.
Indicator Variables
Indicator Variables = Dummy Variables = Fixed Effects
WatRegAll_07=lm(SUMMER_07~TREAT1+TREAT2+TREAT3,data=Water_Conservation_Data)
summaryH(WatRegAll_07)
# Wald Test - test null hypothesis that all population treatment effects are the same.
coefTestH(WatRegAll_07,c("TREAT1=TREAT2","TREAT2=TREAT3"))
Interpretation: Intercept: control group (Reference Category) Slope: difference between the test group and the control group
Indicator Variables and Continuous Variables
ParcelReg=lm(DELTIM~PARCELS+RB+RC,data = Parcel_Delivery)
summaryH(ParcelReg)
ParcelRegFac=lm(DELTIM~PARCELS+factor(ROUTE_NAME),data=Parcel_Delivery)
summaryH(ParcelRegFac)
ParcelRegFac=lm(DELTIM~PARCELS+factor(ROUTE_NUMBER),data=Parcel_Delivery)
summaryH(ParcelRegFac)
# change the reference category using the "levels" feature
ParcelRegFac=lm(DELTIM~PARCELS+factor(ROUTE_NAME,levels=c("RC", "RA","RB")),
data=Parcel_Delivery)
summaryH(ParcelRegFac)
ParcelRegFac2=lm(DELTIM~PARCELS+factor(ROUTE_NUMBER,levels=c(2,1,3)),
data=Parcel_Delivery)
summaryH(ParcelRegFac2)
Interpretation: Intercept: estimated fixed cost Slope: difference in fixed time between routes A and B/C RA: reference category = “excluded” category
Scaling of Variables
DiamReg=lm(Price~Carat+Clarity,data = Diamonds_212)
summaryH(DiamReg)
plot(Price~Carat,data = Diamonds_67,pch=20,cex.main=1,main="Linear Equation Does Not Fit Well")
# pch - dots shape; cex.main - title size
abline(LinDiam,col="red",lw=2)
Interpretation:
-
Our regression tells us that an increase in clarity by one unit would increase price by $1,301 holding carat weight constant.
-
Our regression tells us that a one carat increase in weight holding clarity constant would increase price by $11,922.
scaledCoefficients(DiamReg)
Scale Free:
-
Standardized Coefficient: What standard deviation change in y occurs if xj increases by one standard deviation?
- eg. A one standard deviation increase in clarity increases predicted price by .276 standard deviations of price.
-
Elasticity: By what percentage does y increase if xj is increased by 1%?
- eg. A 1% increase in carat yields a predicted 1.68% increase in price.
Cannot compare directly, because:
-
The magnitude of a coefficient of a variable depends on the units in which we measure the variable.
-
The impact of a a right-hand-side variable on the left-hand-side variable depends on the amount of variation in the right-hand-side variable.
Elasticity
elasticities(DiamReg)
Nonlinear Relationships (Functional Form)
Functional Form: the way in which we transform the data
Quadratic Transformation:
QuadMPG=lm(MPG~Speed+I(Speed^2),data = MPGData)
summaryH(QuadMPG)
QuadFit=fitted(QuadMPG)
par(pty="m")
plot(MPG~Speed,data=MPGData,
main="Quadratic Fits Nicely",pch=16)
lines(MPGData$Speed,QuadFit,col="blue",lw=2)
Interpretation:
- If current speed is 20, a one mph increase in speed will increase fuel efficiency by .59 mpg
- If current speed is 65, a one mph increase in speed will reduce fuel efficiency by .30 mpg
# Calculate the slope
MPGCoefs=coefficients(QuadMPG)
MPGData$Slope=MPGCoefs[2]+2*MPGCoefs[3]*MPGData$Speed
plot(Slope~Speed,data=MPGData,pch=20)
abline(h=0,lty=3)
Logarithmic Transformation: When both the independent and dependent variables are in log form, the coefficient of the independent variable is an elasticity. eg. the predicted price of a diamond increases by 1.65% for each 1% increase in carat. Preferred: one fewer coefficients / constant elasticity property of the log-log model has much intuitive appeal a log function is monotone
LogDiam67=lm(log(Price)~log(Carat),data=Diamonds_67)
summaryH(LogDiam67))
# priction
predict(LogDiam67, data.frame(Carat=2),interval="predict",level=.95)
exp(predict(LogDiam67, data.frame(Carat=2),interval="predict",level=.95))
Log_yhat=fitted(LogDiam67)
yhat=exp(fitted(LogDiam67))
# Compare
par(pty="m") # rectangular plot
par(pty="s") # squared plot
plot(Price~Carat,data=Diamonds_67,pch=16,cex.main=.75,cex.lab=.8,cex.axis=.8,
main="Actual and Fitted Prices from Quadratic (Blue) and Log-Log Model (Red)",
xlab="Carat",ylab="Actual and Fitted")
lines(QuadFit~Carat,data=Diamonds_67,col="blue")
lines(yhat~Carat,data=Diamonds_67,col="red")
Module 4
Semi-log Models
If |b1| < .15:
- b1 is the fraction by which y changes when x1 increases by one unit
- 100*b1 is the percent by which y increases when x1 increases by one unit
If |b1| > .15: [Image: Screen Shot 2021-02-19 at 12.24.35 PM.png][Image: Screen Shot 2021-02-19 at 12.23.52 PM.png]
- (e^b1−1) is the fraction by which y changes when x1 increases by one unit
- is the percent by which y increases when x1 increases by one unit
If b2:
- b2 is the percent by which y increases when x2 increases by one percent
Stepwise Regression
SSE - sum of squared errors
- We run 10 regressions, trying each variable one at a time. We keep the variable that gives the smallest SSE.
We then run 9 regressions, keeping the first variable we selected and including each of the remaining 9 variables one at a time. As our second variable, we choose the one that, paired with our first variable, gives the smallest SSE.
- ...
- Keep going until there is no remaining variable that reduces the SSE by more than a specified amount. (stopping rule)
Akaike Information Criterion (AIC) - AIC = 2(1+k) + n*log(SSE/n) (Let k denote the number of variables in a regression and n the number of observations of data.)
- Our stepwise procedure chooses the variables that minimize the AIC.
- We keep going until there is no additional variable that reduces AIC.
- Tradeoff: Adding a variable reduces SSE but increases k. (Stop adding variables when the reduction of SSE from the additional variable is less than the “cost” of increasing k.)
Stepwise Regression (a decision aid, not a decision maker)
Step_Hosp = step(Reg_Hosp,test='F')
summaryH(Step_Hosp)
Several variables are highly correlated
with(Hospital_Satisfaction,cor(ACCOMODATIONS,ROOMRESTROOM))
# We obtain .668
Sometimes we may want to include variable(s) that R excluded
Step_Hosp2=step(Reg_Hosp,scope=list(lower=OVERALL~ACCOMODATIONS+ROOMRESTROOM, upper=Reg_Hosp), test='F')
Multicollinearity: When two or more right-hand-side variables are sufficiently highly correlated with each other that the data do not yield precise estimates of the coefficients of those variables, we have multicollinearity. (difficult to isolate the effect of individual variables, and difficult to decide which variables to keep and which to remove)
Ramsey Test
Ramsey Test - evaluates the function we have chosen Null hypothesis is that the functional form you have chosen is correct.
- two fitted terms includes y^2 and y^3 and tests the joint null hypothesis that the coefficients of both fitted terms are zero
- low p-values suggest linear model fits poorly
- high p-values suggest the log-log model fits well
- is a decision aid, do not want to be too slavish
- Rejection by the Ramsey test suggests that you should do some investigation of alternative functional forms.
- With time series data, the Ramsey test can often be uninformative
The Ramsey test tests the functional form of the regression. Suppose we ran the regression like y=β0+β1x1+β2x2+ε. The null hypothesis is that your functional form is correct. Let's get the fitted value, y^=β0+β1x1+β2x2 for each observation. If the functional form is correct, then if we run the regression of the form y=β0+β1x1+β2x2+β3y^2+ε should not have a statistically significant β3. One-fitted term adds just y^2 to the original regression and Two-fitted terms add y^2 and y^3. Technical note: the Ramsey test with two fitted terms includes y^2 and y^3 so they test the joint null hypothesis that the coefficients of both terms are 0.
ramseyTest(LogDiam67)
Leverage Plots - help us to determine which transformations we might consider
Leverage Plot shows the effect of each right-hand-side variable on y after removing the effects of all other x variables.
avPlots(LinDiam212,pch=16,id=FALSE,col.lines="red")
Models with Indicator Variables and Continuous Variables: Tire Price Application
Find the Outliers
which(resid==min(resid))
Ztires=subset(Tire_Pricing,Tire_Pricing$Rating=="Z")
predict(TireReg, data.frame(Diameter=15,Ratio=65,Terrain=0,Weight=25,
Tread=225,WW=0,OWL=0, H=0,V=0,Z=0,Volume=300),interval="predict",level=.75)
Sensitivity Analysis: remove a variable from the regression model and re-estimate. Then redo the prediction.
Module 5
Models with Product of Variables: Interactions
summary(Water_Conservation_Data) # find median water use in 2006
# Create an indicator variable equal to 1 if household used more than 46 and 0 otherwise
High=ifelse(Water_Conservation_Data$WATER_2006>=46,1,0)
WatDatHighTreat3=subset(Water_Conservation_Data,High==1 & GROUP==3)
mean(WatDatHighTreat3$SUMMER_07)# calculate mean
sd(WatDatHighTreat3$SUMMER_07) # calculate sd
rm(WatDatHighTreat3,WatDatHighControl, WatDatLowTreat3,WatDatLowControl) # remove
WatDat34=subset(Water_Conservation_Data,GROUP==3|GROUP==4) # select treatment 3
High=ifelse(WatDat34$WATER_2006>=46,1,0)
RegH_T3=lm(SUMMER_07~High+TREAT3+High*TREAT3,data=WatDat34)
summaryH(RegH_T3)
# Based on High
High=ifelse(Water_Conservation_Data$WATER_2006>=46,1,0)
RegH_All=lm(SUMMER_07~High+TREAT1+TREAT2+TREAT3+High*TREAT1+High*TREAT2+High*TREAT3,
data=Water_Conservation_Data)
summaryH(RegH_All)
# Based on Low
Low=ifelse(Water_Conservation_Data$WATER_2006<46,1,0)
RegL_All=lm(SUMMER_07~Low+TREAT1+TREAT2+TREAT3+Low*TREAT1+Low*TREAT2+Low*TREAT3,
data=Water_Conservation_Data)
summaryH(RegL_All)
# The hypothesis that the difference between high and low users is the same for TREAT1 and TREAT2
coefTestH(RegH_All,c("High:TREAT1=High:TREAT2"))
# Model with only main effects: no interaction; assumes the difference between
# high and low users is the same across the four groups
RegMain=lm(SUMMER_07~High+TREAT1+TREAT2+TREAT3, data=Water_Conservation_Data)
summaryH(RegMain)
Interaction of the variables: product of two variables
b0 (Intercept): “low” in control group b1 (High): difference in ”high“ and ”low“ in control group b2 (Treat3): treatment effect for “low” b2+b3: treatment effect for “high” b3 (High: Treat 3): difference in treatment effects between “high” and “low” users - “difference-in-differences”
Analysis of Variance (ANOVA): an analysis in which all variables are indicator variables / Estimation procedures (such as the "variation" among and between groups) used to analyze the differences among group means in a sample.
F = between groups variance / within groups variance the larger the radio, the more likely it is that the groups have different means (reject H0)
Treat2: effect of treatment 2 on “low” HighTreat2 (High: Treat2): difference in treatment 2 between “high” and “low” Treat1: effect of treatment 1 on “low” HighTreat1 (High: Treat1): difference in treatment 1 between “high” and “low”
Terminology: Factors and Factor Levels Factor - A category of indicator variables, eg. Experimental Treatment: Factor Levels - subcategories with a category, eg. Control, Treatment 1, Treatment 2, and Treatment 3
y=ifelse(condition,x1,x2)
# If the condition is true, y is set equal to x1.
# If the condition is false, y is set equal to x2.
Modeling Choice
1) Linear Probability Model
Binary Choice Model: (0,1) dependent variable Linear Probability Model: regression is used with a (0,1) left-hand-side variable
- Pro: coefficients are typically easy to interpret
- eg. An increase in income of $10,000 increases probability of recycling by 1.1 percentage points holding other variables constant.
- Con: predicted probabilities from a Linear Probability Model can sometimes be negative or greater than one
Percentage Change vs. Percentage Point Change
- percentage change: 100*(.30-.25)/.25
- percentage point change: 100*(.30-.25)
2) Logit Model (Logistic Distribution)
Probability of y_i=1 → π_i Probability of y_i=0 → 1 - π_i
Margins Command: the average marginal effect
RecylLogit = glm(Recycler~Income+Midwest+Northeast+West+Them+Me+Green+
factor(Recylaw)+factor(Deposlaw),family="binomial",data=Bottle_Recycling_Data)
summary(RecylLogit)
margins(RecylLogit) # marginal effects - interpretation
RecylPredLogit=fitted(RecylLogit)
hist(RecylPredLogit,breaks=100,col="light blue", main="Predicted Probabilities from Logit Model")
Do not include “"keeping all other variables constant" when interpret non-linear model For logit models (or any nonlinear functions, really), it is important to be more specific about how you are "'keeping all other variables constant". The impact of a marginal change in x1 on y may differ when x1=0.5 or x1=3.0. Also, the impact of x1 on y may also depend on the values of other variables x3, x4, etc. Thus, we need to say "at which values of x-variables" are we talking about a certain x-variable's impact on y. The simplest baseline is to take the mean of all the x-variables, so margins() calculates the marginal effect of each variable on y at the mean of all the x-variables.
Interpret: An increase in discount of 1 percentage point increases the predicted probability of purchase by 0.6638 percentage points.
logit classification table
logitClassificationTable(RecylLogit,"Recycler",Bottle_Recycling_Data,p_cutoff=NULL)
• Name of logit model (RecylLogit) • Name of left-hand-side variable in quotes (Recycler) • Name of file containing the data (Bottle_Recycling_Data) • p_cutoff. If none is provided, p_cutoff is set equal to the average value of the dependent variable (default value) → fraction of ones in our data
As we increase the p_cutoff: • The sales team will be asked to try to sell to fewer prospects. • The sales team success rate is likely to increase. • The total number of sales is likely to fall.
Interpretation:
-
We see above that an increase in income of $10,000 increases the probability of recycling by 1.34 percentage points.
-
We see that the strongest recycling law is predicted to increase the probability of recycling by 15.05 percentage points.
3) Probit Model (Normal Distribution)
RecylProbit = glm(Recycler~Income+Midwest+Northeast+West+Them+Me+Green
+factor(Recylaw)+factor(Deposlaw),family=binomial(link="probit"),data=Bottle_Recycling_Data)
summary(RecylProbit)
margins(RecylProbit)
Modeling Choice: Application to Residential Solar Energy Systems
Step 1: Try Linear Probability Model
# Solar sales linear probability model
LPMsolar=lm(Sale~AnnualSave+ElecBill+AnnualCost+DrivesHybrid
+NearbySolar+HomeValue,data=Solar_Data)
summaryH(LPMsolar)
#Test hypothesis that coefficients of AnnualSave and ElecBill are both equal to zero
coefTestH(LPMsolar,c("AnnualSave=0","ElecBill=0"))
# Solar LPM without ElecBill
LPMsolar=lm(Sale~AnnualSave+AnnualCost+DrivesHybrid
+NearbySolar+HomeValue,data=Solar_Data)
summaryH(LPMsolar)
# Calculate fitted values and do a histogram
LPMfit=fitted(LPMsolar)
hist(LPMfit,breaks=20,col="light blue",main="Histogram of Fitted Values
from Linear Probability Model")
Step 2: Try Logit Model
# Now estimate solar sales model using logit
logitsolar = glm(Sale~AnnualSave+ElecBill+AnnualCost+DrivesHybrid+NearbySolar+HomeValue,
family="binomial", data=Solar_Data)
summary(logitsolar)
# Calculate average marginal effects
margins(logitsolar)
# Calculate fitted values and plot a histogram
logitfit=fitted(logitsolar)
hist(logitfit,breaks=100,col="light blue")
# View classification table with default cutoff probability
# Enter name of logit equation, name of dependent variable in the data, data file.
logitClassificationTable(logitsolar,"Sale",Solar_Data,p_cutoff=NULL)
# With p_cutoff set to .10
logitClassificationTable(logitsolar,"Sale",Solar_Data,.10)
# With p_cutoff set to .15
logitClassificationTable(logitsolar,"Sale",Solar_Data,.15)
Prediction 1:
# We can do prediction for homes not in our sample. For example, we can
# predict purchase probabilities for three homes with the following characteristics
AnnualSave=c(2.5,1.9,2.3)
AnnualCost=c(1.8,1.5,1.7)
DrivesHybrid=c(1,0,0)
NearbySolar=c(0,1,0)
HomeValue=c(550,675,495)
# Put above into a data frame named newx
newx=data.frame(AnnualSave,AnnualCost,DrivesHybrid,NearbySolar,HomeValue)
# Calculated predicted probability of purchase
predict(logitsolar, newdata = newx, type = "response")
Prediction 2: Spreadsheet
# Predict values from the Solar_New_Prospects data and name result PredNew
# Notice that we are putting "PredNew" into the Solar_New_Prospects spreadsheet
Solar_New_Prospects$PredNew = predict(logitsolar, newdata = Solar_New_Prospects, type = "response")
#View a histogram
hist(Solar_New_Prospects$PredNew,breaks=20,col="light blue")
# Rank new prospects in order of predicted probability of purchase
# Notice that we are putting "RankNew" into the Solar_New_Prospects spreadsheet
Solar_New_Prospects$RankNew=rank(Solar_New_Prospects$PredNew)
# Plot predicted probability of purchase versus rank
plot(Solar_New_Prospects$RankNew,Solar_New_Prospects$PredNew,main="Predicted Probability of Sale by Rank")
# Write a file named "Ranked_New_Prospects" with predicted probabilities and rankings included
write.csv(Solar_New_Prospects, file = "Ranked_New_Prospects.csv") # saved to the working directory
confint(model, level = .99) # Additional Tip for CI
Choosing Functional Form
Boston Condo Example
# Estimate Linear Regression
Reg_Condos=lm(PRICE~BATH+BECST+BED+BHILL+CDIST+FLOORS+MARST+PARK+SQFT,
data=Boston_Condos_572)
summaryH(Reg_Condos)
# Test functional form.
# Low p-values suggest this may not be a satisfactory functional form
ramseyTest(Reg_Condos)
# Histograms
par(pty="m")
hist(Boston_Condos_572$PRICE,col="light blue")
hist(log(Boston_Condos_572$PRICE),col="light blue")
hist(Boston_Condos_572$SQFT,col="light blue")
# View added variable plots for the two continuous variables
avPlots(Reg_Condos,id=FALSE, "SQFT")
avPlots(Reg_Condos,id=FALSE, "CDIST")
# Investigate functional form
# Include CDIST and SQFT in both linear and log form
Log_Reg_Condos=lm(log(PRICE)~BATH+BED+CDIST+log(CDIST)+FLOORS+
PARK+SQFT+log(SQFT)+BECST+BHILL+MARST,
data=Boston_Condos_572)
summaryH(Log_Reg_Condos)
# Put Stepwise procedure to work
Log_Step_Condos = step(Log_Reg_Condos,test='F')
summaryH(Log_Step_Condos)
# Stepwise dropped BATH variable
# Require that BATH be retained and re-run stepwise procedure
Log_Step_Condos2 = step(Log_Reg_Condos,
scope=list(lower=log(PRICE) ~ BATH,upper=Log_Reg_Condos),
test='F')
summaryH(Log_Step_Condos2)
# Ramsey test
ramseyTest(Log_Step_Condos2)
# Let's test the null hypothesis that the coefficient of log(SQFT) equals 1
coefTestH(Log_Step_Condos2,"log(SQFT)=1")
# Test the hypothesis that coefficint of BED equals coefficient of BATH
coefTestH(Log_Step_Condos2,c("BED=BATH"))
# Let's investigate the shape of the function of CDIST
# Calculate the combined effect of terms involving CDIST
CdistTerms=with(Boston_Condos_572,(-.030138*CDIST+.118956*log(CDIST)))
# Express as difference from smallest value of CdistTerms
CdistTerms=CdistTerms-min(CdistTerms)
# Exponentiate
ExpCdistTerms=exp(CdistTerms)
# Plot showing effect of CDIST
plot(ExpCdistTerms~CDIST,pch=16,data = Boston_Condos_572,
main="Variation in Price with CDIST Holding Other Variables Constant")
Writing and Numerically Optimizing Functions Using R
Example 1: Optimizing Simple Quadratic Function
# Write an R function to calculate the following: y=6+4*x-x^2
# I chose name Func_y
# Note inside the function we use x[1].
Func_y=function(x){
y=6+4*x[1]-x[1]^2
return(y)}
# View the value of y when x=3
Func_y(3)
# Limit the range of search to values of x in the interval 1 to 5. Don't change nrow or ncol.
bounds=matrix(c(1,5),nrow = 1,ncol = 2)
# Solve for x to maximize y. The name of our function, Func_y, is the first entry in genoud.
# I chose the name Opt_y for the output.
Opt_y = genoud(Func_y, nvars=1,max=TRUE,Domains=bounds,boundary.enforcement = 2,
data.type.int = FALSE,max.generations = 300)
# View the value of x that maximizes y
Opt_y$par
# View the value of y at the maximum
Opt_y$value
Example 2: Automobile Fuel Efficiency and Speed
# Rename for simplicity to MPGData
MPGData=Fuel_Efficiency_vs_Speed
# Estimate Quadratic Model of MPG vs Speed
QuadMPG=lm(MPG~Speed+I(Speed^2),data = MPGData)
summaryH(QuadMPG)
# Here is an example to remind us how to predict MPG given Speed.
# Lets prdict MPG when Speed equals 30 miles per hour
predict(QuadMPG,data.frame(Speed=30))
# Step 1: Fit A Distribution to The Residuals of The Regression #
# Retrieve residuals and fit a distribution to them
resid=residuals(QuadMPG)
select_distribution(resid)
#We obtain: rnorm(n,0,.503)
# Step 2: Simulate 1,000 Draws of MPG for a Given Speed #
# Write a function to simulate MPG including residuals
# Let's use a sample size 1000
# Optimization is done below using this function.
# Setting a seed is essential for the optimization.
SimMPG=function(x){
n=1000
set.seed(33)
MPGPred=predict(QuadMPG,data.frame(Speed=x[1]))
MPGSim=MPGPred+rnorm(n,0,.503)
return(MPGSim)}
# To illustrate the use fo the function, simulate MPG when Speed=30.
MPGSim30=SimMPG(30)
# View a histogram of the 1000 simulated values with 90% confidence interval
hist_CI(MPGSim30)
# Step 3: Calculate The Mean of The 1,000 Simulated Values #
# Write a function to define expected MPG
# Notice that this function uses function SimMPG
SimMeanMPG=function(x){
ExpectedMPG=mean(SimMPG(x[1]))
return(ExpectedMPG)}
# To illustrate use of the function, view simulated expected MPG when Speed=30.
SimMeanMPG(30)
# Step 4: Find The Speed That Maximizes MPG #
# Next we use R to find most fuel-efficient speed.
# Limit the range of search to values of Speed from 20 to 70
bounds=matrix(c(20,70),nrow = 1,ncol = 2)
# Enter SimMeanMPG as first entry in genoud command below
# I chose the name Opt_MPG for the output of the function.
Opt_MPG = genoud(SimMeanMPG, nvars=1,max=TRUE,Domains=bounds,boundary.enforcement = 2,
data.type.int = FALSE,max.generations = 300)
# Step 5: Obtain Results for Analysis of Fuel Efficient Speed #
# View optimal speed that maximizes fuel efficiency
Opt_MPG$par
# View expected MPG at optimal speed
Opt_MPG$value
# Let's find the 90% confidence interval for MPG at the optimal speed
SimSampleAtOptimum=SimMPG(Opt_MPG$par)
#Use the following to display the results:
hist_CI(SimSampleAtOptimum)
# For some applications, you may wish to require integer values of the decision variable.
bounds=matrix(c(20,70),nrow = 1,ncol = 2)
# Set data.type.int=TRUE as shown below. No other change
Opt_MPG = genoud(SimMeanMPG, nvars=1,max=TRUE,Domains=bounds,boundary.enforcement = 2,
data.type.int = TRUE,max.generations = 300)
# View optimal speed to maximize fuel efficiency.Note integer value.
Opt_MPG$par
# View the value of expected MPG at optimal speed
Opt_MPG$value
Module 6
Time Series
Time Series Data:
- error terms that are correlated over time
- Heteroskedasticity
- estimating a model using period-to-period changes
Residuals: amplitude appears to vary over time, so there may also be heteroskedasticity.
summaryHAC()
- Heteroscedasticity and Autocorrelation Correction
Newey-West correction
If short-term forecasting → a portion of the error term in a given period may carry over to the next period / improve our forecasts by taking account of such correlation of errors over time
cons -> coffee bean consumption per capita
rpcinc -> captita real (inflatio adjsuted) income
rpcofe -> price of coffee
rpcard -> price of carbonated beverage (substitute good)
# Plot coffee consumption per capita
par(pty="m")
plot(Coffee_Data$year,Coffee_Data$cons,type="l",col="blue",
main="Per Capita Consumption of Coffee Beans",
xlab="year",ylab="Pounds of Coffee",ylim=c(6,18)) # set the limits of the y axis
lines(Coffee_Data$year,Coffee_Data$cons,col="blue",lw=2)
# Plot Prices and Per Capita Income
# Note: lw sets line width, pch=NA suppresses plot symbol
plot(Coffee_Data$year,Coffee_Data$rpcofe,col="blue",pch=NA,
main="Variables Affecting Coffee Demand in Our Model",
xlab="year",ylab="Real Prices and Income",ylim=c(0,240));
lines(Coffee_Data$year,Coffee_Data$rpcofe,col="blue",lw=2)
lines(Coffee_Data$year,Coffee_Data$rpcinc,col="dark green",lw=2)
lines(Coffee_Data$year,Coffee_Data$rpcarb,col="red",lw=2)
# The text locator command lets you put text on your graph
text(locator(1),"rpcofe")
text(locator(1),"rpcarb")
text(locator(1),"rpcinc")
# Estimate and view coffee regression.
CoffeeReg=lm(cons~rpcofe+rpcinc+rpcarb,data=Coffee_Data)
summaryHAC(CoffeeReg)
# Calculate and plot residuals from coffee regression
cofresid=residuals(CoffeeReg)
plot(Coffee_Data$year,cofresid,col="black",
main="Residuals by Year",
xlab="year",ylab="Residuals",ylim=c(-1.2,1.2))
lines(Coffee_Data$year,cofresid,col="black",lw=2)
abline(h=0,lw=1,col="blue")
# Calculate elasticities for our coffee model
scaledCoefficients(CoffeeReg)
# Conduct Ramsey test
ramseyTest(CoffeeReg)
# View leverage plots using default setting
par(pty="m")
avPlots(CoffeeReg,id=FALSE,pch=20)
Modeling Correlated Errors: ex. An increase in price may result in a temporary increase in consumption from inventories / habit formation may cause some delay in adjusting consumption / the effect may persist subsequent days until the problem is fully corrected
- Each period, a fraction of the error from the previous period is carried forward, and a new random error is added.
- ut= ρ1ut-1+εt → AR(1) “first-order serial correlation” / first-order autocorrelation / first-order autoregressive model
- ut-1 be the error term in our regression last period
- ρ1 the fraction carried forward to period t
- εt a new random event in period t
- yt = β0 + β 1x1t + … + β KxKt + ut
- ut= ρ1ut-1+ ρ2ut-2 + εt (2nd order)
Estimating Models: summaryArima()
->Summary for Regression with Autoregressive Errors*
MyReg=with(MyData, Arima(y, xreg=cbind(x1,x2,x3), order=c(1,0,0), include.constant = TRUE))
summaryArima(MyReg)
# 1st Order
cof_arima=with(Coffee_Data, Arima(cons, xreg=cbind(rpcofe,rpcinc,rpcarb), order=c(1,0,0), include.constant = TRUE))
summaryArima(cof_arima)
# 2nd Order
cof_arima2=with(Coffee_Data, Arima(cons, xreg=cbind(rpcofe,rpcinc,rpcarb), order=c(2,0,0), include.constant = TRUE))
summaryArima(cof_arima2)
Forecast (the model with correlated errors is better most of the time)
MyForecast=forecast(cof_arima,xreg = cbind(83.04195,39.20158,67.11273))
summary(MyForecast)
proceed cautiously if...
- AR(1) coefficient (ρ1) is close to one
- a lagged dependent variable with coefficient close to one
- → In such circumstances, it may be appropriate to estimate the model with all variables expressed in changes rather than levels.
Forecasting
In a production setting, you might know some right-hand-side variables and your managerial decisions might determine other variables. 1) Forecast future values of the right-hand-side variables, or 2) Use values from several periods in the past as predictors.
# Let's select observations on housing starts from 1990 onward for some visual displays
Houst90=subset(HousDat$HOUST,Year>=1990)
# The date will then appear automatically on the horizontal axis when we do our plot
tsHoust90=ts(Houst90,frequency = 12, start = 1990)
# Plot Housing Starts
plot(tsHoust90,col="blue",lw=2,ylab="Housing Starts",main="Monthly Housing Starts")
#Histogram of Housing Starts
hist(Houst90,breaks=30,col="light blue",xlab="Housing Starts", main="Histogram of Housing Starts")
# Regression of starts on starts one month earlier
HousRegLag=lm(Houst90~lag(Houst90,1))
summaryHAC(HousRegLag)
# calculate fitted values
yhat=fitted(HousRegLag)
# For plotting, create time series version of yhat
tsyhat90=ts(yhat,frequency = 12,start = 1990)
# Plotted actual housing starts and starts predicted by HousRegLat
plot(tsHoust90,col="blue",lw=2,ylab="Housing Starts",main="Monthly Housing Starts Actual (blue) and Predicted (red)")
lines(tsyhat90,col="red",lw=1)
# Choose data from before the financial crisis, 1961 through 2006
HousDat1=subset(HousDat,Year<=2006)
Hreg1=lm(HOUST_PCH~lag(AWHMFG,6)+lag(PERMIT,6)+
lag(TREASSPREAD,6)+lag(INDPRO,6)+
lag(HOUST_PCH,6),data=HousDat1)
summaryHAC(Hreg1)
# View standardized coefficients
scaledCoefficients(Hreg1)
# Use all data from 1961 to March 2019
# Estimate model to predict HOUST_PCH
HregAll=lm(HOUST_PCH~lag(AWHMFG,6)+lag(PERMIT,6)+
lag(TREASSPREAD,6)+lag(INDPRO,6)+
lag(HOUST_PCH,6),data=HousDat)
summaryHAC(HregAll)
# In an application you would need to create the counterpart of HousPredDat
Pred_and_PI=predict(HregAll,HousPredDat,level=.95, interval="predict")
Levels vs. changes:
- The value of housing starts at a given point in time is the level of housing starts (April housing starts)
- The difference between two points in time is the change in starts (the change from March to April) ← more appropriate to measure rate of change
- by percentage - asymmetry
- If percentage changes are not too big, percentages are sometime used because percentages are easier for an audience to understand.
- eg. We will use the percentage change in starts in month t relative to the same month a year earlier. This is very useful for seasonal data because we benchmark against the same time a year earlier.
- by (natural) logarithms - log(2.2)-log(2)= log(2)-log(2.2)
- by percentage - asymmetry
• INDPRO: Industrial Production Index, Percent Change from Year Ago, Seasonally Adjusted • PERMIT: New Private Housing Units Authorized by Building Permits, Percent Change from Year Ago, Seasonally Adjusted Annual Rate • AWHMFG: Average Weekly Hours of Production and Nonsupervisory Employees: Manufacturing, Percent Change from Year Ago, Seasonally Adjusted • Treasury Spread: 10yr yield minus Federal Funds Rate, annualized
Suppose we have a monthly series in levels named X We wish to calculate the percentage change relative to 12 months earlier.
X_PCH=100*((X-lag(X,12))/lag(X,12))
𝑦t+6 = b0 + b1xt
# Create HousDat2, which has data from 1961 through 2015
HousDat2=subset(HousDat,Year<=2015)
Hreg2=lm(HOUST_PCH~lag(AWHMFG,6)+lag(PERMIT,6)+
lag(TREASSPREAD,6)+lag(INDPRO,6)+
lag(HOUST_PCH,6),data=HousDat2)
summaryHAC(Hreg2)
# Prepare Variables for Plotting for years 2015-2019
HPred2=predict(Hreg2, HousDat,level = .95, interval="predict")
HPred2=data.frame(HPred2)
HPred2=subset(HPred2,Year>=2015 & Year<=2019)
tsHPred2=ts(HPred2,frequency = 12,start=2015,end=2019)
Houst2=subset(HousDat$HOUST_PCH,Year>=2015 & Year<=2019)
tsHoust2=ts(Houst2,frequency = 12,start = 2015, end=2019)
# Plot actual and predicted with prediction interval for 2015-2019
ts.plot(tsHPred2,tsHoust2, col=c("red","blue","blue","black"),
ylab="Housing Starts",
main="Actual (black) and Predicted (red) Starts with 95% Pred. Interval")
# Prepare Variables for Plotting for years 2006 to 2016
HPred1=predict(Hreg1, HousDat,level = .95, interval="predict")
HPred1=data.frame(HPred1)
HPred1=subset(HPred1,Year>=2006 & Year<=2015)
tsHPred1=ts(HPred1,frequency = 12,start=2006)
Houst1=subset(HousDat$HOUST_PCH,Year>=2006 & Year<=2015)
tsHoust1=ts(Houst1,frequency = 12,start = 2006)
# Plot actual and predicted with prediction interval for 2006-2016
ts.plot(tsHPred1,tsHoust1, col=c("red","blue","blue","black"),
ylab="Housing Starts",
main="Actual (black) and Predicted (red) Starts with 95% Prediction Interval")
All variables except Treasspread are expressed as percent changes, so the coefficients of all variables except Treaspread are elasticities. → coefficients in our model capture a combination of demand and supply effects “reduced form model”.
Estimation, Simulation, and Optimization
• Estimation: Obtaining estimates of parameters required for our decision. • Simulation: Capturing the effects of uncertainty. • Optimization: Choosing the optimal policy.
Step 1: Fit A Distribution to The Residuals of The Regression
# Estimate Regression
RegKidsafe=lm(log(Sales)~log(Price)+log(Compprice)+log(Advertising)
+Age+Education+Income+City+factor(Shelveloc),data=Kidsafe)
summaryH(RegKidsafe)
# Obtain residuals
resid=residuals(RegKidsafe)
# Select a distribution to fit residuals
# We find normal with mean=0 and standard deviation .169
select_distribution(resid)
# Standard deviation from select_distribution command
sd=.169
Step 2: Simulate 1,000 Draws of Profit for a Given Advertising
# Function to draw a sample of n observations of Profit
# Note two places where advertising, x[1], appears in the function.
FuncProfitSample = function(x){ set.seed(33)
n = 1000
set.seed(33)
Pred_Log_Sales=predict(RegKidsafe, data.frame(Price=90,Compprice=80,
Advertising=x[1],Age=40,Education=12,Income=50,
City=1,Shelveloc="Good"))
Sim_Log_Sales=Pred_Log_Sales+rnorm(n,0,sd)
Sales=exp(Sim_Log_Sales)
Profit=10*Sales-x[1]
return(Profit)}
Step 3: Calculate The Mean of The 1,000 Simulated Values
# We wish to calculate expected profit.
FuncMeanProfit = function(x){
Profit=FuncProfitSample(x) # Draw profit sample of n for given x
ExpectedProfit=mean(Profit) # Mean of sample of profit
return(ExpectedProfit)}
Step 4: Find The Advertising That Maximizes Profit
# Solve for the optimum
bounds=matrix(c(500,1200),nrow = 1,ncol = 2) # advertising range from $500 to $1200
OptAdv = genoud(FuncMeanProfit, nvars=1,max=TRUE,Domains=bounds,boundary.enforcement = 2,
data.type.int = FALSE,max.generations = 300)
Step 5: Obtain Results for Analysis of Optimal Advertising
# View optimal advertising and expected profit at the optimum
OptAdv$par
OptAdv$value
# Simulate profit sample at optimum
SimSampleAtOptimum=FuncProfitSample(OptAdv$par)
# Calculate simulated 90% confidence interval of profit.
hist_CI(SimSampleAtOptimum)
# Predicted carseats sold and 95% CI
# We calculate the predicted log values and the exponentiate.
Pred_Log_Sales=predict(RegKidsafe, data.frame(Price=90,Compprice=80,
Advertising=OptAdv$par,Age=40,Education=12,Income=50,
City=1,Shelveloc="Good"),interval="predict")
exp(Pred_Log_Sales)
Regression Analysis with Non-Experimental Data
- amount of advertising that the company allocates to each store is determined in part by sales in each store
- some store managers provide improved shelf location for products that sell well.
- reverse causality
- the regression model would tend to overstate the effect of advertising on sales
Addressing Reverse Causality
- Investigate how the advertising budget of each store is actually determined.
- If the advertising budget is determined in part by sales, then consider doing experimental variation of the advertising budgets of stores.
- If experimental variation is not feasible, find the algorithm that is uses to allocate advertising to stores. This algorithm can potentially be used to separate out the causal effect of advertising on sales from the effect of sales on the advertising budget.