#This script contains a variety of functions for estimating total cost models (using iterated SUR) and benchmarking firms in a given sample #Currently, no custom classes are included in this script. Model results are contained in lists, which can prove somewhat unwieldy. #At some point in the future, it would be advisable to design a class for the model returned by hetsur as well as default summary and print methods #Any number of factors of production may be used with all of these functions #This function meanscales selected variables. #Forecast values for certain companies may be included in the input dataset but excluded from the calculation of the means #by using the incfc and fcvar options. #The function returns a list containing two dataframes, #the mean-scaled data and the list of means used to scale the data. #The verbose option is provided for debugging purposes, changing it from its default to T will cause #the function to print the class of the each variable being mean scaled, its name, and its mean, #as the function loops over the variables specified by the user. meanscale <- function(fulldata, varlist, verbose = F, incfc = F, fcvar = NULL) { #Create data frame in which to store the means of the variables. meanframe <- data.frame( varname = varlist, varmean = rep(0, length(varlist)), stringsAsFactors = F ) #Initialize vector of means tempmeans <- NULL #Create temporary datasets. This distinction is only meaningful if the incfc and fcvar options are specified. newdata <- fulldata meandata <- fulldata #Check if forecast values have been included in the input data frame, and if so, drop them from the dataset used to calculate the sample means. if (incfc == T) { fcvarmark <- which(names(fulldata) == fcvar) tempdrop <- which(fulldata[, fcvarmark] == 1) meandata <- meandata[-tempdrop, ] rm(tempdrop) } #Loop over variables to be mean-scaled for (i in varlist) { #Get location of current variable in the data frame varmark <- which(names(meandata) == i) if (verbose == T) { print(class(meandata[, varmark])) } #Calculate the mean of the current variable, excluding missing values from this calculation. tempmean <- mean(meandata[, varmark], na.rm = T) if (verbose == T) { print(c(i, tempmean)) } #Append the mean of the current variable to the vector of means tempmeans <- c(tempmeans, tempmean) #Scale all observations data for the current variable (not necessarily just the data used to calculate the mean) by the sample mean. newdata[, varmark] <- fulldata[, varmark] / tempmean #Remove temporary variables from memory before next cycle of loop. rm(tempmean, varmark) } #Fill the column of the data frame of variable means with the vector constructed during the above loop. meanframe$varmean <- tempmeans #Remove temporary objects from memory. rm(tempmeans, meandata) #Create list of outputs. outputmean <- list(newdata, meanframe) #Return outputs. return(outputmean) } #This function logs selected variables #The verbose option is provided for debugging purposes. Changing it from its default to T will #cause the function to print the variable name and class as it loops over the variables specified by the user. logvars <- function(fulldata, varlist, verbose = F) { #Create local data frame for new data. newdata <- fulldata varinfo <- data.frame( varname = varlist, check0 = rep(0, length(varlist)), stringsAsFactors = F ) #Loop over variables to be logged for (i in 1:length(varlist)) { tempvar <- varlist[i] #Get location of current variable in input data frame varmark <- which(names(fulldata) == tempvar) #check if there are any zero values in the current variable check0 <- ifelse(length(which(fulldata[, varmark] == 0)) > 0, 1, 0) #print information on current variable if verbose was specified as true if (verbose == T) { print(c(tempvar, class(fulldata[, varmark]), check0)) } #Replace current variable in the local data frame with the natural logarithm of that variable in the input data frame. #If that variable has any zero values, add 1 to all values before logging. #NB: DO NEED TO ADD 1!! DOES IT FOR YOU! if (check0 == 0) { newdata[, varmark] <- log(fulldata[, varmark]) } else{ newdata[, varmark] <- log(1 + fulldata[, varmark]) } #Remove temporary variables before next cycle of loop. varinfo$varname[i] <- tempvar varinfo$check0[i] <- check0 rm(varmark, check0, tempvar) } #Return output data frame. outputlogs <- list(newdata, varinfo) return(outputlogs) } #hetsur is a regression routine that has been almost entirely translated directly from gauss to R #The structure of the code follows the original gauss code found in SUR3HUP.src extremely closely #Annotations are included to help the user understand the code, but not to explain why the routine is carried out in this way #It is likely that the program could be improved somewhat by taking advantage of features of R not available in Gauss #The arguments of this program are as follows: # 1) lhs_vars: a vector containing the string names of the the dependent variables, in order (cost should always go first) # 2) rhs_vars: a vector containing the string names of each set of independent variables # (order of price variables in share equations should match order of factors in lhs_vars). # The sets should be ordered by equation (as defined by the order of lhs_vars) and variables should be repeated as necessary. # 3) num_var: the number of independent variables in each equation (also ordered according to lhs_vars) # 4) restrict.R: the left hand side of the matrix defining restrictions on the coefficients. # Columns in restrict.R correspond to the similarly indexed variable in rhs_vars. # For Total Cost Models, this is used to force coefficients in different equations to # be equal to one another following shephard's lemma. # 5) restrict.z: the right hand side of the matrix defining restrictions on the coefficients. # 6) ac: whether an autocorrelation correction should be performed prior to estimation and if so, what kind. # "none" results in no autocorrelation correction. # "ar1" results in an autocorrelation correction using the same rho for all companies # "psar1" results in an autocorrelation correction using different rhos for each company # It should be noted that the Gauss code and this R code do not use the same estimation method for a # single autocorrelation coefficient and the panel-specific autocorrelation coefficients # 7) het: whether a heteroskedasticity correction should be applied prior to estimation (T = yes, F = no) # 8) lrdv: Determines which divisor is used to compute the covariance matrix of the error # If 0, T-(K/M) is used as divisor, where T is thenumber of observations, K is the number of all # right-hand side variables in the systems,and M is the total number of equations. # Hence,(K/M) is the average number of coefficientsper equation. # If 1, T is used as divisor. Users are encouraged to use this, since it provides good asymptotic properties for the estimator. #lriter: maximum number of iterations of the routine without convergence before returning estimates #lrtol: the threshold for determining convergence #verbose: prints progress of determinant of sigma matrix #The object it returns contains a list of the following: # 1) Dependent.Variables: a vector the strings of the dependent variables in the model # 2) Models: a list of data frames containing the estimation results (coefficients, standard errors, t statistics, and p values) # 3) Covariance.Matrix: the covariance matrix of the estimated parameters # 4) Correlation.Matrix: the correlation matrix of the estimated parameters # 5) Sigma.Matrix: the matrix of equation variances/covariances # 6) Model.Statistics: a list of various model statistics # 7) Transformed.Data: the data after the application of the heteroskedasticity and autocorrelation corrections # 8) Estimation.Parameters: a list of parameters used in estimating the model # (important for replication later during out of sample reestimation) # 9) The following parameters are recorded: restrict.R, restrict.z, het, ac, lrdv, lriter, lrtol # 10) Original.Data: the data before the application of the heteroskedasticity and autocorrelation corrections hetsur <- function(lhs_vars, rhs_vars, num_var, data, idvar, tvar, restrict.R, restrict.z, het = T, ac = c("none", "ar1", "psar1"), lrdv = 1, lriter = 100, lrtol = .0001, verbose = T) { #add new variables, as necessary to the full data frame. #AV: function add interaction terms alldata <- addvars(data = data, rhs_vars = rhs_vars) #check which columns in the data are numeric and reduce the data set to only those #NEW 072016 numtest <- sapply(alldata, is.numeric) nummark <- which(numtest == T) data <- alldata[, nummark] #convert data to matrix dta <- as.matrix(data) #set values of objects involved in applying the restrictions during estimation R <- restrict.R z <- restrict.z #find the locations (column indexes) of the dependent and independent variables in the full data frame. lhslocs <- match(lhs_vars, names(data)) rhslocs <- match(rhs_vars, names(data)) #find the locations (column indexes) of the id and time variables in the full data frame idvarloc <- match(idvar, names(data)) tvarloc <- match(tvar, names(data)) #order the full data by id, then time dta <- dta[order(dta[, idvarloc], dta[, tvarloc]), ] #create a table of how many observations appear for each id number, convert it to a dataframe, #and rename its columns appropriately idtlist <- table(data[, idvarloc]) idtframe <- as.data.frame(idtlist) names(idtframe) <- c(idvar, "apps") #apps for appearnces class(idtframe[, 2]) <- "numeric" #record how many firms are in the dataset N_firms <- dim(idtframe)[1] #calculate the values of the matrix x`x and x`y. #AV: %*% is matrix multiplication operator xx <- t(dta[, rhslocs]) %*% dta[, rhslocs] xy <- t(dta[, rhslocs]) %*% dta[, lhslocs] #create empty matrices for old x data, new x data, and lagged x data dtax <- matrix(0, dim(dta)[1], dim(xx)[2]) dtax1 <- matrix(0, dim(dta)[1], dim(xx)[2]) dtaxL <- matrix(0, dim(dta)[1], dim(xx)[2]) #create empty matrices for old y data, new y data, and lagged y data dtay <- matrix(0, dim(dta)[1], length(num_var)) dtay1 <- matrix(0, dim(dta)[1], length(num_var)) dtayL <- matrix(0, dim(dta)[1], length(num_var)) #initialize hetloop (this variable tracks progress in the calculation and application of heteroskedasticity #and/or autocorrelation corrections) hetloop <- 1 #enter loop for calculation and application of heteroskedasticity and/or autocorrelation corrections while (hetloop <= 2) { #check whether loop has been completed once and therefore heteroskedsticity weights #and/or autocorrelation coefficients are available #if so, enter a block of code which applies the corrections if (hetloop == 2) { #initialize variables marking the columns of the x data currently in use lb <- 1 ub <- 0 #initialize the variable marking the column of the y data currently in use i <- 1 #loop over columns of y data, stop when column index exceeds the number of equations #AV:g is defined below since this part is not run during the first iteration of the loop. while (i <= g) { #properly increment the upper bound of the columns currently in use for the x data (incremented by the number of variables in the equation at hand) ub <- num_var[i] + ub #initialize a variable tracking the firm currently in use k <- 1 #initialize variables marking the rows of data corresponding to the firm currently in use lbk <- 1 ubk <- 0 while (k <= N_firms) { #properly increment the upper bound of the rows currently in use #(incremented by the number of observations for the current firm in the data) ubk <- ubk + idtframe$apps[k] #divide each column of the rows of x data currently selected by the corresponding heteroskedasticity #weights for the equation/firm in question dtax[lbk:ubk, lb:ub] <- sweep(dta[lbk:ubk, rhslocs[lb:ub]], 1, wts[lbk:ubk, i], `/`) #create lagged x data with a row of zeroes in the initial year dtaxL[lbk:ubk, lb:ub] <- rbind(rep(0, ub - lb + 1), dtax[lbk:(ubk - 1), lb:ub]) #perform autocorrelation correction for the first year in which the firm appears in the data set dtax1[lbk, lb:ub] <- sqrt(1 - rhoi[lbk, i] ^ 2) * dtax[lbk, lb:ub] #check whether there is more than one year after the first year for the current firm #and then use the appropriate function to carry out the autocorrelation correction in other years. #this distinction is necessary because sweep() does not work if only one row of data is given to it if (lbk + 1 < ubk) { dtax1[(lbk + 1):ubk, lb:ub] <- dtax[(lbk + 1):ubk, lb:ub] - sweep(dtaxL[(lbk + 1):ubk, lb:ub], 1, rhoi[(lbk + 1):ubk, i], `*`) } if (lbk + 1 == ubk) { dtax1[(lbk + 1):ubk, lb:ub] <- dtax[(lbk + 1):ubk, lb:ub] - dtaxL[(lbk + 1):ubk, lb:ub] * rhoi[(lbk + 1):ubk, i] } #dtay[,i] <- dta[,lhslocs[i]]/wts[,i] CHANGED AS BELOW #divide each the rows of y data currently selected by the corresponding heteroskedasticity weights #for the equation/firm in question dtay[lbk:ubk, i] <- dta[lbk:ubk, lhslocs[i]] / wts[lbk:ubk, i] #create lagged y data with a zero for the first year in which the firm appears in the data set dtayL[lbk:ubk, i] <- c(0, dtay[lbk:(ubk - 1), i]) #perform the autocorrelation correction for the first year in which the firm appears in the data set dtay1[lbk, i] <- sqrt(1 - rhoi[lbk, i] ^ 2) * dtay[lbk, i] #apply the autocorrelation correction in the remaining years dtay1[(lbk + 1):ubk, i] <- dtay[(lbk + 1):ubk, i] - (rhoi[(lbk + 1):ubk, i] * dtayL[(lbk + 1):ubk, i]) #increment the variable tracking the firm currently in use k <- k + 1 #increment the variable tracking the lower bound of the rows currently in use (incremented to the upper bound of the rows just corrected plus 1) lbk <- ubk + 1 } #increment the variable tracking the lower bound of the columns of x data currently in use (incremented to the upper bound of the columns just corrected plus 1) lb <- ub + 1 #increment the variable tracking the equation in use i <- i + 1 } #DELIF EXPRESSIONS OMITTED???? #calculate new x`x and x`y matrices using corrected data xx <- t(dtax1) %*% dtax1 xy <- t(dtax1) %*% dtay1 #exit section of code that corrects data, began with if(hetloop==2) } #since hetloop always starts off at 1, this will be the first section of code actually executed when the program runs #get the number of equations g <- length(num_var) #initialize the matrix of coefficients b <- matrix(0, length(rhs_vars), 1) #initialize variables marking the columns of the x data currently in use lb <- 1 ub <- 0 #initialize the variable marking the column of the y data currently in use i <- 1 #loop over columns of y data, stop when column index exceeds the number of equations while (i <= g) { #properly increment the upper bound of the columns currently in use for the x data (incremented by the number of variables in the equation at hand) ub <- num_var[i] + ub #get inverse of x`x matrix ixxtemp <- solve(xx[lb:ub, lb:ub]) #calculate coefficients for the equation currently in use b[lb:ub] <- ixxtemp %*% xy[lb:ub, i] #increment the lower bound of the columns of x data currently in use lb <- ub + 1 #increment the column of y data currently in use i <- i + 1 } #check whether heteroskedasticity and autocorrelation corrections have been performed and, #if so, apply restrictions to estimated coefficients if (hetloop == 2) { c <- matrix(0, length(rhs_vars), length(rhs_vars)) lb <- 1 ub <- 0 i <- 1 while (i <= g) { ub <- num_var[i] + ub c[lb:ub, lb:ub] <- xx[lb:ub, lb:ub] lb <- ub + 1 i <- i + 1 } c <- solve(c) invRcR <- solve(R %*% c %*% t(R)) b <- b - (c %*% t(R)) %*% invRcR %*% (R %*% b - z) } #check whether heteroskedasticity and autocorrelation corrections have been performed and, #if not, calculate the groupwise heteroskedasticity weights and autocorrelation coefficient(s) to be used if (hetloop == 1) { #initialize matrix of residuals eols <- matrix(0, dim(dta)[1], g) #initialize matrix of lagged residuals eols1 <- matrix(0, dim(dta)[1], g) #initialize matrix of heteroskedasticity weights (nobs x neqs) gh_wt <- matrix(0, dim(dta)[1], g) #initialize matrix of heteroskedasticity weights (n_firms x neqs) gh_wt0 <- matrix(0, N_firms, g) #initialize matrix of firm durbin watson components ddwf <- matrix(0, N_firms, g) #initialize vector of durbin watson statistics ddw <- matrix(0, g, 1) #initialize matrices of autocorrelation coefficients rhoi <- matrix(0, dim(dta)[1], g) rho2a <- matrix(0, 1, g) rho1 <- matrix(0, dim(dta)[1], g) rho2 <- matrix(0, dim(dta)[1], g) #initialize a variable marking the equation (column of y data) currently in use i <- 1 #initialize variables marking the columns of the x data currently in use lb <- 1 ub <- 0 #loop until index exceeds number of columns/equations it is indexing while (i <= g) { #properly increment upper bound of columns of x data currently in use ub <- num_var[i] + ub #initialize variable marking the firm currently in use k <- 1 #initialize variables marking the rows corresponding to the firm currently in use #LBK AND UBK ARE NOT USED(?) lbk <- 1 ubk <- 0 lbkk <- 1 ubkk <- 0 while (k <= N_firms) { #properly increment upper bound of rows currently in use ubk <- ubk + idtframe$apps[k] - 1 ubkk <- ubkk + idtframe$apps[k] #generate residuals for the current dependent variable and company eols[lbkk:ubkk, i] <- dta[lbkk:ubkk, lhslocs[i]] - dta[lbkk:ubkk, rhslocs[lb:ub]] %*% b[lb:ub] #TWO STEPS NECESSARY HERE? #calculate company specific variance and assign its square root to the relevant components of the group heteroskedasticity weight matrices gh_wt0[k, i] <- sqrt(mean(eols[lbkk:ubkk, i] ^ 2) - mean(eols[lbkk:ubkk, i]) ^ 2) gh_wt[lbkk:ubkk, i] <- gh_wt0[k, i] #generate lagged residuals with a 0 in the initial position eols1[lbkk:ubkk, i] <- c(0, eols[lbkk:(ubkk - 1), i]) ##IS DW STAT INCORRECT??? why not sum((eols[(lbkk+1):ubkk,i]-eols1[lbkk:(ubkk-1),i])^2)-eols[lbkk:ubkk,i]^2 #calculate company specific durbin watson component ddwf[k, i] <- sum((eols[lbkk:ubkk, i] - eols1[lbkk:ubkk, i]) ^ 2) - sum(eols[lbkk, i] ^ 2) #calculate company specific autocorrelation coefficient (corresponds to scorr in panelAR) rho1[lbkk:ubkk, i] <- sum(eols[lbkk:ubkk, i] * eols1[lbkk:ubkk, i]) / sum(eols[lbkk:ubkk, i] ^ 2) #increment index tracking the company currently in use k <- k + 1 #increment lower bounds of rows that correspond to the company currently in use lbk <- ubk + 1 lbkk <- ubkk + 1 #calculate full sample common autocorrelation coefficient, #this gets redone repeatedly until it is done properly at the very end of the loop ddw[i] <- sum(ddwf[, i]) rho2a[, i] <- 1 - .5 * (ddw[i] / sum(eols[, i] ^ 2)) rho2[, i] <- rho2a[, i] } #increment lower bound of columns of x data currently in use lb <- ub + 1 #increment equation/dependent variable/column of y data currently in use i <- i + 1 } #check input parameter het and set wts according to the result #if het is false, all observations will just be divided by one during the next loop cycle, changing nothing #if het is true, all observations will be divided by the appropriate weight if (het == T) { wts <- gh_wt } if (het == F) { wts <- matrix(1, dim(gh_wt)[1], dim(gh_wt)[2]) } #check input parameter ac and set weight according to the result #if ac is "none", the the autocorrelation coefficient will be set to 0, #which will result in no changes when the correction is applied in the enxt cycle of the loop if (ac == "psar1") { rhoi <- rho1 } if (ac == "ar1") { rhoi <- rho2 } if (ac == "none") { rhoi <- matrix(0, dim(dta)[1], g) } } #increment the variable tracking progress in applying the autocorrelation and heteroskedasticity corrections hetloop <- hetloop + 1 } #create an object containing the transformed dataset hdataset <- cbind(dtay1, dtax1) #initialize a variable tracking the current iteration iter <- 0 #get the sigma matrix for the current parameter values sig <- LRsseh( lhs_vars = lhs_vars, rhs_vars = rhs_vars, num_var = num_var, b = b, data = hdataset, g = g ) #calculate and display the log of the determinant of the sigma matrix (divided by the number of observations) if (lrdv == 1) { sig <- sig / dim(hdataset)[1] } lnsig_o <- log(det(sig)) if (verbose == T) { print(paste("ITERATION ", iter, sep = "")) print(paste("LOG OF DETERMINANT OF SIGMA = ", lnsig_o, sep = "")) } #store the absolute value of the log of the determinant of sigma pcd <- abs(lnsig_o) #copy lriter into a new variable that denotes the maximum number of iterations maxiter <- lriter #initialize a loop that will end if either: #1. the number of iterations exceeds the maximum specified by the user; or #2. the difference in the absolute value of the log of the determinant of the sigma matrix #falls below the tolerance specified by the user while (iter <= maxiter & pcd >= lrtol) { #get the estimation results for the next iteration of the SUR regression isig <- solve(sig) mk <- sum(num_var) xzx <- matrix(0, mk, mk) xzy <- matrix(0, mk, 1) i <- 1 lbi <- 1 ubi <- 0 while (i <= length(num_var)) { ubi <- num_var[i] + ubi j <- 1 lbj <- 1 ubj <- 0 while (j <= length(num_var)) { ubj <- num_var[j] + ubj xzx[lbi:ubi, lbj:ubj] <- isig[i, j] * xx[lbi:ubi, lbj:ubj] xzy[lbi:ubi] <- xzy[lbi:ubi] + isig[i, j] * xy[lbi:ubi, j] j <- j + 1 lbj <- ubj + 1 } i <- i + 1 lbi <- ubi + 1 } c <- solve(xzx) b <- c %*% xzy invRcR <- solve(R %*% c %*% t(R)) b <- b - (c %*% t(R)) %*% invRcR %*% (R %*% b - z) c <- c - (c %*% t(R)) %*% invRcR %*% (R %*% c) #CHANGED AS BELOW #se<-sqrt(abs(diag(c))) se <- sqrt(diag(c)) t_temp <- b / se t1 <- t_temp t2 <- matrix(0, length(t1), 1) df <- matrix(0, g, 1) lb <- 1 ub <- 0 i <- 1 while (i <= g) { ub <- num_var[i] + ub df[i] <- dim(hdataset)[1] - num_var[i] - dim(R)[1] for (j in lb:ub) { t2[j] <- 2 * pt(abs(t1[j]), df[i], lower.tail = F) } lb <- ub + 1 i <- i + 1 } #get sigma matrix for updated parameters and divide by proper divisor sig <- LRsseh( lhs_vars = lhs_vars, rhs_vars = rhs_vars, num_var = num_var, b = b, data = hdataset, g = g ) if (lrdv == 1) { sig <- sig / dim(hdataset)[1] } #calculate the log of the determinant of sigma and the percentage change in it lnsig_n <- log(det(sig)) pcd <- abs((lnsig_n - lnsig_o) / lnsig_o) * 100 #increment the iteration number iter <- iter + 1 #if verbose is true, display the iteration number and the current log of the determinant of the sigma matrix if (verbose == T) { print(paste("ITERATION ", iter, sep = "")) print(paste("LOG OF DETERMINANT OF SIGMA = ", lnsig_n, sep = "")) } #replace the old log of the determinant of sigma with the new one lnsig_o <- lnsig_n } #initialize a variety of variables for use in calculating model statistics model #many of these are not used in the R code yy <- 0 sse <- 0 sumy <- 0 ybar <- 0 dw <- 0 ef <- matrix(0, 1, g) #readisk<-0 tdwf <- matrix(0, N_firms, g) e2 <- matrix(0, dim(dtax1), g) ef2 <- matrix(0, 1, g) e3 <- matrix(0, dim(dta)[1], g) #tdw3<-matrix(0,g,1) y <- matrix(0, dim(dta)[1], g) e <- matrix(0, dim(dtax1), g) e1 <- e3 tsumy <- matrix(0, g, 1) tdw <- matrix(0, g, 1) #initialize the variable tracking the current equation i <- 1 #initialize variables marking the columns of x data that correspond to the current equation lb <- 1 ub <- 0 #loop until the variable indexing equations exceeds the number of equations while (i <= g) { #properly increment the upper bound of the columns of x data that correspond to the current equation ub <- num_var[i] + ub #assign dependent variable original values to the current column of y y[, i] <- dta[, lhslocs[i]] #assign residuals from transformed data for the current equation to the current column of e e[, i] <- dtay1[, i] - dtax1[, lb:ub] %*% b[lb:ub] #assign residuals from the original data for the current equation to the current column of e3 e3[, i] <- dta[, lhslocs[i]] - dta[, rhslocs[lb:ub]] %*% b[lb:ub] #record the sum of the current y variable tsumy[i] <- tsumy[i] + sum(y[, i]) #initialize variable tracking the firm currently in use k <- 1 #initialize variables marking the rows that correspond to the firm currently in use lbk <- 1 ubk <- 0 lbkk <- 1 ubkk <- 0 #loop until the variable indexing firms exceeds the number of firms while (k <= N_firms) { #properly increment the upper bound of the rows that correspond to firm currently in use ubk <- ubk + idtframe$apps[k] - 1 ubkk <- ubkk + idtframe$apps[k] #USES KK INSTEAD OF K AS IN ORIGINAL #properly assign lagged residuals with 0 in the initial position e2[lbkk:ubkk, i] <- c(0, e[lbkk:(ubkk - 1), i]) #calculate the component of the durbin watson statistic for the current firm tdwf[k, i] <- sum((e[lbkk:ubkk, i] - e2[lbkk:ubkk]) ^ 2) - e[lbk, i] ^ 2 #calculate the sum of the current durbin watson components and store it, this will be redone repeatedly until the very end of the loop, when it will be calculated properly tdw[i] <- sum(tdwf[, i]) #increment the variable marking the firm currently in use k <- k + 1 #increment the variabes marking the lower bound of the rows currently in use, lbk LOOKS VERY WRONG(?) lbk <- ubk + 1 lbkk <- ubkk + 1 } #increment the equation currently in use i <- i + 1 #properly increment the lower bound of the columns of x data currently in use lb <- ub + 1 } #calculate sum square errors based on residuals produced using transformed data sse <- sse + diag(t(e) %*% e) #calculate sum square errors based on residuals produced using original data sse3 <- diag(t(e3) %*% e3) #calculate final sigma matrix sig <- t(e3) %*% e3 / dim(hdataset)[1] #calculate y`y matrix yy <- yy + t(y) %*% y sumy <- sumy + tsumy #store components of durbin watson dw <- dw + tdw #store means of y ybar <- sumy / dim(hdataset)[1] #finish calculation of durbin watson coefficients dw <- dw / sse #initialize the variable marking the equation currently in use i <- 1 #initialize variables marking the columns of x data currently in use lb <- 1 ub <- 0 #initialize vectors for various model statistics sst <- matrix(0, g, 1) rsq <- matrix(0, g, 1) rbsq <- matrix(0, g, 1) #loop until the variable indexing the equations exceeds the number of equations while (i <= g) { #properly increment the variable marking the upper bound of the columns of x data currently in use ub <- num_var[i] + ub #calculate model statistics sst[i] <- yy[i, i] - dim(hdataset)[1] * ybar[i] ^ 2 rsq[i] <- 1 - sse3[i] / sst[i] rbsq[i] <- 1 - ((dim(hdataset)[1] - 1) / df[i]) * (1 - rsq[i]) #properly increment the variable marking the lower bound of the columns off x data currently in use lb <- ub + 1 #increment the variable tracking the equation currently in use i <- i + 1 } #calculate the correlation matrix of the estimates s <- diag(1 / sqrt(diag(c))) cm <- s %*% c %*% t(s) #store the number of observations nobs <- dim(hdataset)[1] #initialize the object containing the models to be returned modelsout <- list(NULL) #initialize the variable tracking the equation currently in use i <- 1 #initialize the variables marking the right hand side variables currently in use lb <- 1 ub <- 0 #loop until the variable indexing the equations exceeds the number of equations while (i <= g) { #properly increment the upper bound of the right hand side variables currently in use ub <- num_var[i] + ub #create a table containing the estimation results tempmodel <- cbind(b, se, t1, t2)[lb:ub, ] #properly name the rows of the table row.names(tempmodel) <- rhs_vars[lb:ub] tempmodel <- as.data.frame(tempmodel) #properly name columns of the table names(tempmodel) <- c("Coefficient", "Standard.Error", "T.Statistic", "P.Value") #store the current table in the list of model results to be returned modelsout[[i]] <- tempmodel #increment the variable tracking the equation currently in use i <- i + 1 #properly increment the lower bound of the right hand side variables currently in use lb <- ub + 1 } #create new objects containing the covariance and correlation matrices covmatout <- c cormatout <- cm #create new object containing the sigma matrix sigmaout <- sig #create list of objects containing model statistics and properly name its components modelstatsout <- list(nobs, sst, sse3, df, rsq, rbsq, sig, dw, N_firms) names(modelstatsout) <- c( "Observations", "Total.Sum.of.Squares", "Residual.Sum.of.Squares", "Degrees.of.Freedom", "R.Squared", "R.Bar.Squared", "Sigma", "Durbin.Watson.Statistic", "Number.of.Firms" ) #create list of parameters used during estimation and name them appropriately estparsout <- list(restrict.R, restrict.z, het, ac, lrdv, lriter, lrtol) names(estparsout) <- c("rmat", "zmat", "het", "ac", "lrdv", "lriter", "lrtol") #get original data set origdata <- data #compile full list of objects to be output and name them appropriately fulloutput <- list( lhs_vars, modelsout, covmatout, cormatout, sigmaout, modelstatsout, hdataset, estparsout, origdata ) names(fulloutput) <- c( "Dependent.Variables", "Models", "Covariance.Matrix", "Correlation.Matrix", "Sigma.Matrix", "Model.Statistics", "Transformed.Data", "Estimation.Parameters", "Original.Data" ) #return estimation results return(fulloutput) } #This function gets sum of squared errors for groups in each equation and in system #The arguments are as follows: # 1) lhs_vars: the dependent variables in the system of equations # 2) rhs_vars: the vector containing each equation's independent variables, # concatenated in the order of the corresponding dependent variables # 3) num_var: the number of independent variables in each equation # 4) b: the current parameter estimates in use # 5) data: the dataset to be used for the regression # 6) g: the number of equations in the system. #This function returns a gxg matrix called sig, which is the sigma matrix for the system of equations. LRsseh <- function(lhs_vars, rhs_vars, num_var, b, data, g) { #get number of observations #THIS IS PROBABLY UNNEEDED, COPIED DIRECT FROM GAUSS CODE nobs <- dim(data)[1] #initialize matrices of dependent variables and errors e <- matrix(0, dim(data)[1], g) y <- matrix(0, dim(data)[1], g) #intialize loop variables i <- 1 lb <- 1 + g ub <- g #initialize results variables #WHY IS yy HERE? NOT USED IN ANY CALCULATIONS OR RETURNED sig <- 0 yy <- 0 #begin loop over equations #AV: i indexes in the equation while (i <= g) { #set proper upper bound for data matrix columns for the current equation's X data ub <- num_var[i] + ub #calculate errors for current equation e[, i] <- data[, i] - data[, lb:ub] %*% b[(lb - g):(ub - g)] #set y for current equation #WHY IS THIS HERE? NOT USED IN ANYTHING RETURNED BY THE PROGRAM y[, i] <- data[, i] #increment equation number i <- i + 1 #set proper lower bound for the data matrix columns for the next equation's X data lb <- ub + 1 } #calculate the sigma matrix sig <- t(e) %*% e + sig #calculate the covariance matrix for the dependent variables #WHY IS THIS DONE? NOT RETURNED yy <- t(y) %*% y + yy #return sigma matrix return(sig) } #This function produces an additional explicit share equation based on the initial results. #AV: This is the share equation that was left out of the initial estimation because of colinearity (shares add up to 1) #Its arguments are as follows: # 1) hetest: the object returned by the subroutine hetsur, containing the results of a seemingly unrelated regression # with heteroskedasticity and/or autocorrelation # 2) newsharename: the name of the dependent variable for which a new equation should be produced # 3) trendvar: the name, if any, of the trend variable interacted with the input price variables in the main equation #This function returns an augmented version of the hetsur result #It adds a new dependent variable to the list of dependent variables, #a new model results frame to the list of models, and the original dataset to the full hetsur results object #If necessary, it also adds a column containing the omitted share to the original dataset #Different methods are used for 2 factor models and 3+ factor models due to the way single column objects are handled #It augments the Dependent.Variables and Models components of the list returned by hetsur est_share_new <- function(hetest, newsharename, trendvar = "trend") { #get the names of the dependent variables for which equations have already been estimated tempdepvars <- hetest$Dependent.Variables #get the original data origdata <- hetest$Original.Data #locate the current dependent variables within the original dataset tempdepvarlocs <- match(tempdepvars, names(origdata)) #get the number of equations already estimated numeqs <- length(tempdepvars) if (numeqs == 2) { #create a duplicate of the result for the first share equation tempneweq <- hetest$Models[[2]] #check whether data for the second share already exists in the dataset, and, if not, create a column for #the second share and name it appropriately if ((newsharename %in% names(origdata)) == F) { origdata$tempnew <- 1 - origdata[, tempdepvarlocs[2]] names(origdata)[dim(origdata)[2]] <- newsharename } #calculate the intercept of the second share equation and recalculate the parameter's t-statistic and p-value appropriately tempneweq[1, 1] <- 1 - tempneweq[1, 1] tempneweq[1, 3] <- tempneweq[1, 1] / tempneweq[1, 2] tempneweq[1, 4] <- 2 * pt(abs(tempneweq[1, 3]), hetest$Model.Statistics$Degrees.of.Freedom[2], lower.tail = F) #calculate the coefficients for non-intercept coefficients of the second share equation as well as their t-statistics tempneweq[2:dim(tempneweq)[1], c(1, 3)] <- tempneweq[2:dim(tempneweq)[1], c(1, 3)] * -1 #initialize a new object for the augmented hetsur results to be returned hetestaug <- hetest hetestaug$Dependent.Variables[3] <- newsharename hetestaug$Models[[3]] <- tempneweq } #the following block of code generalizes the methods found in gauss files called EST_SLM3, EST_SLM4, and EST_SLM5. if (numeqs > 2) { #check whether data for the new share already exists in the dataset, and, if not, create a column for the additional share and name it appropriately if ((newsharename %in% names(origdata)) == F) { #get the locations of the share variables in the original dataset for which equations already exist currentshares <- origdata[, tempdepvarlocs[-1]] #sum the share variables for every observation currentsharetots <- apply(origdata, MARGIN = 1, FUN = sum) #store the additional share in a new column and name it appropriately origdata$tempnew <- 1 - currentsharetots names(origdata)[dim(origdata)[2]] <- newsharename } #create a placeholder for the new equation tempneweq <- hetest$Models[[2]] #create a matrix in which to store the coefficients of the already estimated share equations and name its rows appropriately sharemodcoefs <- matrix(0, dim(tempneweq)[1], numeqs) row.names(sharemodcoefs) <- row.names(tempneweq) #recreate the num_var variable from the original hetsur call and fill its first element appropriately tempnumvar <- rep(0, numeqs) tempnumvar[1] <- dim(hetest$Models[[1]])[1] #fill the matrix of share equation coefficients and the tempnumvar variable for (i in 2:length(hetest$Models)) { sharemodcoefs[, (i - 1)] <- hetest$Models[[i]][, 1] tempnumvar[i] <- dim(hetest$Models[[i]])[1] } #store the covariance matrix for the existing model covmat <- hetest$Covariance.Matrix #sum the coefficients for each variable on the right hand side of the share equations sharemodcoeftots <- apply(sharemodcoefs, MARGIN = 1, FUN = sum) #calculate the value of the intercept in the new share equation tempneweq[1, 1] <- 1 - sharemodcoeftots[1] #calculate the value of the remaining coefficients in the new share equation tempneweq[-1, 1] <- -1 * sharemodcoeftots[-1] #calculate the locations of each of the constant terms in the covariance matrix constindx <- cumsum(tempnumvar)[-numeqs] + 1 #create a temporary variable in which to store the variance of the intercept term in the new equation tempconstvar <- 0 #loop twice over the locations of the constants in the covariance matrix (once for rows, once for columns) for (i in 1:length(constindx)) { for (j in 1:length(constindx)) { #store the current row and column tempr <- constindx[i] tempc <- constindx[j] #add the covariance matrix value in the cell corrseponding to the current row and column to the temporary variable for the variance of the intercept term in the new share equation tempconstvar <- tempconstvar + covmat[tempr, tempc] } } #calculate the standard error of the intercept term in the new equation tempneweq[1, 2] <- sqrt(tempconstvar) #calculate the upper and lower bounds denoting the locations of non-intercept right hand side variables for the share equations in the covariance matrix varindx1 <- cumsum(tempnumvar)[-numeqs] + 2 varindx2 <- cumsum(tempnumvar)[-1] #DELETE THIS? testrhs <- c( row.names(hetest$Models[[1]]), row.names(hetest$Models[[2]]), row.names(hetest$Models[[3]]) ) #create a temporary variable in which to store the variances of the non-intercept terms of the new share equation tempvarvar <- rep(0, tempnumvar[2] - 1) #loop twice over the number of equations (once for rows, once for columns) for (i in 1:length(varindx1)) { for (j in 1:length(varindx1)) { #create the current row and column ranges temprs <- c(varindx1[i]:varindx2[i]) tempcs <- c(varindx1[j]:varindx2[j]) #add the diagonal of the block of the covariance corresponding to the current row and column ranges to the temporary variable for the variance of the non-intercept terms in the new share equation tempvarvar <- tempvarvar + diag(covmat[temprs, tempcs]) } } #calculate the standard errors of the non-intercept terms in the new equation tempneweq[(2:tempnumvar[2]), 2] <- sqrt(tempvarvar) #calculate the t-statistics for all terms in the new equation tempneweq[, 3] <- tempneweq[, 1] / tempneweq[, 2] #calculate the p-values for all terms in the new equation for (i in 1:tempnumvar[2]) { tempneweq[i, 4] <- 2 * pt(abs(tempneweq[i, 3]), hetest$Model.Statistics$Degrees.of.Freedom[2], lower.tail = F) } #create the augmented version of hetest to be modified and returned hetestaug <- hetest #add the new dependent variable to the Dependent Variables component of the object to be returned hetestaug$Dependent.Variables[numeqs + 1] <- newsharename #check if the trend variable was included in the share equations and, if so, remove it from the final share equation if (trendvar %in% row.names(tempneweq)) { trendrow <- match(trendvar, row.names(tempneweq)) tempneweq <- tempneweq[-trendrow, ] } #add the new share equation to the Models component of the object to be returned hetestaug$Models[[numeqs + 1]] <- tempneweq } #add the modified original data to the object to be returned and name it appropriately. hetestaug[[length(hetestaug)]] <- origdata #return the output return(hetestaug) } #This function provides predicted values for all estimated variables and adds them to the Original.Data #component of the hetest object returned by est_share_new() #The arguments are as follow: # 1) hetest: the object returned by hetsur(), after having been augmented by est_share_new() # 2) newdata: if a dataset other than hetest$Original.Data should be used for prediction, provide it here #if newdata is not null, then the Original.Data component of hetest will be replaced with the contents of newdata prehetsur <- function(hetest, newdata = NULL) { #check whether alternative data has been supplied #if so, replace the original data component of the supplied hetest object with the contents of new data and initialize an object called predata to be used in calculating the predicted values of dependent variables #if not, simply initialize an object called predata to be used in calculating the predicted values of dependent variables if (length(newdata) == 0) { predata <- hetest$Original.Data } if (length(newdata) > 0) { predata <- newdata hetest$Original.Data <- predata } #loop over models for (i in 1:length(hetest$Models)) { #get the string name of the dependent variable for the equation currently under consideration tempdepvar <- hetest$Dependent.Variables[i] #initialize a vector for the predicted values temppred <- rep(0, dim(predata)[1]) #get the string names of the independent variables for the equations currently under consideration temprhs <- row.names(hetest$Models[[i]]) #get the locations of the columns containing the independent variables in the original data set temprhslocs <- match(temprhs, names(hetest$Original.Data)) #create a matrix of the independent variables in the same order that the coefficients appear tempxmat <- as.matrix(hetest$Original.Data[, temprhslocs]) #get the coefficients for the current model tempbeta <- hetest$Models[[i]][, 1] #calculate the predicted values for the current dependent variables temppred <- tempxmat %*% tempbeta #store the predicted values of the current dependent variable in a new column of the original dataset and name it appropriately hetest$Original.Data$tempnew <- temppred names(hetest$Original.Data)[dim(hetest$Original.Data)[2]] <- paste(tempdepvar, "hat", sep = "_") } #return the augmented hetest object return(hetest) } #This function calculates several measures of goodness of fit. #Its only argument is hetest, which should be the object returned by hetest(), #after being augmented by est_share_new() and prehetsur(), in that order #It returns of list the following four components: # 1) Uncentered.System.R.Square # 2) Centered.System.R.Square # 3) LR.Test.Statistic # 4) P-Value systemR2 <- function(hetest) { templhs <- hetest$Dependent.Variables[-length(hetest$Dependent.Variables)] templhsalocs <- match(templhs, names(hetest$Original.Data)) templhsplocs <- match(paste(templhs, "hat", sep = "_"), names(hetest$Original.Data)) templhsa <- hetest$Original.Data[, templhsalocs] templhsp <- hetest$Original.Data[, templhsplocs] tempdf <- dim(hetest$Models[[1]])[1] - 1 res <- matrix(0, dim(hetest$Original.Data)[1], length(templhs)) res2 <- matrix(0, dim(hetest$Original.Data)[1], length(templhs)) dev <- matrix(0, dim(hetest$Original.Data)[1], length(templhs)) for (i in 1:length(templhs)) { res[, i] <- templhsa[, i] - templhsp[, i] dev[, i] <- templhsa[, i] - mean(templhsa[, i]) res2[, i] <- res[, i] - mean(res[, i]) } ucr2 <- 1 - det(t(res) %*% res) / det(t(dev) %*% dev) cr2 <- 1 - det(t(res2) %*% res2) / det(t(dev) %*% dev) lrstat <- -1 * dim(hetest$Original.Data)[1] * log(1 - cr2) pvalue <- pchisq(q = lrstat, df = tempdf, lower.tail = F) gofoutput <- list(ucr2, cr2, lrstat, pvalue) names(gofoutput) <- c( "Uncentered.System.R.Square", "Centered.System.R.Square", "LR.Test.Statistic", "P-Value" ) return(gofoutput) } surcostRmat <- function(lhs_vars, rhs_vars, num_var, factors, constantvar = "intercept", verbose = T) { #create the vectors of share and price variable names sharevars <- paste("s", factors, sep = "") pricevars <- paste("w", factors, sep = "") #check that the order of shares matches exactly their order in the vector of the dependent variables for (i in 1:length(sharevars)) { if (sharevars[i] != lhs_vars[i + 1]) { print("BAD ORDER OF SHARES") return("BAD ORDER OF SHARES") } } #initialize a list of the variables in each equation eqvarlist <- list(NULL) #fill the list of variables in each equation lb <- 1 ub <- 0 i <- 1 while (i <= length(lhs_vars)) { ub <- ub + num_var[i] eqvarlist[[i]] <- rhs_vars[lb:ub] lb <- ub + 1 i <- i + 1 } outputR <- matrix(0, length(rhs_vars) - num_var[1], length(rhs_vars)) outputz <- matrix(0, length(rhs_vars) - num_var[1], 1) #get the right hand side variables of the cost equation ceqrhs <- rhs_vars[1:num_var[1]] #get the position within rhs_vars where each equation's (not counting the first equation) variables begin shcolinit <- cumsum(num_var)[-length(num_var)] + 1 #initialize a variable tracking the current row of the R matrix rowR <- 1 #loop over factors for (i in 1:length(factors)) { #store the current factor, price, share equation, and column index tempfact <- factors[i] tempprice <- pricevars[i] tempsheq <- eqvarlist[[i + 1]] tempcolinit <- shcolinit[i] #loop over the variables in the current share equation for (j in 1:length(tempsheq)) { #get the current variable temprhsvar <- tempsheq[j] #if verbose is true, print the current variable for debugging purposes if (verbose == T) { print(temprhsvar) } #check whether the current variable is the intercept term, the price for the current cost share, or another variable and fill in the current row of the R matrix appropriately if (temprhsvar == constantvar) { tempcloc <- which(ceqrhs == tempprice) outputR[rowR, tempcloc] <- -1 outputR[rowR, tempcolinit + j - 1] <- 1 } if (temprhsvar != constantvar) { if (temprhsvar == tempprice) { #get the location of the current price variable squared term in the cost equation tempcloc <- which(ceqrhs == paste(temprhsvar, "*", temprhsvar, "/2", sep = "")) outputR[rowR, tempcloc] <- -1 outputR[rowR, tempcolinit + j - 1] <- 1 } if (temprhsvar != tempprice) { #create the two possible ways of writing the interaction in the cost equation tempceqnames <- c( paste(temprhsvar, "*", tempprice, sep = ""), paste(tempprice, "*", temprhsvar, sep = "") ) #determine which way was used tempceqnamecheck <- which(tempceqnames %in% ceqrhs) #get the location of the current variable's interaction term in the cost equation tempcloc <- which(ceqrhs == tempceqnames[tempceqnamecheck]) outputR[rowR, tempcloc] <- -1 outputR[rowR, tempcolinit + j - 1] <- 1 } } #increment the variable tracking the row of the R matrix currently in use rowR <- rowR + 1 } } #construct an output list and return the results fulloutput <- list(outputR, outputz) return(fulloutput) } #this function adds interactive terms to a data frame addvars <- function(data, rhs_vars) { #get a unique listing of the independent variable terms redvars <- unique(rhs_vars) #get the old column names oldvars <- names(data) #check which independent variable terms are already in the data set and drop them to create a list of variables to be created oldvarlocs <- which(redvars %in% oldvars) newvars <- redvars[-oldvarlocs] if (length(newvars) > 0) { #loop over the variables to be added for (i in 1:length(newvars)) { #create the new variable and name it appropriately data$tempnew <- with(data, eval(parse(text = newvars[i]))) names(data)[dim(data)[2]] <- newvars[i] } } #return the augmented data set return(data) }