This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
GFluTrends<-read_excel("/Users/jennyyang/Downloads/case_study_1_fluwatch.xlsx",sheet="Google Flu Trends", skip = 1)
fluWatch<-read_excel("/Users/jennyyang/Downloads/case_study_1_fluwatch.xlsx", sheet="FluWatch-BC", skip = 2)
tim<-timeSequence(from = "2003-09-28", to = "2015-08-09", by = "week")
tim1<-timeSequence(from = "2003-09-07", to = "2015-08-23", by = "week")
GFT<- timeSeries(GFluTrends[,"British Columbia"], charvec = tim)
fluTest<- timeSeries(fluWatch[,"FluTest"], charvec = tim1)
spliting data into training set and testing set:
trainingsetGFT <- window(GFT,start="2003-09-28",end="2014-08-10")
trainingsetfluTest <- window(fluTest,start="2003-09-28", end="2014-08-10")
testingsetGFT <- window(GFT, start="2014-08-17", end="2015-08-09")
testingsetgfluTest <- window(fluTest, start="2014-08-17", end="2015-08-09")
plots for training set:
plot(trainingsetGFT,type = "b", pch = 19)
grid ()
plot(trainingsetfluTest,type = "b", pch = 19)
grid()
plots for testing set:
plot(testingsetGFT,type = "b", pch = 19)
grid()
plot(testingsetgfluTest,type = "b", pch = 19)
grid()
PreWhiten.ar <- function(x, ar = NULL){
if(is.null(ar)) print("autoregressive coefficients are empty")
prewhdata = numeric(0)
prewhdata = filter(x, c(1, -ar),method = c("convo"),sides=1)
prewhdata[!is.na(prewhdata)]
}
PreWhiten.arma <- function(x, ar=NULL,ma=0){
if(is.null(ar) && is.null(ma)) print("both ar and ma coefficients are empty")
prewhdata=numeric(0)
mod = as(modelCoef(new("ArmaModel",ar=ar, ma=ma)),"list")
eps = numeric(length(x))
prewhdata=xarmaFilter(mod, x=x, eps = eps, whiten = TRUE)
prewhdata[!is.na(prewhdata)]
}
mod.ar <- auto.arima(trainingsetGFT, max.p = 52, max.q = 0, stationary = TRUE) #, allowmean = FALSE)
temp = PreWhiten.arma(trainingsetGFT, ar = mod.ar$coef[1:3], ma = 0)
temp1 = PreWhiten.ar(trainingsetGFT, ar = mod.ar$coef[1:3])
par(mfrow = c(2,1), cex =0.5)
plot(mod.ar)
plot(c(temp1)-temp[-(1:3)], col = 1,
ylab = "Difference", ylim = c(-0.01,0.01), pch = 15,
main = "Difference of prewhitened time series btw 2 filters")
grid()
c(BoxCox.lambda(GFT),BoxCox.lambda(fluTest))
## [1] 0.09164784 0.19117497
LBTest <- function(res, nPQ = 0, m = 24, ifPlot = FALSE){
stopifnot(nPQ >= 0, m >= 1, m > nPQ)
n <- length(res)
lags <- 1:m
df <- (nPQ + 1):m
ra <- (acf(res, lag.max = m, plot = FALSE)$acf)[-1]
QQ <- n*(n+2)*cumsum((ra^2)/(n-(1:m)))[df]
pval <- 1-pchisq(QQ, df)
QQ <- round(QQ,2)
mat <- matrix(c(df,QQ,pval), ncol=3)
dimnames(mat) <- list(rep("",length(QQ)),c("m", "Qm", "pvalue"))
if(ifPlot){
plot(x = mat[,1], y=mat[,3],
ylim=c(0,1), pch =15, col =4,
ylab="pvalue", xlab = "m",
main = "Ljung-Box portmanteau test")
abline(h=0.05, col=2)
grid()
}else {
mat
}
}
model_arma<-auto.arima(trainingsetGFT, max.p = 52, max.q = 52, stationary = TRUE)
a = model_arma$arma[1]
b = model_arma$arma[2]
coef(model_arma)
## ar1 ar2 ar3 ar4 ma1
## -0.2508187 1.2035315 0.1925643 -0.4037494 1.5177007
## ma2 ma3 intercept
## 0.4790093 -0.0689342 1793.0878754
plot(model_arma)
Inverse AR and MA roots are all inside of the unit cicle therefore the fitted ARMA model is casual and invertible.
npq = sum(model_arma$arma[c(1,2)])
LBTest(model_arma$residuals, nPQ = npq, m = 52, ifPlot=TRUE)
mod <- model_arma
nAR <- mod$arma[1]
nMA = mod$arma[2]
if(nMA!=0){
xf = PreWhiten.arma(trainingsetGFT, ar = mod$coef[1:nAR],
ma = mod$coef[(1:nMA)+nAR])[-(1:nAR)]
yf = PreWhiten.arma(trainingsetfluTest, ar = mod$coef[1:nAR],
ma=mod$coef[(1:nMA)+nAR])[-(1:nAR)]
}else{
xf = PreWhiten.arma(trainingsetGFT, ar = mod$coef[1:nAR],
ma = 0)[-(1:nAR)]
yf = PreWhiten.arma(trainingsetfluTest, ar = mod$coef[1:nAR],
ma=0)[-(1:nAR)]
}
par(cex=0.75,bg="gray95")
ccf(c(xf), c(yf), lwd=1, ylab="Cross-correlation functions",
main="CCF of prewhitened GFT and flu test")
abline(v=0, col="gold", lwd=2, lty="dashed")
text(-1, 0.2, "-1", col=2)
dat<- cbind(fluTest,GFT, lag(GFT))[-c(1:5),]
colnames(dat)<-c("fluTest", "GFT", "GFT1")
data<- timeSeries(dat, charvec = tim)
data.train = window(data, start = "2003-10-12", end = "2014-08-10")
data.test = window(data, start = "2014-08-17", end = "2015-07-26")
mod.tfn = auto.arima(data.train[,1], xreg = data.train[,-1], stationary = TRUE)
coef(mod.tfn)
## ar1 ma1 intercept GFT GFT1
## 0.82928265 0.36672219 -95.07126539 0.07782297 0.05883542
m = 26
lags = 1:m
df <- (1+1+1):m
n = length(mod.tfn$res)
rccf = ccf(mod$residuals,mod.tfn$residuals, plot = FALSE, lag.max = m)$acf[-(1:m)]
Qm = n* (n + 2) * cumsum((rccf^2)/(n - (0:m)))[df]
pv <- 1 - pchisq(Qm, df)
a = cbind(df, Qm,pv)
par(mfrow = c(1,2))
LBTest(mod.tfn$res, nPQ = 6, ifPlot = TRUE)
plot(x = a[,1],y = a[,3],
ylim = c(0,1), pch = 15, col =4,
ylab = "pvalue", xlab = "m",
main = "Cross-correlation check")
abline(h =0.05, col =2)
grid()
ARMA Model:
model_full <- auto.arima(trainingsetfluTest, max.p = 52, max.q = 52, stationary = TRUE)
coef(model_full)
## ar1 ma1 ma2 ma3 intercept
## 0.82001760 0.55143909 0.17256528 0.07728373 148.40014178
plot(model_full)
Netrual Network:
mod_nnflutest = forecast::nnetar(trainingsetfluTest);mod_nnflutest
## Series: trainingsetfluTest
## Model: NNAR(8,4)
## Call: forecast::nnetar(y = trainingsetfluTest)
##
## Average of 20 networks, each of which is
## a 8-4-1 network with 41 weights
## options were - linear output units
##
## sigma^2 estimated as 2751
mod_nnx = forecast::nnetar(data.train[,1], xreg = data.train[,-1]);mod_nnx
## Series: data.train[, 1]
## Model: NNAR(8,6)
## Call: forecast::nnetar(y = data.train[, 1], xreg = data.train[, -1])
##
## Average of 20 networks, each of which is
## a 10-6-1 network with 73 weights
## options were - linear output units
##
## sigma^2 estimated as 1342
Forecast Accuracy Evaluation:
e=round(accuracy(model_full),2)
f=round(accuracy(mod.tfn),2)
g=round(accuracy(mod_nnflutest),2)
l=round(accuracy((mod_nnx)),2)
which.min(c(abs(as.data.frame(e)$ME),abs(as.data.frame(f)$ME),abs(as.data.frame(g)$ME),abs(as.data.frame(l)$ME)))
## [1] 1
Estamating in short run:
model_testing <- auto.arima(testingsetgfluTest, max.p = 52, max.q = 52, stationary = TRUE)
model_tfntesting <- auto.arima(data.test[,1], xreg = data.test[,-1], stationary = TRUE)
mod_nntesting1 <- forecast::nnetar(testingsetgfluTest, h=1);mod_nntesting1
## Series: testingsetgfluTest
## Model: NNAR(3,2)
## Call: forecast::nnetar(y = testingsetgfluTest, h = 1)
##
## Average of 20 networks, each of which is
## a 3-2-1 network with 11 weights
## options were - linear output units
##
## sigma^2 estimated as 7344
mod_nnxtesting <- forecast::nnetar(data.test[,1], xreg = data.test[,-1]);mod_nnxtesting
## Series: data.test[, 1]
## Model: NNAR(3,3)
## Call: forecast::nnetar(y = data.test[, 1], xreg = data.test[, -1])
##
## Average of 20 networks, each of which is
## a 5-3-1 network with 22 weights
## options were - linear output units
##
## sigma^2 estimated as 482.4
bb=round(accuracy(model_testing),2)
bc=round(accuracy(model_tfntesting),2)
bd=round(accuracy(mod_nntesting1),2)
be=round(accuracy(mod_nnxtesting),2)
which.min(c(abs(as.data.frame(bb)$ME),abs(as.data.frame(bc)$ME),abs(as.data.frame(bd)$ME),abs(as.data.frame(be)$ME)))
## [1] 4
Estamating for long run:
model_testing <- auto.arima(testingsetgfluTest, max.p = 52, max.q = 52, stationary = TRUE)
model_tfntesting <- auto.arima(data.test[,1], xreg = data.test[,-1], stationary = TRUE)
mod_nntesting50 <- forecast::nnetar(testingsetgfluTest, h=50);mod_nntesting50
## Series: testingsetgfluTest
## Model: NNAR(3,2)
## Call: forecast::nnetar(y = testingsetgfluTest, h = 50)
##
## Average of 20 networks, each of which is
## a 3-2-1 network with 11 weights
## options were - linear output units
##
## sigma^2 estimated as 6882
mod_nnxtesting <- forecast::nnetar(data.test[,1], xreg = data.test[,-1]);mod_nnxtesting
## Series: data.test[, 1]
## Model: NNAR(3,3)
## Call: forecast::nnetar(y = data.test[, 1], xreg = data.test[, -1])
##
## Average of 20 networks, each of which is
## a 5-3-1 network with 22 weights
## options were - linear output units
##
## sigma^2 estimated as 515.3
cc=round(accuracy(model_testing),2)
cd=round(accuracy(model_tfntesting),2)
ce=round(accuracy(mod_nntesting50),2)
cf=round(accuracy(mod_nnxtesting),2)
which.min(c(abs(as.data.frame(cc)$ME),abs(as.data.frame(cd)$ME),abs(as.data.frame(ce)$ME),abs(as.data.frame(cf)$ME)))
## [1] 4