## code/libraries
source("lift.R")
library(h2o)

## data: tabloid separated into train and test
trainDf = read.csv("Tabloid_train.csv")
testDf = read.csv("Tabloid_test.csv")

print(names(trainDf))

## plot
if(0) {
p=ncol(trainDf)-1
par(mfrow=c(p,2))
for(i in 1:p) {
   plot(trainDf[[i]])
   plot(log(trainDf[[i]]+1))
}
}

##make y=purchase a factor and call it y
trainDf$purchase = as.factor(trainDf$purchase)
testDf$purchase = as.factor(testDf$purchase)
names(trainDf)[1]="y"
names(testDf)[1]="y"

### setup storage for results
phatL = list() #store the test phat for the different methods here

### fit logit
lgfit = glm(y~.,trainDf,family=binomial)
print(summary(lgfit))
phat = predict(lgfit,testDf,type="response")
phatL$logit = matrix(phat,ncol=1) #logit phat

## how is logit
par(mfrow=c(1,1))
temp = lift.plot(phatL$logit,testDf$y)
# dev.copy2pdf(file="logit-lift.pdf",width=12,height=10)

## get setup to run nn in h2o
h2oServer <- h2o.init(max_mem_size="10g", nthreads=-1)
train_h2o = as.h2o(trainDf, destination_frame = "tabloid_train")
test_h2o = as.h2o(testDf, destination_frame = "tabloid_test")


##run single layer h2o

if(file.exists(file.path("./files", "mTabD10"))) {
  mTabD10 = h2o.loadModel(path = file.path("./files", "mTabD10"))
} else {
  mTabD10 = h2o.deeplearning(
          x=2:5, y=1,
          training_frame=train_h2o,
          hidden=10,
          epochs=1000,
          l1 = 1e-2,
          model_id = "mTabD10"
          )
}

## look at fit from single layer

phat = predict(mTabD10, test_h2o)
phatL$mTabD10 = as.matrix( phat[,3] )

#plot, compare to logit
par(mfrow=c(1,2))
plot(phatL$logit, phatL$mTabD10)
abline(0,1,col="blue")
lift.many.plot(phatL, testDf$y)
legend("topleft",legend=names(phatL),col=1:2,lty=rep(1,2),bty="n")
# dev.copy2pdf(file="logit-model1.pdf",height=5,width=12)

## if you like it, save it
#h2o.saveModel(mTabD10, path="files",force=TRUE)


### fit h2o deep
fp = file.path("./files", "mTabD10.10")
if(file.exists(fp)) {
  mTabD10.10 = h2o.loadModel(fp)
} else {
  mTabD10.10 = h2o.deeplearning(
                  x=2:5, y=1,
                  training_frame=train_h2o,
                  hidden=c(10,10),
                  epochs=500,
                  activation="Rectifier",
                  l1=1e-3,
                  model_id = "mTabD10.10"
                  )
}

phat = predict(mTabD10.10, test_h2o)
phatL$mTabD10.10 = as.matrix( phat[,3] )

pairs(phatL)
# dev.copy2pdf(file="pairs-logt-nn-dnn10-10.pdf",height=10,width=12)

par(mfrow=c(1,1))
lift.many.plot(phatL, testDf$y)
legend("topleft",legend=names(phatL),col=1:3,lty=rep(1,3))
# dev.copy2pdf(file="lift-logt-nn-dnn10-10.pdf",height=10,width=12)


## if you like it, save it
#h2o.saveModel(mTabD10.10, path="files",force=TRUE)


### fit h2o really deep
fp = file.path("./files", "mTabD100.100")
if(file.exists(fp)) {
  mTabD100.100 = h2o.loadModel(fp)
} else {
  mTabD100.100 = h2o.deeplearning(
                  x=2:5, y=1,
                  training_frame=train_h2o,
                  hidden=c(100,100),
                  epochs=500,
                  activation="Tanh",
                  l1=1e-3,
                  l2=1e-2,
                  model_id = "mTabD100.100"
                  )
}

phat = predict(mTabD100.100, test_h2o)
phatL$mTabD100.100 = as.matrix( phat[,3] )

pairs(phatL)
# dev.copy2pdf(file="pairs-logt-nn-dnn100.100.pdf",height=10,width=12)

par(mfrow=c(1,1))
lift.many.plot(phatL, testDf$y)
legend("topleft",legend=names(phatL),col=1:4,lty=rep(1,4))
# dev.copy2pdf(file="lift-logt-nn-dnn100.100.pdf",height=10,width=12)


## if you like it, save it
#h2o.saveModel(mTabD100.100, path="files",force=TRUE)


par(mfrow=c(2,2))
yrg = range(unlist(phatL))
boxplot(phatL$logit~testDf$y,ylim=yrg)
title(main=names(phatL)[1],cex.main=1.5)
abline(h=.02,col="red",lty=2)
boxplot(phatL$mTabD10~testDf$y,ylim=yrg)
title(main=names(phatL)[2],cex.main=1.5)
abline(h=.02,col="red",lty=2)
boxplot(phatL$mTabD10.10~testDf$y,ylim=yrg)
title(main=names(phatL)[3],cex.main=1.5)
abline(h=.02,col="red",lty=2)
boxplot(phatL$mTabD100.100~testDf$y,ylim=yrg)
title(main=names(phatL)[4],cex.main=1.5)
abline(h=.02,col="red",lty=2)
# dev.copy2pdf(file="box-all4.pdf",height=10,width=12)


## deviance loss
source("devloss.R")
loss(testDf$y,phatL$logit)
loss(testDf$y,phatL$mTabD10)
loss(testDf$y,phatL$mTabD10.10)
loss(testDf$y,phatL$mTabD100.100)

## real gain
gain=function(phat,y) {
   n = length(y)
   dotarget = ifelse(phat>.02,1,0)
   if(sum(dotarget) ==  n) {
      return(40*sum(y==1)-.8*n)
   }
   if(sum(dotarget) ==  0) {
      return(0)
   }
   tbl = table(dotarget,y)
   return(tbl[2,2]*40-.8*sum(tbl[2,]))
}
nmod = length(phatL)
for(i in 1:nmod) {
   cat("on model: ",names(phatL)[i],"\n")
   print(gain(phatL[[i]],testDf$y))
}


