##################################################
### import
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

##################################################
### read in data
trainB = pd.read_csv("http://www.rob-mcculloch.org/data/smsTrainB.csv")
testB = pd.read_csv("http://www.rob-mcculloch.org/data/smsTestB.csv")
trainyB = pd.read_csv("http://www.rob-mcculloch.org/data/smsTrainyB.csv")['smsTrainyB']
testyB = pd.read_csv("http://www.rob-mcculloch.org/data/smsTestyB.csv")['smsTestyB']

##################################################
### check data again what we had in R
#> dim(smsTrain)
#[1] 4169 1139
#> table(smsTrain[,1])
#
#  No  Yes 
#4164    5

#check
# counts for first column
trainB.iloc[:,0].value_counts()

# train and test % spam
np.sum(testyB)/len(testyB)
np.sum(trainyB)/len(trainyB)

# age and y
pd.crosstab(trainyB,trainB['age'])
5/(5+3600)
12/(12+552)

##################################################
### fit NB model

model = MultinomialNB()
model.fit(trainB,trainyB)
yhat = model.predict(testB)

confusion_matrix(testyB,yhat)
accuracy_score(testyB,yhat)

