####################
###: basic imports
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
##sklearn model selection
from sklearn.model_selection import train_test_split
## scale the x variables when there is more than one
from sklearn.preprocessing import StandardScaler
####################
###: pytorch
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
def myrmse(y,yhat):
""" print out rmse with 3 digits"""
#rmse = math.sqrt(mean_squared_error(y,yhat))
rmse = np.sqrt(np.sum((y-yhat)**2)/len(y))
return(np.round(rmse,3))
####################
###: read in data
cd = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/susedcars.csv")
cds = cd[['price','mileage','year']]
cds = cds.astype('float64') # cd read the data in as integer
cds['price'] = cds['price']/1000.0
cds['mileage'] = cds['mileage']/1000.0
print(cds.head())
X = cds[['mileage','year']].to_numpy() #mileage and year columns as a numpy array
print("*** type of X is",type(X))
print(X.shape) #number of rows and columns
print(X[0:4,:]) #first 4 rows
y = cds['price'].values #price as a numpy vector
print(f'length of y is {len(y)}')
print(y[:4]) #implicit start at 0
price mileage year 0 43.995 36.858 2008.0 1 44.995 46.883 2012.0 2 25.999 108.759 2007.0 3 33.880 35.187 2007.0 4 34.895 48.153 2007.0 *** type of X is <class 'numpy.ndarray'> (1000, 2) [[ 36.858 2008. ] [ 46.883 2012. ] [ 108.759 2007. ] [ 35.187 2007. ]] length of y is 1000 [43.995 44.995 25.999 33.88 ]
####################
###: train/test split
myseed = 88 #Nylander
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=myseed, test_size=.3)
print(f'train sample size is {ytrain.shape[0]}\n')
print(f'test sample size is {ytest.shape[0]}\n')
train sample size is 700 test sample size is 300
##################################################
###: scale both x and y, in principle you don't have to scale y
sc = StandardScaler()
xtr = sc.fit_transform(Xtrain)
xte = sc.transform(Xtest)
scy = StandardScaler()
ytr = scy.fit_transform(ytrain.reshape(-1,1))
yte = scy.transform(ytest.reshape(-1,1))
print(xtr.shape)
print(ytr.shape)
print(xte.shape)
print(yte.shape)
(700, 2) (700, 1) (300, 2) (300, 1)
##################################################
###: move to Tensors
xtr = xtr.astype('float32')
xxtr = torch.from_numpy(xtr)
ytr = ytr.astype('float32')
yytr = torch.from_numpy(ytr)
xte = xte.astype('float32')
xxte = torch.from_numpy(xte)
yte = yte.astype('float32')
yyte = torch.from_numpy(yte)
plt.scatter(xxtr[:,0],yytr)
plt.xlabel('scaled mileage'); plt.ylabel('scaled price')
print(xxtr.shape)
print(xxtr.dtype)
print(yytr.shape)
print(yytr.dtype)
print(yyte.dtype)
torch.Size([700, 2]) torch.float32 torch.Size([700, 1]) torch.float32 torch.float32
##################################################
###: set seed(s)
theseed = 14 # Dave Keon
torch.manual_seed(theseed)
np.random.seed(theseed)
random.seed(theseed)
## if gpu
#torch.cuda.manual_seed_all(theseed)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False
##################################################
###: Dataset
class DF(Dataset):
def __init__(self,x,y,transform=None, target_transform=None):
self.x = x
self.y = y
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return len(self.y)
def __getitem__(self,idx):
return self.x[idx], self.y[idx]
tDF = DF(xxtr,yytr)
##################################################
###: DataLoader
tdl = DataLoader(tDF,batch_size=50,shuffle=True)
##################################################
###: model
class SLNN(nn.Module):
def __init__(self,nunits=5):
#super(SLNN, self).__init__()
super().__init__()
self.SSM = nn.Sequential(
nn.Linear(2,nunits),
nn.ReLU(),
nn.Linear(nunits,1))
def forward(self,x):
yhat = self.SSM(x)
return yhat
nunits= 50
model = SLNN(nunits)
## see model
print(model)
SLNN(
(SSM): Sequential(
(0): Linear(in_features=2, out_features=50, bias=True)
(1): ReLU()
(2): Linear(in_features=50, out_features=1, bias=True)
)
)
##################################################
###: do it
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss_fn(model(dataloader.dataset.x),dataloader.dataset.y).item()
learning_rate = .05
l2par = .0
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = l2par)
epochs = 5000
lv = np.zeros((epochs,))
printskip = 1000
for t in range(epochs):
lv[t] = train_loop(tdl, model, loss_fn, optimizer)
if ((t+1) % printskip == 0):
print(f"Epoch {t+1} -------------------------------")
print(f'train loss: {lv[t]}\n')
print("Done!")
Epoch 1000 ------------------------------- train loss: 0.08587513864040375 Epoch 2000 ------------------------------- train loss: 0.0790461078286171 Epoch 3000 ------------------------------- train loss: 0.07889129966497421 Epoch 4000 ------------------------------- train loss: 0.07782591879367828 Epoch 5000 ------------------------------- train loss: 0.07735740393400192 Done!
###: plot loss over epochs
plt.plot(np.sqrt(lv))
plt.xlabel('epoch')
Text(0.5, 0, 'epoch')
###: out of sample
ypredN = model(xxte).detach().numpy()
ypredN = ypredN.astype('float64')
ypredN = scy.inverse_transform(ypredN)
ypredN = ypredN.flatten()
print(type(ypredN))
print(ypredN.shape)
print(ypredN.dtype)
plt.scatter(ypredN,ytest,s=10)
plt.plot(ytest,ytest,c='red')
plt.xlabel('nn prediction'); plt.ylabel('test y')
print(f'neural net rmse: {myrmse(ypredN,ytest)}')
<class 'numpy.ndarray'> (300,) float64 neural net rmse: 5.077