import torch
import numpy as np
from torch.utils.data import TensorDataset
from sklearn.datasets import load_boston

## Load data in a numpy array
# load data in bunch object
data_bunch = load_boston()
# get input data
x = data_bunch['data']
# cut out 'B' column
cut_x = np.concatenate((x[...,0:-2],x[...,-1][...,None]), axis=1)
x = cut_x
# get output data
y = data_bunch['target']


# normalize data
y_mean = np.mean(y)
y_std = np.std(y)
y = (y-y_mean)/y_std
x_mean = np.mean(x, axis=0, keepdims=True)
x_std = np.std(x, axis=0, keepdims=True)
x = (x-x_mean)/x_std

# randomly split for training and testing
length_data = y.shape[0]
test_percentage = 0.2
length_test_data = int(length_data * test_percentage)
full_indices = np.arange(0, length_data)
np.random.seed(0)
test_indices = np.random.choice(full_indices,
        size=length_test_data, replace=False)
train_indices = np.setdiff1d(full_indices, test_indices)
train_x, train_y = [torch.tensor(t[train_indices,...], 
                        dtype=torch.float32) for t in (x,y)]
test_x, test_y = [torch.tensor(t[test_indices,...], 
                        dtype=torch.float32) for t in (x,y)]

# create datasets
train_data = TensorDataset(train_x, train_y.view((-1,1)))
test_data = TensorDataset(test_x, test_y.view((-1,1)))