import torch from torch.utils.data import TensorDataset import pandas as pd import numpy as np from data_frameworks import CSVData data_name='winequality-red.csv' wine_data = np.array(pd.read_csv(data_name, delimiter=',', header=0)) x = wine_data[:,:-1] # log transform to account for log normal distribution log_indices = [0, 1, 3, 4, 5, 6, 9, 10] for i in log_indices: x[:,i] = np.log(x[:,i]) y = wine_data[:,-1] # normalize data y_mean = np.mean(y) y_std = np.std(y) y = (y-y_mean)/y_std x_mean = np.mean(x, axis=0, keepdims=True) x_std = np.std(x, axis=0, keepdims=True) x = (x-x_mean)/x_std # randomly split for training and testing length_data = y.shape[0] test_percentage = 0.2 length_test_data = int(length_data * test_percentage) full_indices = np.arange(0, length_data) np.random.seed(0) test_indices = np.random.choice(full_indices, size=length_test_data, replace=False) train_indices = np.setdiff1d(full_indices, test_indices) train_x, train_y = [torch.tensor(t[train_indices,...], dtype=torch.float32) for t in (x,y)] test_x, test_y = [torch.tensor(t[test_indices,...], dtype=torch.float32) for t in (x,y)] # create datasets train_data = TensorDataset(train_x, train_y.view((-1,1))) test_data = TensorDataset(test_x, test_y.view((-1,1)))