import torch
from torch.utils.data import TensorDataset
import pandas as pd
import numpy as np
from data_frameworks import CSVData

data_name='winequality-red.csv'

wine_data = np.array(pd.read_csv(data_name, delimiter=',', header=0))
x = wine_data[:,:-1]
# log transform to account for log normal distribution
log_indices = [0, 1, 3, 4, 5, 6, 9, 10]
for i in log_indices:
    x[:,i] = np.log(x[:,i])
y = wine_data[:,-1]

# normalize data
y_mean = np.mean(y)
y_std = np.std(y)
y = (y-y_mean)/y_std
x_mean = np.mean(x, axis=0, keepdims=True)
x_std = np.std(x, axis=0, keepdims=True)
x = (x-x_mean)/x_std

# randomly split for training and testing
length_data = y.shape[0]
test_percentage = 0.2
length_test_data = int(length_data * test_percentage)
full_indices = np.arange(0, length_data)
np.random.seed(0)
test_indices = np.random.choice(full_indices,
        size=length_test_data, replace=False)
train_indices = np.setdiff1d(full_indices, test_indices)
train_x, train_y = [torch.tensor(t[train_indices,...], 
                        dtype=torch.float32) for t in (x,y)]
test_x, test_y = [torch.tensor(t[test_indices,...], 
                        dtype=torch.float32) for t in (x,y)]

# create datasets
train_data = TensorDataset(train_x, train_y.view((-1,1)))
test_data = TensorDataset(test_x, test_y.view((-1,1)))