Skip to content
Snippets Groups Projects
Commit dda1e409 authored by Jörg Martin's avatar Jörg Martin
Browse files

Added linear dataset

parent c0852b66
No related branches found
No related tags found
No related merge requests found
import torch
import sys
from torch.utils.data import TensorDataset, random_split
total_number_of_datapoints = 10000
input_range = [-1,1]
slope = 1.0
intercept = 0.0
x_noise_strength = 0.1
y_noise_strength = 0.1
def get_normalization(*args):
"""
Returns the mean and standard deviations (in tuples) of the tensors in *args.
"""
normalization_collection = []
for t in args:
t_mean = torch.mean(t, dim=0, keepdim=True)
t_std = torch.std(t, dim=0, keepdim=True)
normalization_collection.append((t_mean, t_std))
return tuple(normalization_collection)
def load_data(seed=0, splitting_part=0.8, normalize=True,
return_ground_truth=False):
"""
Loads one-dimensional data
:param seed: Seed for drawing and splitting the data.
:param splitting_part: Which fraction of the data to use as training
data. Defaults to 0.8.
:param normalize: Whether to normalize the data, defaults to True.
:param return_ground_truth: Boolean. If True, the unnoisy ground truth will
also be returned. Defaults to False.
:returns: linear_trainset, linear_testset if return_ground_truth is False,
else linear_trainset, linear_testset, (true_x, true_y)
"""
random_generator = torch.Generator().manual_seed(seed)
# draw different seeds for noise and splitting
seeds = torch.randint(0,sys.maxsize,(4,), generator=random_generator)
# create new generators from tensor seeds
create_generator = lambda tensor_seed:\
torch.Generator().manual_seed(tensor_seed.item())
true_x = input_range[0] + (input_range[1]-input_range[0])\
* torch.rand((total_number_of_datapoints,1),
generator=create_generator(seeds[0]))
true_y = slope * true_x + intercept
noisy_x = true_x + x_noise_strength * \
torch.randn((total_number_of_datapoints,1),
generator=create_generator(seeds[1]))
noisy_y = true_y + y_noise_strength * \
torch.randn((total_number_of_datapoints,1),
generator=create_generator(seeds[2]))
if normalize:
normalization_x, normalization_y = get_normalization(noisy_x, noisy_y)
noisy_x = (noisy_x-normalization_x[0])/normalization_x[1]
true_x = (true_x-normalization_x[0])/normalization_x[1]
noisy_y = (noisy_y-normalization_y[0])/normalization_y[1]
true_y = (true_y-normalization_y[0])/normalization_y[1]
linear_dataset = TensorDataset(noisy_x, noisy_y)
dataset_len = len(linear_dataset)
train_len = int(dataset_len*splitting_part)
test_len = dataset_len - train_len
linear_trainset, linear_testset = random_split(linear_dataset,
lengths=[train_len, test_len],
generator=create_generator(seeds[3]))
if not return_ground_truth:
return linear_trainset, linear_testset
else:
return linear_trainset, linear_testset, (true_x, true_y)
......@@ -4,24 +4,24 @@ from torch.utils.data import random_split
def load_data(seed=0, splitting_part=0.8, normalize=True):
"""
Loads the naval propulsion dataset
Loads the power plant dataset
:param seed: Seed for splitting and shuffling the data.
Defaults to 0.
:param splitting_part: Which fraction of the data to use as training
data. Defaults to 0.8.
:normalize: Whether to normalize the data, defaults to True.
:returns: naval_trainset, naval_testset
:returns: power_trainset, power_testset
"""
naval_dataset = CSVData('~/SharedData/AI/datasets/combined_cycle_power_plant/Folds5x2_pp_single_sheet.csv',
power_dataset = CSVData('~/SharedData/AI/datasets/combined_cycle_power_plant/Folds5x2_pp_single_sheet.csv',
class_name="PE",
shuffle_seed=seed,
normalize=normalize,
delimiter=",")
dataset_len = len(naval_dataset)
dataset_len = len(power_dataset)
train_len = int(dataset_len*splitting_part)
test_len = dataset_len - train_len
naval_trainset, naval_testset = random_split(naval_dataset,
power_trainset, power_testset = random_split(power_dataset,
lengths=[train_len, test_len],
generator=torch.Generator().manual_seed(seed))
return naval_trainset, naval_testset
return power_trainset, power_testset
{
"long_dataname": "linear",
"short_dataname": "linear",
"lr": 1e-3,
"batch_size": 100,
"test_batch_size": 800,
"number_of_epochs": 100,
"unscaled_reg": 10,
"report_point": 5,
"p": 0.1,
"lr_update": 20,
"std_y_update_points": [1,40],
"eiv_prediction_number_of_draws": [100,5],
"eiv_prediction_number_of_batches": 10,
"init_std_y_list": [0.5],
"gamma": 0.5,
"hidden_layers": [1024, 1024, 1024, 1024],
"fixed_std_x": 0.1,
"seed_range": [0,10],
"gpu_number": 1
}
{
"long_dataname": "linear",
"short_dataname": "linear",
"lr": 1e-3,
"batch_size": 100,
"test_batch_size": 800,
"number_of_epochs": 100,
"unscaled_reg": 10,
"report_point": 5,
"p": 0.1,
"lr_update": 20,
"std_y_update_points": [1,40] ,
"noneiv_prediction_number_of_draws": 100,
"noneiv_prediction_number_of_batches": 10,
"init_std_y_list": [0.5],
"gamma": 0.5,
"hidden_layers": [1024, 1024, 1024, 1024],
"seed_range": [0,10],
"gpu_number": 1
}
"""
Compute metrics for datasets for which there is not necessarily a ground truth.
Results will be stored in the results folder
"""
import importlib
import os
import argparse
......@@ -15,7 +19,7 @@ from EIVGeneral.coverage_metrics import epistemic_coverage, normalized_std
# read in data via --data option
parser = argparse.ArgumentParser()
parser.add_argument("--data", help="Loads data", default='california')
parser.add_argument("--data", help="Loads data", default='linear')
parser.add_argument("--no-autoindent", help="",
action="store_true") # to avoid conflics in IPython
args = parser.parse_args()
......@@ -73,6 +77,7 @@ def collect_metrics(x,y, seed=0,
of Gal et al. is followed where, in the evaluation of the
log-posterior-predictive, each dimension is treated independently and then
averaged. If False (default), a multivariate distribution is used.
:param device: The device to use.
:param scale_output: Boolean, scale the outputs for the RMSE, the bias and
the log-dens to make them comparable with the literature.
:returns: Dictionaries noneiv_metrics, eiv_metrics
......@@ -167,7 +172,6 @@ def collect_metrics(x,y, seed=0,
y = y.view((-1,1))
assert y.shape == eiv_mean.shape
res = y-eiv_mean
scale = train_data.dataset.std_labels.to(device)
if scale_outputs:
scale = train_data.dataset.std_labels.to(device)
scaled_res = res * scale.view((1,-1))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment