linear.py

import torch
import sys
from torch.utils.data import TensorDataset, random_split

total_number_of_datapoints = 2000
input_range = [-1,1]
slope = 1.0
intercept = 0.0
x_noise_strength = 0.05
y_noise_strength = 0.1

def get_normalization(*args):
    """
    Returns the mean and standard deviations (in tuples) of the tensors in *args.
    """
    normalization_collection = []
    for t in args:
        t_mean = torch.mean(t, dim=0, keepdim=True)
        t_std = torch.std(t, dim=0, keepdim=True)
        normalization_collection.append((t_mean, t_std))
    return tuple(normalization_collection)

def load_data(seed=0, splitting_part=0.8, normalize=True,
        return_ground_truth=False):
    """
    Loads one-dimensional data
    :param seed: Seed for drawing and splitting the data.
    :param splitting_part: Which fraction of the data to use as training
    data. Defaults to 0.8.
    :param normalize: Whether to normalize the data, defaults to True.
    :param return_ground_truth: Boolean. If True, the unnoisy ground truth will
    also be returned. Defaults to False.
    :returns: linear_trainset, linear_testset if return_ground_truth is False,
    else linear_trainset, linear_testset, (true_x, true_y)
    """
    random_generator = torch.Generator().manual_seed(seed)
    # draw different seeds for noise and splitting
    seeds = torch.randint(0,sys.maxsize,(4,), generator=random_generator)
    # create new generators from tensor seeds
    create_generator = lambda tensor_seed:\
            torch.Generator().manual_seed(tensor_seed.item())
    true_x = input_range[0] + (input_range[1]-input_range[0])\
                  * torch.rand((total_number_of_datapoints,1),
                          generator=create_generator(seeds[0]))
    true_y = slope * true_x + intercept 
    noisy_x = true_x + x_noise_strength * \
            torch.randn((total_number_of_datapoints,1),
            generator=create_generator(seeds[1]))
    noisy_y = true_y + y_noise_strength * \
            torch.randn((total_number_of_datapoints,1),
            generator=create_generator(seeds[2]))
    if normalize:
        normalization_x, normalization_y = get_normalization(noisy_x, noisy_y)
        noisy_x = (noisy_x-normalization_x[0])/normalization_x[1]
        true_x = (true_x-normalization_x[0])/normalization_x[1]
        noisy_y = (noisy_y-normalization_y[0])/normalization_y[1]
        true_y = (true_y-normalization_y[0])/normalization_y[1]
    linear_dataset = TensorDataset(noisy_x, noisy_y)
    dataset_len = len(linear_dataset)
    train_len = int(dataset_len*splitting_part)
    test_len = dataset_len - train_len
    linear_trainset, linear_testset = random_split(linear_dataset,
            lengths=[train_len, test_len],
            generator=create_generator(seeds[3]))
    if not return_ground_truth:
        return linear_trainset, linear_testset
    else:
        return linear_trainset, linear_testset, (true_x, true_y)