repeated_sampling.py

"""
Contains the class `repeated_sampling` that can be used to generate
datasets for repeated sampling from datasets with a ground truth.
"""
import sys

import torch
from torch.utils.data import TensorDataset

from EIVGeneral.manipulate_tensors import add_noise

class repeated_sampling():
    """
    A class for repeated sampling from datasets with a known ground truth and
    known input and output noise. The class `dataclass` should contain a
    `load_data` routine that returns a ground truth and two positive floats
    `x_noise_strength` and `y_noise_strength` that will be used as the standard
    deviation of input and output noise.
    :param dataclass: A module that contains a routine `load_data`, which
    accepts the keyword `return_ground_truth` and returns the noisy and true
    train and test datasets, and two positive floats `x_noise_strength` and
    `y_noise_strength`.
    :param fixed_seed: Integer. The seed to load the unnoisy ground truth,
    defaults to 0.
    """
    def __init__(self, dataclass, fixed_seed=0):
        self.dataclass = dataclass
        self.fixed_seed = fixed_seed
        self.x_noise_strength = dataclass.x_noise_strength
        self.y_noise_strength = dataclass.y_noise_strength

    def __call__(self,seed=0, splitting_part=0.8, normalize=True,
            return_ground_truth=False):
        _, _, true_trainset, true_testset\
                = self.dataclass.load_data(
                        seed=self.fixed_seed, splitting_part=splitting_part,
                        normalize=False,
                        return_ground_truth=True)
        full_noisy_x = torch.concat((true_trainset.tensors[2],
                true_testset.tensors[2]), dim=0)
        full_noisy_y = torch.concat((true_trainset.tensors[3],
                true_testset.tensors[3]), dim=0)
        true_train_x, true_train_y = true_trainset.tensors[:2]
        true_test_x, true_test_y = true_testset.tensors[:2]
        random_generator = torch.Generator().manual_seed(seed)
        # draw different seeds for noise and splitting
        seeds = [int(t) for t in torch.randint(0,sys.maxsize,(2,),\
                generator=random_generator)]
        (noisy_train_x, noisy_train_y), (true_train_x, true_train_y) =\
                add_noise((true_train_x, true_train_y),
                        (self.x_noise_strength, self.y_noise_strength), seeds,
                        normalize=normalize,
                        normalization_list=[full_noisy_x, full_noisy_y])
        (noisy_test_x, noisy_test_y), (true_test_x, true_test_y) =\
                add_noise((true_test_x, true_test_y),
                        (self.x_noise_strength, self.y_noise_strength), seeds,
                        normalize=normalize,
                        normalization_list=[full_noisy_x, full_noisy_y])
        trainset = TensorDataset(noisy_train_x, noisy_train_y)
        testset = TensorDataset(noisy_test_x, noisy_test_y)
        true_trainset = TensorDataset(true_train_x, true_train_y,
                noisy_train_x, noisy_train_y)
        true_testset = TensorDataset(true_test_x, true_test_y,
                noisy_test_x, noisy_test_y)
        if not return_ground_truth:
            return trainset, testset
        else:
            return trainset, testset, true_trainset, true_testset