Why not do something like this:
from torch.utils.data import Dataset, DataLoader
class MyDataset(Dataset):
def __init__(self, content, max_length=None):
# the DataLoader will only shuffle instances in [0, max_length], maybe shuffle
# the instances here to not have smaller datasets being subsets of the larger ones
self.content = content
self.max_length = max_length or float("inf")
def __len__(self):
return min(self.max_length, len(self.content))
def __getitem__(self, idx):
return self.content[idx]
train = MyDataset(np.arange(1000),
max_length=100)
loader = DataLoader(train,
batch_size=25,
shuffle=True)
for entry in loader:
print(entry)
Output:
tensor([ 8, 33, 50, 40, 85, 32, 96, 58, 64, 78, 35, 97, 29, 73, 36, 68, 82, 62,
89, 66, 48, 79, 99, 93, 6], dtype=torch.int32)
tensor([52, 3, 26, 28, 77, 95, 24, 10, 7, 20, 16, 1, 25, 92, 56, 91, 30, 14,
65, 51, 74, 98, 46, 61, 81], dtype=torch.int32)
tensor([22, 5, 0, 31, 70, 45, 72, 42, 69, 12, 17, 41, 23, 54, 88, 60, 80, 90,
2, 47, 27, 67, 13, 49, 18], dtype=torch.int32)
tensor([34, 71, 19, 39, 37, 55, 21, 43, 57, 86, 59, 83, 11, 38, 87, 9, 94, 84,
63, 76, 53, 15, 75, 4, 44], dtype=torch.int32)
I'd further leave the validation set fixed. Split the validation data off and then only vary the size of the training data for an apples-to-apples comparison.
Edit:
For stratification, you can use train_test_split
from sklearn.model_selection
. A strategy for defining a dataset could be to take as constructor arguments the training_data
(complete), labels
and the desired size.
class LimitedDataset(Dataset):
def __init__(self, X, y, size, random_state=42):
X_selected, _, y_selected, _ = train_test_split(X, y, stratify=y, train_size=size, random_state=random_state)
self.X = X_selected
self.y = y_selected
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]