datasets
Loading datasets
This module preprocesses the datasets (where necessary) and returns them as
tf.data.Dataset
objects.
The main interface is the function load_dataset(datasetName:str)
. It loads a
tf.data.Dataset
from disk and if none is found, creates one and saves it to
disk. That means, it can also be used in advance to ensure that the dataset is
stored to disk and can be quickly loaded during training.
Use it like this:
kaggle = load_dataset("kaggle")
Valid datasetName
values are: "cifar10", "cifar100", "kaggle", "kaggle_2",
"kaggle_10","kaggle_20","kaggle_50","kaggle_100". Using the key "kaggle_x"
returns the Kaggle dataset, but clustered into x
classes. If the dataset has
to be constructed, a k-means clustering is done in the backend, so it can take
a while.
Shuffling
To shuffle a Kaggle dataset (e.g. to construct a training set from random
samples) call shuffle_kaggle
:
kaggle = load_dataset("kaggle")
kaggle_shuffled = shuffle_kaggle(kaggle)
Pre-loading for later use
There is a conveniance method load_all_datasets
, which loads all datasets as
tf.data.Dataset
and saves them to disk, so that they can be quickly loaded in
the future.
Seeding
Use set_seed
to set a global seed, used by all random functions:
set_seed(1234)
1""" 2.. include:: ../docs/datasets.md 3""" 4 5from os import environ 6 7# Tensorflow C++ backend logging verbosity 8environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # NOQA 9 10from os.path import dirname, isdir, join 11from os import makedirs 12import shutil 13from typing import Tuple 14 15import numpy as np 16import sklearn.cluster 17import tensorflow as tf 18from tensorflow.data import Dataset # pyright: ignore 19from tensorflow.keras.utils import to_categorical # pyright: ignore 20from tensorflow.python.framework import random_seed 21from typing import List 22from numpy.typing import NDArray 23 24dataDir = join(dirname(__file__), "../data") 25global_seed: int = 1234 26 27 28def set_seed(new_seed: int): 29 """ 30 Set the global seed that will be used for all functions that include 31 randomness. 32 """ 33 global global_seed 34 global_seed = new_seed 35 np.random.seed(global_seed) 36 random_seed.set_seed(global_seed) 37 38 39def _dataset_from_split( 40 x_train, y_train, x_test, y_test) -> Dataset: 41 """ 42 Using the provided split dataset, create a Dataset. 43 """ 44 features: NDArray = np.append(x_train, x_test, axis=0).astype(np.float64) 45 labels: NDArray = np.append(y_train, y_test, axis=0).astype(np.int32) 46 labels = to_categorical(labels) 47 return Dataset.from_tensor_slices((features, labels)) 48 49 50def _prepare_cifar100() -> Dataset: 51 train, test = tf.keras.datasets.cifar100.load_data() 52 return _dataset_from_split(train[0], train[1], test[0], test[1]) 53 54 55def _prepare_cifar10() -> Dataset: 56 train, test = tf.keras.datasets.cifar10.load_data() 57 return _dataset_from_split(train[0], train[1], test[0], test[1]) 58 59 60def _read_kaggle_data() -> Tuple[NDArray, NDArray]: 61 """ 62 Read the Kaggle dataset features and labels from disk into Numpy arrays. 63 """ 64 print("Reading Kaggle from raw file.") 65 rawDataFile: str = join(dataDir, "kaggle", "raw_data") 66 data: NDArray = np.loadtxt(rawDataFile, dtype=int, delimiter=',') 67 labels: NDArray = data[:, 0] 68 features: NDArray = data[:, 1:] 69 # 0-based index 70 assert np.min(labels) >= 0 71 labels = labels - np.min(labels) 72 labels = to_categorical(labels, dtype='int64') 73 return features, labels 74 75 76def shuffle(dataset: Dataset) -> Dataset: 77 datasetSize = dataset.cardinality().numpy() 78 if datasetSize <= 0: # tf uses constants < 0 to indicate unknown cardinality 79 print("Warning: Getting dataset size from loading it to memory via numpy iterator, potentially slow.") 80 datasetSize = len(list(dataset.as_numpy_iterator())) 81 assert datasetSize > 0, "Dataset is empty" 82 83 return dataset.shuffle(datasetSize, seed=global_seed, reshuffle_each_iteration=False) 84 85 86def _prepare_kaggle() -> Dataset: 87 """ 88 Create Kaggle as Dataset from Numpy arrays 89 """ 90 features, labels = _read_kaggle_data() 91 return Dataset.from_tensor_slices((features, labels)) 92 93 94def _prepare_clustered_kaggle(numberOfClusters: int): 95 """ 96 Load the Kaggle data and cluster it. 97 """ 98 print(f"Clustering Kaggle with {numberOfClusters} classes.") 99 kmeans = sklearn.cluster.KMeans(n_clusters=numberOfClusters, random_state=global_seed) 100 features, _ = _read_kaggle_data() 101 kaggleSize = 197324 102 labels: NDArray = kmeans.fit_predict(features).reshape(kaggleSize, 1) 103 labels = to_categorical(labels, dtype='int64') 104 return Dataset.from_tensor_slices((features, labels)) 105 106 107def load_attack(datasetName: str, verbose=True) -> Dataset: 108 datasetDir: str = join(dataDir, "attack", datasetName, "dataset") 109 if verbose: 110 print(f"Loading dataset \"{datasetName}\" from disk.") 111 return tf.data.experimental.load(datasetDir) 112 113 114def load_shadow(datasetName: str, verbose=True) -> Dataset: 115 datasetDir: str = join(dataDir, "shadow", datasetName, "dataset") 116 if verbose: 117 print(f"Loading dataset \"{datasetName}\" from disk.") 118 return tf.data.experimental.load(datasetDir) 119 120def load_numpy_array(filename:str): 121 numpyFile: str = join(dataDir, "numpy", filename) 122 return np.load(numpyFile, allow_pickle=True) 123 124def save_numpy_array(filename:str, array): 125 numpyDir: str = join(dataDir, "numpy") 126 if not isdir(numpyDir): 127 makedirs(numpyDir) 128 numpyFile: str = join(dataDir, "numpy", filename) 129 np.save(numpyFile, array) 130 131def save_target(dataset: Dataset, datasetName: str): 132 datasetDir: str = join(dataDir, "target", datasetName, "dataset") 133 tf.data.experimental.save(dataset, datasetDir) 134 135def load_target(datasetName: str, verbose=True) -> Dataset: 136 datasetDir: str = join(dataDir, "target", datasetName, "dataset") 137 if verbose: 138 print(f"Loading dataset \"{datasetName}\" from disk.") 139 return tf.data.experimental.load(datasetDir) 140 141def save_attack(dataset: Dataset, datasetName: str): 142 datasetDir: str = join(dataDir, "attack", datasetName, "dataset") 143 tf.data.experimental.save(dataset, datasetDir) 144 145 146def save_shadow(dataset: Dataset, datasetName: str): 147 datasetDir: str = join(dataDir, "shadow", datasetName, "dataset") 148 tf.data.experimental.save(dataset, datasetDir) 149 150 151def delete_shadow(datasetName: str): 152 datasetDir: str = join(dataDir, "shadow", datasetName, "dataset") 153 shutil.rmtree(datasetDir) 154 155 156def load_dataset(datasetName: str) -> Dataset: 157 """ 158 Load a dataset. 159 160 Valid `datasetName` values are: "cifar10", "cifar100", "kaggle", "kaggle_2", 161 "kaggle_10","kaggle_20","kaggle_50","kaggle_100". 162 """ 163 datasetDir: str = join(dataDir, datasetName, "dataset") 164 if isdir(datasetDir): 165 print(f"Loading {datasetName} from disk.") 166 return tf.data.experimental.load(datasetDir) 167 168 print(f"Loading {datasetName}.") 169 170 if datasetName == "cifar10": 171 dataset = _prepare_cifar10() 172 elif datasetName == "cifar100": 173 dataset = _prepare_cifar100() 174 elif datasetName == "kaggle": 175 dataset = _prepare_kaggle() 176 elif datasetName == "kaggle_2": 177 dataset = _prepare_clustered_kaggle(2) 178 elif datasetName == "kaggle_10": 179 dataset = _prepare_clustered_kaggle(10) 180 elif datasetName == "kaggle_20": 181 dataset = _prepare_clustered_kaggle(20) 182 elif datasetName == "kaggle_50": 183 dataset = _prepare_clustered_kaggle(50) 184 else: 185 raise ValueError(f"{datasetName} is not a known dataset.") 186 187 print(f"Saving {datasetName} to disk.") 188 tf.data.experimental.save(dataset, datasetDir) 189 return dataset 190 191 192def split_dataset(dataset: Dataset, numSubsets: int) -> List[Dataset]: 193 datasets = [] 194 for i in range(numSubsets): 195 datasets.append(dataset.shard(numSubsets, i)) 196 return datasets 197 198 199def load_all_datasets(): 200 load_dataset("cifar10") 201 load_dataset("cifar100") 202 load_dataset("kaggle") 203 load_dataset("kaggle_2") 204 load_dataset("kaggle_10") 205 load_dataset("kaggle_20") 206 load_dataset("kaggle_50") 207 208 209if __name__ == "__main__": 210 import argparse 211 import configuration as con 212 213 parser = argparse.ArgumentParser(description='Make sure the needed dataset is downloaded.') 214 parser.add_argument('--config', help='Relative path to config file.',) 215 config = con.from_cli_options(vars(parser.parse_args())) 216 set_seed(config["seed"]) 217 218 dataName = config["targetDataset"]["name"] 219 print(f"Loading {dataName}, doing clustering if necessary.") 220 # Implicitely performs clustering for kaggle datasets 221 load_dataset(dataName)
29def set_seed(new_seed: int): 30 """ 31 Set the global seed that will be used for all functions that include 32 randomness. 33 """ 34 global global_seed 35 global_seed = new_seed 36 np.random.seed(global_seed) 37 random_seed.set_seed(global_seed)
Set the global seed that will be used for all functions that include randomness.
77def shuffle(dataset: Dataset) -> Dataset: 78 datasetSize = dataset.cardinality().numpy() 79 if datasetSize <= 0: # tf uses constants < 0 to indicate unknown cardinality 80 print("Warning: Getting dataset size from loading it to memory via numpy iterator, potentially slow.") 81 datasetSize = len(list(dataset.as_numpy_iterator())) 82 assert datasetSize > 0, "Dataset is empty" 83 84 return dataset.shuffle(datasetSize, seed=global_seed, reshuffle_each_iteration=False)
157def load_dataset(datasetName: str) -> Dataset: 158 """ 159 Load a dataset. 160 161 Valid `datasetName` values are: "cifar10", "cifar100", "kaggle", "kaggle_2", 162 "kaggle_10","kaggle_20","kaggle_50","kaggle_100". 163 """ 164 datasetDir: str = join(dataDir, datasetName, "dataset") 165 if isdir(datasetDir): 166 print(f"Loading {datasetName} from disk.") 167 return tf.data.experimental.load(datasetDir) 168 169 print(f"Loading {datasetName}.") 170 171 if datasetName == "cifar10": 172 dataset = _prepare_cifar10() 173 elif datasetName == "cifar100": 174 dataset = _prepare_cifar100() 175 elif datasetName == "kaggle": 176 dataset = _prepare_kaggle() 177 elif datasetName == "kaggle_2": 178 dataset = _prepare_clustered_kaggle(2) 179 elif datasetName == "kaggle_10": 180 dataset = _prepare_clustered_kaggle(10) 181 elif datasetName == "kaggle_20": 182 dataset = _prepare_clustered_kaggle(20) 183 elif datasetName == "kaggle_50": 184 dataset = _prepare_clustered_kaggle(50) 185 else: 186 raise ValueError(f"{datasetName} is not a known dataset.") 187 188 print(f"Saving {datasetName} to disk.") 189 tf.data.experimental.save(dataset, datasetDir) 190 return dataset
Load a dataset.
Valid datasetName
values are: "cifar10", "cifar100", "kaggle", "kaggle_2",
"kaggle_10","kaggle_20","kaggle_50","kaggle_100".