attack_data
Constructing training data for attack models
Membership status is one-hot encoded with the following scheme:
[1,0] or 0 : respective record was part of training data [0,1] or 1 : respective record was not part of training data
1""" 2.. include:: ../docs/attack_data.md 3""" 4 5from os import environ 6from math import floor 7from typing import List, Dict, Tuple 8 9# Tensorflow C++ backend logging verbosity 10environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # NOQA 11 12import target_models as tm 13import datasets as ds 14 15import numpy as np 16import tensorflow as tf 17from tensorflow.data import Dataset # pyright: ignore 18from tensorflow.python.framework import random_seed 19from tensorflow.keras import Sequential # pyright: ignore 20from tensorflow.keras.utils import to_categorical # pyright: ignore 21 22global_seed: int = 1234 23 24 25def set_seed(new_seed: int): 26 """ 27 Set the global seed that will be used for all functions that include 28 randomness. 29 """ 30 global global_seed 31 global_seed = new_seed 32 np.random.seed(global_seed) 33 random_seed.set_seed(global_seed) 34 35 36def _prepare_subset(superset: Dataset, size: int, 37 model: Sequential, inTraining: bool): 38 # TODO: hardcoded batch size 39 batchSize: int = 1 40 subset = superset.take(size).batch(batchSize) 41 predictions = model.predict(subset, batch_size=batchSize) 42 if inTraining: 43 labels = np.zeros(size) 44 else: 45 labels = np.ones(size) 46 return predictions, labels 47 48 49def _get_filter_fn(label: int): 50 51 wantedLabel = np.int64(label) 52 def _filter_fn(_, y): return tf.math.equal(wantedLabel, tf.math.argmax(y)) 53 return _filter_fn 54 55 56def from_target_data(targetTrainData: Dataset, targetTestData: Dataset, 57 targetModel: Sequential, label: int) -> Dataset: 58 """ 59 Use the training and test data to construct the attack dataset. 60 Splitting is done according to this scheme (best viewed in the source code): 61 62 Training data Testing data 63 64 ┌───────┬───────┐ ┌───────┬───────┐ 65 │ │ │ │ │ │ 66 │ B │ C │ │ A │ D │ 67 └────┬──┴─────┬─┘ └───┬───┴────┬──┘ 68 │ │ │ │ 69 │ ┌───┼───────────────┘ │ 70 B │ A │ └──────────────────┐ │ 71 │ │ │C │D 72 │ │ │ │ 73 ▼ ▼ ▼ ▼ 74 75 ┌───────────────┐ ┌───────────────┐ 76 │ │ │ │ 77 │ A+B │ │ C+D │ 78 └───────────────┘ └───────────────┘ 79 Attack train data Attack test data 80 81 """ 82 # TODO assertions about disjoint sets, and equal set sizes 83 targetTrainData = targetTrainData.filter(_get_filter_fn(label)) 84 targetTestData = targetTestData.filter(_get_filter_fn(label)) 85 86 # There are only limited data points per class, thus we use as many as we 87 # can get and split them 80/20 for training 88 dataSizePerSet: int = min(len(list(targetTrainData.as_numpy_iterator())), len( 89 list(targetTestData.as_numpy_iterator()))) 90 91 splitFactor = 0.8 92 attackTestSize = int((1 - splitFactor) * dataSizePerSet) 93 attackTrainSize = int(splitFactor * dataSizePerSet) 94 95 print(f"Train dataset size (train) for label {label}: {attackTrainSize}") 96 97 halfAttackTrainSize: int = int(attackTrainSize / 2) 98 halfAttackTestSize: int = int(attackTestSize / 2) 99 100 APredictions, ALabels = _prepare_subset(targetTestData, halfAttackTrainSize, targetModel, False) 101 BPredictions, BLabels = _prepare_subset(targetTrainData, halfAttackTrainSize, targetModel, True) 102 CPredictions, CLabels = _prepare_subset( 103 targetTrainData.skip(halfAttackTrainSize), halfAttackTestSize, targetModel, True) 104 DPredictions, DLabels = _prepare_subset( 105 targetTestData.skip(halfAttackTrainSize), halfAttackTestSize, targetModel, False) 106 107 featuresTrain = np.append(APredictions, BPredictions, axis=0) 108 featuresTest = np.append(CPredictions, DPredictions, axis=0) 109 labelsTrain = to_categorical(np.append(ALabels, BLabels, axis=0)) 110 labelsTest = to_categorical(np.append(CLabels, DLabels, axis=0)) 111 112 attackTrainData = Dataset.from_tensor_slices((featuresTrain, labelsTrain)) 113 attackTestData = Dataset.from_tensor_slices((featuresTest, labelsTest)) 114 115 return attackTrainData, attackTestData 116 117 118def load_attack_data(config: Dict) -> List[Tuple[ds.Dataset, ds.Dataset]]: 119 verbose = config["verbose"] 120 numClasses = config["targetModel"]["classes"] 121 numDatasets = numClasses 122 attackDatasets = [] 123 for i in range(numDatasets): 124 testData = ds.load_attack(_get_attack_data_name(config, i, test=True), verbose=verbose) 125 trainData = ds.load_attack(_get_attack_data_name(config, i, test=False), verbose=verbose) 126 attackDatasets.append((testData, trainData)) 127 return attackDatasets 128 129 130def _get_attack_data_name(config: Dict, i, test=False): 131 numModels: int = config["shadowModels"]["number"] 132 numClasses = config["targetModel"]["classes"] 133 shadowMethod = config["shadowDataset"]["method"] 134 split: float = config["shadowModels"]["split"] 135 if config["attackDataset"]["balance"]: 136 balanced = "balanced" 137 else: 138 balanced = "unbalanced" 139 name = tm.get_model_name(config) + "_" + shadowMethod + f"_split_{split}_with_{numModels}_models_{i+1}_of_{numClasses}_{balanced}" 140 if test: 141 return name + "_test" 142 else: 143 return name + "_train" 144 145 146def save(config: Dict, datasets: List[ds.Dataset]): 147 numClasses = config["targetModel"]["classes"] 148 assert numClasses == len( 149 datasets), "List should contain 1 dataset per class" 150 for index, (trainData, testData) in enumerate(datasets): 151 if config["verbose"]: 152 print(f"Saving attack dataset #{index+1}/{numClasses}") 153 ds.save_attack(trainData, _get_attack_data_name(config, index, test=False)) 154 ds.save_attack(testData, _get_attack_data_name(config, index, test=True)) 155 156 157def shuffle(dataset: Dataset, bufferSize=10000) -> Dataset: 158 # TODO: hard coded buffer size (needs to be greater than dataset size for 159 # real random sampling.) 160 return dataset.shuffle(bufferSize, seed=global_seed, reshuffle_each_iteration=False) 161 162 163def _balance_attack_data(dataset: ds.Dataset) -> ds.Dataset: 164 in_data = dataset.filter(lambda _, y: tf.math.equal(tf.argmax(y), 1)) 165 out_data = dataset.filter(lambda _, y: tf.math.equal(tf.argmax(y), 0)) 166 167 in_points = len(list(in_data)) 168 out_points = len(list(out_data)) 169 170 if in_points > out_points: 171 in_data = shuffle(in_data).skip(in_points - out_points) 172 elif in_points < out_points: 173 out_data = shuffle(out_data).skip(out_points - in_points) 174 else: 175 return dataset 176 177 return in_data.concatenate(out_data) 178 179 180def balance_attack_data(datasets: List[ds.Dataset]) -> List[ds.Dataset]: 181 """ 182 Make sure that input datasets have equal number of in/out datapoints. 183 """ 184 size = len(datasets) 185 for index, dataset in enumerate(datasets): 186 print(f"Balancing dataset {index+1} of {size}.") 187 datasets[index] = _balance_attack_data(dataset) 188 return datasets 189 190 191def split_dataset(dataset: ds.Dataset, split: float): 192 datasetSize = len(list(dataset)) 193 trainSize = floor(split * datasetSize) 194 testSize = floor((1 - split) * datasetSize) 195 assert trainSize + testSize <= datasetSize 196 197 trainData = dataset.take(trainSize) 198 testData = dataset.skip(trainSize).take(testSize) 199 return trainData, testData 200 201 202def split_attack_data_for_training(datasets: List[ds.Dataset], config: Dict): 203 split = config["attackDataset"]["split"] 204 splitDatasets = [] 205 for index, dataset in enumerate(datasets): 206 if config["verbose"]: 207 print(f"Splitting dataset {index+1} of {len(datasets)}") 208 splitDatasets.append(split_dataset(dataset, split)) 209 return splitDatasets 210 211 212def get_attack_data(config: Dict, 213 shadowModels: List[tm.Sequential], 214 shadowDatasets: List[Tuple[ds.Dataset, ds.Dataset]]) -> List[Tuple[ds.Dataset, ds.Dataset]]: 215 """ 216 This function predicts and then labels the provided datasets on their 217 respective shadow model, thus creating the labeled data needed for the 218 attack model. 219 220 It returns a list of tuples that contains (trainingData, testingData) per class. 221 """ 222 try: 223 print("Loading attack data.") 224 return load_attack_data(config) 225 except BaseException: 226 print("Didn't work, reconstructing it.") 227 attackDatasets = from_shadow_models(config, shadowModels, shadowDatasets) 228 balanceAttackData = config["attackDataset"]["balance"] 229 if balanceAttackData: 230 print("Balancing attack data to contain equal amounts in/out records.") 231 attackDatasets = balance_attack_data(attackDatasets) 232 print("Splitting attack data for training.") 233 attackDatasets = split_attack_data_for_training(attackDatasets, config) 234 print("Saving attack data to disk.") 235 save(config, attackDatasets) 236 return attackDatasets 237 238 239def from_shadow_models(config: Dict, shadowModels: 240 List[tm.Sequential], shadowDatasets: 241 List[Tuple[ds.Dataset, ds.Dataset]]) -> List[ds.Dataset]: 242 """ 243 Predicts the shadow data on the shadow models themselves and labels it with 244 "in" and "out", for the attack model to train on. 245 """ 246 numModels: int = config["shadowModels"]["number"] 247 numClasses = config["targetModel"]["classes"] 248 attackDatasets = [] 249 250 for i in range(numModels): 251 252 model = shadowModels[i] 253 trainData, testData = shadowDatasets[i] 254 trainDataSize = trainData.cardinality().numpy() 255 testDataSize = testData.cardinality().numpy() 256 257 # Only relevant if split > 0.5 258 assert trainDataSize >= testDataSize 259 trainData = trainData.take(testDataSize) 260 trainDataSize = testDataSize 261 262 # Get predictions 263 trainPreds = model.predict(trainData.batch(100, drop_remainder=False)) 264 testPreds = model.predict(testData.batch(100, drop_remainder=False)) 265 266 # Construct "in"/"out" labels 267 trainLabels = np.tile(np.array([[1, 0]]), (trainDataSize, 1)) 268 testLabels = np.tile(np.array([[0, 1]]), (testDataSize, 1)) 269 270 # Combine them into 1 dataset 271 trainPredsLabels = tf.data.Dataset.from_tensor_slices((trainPreds, trainLabels)) 272 testPredsLabels = tf.data.Dataset.from_tensor_slices((testPreds, testLabels)) 273 274 # Add data records and ground truth class to the dataset 275 trainDataPredsLabels = tf.data.Dataset.zip((trainData, trainPredsLabels)) 276 testDataPredsLabels = tf.data.Dataset.zip((testData, testPredsLabels)) 277 278 # Combine train and test data 279 attackData = trainDataPredsLabels.concatenate(testDataPredsLabels) 280 281 for currentClass in range(numClasses): 282 283 def is_current_class(dataAndClass, predAndLabel): 284 (_, classLabel) = dataAndClass 285 return tf.math.equal(np.int64(currentClass), tf.math.argmax(classLabel)) 286 287 classAttackData = attackData.filter(is_current_class) 288 289 def restructure_data(dataAndClass, predAndLabel): 290 return predAndLabel 291 292 # Drop unused data record and class ground truth 293 classAttackDataFinal = classAttackData.map(restructure_data) 294 295 if i == 0: 296 # First shadow model -> Each class seen the first time 297 attackDatasets.append(classAttackDataFinal) 298 else: 299 # Not first shadow model. Concatenate with appropriate dataset 300 attackDatasets[currentClass] = attackDatasets[currentClass].concatenate(classAttackDataFinal) 301 302 return attackDatasets 303 304 305if __name__ == "__main__": 306 import argparse 307 import configuration as con 308 import shadow_models as sm 309 310 parser = argparse.ArgumentParser(description='Save one shadow dataset per model and train the models.') 311 parser.add_argument('--config', help='Relative path to config file.',) 312 config = con.from_cli_options(vars(parser.parse_args())) 313 set_seed(config["seed"]) 314 315 shadowModels, shadowDatasets = sm.load_shadow_models_and_datasets(config) 316 attackDatasets = get_attack_data(config, shadowModels, shadowDatasets)
26def set_seed(new_seed: int): 27 """ 28 Set the global seed that will be used for all functions that include 29 randomness. 30 """ 31 global global_seed 32 global_seed = new_seed 33 np.random.seed(global_seed) 34 random_seed.set_seed(global_seed)
Set the global seed that will be used for all functions that include randomness.
57def from_target_data(targetTrainData: Dataset, targetTestData: Dataset, 58 targetModel: Sequential, label: int) -> Dataset: 59 """ 60 Use the training and test data to construct the attack dataset. 61 Splitting is done according to this scheme (best viewed in the source code): 62 63 Training data Testing data 64 65 ┌───────┬───────┐ ┌───────┬───────┐ 66 │ │ │ │ │ │ 67 │ B │ C │ │ A │ D │ 68 └────┬──┴─────┬─┘ └───┬───┴────┬──┘ 69 │ │ │ │ 70 │ ┌───┼───────────────┘ │ 71 B │ A │ └──────────────────┐ │ 72 │ │ │C │D 73 │ │ │ │ 74 ▼ ▼ ▼ ▼ 75 76 ┌───────────────┐ ┌───────────────┐ 77 │ │ │ │ 78 │ A+B │ │ C+D │ 79 └───────────────┘ └───────────────┘ 80 Attack train data Attack test data 81 82 """ 83 # TODO assertions about disjoint sets, and equal set sizes 84 targetTrainData = targetTrainData.filter(_get_filter_fn(label)) 85 targetTestData = targetTestData.filter(_get_filter_fn(label)) 86 87 # There are only limited data points per class, thus we use as many as we 88 # can get and split them 80/20 for training 89 dataSizePerSet: int = min(len(list(targetTrainData.as_numpy_iterator())), len( 90 list(targetTestData.as_numpy_iterator()))) 91 92 splitFactor = 0.8 93 attackTestSize = int((1 - splitFactor) * dataSizePerSet) 94 attackTrainSize = int(splitFactor * dataSizePerSet) 95 96 print(f"Train dataset size (train) for label {label}: {attackTrainSize}") 97 98 halfAttackTrainSize: int = int(attackTrainSize / 2) 99 halfAttackTestSize: int = int(attackTestSize / 2) 100 101 APredictions, ALabels = _prepare_subset(targetTestData, halfAttackTrainSize, targetModel, False) 102 BPredictions, BLabels = _prepare_subset(targetTrainData, halfAttackTrainSize, targetModel, True) 103 CPredictions, CLabels = _prepare_subset( 104 targetTrainData.skip(halfAttackTrainSize), halfAttackTestSize, targetModel, True) 105 DPredictions, DLabels = _prepare_subset( 106 targetTestData.skip(halfAttackTrainSize), halfAttackTestSize, targetModel, False) 107 108 featuresTrain = np.append(APredictions, BPredictions, axis=0) 109 featuresTest = np.append(CPredictions, DPredictions, axis=0) 110 labelsTrain = to_categorical(np.append(ALabels, BLabels, axis=0)) 111 labelsTest = to_categorical(np.append(CLabels, DLabels, axis=0)) 112 113 attackTrainData = Dataset.from_tensor_slices((featuresTrain, labelsTrain)) 114 attackTestData = Dataset.from_tensor_slices((featuresTest, labelsTest)) 115 116 return attackTrainData, attackTestData
Use the training and test data to construct the attack dataset. Splitting is done according to this scheme (best viewed in the source code):
Training data Testing data
┌───────┬───────┐ ┌───────┬───────┐ │ │ │ │ │ │ │ B │ C │ │ A │ D │ └────┬──┴─────┬─┘ └───┬───┴────┬──┘ │ │ │ │ │ ┌───┼───────────────┘ │ B │ A │ └──────────────────┐ │ │ │ │C │D │ │ │ │ ▼ ▼ ▼ ▼
┌───────────────┐ ┌───────────────┐ │ │ │ │ │ A+B │ │ C+D │ └───────────────┘ └───────────────┘ Attack train data Attack test data
119def load_attack_data(config: Dict) -> List[Tuple[ds.Dataset, ds.Dataset]]: 120 verbose = config["verbose"] 121 numClasses = config["targetModel"]["classes"] 122 numDatasets = numClasses 123 attackDatasets = [] 124 for i in range(numDatasets): 125 testData = ds.load_attack(_get_attack_data_name(config, i, test=True), verbose=verbose) 126 trainData = ds.load_attack(_get_attack_data_name(config, i, test=False), verbose=verbose) 127 attackDatasets.append((testData, trainData)) 128 return attackDatasets
147def save(config: Dict, datasets: List[ds.Dataset]): 148 numClasses = config["targetModel"]["classes"] 149 assert numClasses == len( 150 datasets), "List should contain 1 dataset per class" 151 for index, (trainData, testData) in enumerate(datasets): 152 if config["verbose"]: 153 print(f"Saving attack dataset #{index+1}/{numClasses}") 154 ds.save_attack(trainData, _get_attack_data_name(config, index, test=False)) 155 ds.save_attack(testData, _get_attack_data_name(config, index, test=True))
181def balance_attack_data(datasets: List[ds.Dataset]) -> List[ds.Dataset]: 182 """ 183 Make sure that input datasets have equal number of in/out datapoints. 184 """ 185 size = len(datasets) 186 for index, dataset in enumerate(datasets): 187 print(f"Balancing dataset {index+1} of {size}.") 188 datasets[index] = _balance_attack_data(dataset) 189 return datasets
Make sure that input datasets have equal number of in/out datapoints.
192def split_dataset(dataset: ds.Dataset, split: float): 193 datasetSize = len(list(dataset)) 194 trainSize = floor(split * datasetSize) 195 testSize = floor((1 - split) * datasetSize) 196 assert trainSize + testSize <= datasetSize 197 198 trainData = dataset.take(trainSize) 199 testData = dataset.skip(trainSize).take(testSize) 200 return trainData, testData
203def split_attack_data_for_training(datasets: List[ds.Dataset], config: Dict): 204 split = config["attackDataset"]["split"] 205 splitDatasets = [] 206 for index, dataset in enumerate(datasets): 207 if config["verbose"]: 208 print(f"Splitting dataset {index+1} of {len(datasets)}") 209 splitDatasets.append(split_dataset(dataset, split)) 210 return splitDatasets
213def get_attack_data(config: Dict, 214 shadowModels: List[tm.Sequential], 215 shadowDatasets: List[Tuple[ds.Dataset, ds.Dataset]]) -> List[Tuple[ds.Dataset, ds.Dataset]]: 216 """ 217 This function predicts and then labels the provided datasets on their 218 respective shadow model, thus creating the labeled data needed for the 219 attack model. 220 221 It returns a list of tuples that contains (trainingData, testingData) per class. 222 """ 223 try: 224 print("Loading attack data.") 225 return load_attack_data(config) 226 except BaseException: 227 print("Didn't work, reconstructing it.") 228 attackDatasets = from_shadow_models(config, shadowModels, shadowDatasets) 229 balanceAttackData = config["attackDataset"]["balance"] 230 if balanceAttackData: 231 print("Balancing attack data to contain equal amounts in/out records.") 232 attackDatasets = balance_attack_data(attackDatasets) 233 print("Splitting attack data for training.") 234 attackDatasets = split_attack_data_for_training(attackDatasets, config) 235 print("Saving attack data to disk.") 236 save(config, attackDatasets) 237 return attackDatasets
This function predicts and then labels the provided datasets on their respective shadow model, thus creating the labeled data needed for the attack model.
It returns a list of tuples that contains (trainingData, testingData) per class.
240def from_shadow_models(config: Dict, shadowModels: 241 List[tm.Sequential], shadowDatasets: 242 List[Tuple[ds.Dataset, ds.Dataset]]) -> List[ds.Dataset]: 243 """ 244 Predicts the shadow data on the shadow models themselves and labels it with 245 "in" and "out", for the attack model to train on. 246 """ 247 numModels: int = config["shadowModels"]["number"] 248 numClasses = config["targetModel"]["classes"] 249 attackDatasets = [] 250 251 for i in range(numModels): 252 253 model = shadowModels[i] 254 trainData, testData = shadowDatasets[i] 255 trainDataSize = trainData.cardinality().numpy() 256 testDataSize = testData.cardinality().numpy() 257 258 # Only relevant if split > 0.5 259 assert trainDataSize >= testDataSize 260 trainData = trainData.take(testDataSize) 261 trainDataSize = testDataSize 262 263 # Get predictions 264 trainPreds = model.predict(trainData.batch(100, drop_remainder=False)) 265 testPreds = model.predict(testData.batch(100, drop_remainder=False)) 266 267 # Construct "in"/"out" labels 268 trainLabels = np.tile(np.array([[1, 0]]), (trainDataSize, 1)) 269 testLabels = np.tile(np.array([[0, 1]]), (testDataSize, 1)) 270 271 # Combine them into 1 dataset 272 trainPredsLabels = tf.data.Dataset.from_tensor_slices((trainPreds, trainLabels)) 273 testPredsLabels = tf.data.Dataset.from_tensor_slices((testPreds, testLabels)) 274 275 # Add data records and ground truth class to the dataset 276 trainDataPredsLabels = tf.data.Dataset.zip((trainData, trainPredsLabels)) 277 testDataPredsLabels = tf.data.Dataset.zip((testData, testPredsLabels)) 278 279 # Combine train and test data 280 attackData = trainDataPredsLabels.concatenate(testDataPredsLabels) 281 282 for currentClass in range(numClasses): 283 284 def is_current_class(dataAndClass, predAndLabel): 285 (_, classLabel) = dataAndClass 286 return tf.math.equal(np.int64(currentClass), tf.math.argmax(classLabel)) 287 288 classAttackData = attackData.filter(is_current_class) 289 290 def restructure_data(dataAndClass, predAndLabel): 291 return predAndLabel 292 293 # Drop unused data record and class ground truth 294 classAttackDataFinal = classAttackData.map(restructure_data) 295 296 if i == 0: 297 # First shadow model -> Each class seen the first time 298 attackDatasets.append(classAttackDataFinal) 299 else: 300 # Not first shadow model. Concatenate with appropriate dataset 301 attackDatasets[currentClass] = attackDatasets[currentClass].concatenate(classAttackDataFinal) 302 303 return attackDatasets
Predicts the shadow data on the shadow models themselves and labels it with "in" and "out", for the attack model to train on.