shadow_data
This module generates shadow data for training the shadow models. It assumes varying knowledge about the original dataset, depending on the attacker capabilities.
1""" 2.. include:: ../docs/shadow_data.md 3""" 4 5# TODO: Everything (?) in here is Kaggle specific 6 7from os import environ 8from typing import Tuple 9from numpy.typing import NDArray 10from typing import Optional, Dict, List 11import datasets as ds 12import random 13import numpy as np 14 15# Tensorflow C++ backend logging verbosity 16environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # NOQA 17 18import tensorflow as tf 19from tensorflow.keras.utils import to_categorical # pyright: ignore 20from tensorflow.python.framework import random_seed 21from tensorflow.data import Dataset # pyright: ignore 22from tensorflow.keras import Sequential # pyright: ignore 23 24global_seed: int = 1234 25globalRandomGen = np.random.default_rng(global_seed) 26 27 28def set_seed(new_seed: int): 29 """ 30 Set the global seed that will be used for all functions that include 31 randomness. 32 """ 33 global global_seed 34 global_seed = new_seed 35 np.random.seed(global_seed) 36 random.seed(global_seed) 37 random_seed.set_seed(global_seed) 38 39 40def generate_shadow_data_sampling(original_data: Dataset) -> Dataset: 41 """ 42 Generate synthetic data for the shadow models by randomly sampling data 43 points from the original data set. 44 """ 45 sample_dataset: Dataset = tf.data.Dataset.sample_from_datasets( 46 [original_data], seed=global_seed, stop_on_empty_dataset=True) 47 return sample_dataset 48 49 50def split_shadow_data(config: Dict, shadowData: ds.Dataset) -> List[ds.Dataset]: 51 print("Splitting shadow data into subsets.") 52 numSubsets = config["shadowModels"]["number"] 53 return ds.split_dataset(shadowData, numSubsets) 54 55 56def load_shadow_data(config: Dict): 57 dataName = get_shadow_data_name(config) 58 return ds.load_shadow(dataName, verbose=config["verbose"]) 59 60 61def get_shadow_data_name(config: Dict): 62 shadowConfig = config["shadowDataset"] 63 method = shadowConfig["method"] 64 targetDataName = config["targetDataset"]["name"] 65 dataSize = shadowConfig["size"] 66 hyperpars = shadowConfig[method]["hyperparameters"] 67 if method == "noisy": 68 dataName = f'{method}_fraction_{hyperpars["fraction"]}_size_{dataSize}_target_{targetDataName}' 69 elif method == "hill_climbing": 70 dataName = \ 71 f'{method}_' + \ 72 f'{targetDataName}_' + \ 73 f'kmax_{hyperpars["k_max"]}_' + \ 74 f'kmin_{hyperpars["k_min"]}_' + \ 75 f'confmin_{hyperpars["conf_min"]}_' + \ 76 f'rejmax_{hyperpars["rej_max"]}_' + \ 77 f'itermax_{hyperpars["iter_max"]}_' + \ 78 f'size_{dataSize}' 79 elif method == "original": 80 dataName = \ 81 f'{method}_' + \ 82 f'{targetDataName}_' + \ 83 f'original_data_' + \ 84 f'size_{dataSize}' 85 elif method == "statistic": 86 dataName = \ 87 f'{method}_' + \ 88 f'{targetDataName}_' + \ 89 f'statistic_' + \ 90 f'size_{dataSize}' 91 else: 92 raise ValueError(f"{method} is not a valid shadow data method.") 93 return dataName 94 95 96def get_shadow_data(config: Dict) -> ds.Dataset: 97 verbose = config["verbose"] 98 shadowConfig = config["shadowDataset"] 99 method = shadowConfig["method"] 100 sizePerModel = shadowConfig["size"] 101 split = config["shadowModels"]["split"] 102 numModels = config["shadowModels"]["number"] 103 dataSize = int(np.ceil(sizePerModel * (numModels/split))) 104 hyperpars = shadowConfig[method]["hyperparameters"] 105 dataName = get_shadow_data_name(config) 106 107 try: 108 print("Loading shadow data from disk.") 109 shadowData = load_shadow_data(config) 110 except BaseException: 111 print("Loading failed, generating shadow data.") 112 113 targetDataset = get_target_model_rest_data(config) 114 targetModel = tm.load_model(tm.get_model_name(config), verbose=config["verbose"]) 115 116 if method == "noisy": 117 shadowData = generate_shadow_data_noisy(targetDataset, dataSize, **hyperpars) 118 elif method == "hill_climbing": 119 shadowData = hill_climbing(targetModel, dataSize, **hyperpars) 120 elif method == "original": 121 shadowData = generate_shadow_data_original(targetDataset, dataSize) 122 elif method == "statistic": 123 shadowData = generate_shadow_data_statistic(config) 124 else: 125 raise ValueError(f"{method} is not a valid shadow data method.") 126 127 if verbose: 128 print(f"Saving shadow data {dataName} to disk.") 129 try: 130 ds.save_shadow(shadowData, dataName) 131 except BaseException: 132 print(f"Failed to save shadow data {dataName} to disk.") 133 ds.delete_shadow(dataName) 134 raise 135 136 return shadowData 137 138 139def _make_data_record_noisy(features, label, fraction): 140 # TODO: numFeatures is hardcoded 141 numFeatures = 600 142 k = int(numFeatures * fraction) 143 return _randomize_features(features, k=k).reshape(numFeatures), label 144 145 146def _make_dataset_noisy(original_data: Dataset, fraction: float) -> Dataset: 147 """ 148 Returns new dataset, where each element has a fraction of its features 149 flipped. 150 """ 151 return original_data.map( 152 lambda x, y: 153 tf.numpy_function(func=_make_data_record_noisy, inp=(x, y, fraction), Tout=[tf.int64, tf.int64]) 154 ) 155 156def generate_shadow_data_original(targetDataset, outputSize) -> Dataset: 157 158 inputSize = targetDataset.cardinality().numpy() 159 160 shadowData = ds.shuffle(targetDataset) 161 162 if inputSize >= outputSize: 163 return shadowData.take(outputSize) 164 165 166 numSets = int(np.floor(outputSize / inputSize)) 167 for _ in range(numSets - 1): 168 newSet = ds.shuffle(targetDataset) 169 shadowData = shadowData.concatenate(newSet) 170 171 # How many records to add after collecting numSets sets 172 offset = outputSize % inputSize 173 offsetSet = ds.shuffle(targetDataset).take(offset) 174 175 return shadowData.concatenate(offsetSet) 176 177def generate_shadow_data_noisy(original_data: Dataset, outputSize: int, fraction: float = 0.1) -> Dataset: 178 """ 179 Generate synthetic data for the shadow models by using a noisy version of 180 the original data. 181 Returns only the noisy data, no the oririnal data. 182 183 Arguments: 184 fraction: percentage of labels that will be flipped per data record to 185 make it "noisy" 186 """ 187 inputSize = original_data.cardinality().numpy() 188 # Since outputSize % inputSize not always 0, we have to fill the gap with a subset 189 # of the full input data. To avoid bias, shuffle the input data. 190 noisySet = _make_dataset_noisy(ds.shuffle(original_data), fraction) 191 192 if inputSize >= outputSize: 193 return noisySet.take(outputSize) 194 195 numNoisyVersions = int(np.floor(outputSize / inputSize)) 196 # How many records to add after collecting numNoisyVersions sets 197 offset = outputSize % inputSize 198 199 for _ in range(numNoisyVersions - 1): 200 newNoisySet = _make_dataset_noisy(ds.shuffle(original_data), fraction) 201 noisySet = noisySet.concatenate(newNoisySet) 202 203 offsetSet = _make_dataset_noisy( 204 ds.shuffle(original_data), fraction).take(offset) 205 return noisySet.concatenate(offsetSet) 206 207def _get_filter_fn(label: int): 208 209 wantedLabel = np.int64(label) 210 def _filter_fn(_, y): return tf.math.equal(wantedLabel, tf.math.argmax(y)) 211 return _filter_fn 212 213def _compute_kaggle_marginals(config): 214 dataName = config["targetDataset"]["name"] 215 originalData = ds.load_dataset(dataName) 216 numFeatures = iter(originalData).get_next()[0].numpy().shape[0] 217 numLabels = iter(originalData).get_next()[1].numpy().shape[0] 218 marginalProbabilities = np.zeros((100,600)) 219 # For each class, count binary feature values to get marginal 220 for _class in range(numLabels): 221 if config["verbose"]: 222 print(f"Computing marginal probability for class {_class}/{numLabels}") 223 filteredData = originalData.filter(_get_filter_fn(_class)) 224 initialCount = np.array([0]*numFeatures) 225 countedFeatures = filteredData.reduce(initialCount, lambda oldCount, dataPoint: oldCount + dataPoint[0]).numpy() 226 sampleSize = filteredData.cardinality() 227 if sampleSize <= 0 and config["verbose"]: # tf uses constants < 0 to indicate unknown cardinality 228 sampleSize = len(list(filteredData.as_numpy_iterator())) 229 marginalProbability = countedFeatures / sampleSize 230 marginalProbabilities[_class] = marginalProbability 231 return marginalProbabilities 232 233def generate_shadow_data_statistic(config: Dict) -> Dataset: 234 """ 235 Generate synthetic data for the shadow models by using the marginal 236 distribution of features in the original dataset. 237 """ 238 # TODO: Kaggle specific 239 size = config["shadowDataset"]["size"] 240 try: 241 marginalProbabilities = ds.load_numpy_array("kaggle_marginals.npy") 242 except: 243 marginalProbabilities = _compute_kaggle_marginals(config) 244 ds.save_numpy_array("kaggle_marginals.npy",marginalProbabilities) 245 246 # Generate new records 247 numClasses = marginalProbabilities.shape[0] 248 numFeatures = marginalProbabilities.shape[1] 249 250 features: NDArray = np.zeros((size,numFeatures)).astype(np.int32) 251 labels: NDArray = np.zeros((size,numClasses)).astype(np.int32) 252 253 recordsPerClass = int(size/numClasses) 254 255 for _class in range(numClasses): 256 if config["verbose"]: 257 print(f"Generating records for class {_class}") 258 index_start = _class * recordsPerClass 259 index_end = (_class + 1) * recordsPerClass 260 # for index in range(index_start, index_end): 261 # labels[index] = to_categorical(_class, num_classes = numClasses) 262 labels[index_start:index_end] = np.tile(to_categorical(_class, num_classes = 263 numClasses),recordsPerClass).reshape(recordsPerClass,numClasses) 264 gen = np.random.default_rng(seed=global_seed) 265 marginalProbability = marginalProbabilities[_class] 266 267 for feature in range(numFeatures): 268 probability = marginalProbability[feature] 269 # sample one feature for all records in this class at once 270 sampledFeature = gen.choice([0,1], p=[1-probability,probability],size = recordsPerClass) 271 features[index_start:index_end,feature] = sampledFeature 272 273 shadowData = Dataset.from_tensor_slices((features, labels)) 274 shadowData = shadowData.shuffle(size, seed=global_seed, reshuffle_each_iteration=False) 275 return shadowData 276 277 278 279 280def _generate_labels(classes: int, size: int) -> NDArray: 281 """ 282 Generate a numpy array of size `size`, where the values are integers between 283 0 and `classes` - 1, distributed as evenly as possible. 284 285 This array will be used to generate a synthetic array of features for each 286 array element. 287 """ 288 289 records_per_class: int = int(size / classes) 290 extra_records: int = size % classes 291 292 labels: NDArray = np.zeros((size, 1)) 293 index: int = 0 294 295 for x in range(classes): 296 if x < extra_records: 297 records_for_this_class = records_per_class + 1 298 else: 299 records_for_this_class = records_per_class 300 for y in range(records_for_this_class): 301 labels[index + y, 0] = x 302 index = index + records_for_this_class 303 304 return labels 305 306 307def _randomize_features(data: NDArray, k: int, 308 numFeatures: int = 600) -> NDArray: 309 310 featuresToFlip = random.sample(range(numFeatures), k) 311 312 data = data.reshape((1, numFeatures)) 313 314 data[0][featuresToFlip] ^= 1 315 316 return data 317 318 319def _get_random_record(numFeatures: int, 320 randomGenerator=globalRandomGen) -> NDArray: 321 322 x = randomGenerator.integers(0, high=1, endpoint=True, size=numFeatures) 323 324 return x.reshape((1, numFeatures)) 325 326 327def _randomize_features_batched( 328 data: NDArray, k: int, batchSize: int, numFeatures: int = 600) -> NDArray: 329 330 outputdata = np.repeat(data.reshape((numFeatures, 1)), batchSize, axis=1).transpose() 331 332 import numpy.testing as tt 333 tt.assert_equal(outputdata[0], data.reshape(numFeatures)) 334 335 # Flip features of the first record 336 featuresToFlip = random.sample(range(numFeatures), k) 337 outputdata[0, featuresToFlip] ^= 1 338 339 # Flip all further records based on the previous one 340 for i in range(1,batchSize): 341 featuresToFlip = random.sample(range(numFeatures), k) 342 outputdata[i,:] = outputdata[i-1, :] 343 outputdata[i, featuresToFlip] ^= 1 344 345 return outputdata 346 347 348def _rebatch(x, k, batchSize, targetModel) -> Tuple[NDArray, NDArray, int]: 349 xs = _randomize_features_batched(x, k, batchSize) 350 ys = targetModel.predict(xs, batch_size=batchSize, verbose=0) 351 return xs, ys, 0 352 353 354def _generate_synthetic_record_batched(label: int, 355 targetModel: Sequential, 356 k_max: int = 200, 357 k_min: int = 5, 358 conf_min: float = 0.05, 359 rej_max: int = 20, 360 iter_max: int = 200, 361 batchSize: int = 1) -> Tuple[int, Optional[NDArray]]: 362 """ 363 Synthesize a data record, using Algorithm 1 from Shokri et als 364 paper "Membership Inference Attacks against Machine Learning Models". 365 """ 366 assert label < 100 and label >= 0 367 368 # Initalization 369 batchIndex: int = 0 370 numFeatures: int = 600 371 kWasUpdated = False 372 k = k_max 373 y_c_star = 0 374 j = 0 375 x = _get_random_record(numFeatures) 376 haveSampled = False 377 378 if batchSize == 1: 379 xs = x.reshape((1, 600)) 380 ys = targetModel.predict(xs, batch_size=batchSize, verbose=0) 381 else: 382 xs, ys, batchIndex = _rebatch(x, k, batchSize, targetModel) 383 384 # Controls number of iterations 385 for i in range(iter_max): 386 387 x = xs[batchIndex] 388 y = ys[batchIndex] 389 y_c = y[label] 390 predictedClass = np.argmax(y, axis=0) 391 392 if y_c >= y_c_star: 393 if y_c > conf_min and predictedClass == label: 394 # print(f"Now sampling! {batchIndex},{y_c},{y_c_star}") 395 haveSampled = True 396 if y_c > globalRandomGen.random(): 397 return i, x.reshape((1, numFeatures)) 398 399 xs, ys, batchIndex = _rebatch(x, k, batchSize, targetModel) 400 y_c_star = y_c 401 j = 0 402 continue 403 else: 404 j = j + 1 405 if j > rej_max and (k != k_min) and haveSampled: 406 k = int(max(k_min, np.ceil(k / 2))) 407 j = 0 408 kWasUpdated = True 409 410 batchExhausted = (batchIndex == batchSize - 1) 411 412 if batchExhausted or kWasUpdated: 413 xs, ys, batchIndex = _rebatch(x, k, batchSize, targetModel) 414 kWasUpdated = False 415 else: 416 batchIndex += 1 417 418 # if (i % 20) == 0: 419 # print(f"{i}/{iter_max}, y_c/y_c*: {y_c:.1%}/{y_c_star:.1%}, pred/class: {predictedClass}/{label}") 420 421 return iter_max, None 422 423 424def _generate_synthetic_record(label: int, 425 targetModel: Sequential, 426 k_max: int = 200, 427 k_min: int = 5, 428 conf_min: float = 0.05, 429 rej_max: int = 20, 430 iter_max: int = 200, 431 batchSize: int = 1) -> Tuple[int,Optional[NDArray]]: 432 """ 433 Synthesize a data record, using Algorithm 1 from Shokri et als 434 paper "Membership Inference Attacks against Machine Learning Models". 435 """ 436 assert label < 100 and label >= 0 437 438 # Initalization 439 numFeatures: int = 600 440 k = k_max 441 y_c_star = 0 442 j = 0 443 x = _get_random_record(numFeatures) 444 445 # Controls number of iterations 446 for i in range(iter_max): 447 448 y = targetModel.predict(x, batch_size=1, verbose=0) 449 y_c = y[0][label] 450 predictedClass = np.argmax(y, axis=1)[0] 451 452 if y_c >= y_c_star: 453 if y_c > conf_min and predictedClass == label: 454 # print("Now sampling!") 455 if y_c > globalRandomGen.random(): 456 return i,x 457 458 y_c_star = y_c 459 j = 0 460 else: 461 j = j + 1 462 if j > rej_max and (k != k_min): 463 k = int(max(k_min, np.ceil(k / 2))) 464 j = 0 465 466 x = _randomize_features(x, k) # pyright: ignore 467 468 # if (i % 20) == 0: 469 # print( 470 # f"{i}/{iter_max}, y_c/y_c*: {y_c:.1%}/{y_c_star:.1%}, pred/class: {predictedClass}/{label}") 471 472 return iter_max,None 473 474 475def hill_climbing(targetModel: Sequential, numRecords: int, 476 **hyperpars) -> Dataset: 477 """ 478 Generate synthetic data for the shadow models by querying the target model 479 for randomly sampled records, in order to find those that are classified 480 with high confidence. 481 482 `numRecords`: size of generated dataset 483 `hyperpars` has the following keys (taken from the paper: 484 k_max,k_min,rej_max,conf_min,iter_max) 485 """ 486 487 # Generate an array of labels, determining which class to synthesize for 488 # TODO: initializing and then changing `features` array might not be most 489 # efficient solution 490 491 numClasses: int = 100 492 labels: NDArray = _generate_labels(numClasses, numRecords) 493 494 numFeatures: int = 600 495 features: NDArray = np.zeros((numRecords, numFeatures)) 496 overallNumQueries = [] 497 498 for index, label in enumerate(labels): 499 label = int(label[0]) 500 queries, new_record = _generate_synthetic_record(label, targetModel, **hyperpars) 501 overallNumQueries.append(queries) 502 while new_record is None: 503 queries, new_record = _generate_synthetic_record(label, targetModel, **hyperpars) 504 overallNumQueries.append(queries) 505 print(f"Generating synthetic records: {index}/{numRecords}, {index/numRecords*100:.2f}% done. On average {sum(overallNumQueries)/(index+1)} +- {np.std(overallNumQueries)} queries.") 506 features[index] = new_record.reshape((1, numFeatures)) 507 508 features = features.reshape((numRecords, numFeatures)) 509 labels = labels.reshape((numRecords, 1)) 510 return Dataset.from_tensor_slices((features, labels)) 511 512def get_target_model_rest_data(config:Dict) -> Dataset: 513 modelName = tm.get_model_name(config) 514 restDataName = modelName + "_rest_data" 515 return ds.load_target(restDataName) 516 517if __name__ == "__main__": 518 import argparse 519 import configuration as con 520 import datasets as ds 521 import target_models as tm 522 523 parser = argparse.ArgumentParser(description='Generate all the necessary shadow data and save it to disk.') 524 parser.add_argument('--config', help='Relative path to config file.',) 525 config = con.from_cli_options(vars(parser.parse_args())) 526 set_seed(config["seed"]) 527 528 shadowData = get_shadow_data(config)
29def set_seed(new_seed: int): 30 """ 31 Set the global seed that will be used for all functions that include 32 randomness. 33 """ 34 global global_seed 35 global_seed = new_seed 36 np.random.seed(global_seed) 37 random.seed(global_seed) 38 random_seed.set_seed(global_seed)
Set the global seed that will be used for all functions that include randomness.
41def generate_shadow_data_sampling(original_data: Dataset) -> Dataset: 42 """ 43 Generate synthetic data for the shadow models by randomly sampling data 44 points from the original data set. 45 """ 46 sample_dataset: Dataset = tf.data.Dataset.sample_from_datasets( 47 [original_data], seed=global_seed, stop_on_empty_dataset=True) 48 return sample_dataset
Generate synthetic data for the shadow models by randomly sampling data points from the original data set.
62def get_shadow_data_name(config: Dict): 63 shadowConfig = config["shadowDataset"] 64 method = shadowConfig["method"] 65 targetDataName = config["targetDataset"]["name"] 66 dataSize = shadowConfig["size"] 67 hyperpars = shadowConfig[method]["hyperparameters"] 68 if method == "noisy": 69 dataName = f'{method}_fraction_{hyperpars["fraction"]}_size_{dataSize}_target_{targetDataName}' 70 elif method == "hill_climbing": 71 dataName = \ 72 f'{method}_' + \ 73 f'{targetDataName}_' + \ 74 f'kmax_{hyperpars["k_max"]}_' + \ 75 f'kmin_{hyperpars["k_min"]}_' + \ 76 f'confmin_{hyperpars["conf_min"]}_' + \ 77 f'rejmax_{hyperpars["rej_max"]}_' + \ 78 f'itermax_{hyperpars["iter_max"]}_' + \ 79 f'size_{dataSize}' 80 elif method == "original": 81 dataName = \ 82 f'{method}_' + \ 83 f'{targetDataName}_' + \ 84 f'original_data_' + \ 85 f'size_{dataSize}' 86 elif method == "statistic": 87 dataName = \ 88 f'{method}_' + \ 89 f'{targetDataName}_' + \ 90 f'statistic_' + \ 91 f'size_{dataSize}' 92 else: 93 raise ValueError(f"{method} is not a valid shadow data method.") 94 return dataName
97def get_shadow_data(config: Dict) -> ds.Dataset: 98 verbose = config["verbose"] 99 shadowConfig = config["shadowDataset"] 100 method = shadowConfig["method"] 101 sizePerModel = shadowConfig["size"] 102 split = config["shadowModels"]["split"] 103 numModels = config["shadowModels"]["number"] 104 dataSize = int(np.ceil(sizePerModel * (numModels/split))) 105 hyperpars = shadowConfig[method]["hyperparameters"] 106 dataName = get_shadow_data_name(config) 107 108 try: 109 print("Loading shadow data from disk.") 110 shadowData = load_shadow_data(config) 111 except BaseException: 112 print("Loading failed, generating shadow data.") 113 114 targetDataset = get_target_model_rest_data(config) 115 targetModel = tm.load_model(tm.get_model_name(config), verbose=config["verbose"]) 116 117 if method == "noisy": 118 shadowData = generate_shadow_data_noisy(targetDataset, dataSize, **hyperpars) 119 elif method == "hill_climbing": 120 shadowData = hill_climbing(targetModel, dataSize, **hyperpars) 121 elif method == "original": 122 shadowData = generate_shadow_data_original(targetDataset, dataSize) 123 elif method == "statistic": 124 shadowData = generate_shadow_data_statistic(config) 125 else: 126 raise ValueError(f"{method} is not a valid shadow data method.") 127 128 if verbose: 129 print(f"Saving shadow data {dataName} to disk.") 130 try: 131 ds.save_shadow(shadowData, dataName) 132 except BaseException: 133 print(f"Failed to save shadow data {dataName} to disk.") 134 ds.delete_shadow(dataName) 135 raise 136 137 return shadowData
157def generate_shadow_data_original(targetDataset, outputSize) -> Dataset: 158 159 inputSize = targetDataset.cardinality().numpy() 160 161 shadowData = ds.shuffle(targetDataset) 162 163 if inputSize >= outputSize: 164 return shadowData.take(outputSize) 165 166 167 numSets = int(np.floor(outputSize / inputSize)) 168 for _ in range(numSets - 1): 169 newSet = ds.shuffle(targetDataset) 170 shadowData = shadowData.concatenate(newSet) 171 172 # How many records to add after collecting numSets sets 173 offset = outputSize % inputSize 174 offsetSet = ds.shuffle(targetDataset).take(offset) 175 176 return shadowData.concatenate(offsetSet)
178def generate_shadow_data_noisy(original_data: Dataset, outputSize: int, fraction: float = 0.1) -> Dataset: 179 """ 180 Generate synthetic data for the shadow models by using a noisy version of 181 the original data. 182 Returns only the noisy data, no the oririnal data. 183 184 Arguments: 185 fraction: percentage of labels that will be flipped per data record to 186 make it "noisy" 187 """ 188 inputSize = original_data.cardinality().numpy() 189 # Since outputSize % inputSize not always 0, we have to fill the gap with a subset 190 # of the full input data. To avoid bias, shuffle the input data. 191 noisySet = _make_dataset_noisy(ds.shuffle(original_data), fraction) 192 193 if inputSize >= outputSize: 194 return noisySet.take(outputSize) 195 196 numNoisyVersions = int(np.floor(outputSize / inputSize)) 197 # How many records to add after collecting numNoisyVersions sets 198 offset = outputSize % inputSize 199 200 for _ in range(numNoisyVersions - 1): 201 newNoisySet = _make_dataset_noisy(ds.shuffle(original_data), fraction) 202 noisySet = noisySet.concatenate(newNoisySet) 203 204 offsetSet = _make_dataset_noisy( 205 ds.shuffle(original_data), fraction).take(offset) 206 return noisySet.concatenate(offsetSet)
Generate synthetic data for the shadow models by using a noisy version of the original data. Returns only the noisy data, no the oririnal data.
Arguments: fraction: percentage of labels that will be flipped per data record to make it "noisy"
234def generate_shadow_data_statistic(config: Dict) -> Dataset: 235 """ 236 Generate synthetic data for the shadow models by using the marginal 237 distribution of features in the original dataset. 238 """ 239 # TODO: Kaggle specific 240 size = config["shadowDataset"]["size"] 241 try: 242 marginalProbabilities = ds.load_numpy_array("kaggle_marginals.npy") 243 except: 244 marginalProbabilities = _compute_kaggle_marginals(config) 245 ds.save_numpy_array("kaggle_marginals.npy",marginalProbabilities) 246 247 # Generate new records 248 numClasses = marginalProbabilities.shape[0] 249 numFeatures = marginalProbabilities.shape[1] 250 251 features: NDArray = np.zeros((size,numFeatures)).astype(np.int32) 252 labels: NDArray = np.zeros((size,numClasses)).astype(np.int32) 253 254 recordsPerClass = int(size/numClasses) 255 256 for _class in range(numClasses): 257 if config["verbose"]: 258 print(f"Generating records for class {_class}") 259 index_start = _class * recordsPerClass 260 index_end = (_class + 1) * recordsPerClass 261 # for index in range(index_start, index_end): 262 # labels[index] = to_categorical(_class, num_classes = numClasses) 263 labels[index_start:index_end] = np.tile(to_categorical(_class, num_classes = 264 numClasses),recordsPerClass).reshape(recordsPerClass,numClasses) 265 gen = np.random.default_rng(seed=global_seed) 266 marginalProbability = marginalProbabilities[_class] 267 268 for feature in range(numFeatures): 269 probability = marginalProbability[feature] 270 # sample one feature for all records in this class at once 271 sampledFeature = gen.choice([0,1], p=[1-probability,probability],size = recordsPerClass) 272 features[index_start:index_end,feature] = sampledFeature 273 274 shadowData = Dataset.from_tensor_slices((features, labels)) 275 shadowData = shadowData.shuffle(size, seed=global_seed, reshuffle_each_iteration=False) 276 return shadowData
Generate synthetic data for the shadow models by using the marginal distribution of features in the original dataset.
476def hill_climbing(targetModel: Sequential, numRecords: int, 477 **hyperpars) -> Dataset: 478 """ 479 Generate synthetic data for the shadow models by querying the target model 480 for randomly sampled records, in order to find those that are classified 481 with high confidence. 482 483 `numRecords`: size of generated dataset 484 `hyperpars` has the following keys (taken from the paper: 485 k_max,k_min,rej_max,conf_min,iter_max) 486 """ 487 488 # Generate an array of labels, determining which class to synthesize for 489 # TODO: initializing and then changing `features` array might not be most 490 # efficient solution 491 492 numClasses: int = 100 493 labels: NDArray = _generate_labels(numClasses, numRecords) 494 495 numFeatures: int = 600 496 features: NDArray = np.zeros((numRecords, numFeatures)) 497 overallNumQueries = [] 498 499 for index, label in enumerate(labels): 500 label = int(label[0]) 501 queries, new_record = _generate_synthetic_record(label, targetModel, **hyperpars) 502 overallNumQueries.append(queries) 503 while new_record is None: 504 queries, new_record = _generate_synthetic_record(label, targetModel, **hyperpars) 505 overallNumQueries.append(queries) 506 print(f"Generating synthetic records: {index}/{numRecords}, {index/numRecords*100:.2f}% done. On average {sum(overallNumQueries)/(index+1)} +- {np.std(overallNumQueries)} queries.") 507 features[index] = new_record.reshape((1, numFeatures)) 508 509 features = features.reshape((numRecords, numFeatures)) 510 labels = labels.reshape((numRecords, 1)) 511 return Dataset.from_tensor_slices((features, labels))
Generate synthetic data for the shadow models by querying the target model for randomly sampled records, in order to find those that are classified with high confidence.
numRecords
: size of generated dataset
hyperpars
has the following keys (taken from the paper:
k_max,k_min,rej_max,conf_min,iter_max)