shadow_data

This module generates shadow data for training the shadow models. It assumes varying knowledge about the original dataset, depending on the attacker capabilities.

View Source

  1"""
  2.. include:: ../docs/shadow_data.md
  3"""
  4
  5# TODO: Everything (?) in here is Kaggle specific
  6
  7from os import environ
  8from typing import Tuple
  9from numpy.typing import NDArray
 10from typing import Optional, Dict, List
 11import datasets as ds
 12import random
 13import numpy as np
 14
 15# Tensorflow C++ backend logging verbosity
 16environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # NOQA
 17
 18import tensorflow as tf
 19from tensorflow.keras.utils import to_categorical  # pyright: ignore
 20from tensorflow.python.framework import random_seed
 21from tensorflow.data import Dataset  # pyright: ignore
 22from tensorflow.keras import Sequential  # pyright: ignore
 23
 24global_seed: int = 1234
 25globalRandomGen = np.random.default_rng(global_seed)
 26
 27
 28def set_seed(new_seed: int):
 29    """
 30    Set the global seed that will be used for all functions that include
 31    randomness.
 32    """
 33    global global_seed
 34    global_seed = new_seed
 35    np.random.seed(global_seed)
 36    random.seed(global_seed)
 37    random_seed.set_seed(global_seed)
 38
 39
 40def generate_shadow_data_sampling(original_data: Dataset) -> Dataset:
 41    """
 42    Generate synthetic data for the shadow models by randomly sampling data
 43    points from the original data set.
 44    """
 45    sample_dataset: Dataset = tf.data.Dataset.sample_from_datasets(
 46        [original_data], seed=global_seed, stop_on_empty_dataset=True)
 47    return sample_dataset
 48
 49
 50def split_shadow_data(config: Dict, shadowData: ds.Dataset) -> List[ds.Dataset]:
 51    print("Splitting shadow data into subsets.")
 52    numSubsets = config["shadowModels"]["number"]
 53    return ds.split_dataset(shadowData, numSubsets)
 54
 55
 56def load_shadow_data(config: Dict):
 57    dataName = get_shadow_data_name(config)
 58    return ds.load_shadow(dataName, verbose=config["verbose"])
 59
 60
 61def get_shadow_data_name(config: Dict):
 62    shadowConfig = config["shadowDataset"]
 63    method = shadowConfig["method"]
 64    targetDataName = config["targetDataset"]["name"]
 65    dataSize = shadowConfig["size"]
 66    hyperpars = shadowConfig[method]["hyperparameters"]
 67    if method == "noisy":
 68        dataName = f'{method}_fraction_{hyperpars["fraction"]}_size_{dataSize}_target_{targetDataName}'
 69    elif method == "hill_climbing":
 70        dataName = \
 71            f'{method}_' + \
 72            f'{targetDataName}_' + \
 73            f'kmax_{hyperpars["k_max"]}_' + \
 74            f'kmin_{hyperpars["k_min"]}_' + \
 75            f'confmin_{hyperpars["conf_min"]}_' + \
 76            f'rejmax_{hyperpars["rej_max"]}_' + \
 77            f'itermax_{hyperpars["iter_max"]}_' + \
 78            f'size_{dataSize}'
 79    elif method == "original":
 80        dataName = \
 81            f'{method}_' + \
 82            f'{targetDataName}_' + \
 83            f'original_data_' + \
 84            f'size_{dataSize}'
 85    elif method == "statistic":
 86        dataName = \
 87            f'{method}_' + \
 88            f'{targetDataName}_' + \
 89            f'statistic_' + \
 90            f'size_{dataSize}'
 91    else:
 92        raise ValueError(f"{method} is not a valid shadow data method.")
 93    return dataName
 94
 95
 96def get_shadow_data(config: Dict) -> ds.Dataset:
 97    verbose = config["verbose"]
 98    shadowConfig = config["shadowDataset"]
 99    method = shadowConfig["method"]
100    sizePerModel = shadowConfig["size"]
101    split = config["shadowModels"]["split"]
102    numModels = config["shadowModels"]["number"]
103    dataSize = int(np.ceil(sizePerModel * (numModels/split)))
104    hyperpars = shadowConfig[method]["hyperparameters"]
105    dataName = get_shadow_data_name(config)
106
107    try:
108        print("Loading shadow data from disk.")
109        shadowData = load_shadow_data(config)
110    except BaseException:
111        print("Loading failed, generating shadow data.")
112
113        targetDataset = get_target_model_rest_data(config)
114        targetModel = tm.load_model(tm.get_model_name(config), verbose=config["verbose"])
115
116        if method == "noisy":
117            shadowData = generate_shadow_data_noisy(targetDataset, dataSize, **hyperpars)
118        elif method == "hill_climbing":
119            shadowData = hill_climbing(targetModel, dataSize, **hyperpars)
120        elif method == "original":
121            shadowData = generate_shadow_data_original(targetDataset, dataSize)
122        elif method == "statistic":
123            shadowData = generate_shadow_data_statistic(config)
124        else:
125            raise ValueError(f"{method} is not a valid shadow data method.")
126
127        if verbose:
128            print(f"Saving shadow data {dataName} to disk.")
129        try:
130            ds.save_shadow(shadowData, dataName)
131        except BaseException:
132            print(f"Failed to save shadow data {dataName} to disk.")
133            ds.delete_shadow(dataName)
134            raise
135
136    return shadowData
137
138
139def _make_data_record_noisy(features, label, fraction):
140    # TODO: numFeatures is hardcoded
141    numFeatures = 600
142    k = int(numFeatures * fraction)
143    return _randomize_features(features, k=k).reshape(numFeatures), label
144
145
146def _make_dataset_noisy(original_data: Dataset, fraction: float) -> Dataset:
147    """
148    Returns new dataset, where each element has a fraction of its features
149    flipped.
150    """
151    return original_data.map(
152        lambda x, y:
153            tf.numpy_function(func=_make_data_record_noisy, inp=(x, y, fraction), Tout=[tf.int64, tf.int64])
154    )
155
156def generate_shadow_data_original(targetDataset, outputSize) -> Dataset:
157
158    inputSize = targetDataset.cardinality().numpy()
159
160    shadowData = ds.shuffle(targetDataset)
161
162    if inputSize >= outputSize:
163        return shadowData.take(outputSize)
164
165
166    numSets = int(np.floor(outputSize / inputSize))
167    for _ in range(numSets - 1):
168        newSet = ds.shuffle(targetDataset)
169        shadowData = shadowData.concatenate(newSet)
170
171    # How many records to add after collecting numSets sets
172    offset = outputSize % inputSize
173    offsetSet = ds.shuffle(targetDataset).take(offset)
174
175    return shadowData.concatenate(offsetSet)
176
177def generate_shadow_data_noisy(original_data: Dataset, outputSize: int, fraction: float = 0.1) -> Dataset:
178    """
179    Generate synthetic data for the shadow models by using a noisy version of
180    the original data.
181    Returns only the noisy data, no the oririnal data.
182
183    Arguments:
184        fraction: percentage of labels that will be flipped per data record to
185                  make it "noisy"
186    """
187    inputSize = original_data.cardinality().numpy()
188    # Since outputSize % inputSize not always 0, we have to fill the gap with a subset
189    # of the full input data. To avoid bias, shuffle the input data.
190    noisySet = _make_dataset_noisy(ds.shuffle(original_data), fraction)
191
192    if inputSize >= outputSize:
193        return noisySet.take(outputSize)
194
195    numNoisyVersions = int(np.floor(outputSize / inputSize))
196    # How many records to add after collecting numNoisyVersions sets
197    offset = outputSize % inputSize
198
199    for _ in range(numNoisyVersions - 1):
200        newNoisySet = _make_dataset_noisy(ds.shuffle(original_data), fraction)
201        noisySet = noisySet.concatenate(newNoisySet)
202
203    offsetSet = _make_dataset_noisy(
204        ds.shuffle(original_data), fraction).take(offset)
205    return noisySet.concatenate(offsetSet)
206
207def _get_filter_fn(label: int):
208
209    wantedLabel = np.int64(label)
210    def _filter_fn(_, y): return tf.math.equal(wantedLabel, tf.math.argmax(y))
211    return _filter_fn
212
213def _compute_kaggle_marginals(config):
214    dataName = config["targetDataset"]["name"]
215    originalData = ds.load_dataset(dataName)
216    numFeatures = iter(originalData).get_next()[0].numpy().shape[0]
217    numLabels = iter(originalData).get_next()[1].numpy().shape[0]
218    marginalProbabilities = np.zeros((100,600))
219    # For each class, count binary feature values to get marginal
220    for _class in range(numLabels):
221        if config["verbose"]:
222            print(f"Computing marginal probability for class {_class}/{numLabels}")
223        filteredData = originalData.filter(_get_filter_fn(_class))
224        initialCount = np.array([0]*numFeatures)
225        countedFeatures = filteredData.reduce(initialCount, lambda oldCount, dataPoint: oldCount + dataPoint[0]).numpy()
226        sampleSize = filteredData.cardinality()
227        if sampleSize <= 0 and config["verbose"]:  # tf uses constants < 0 to indicate unknown cardinality
228            sampleSize = len(list(filteredData.as_numpy_iterator()))
229        marginalProbability = countedFeatures / sampleSize
230        marginalProbabilities[_class] = marginalProbability
231    return marginalProbabilities
232
233def generate_shadow_data_statistic(config: Dict) -> Dataset:
234    """
235    Generate synthetic data for the shadow models by using the marginal
236    distribution of features in the original dataset.
237    """
238    # TODO: Kaggle specific
239    size = config["shadowDataset"]["size"]
240    try:
241        marginalProbabilities = ds.load_numpy_array("kaggle_marginals.npy")
242    except:
243        marginalProbabilities = _compute_kaggle_marginals(config)
244        ds.save_numpy_array("kaggle_marginals.npy",marginalProbabilities)
245
246    # Generate new records
247    numClasses = marginalProbabilities.shape[0]
248    numFeatures = marginalProbabilities.shape[1]
249
250    features: NDArray = np.zeros((size,numFeatures)).astype(np.int32)
251    labels: NDArray = np.zeros((size,numClasses)).astype(np.int32)
252
253    recordsPerClass = int(size/numClasses)
254
255    for _class in range(numClasses):
256        if config["verbose"]:
257            print(f"Generating records for class {_class}")
258        index_start = _class * recordsPerClass
259        index_end = (_class + 1) * recordsPerClass
260        #  for index in range(index_start, index_end):
261        #      labels[index] = to_categorical(_class, num_classes = numClasses)
262        labels[index_start:index_end] = np.tile(to_categorical(_class, num_classes =
263                                                               numClasses),recordsPerClass).reshape(recordsPerClass,numClasses)
264        gen = np.random.default_rng(seed=global_seed)
265        marginalProbability = marginalProbabilities[_class]
266
267        for feature in range(numFeatures):
268            probability = marginalProbability[feature]
269            # sample one feature for all records in this class at once
270            sampledFeature = gen.choice([0,1], p=[1-probability,probability],size = recordsPerClass)
271            features[index_start:index_end,feature] = sampledFeature
272
273    shadowData = Dataset.from_tensor_slices((features, labels))
274    shadowData = shadowData.shuffle(size, seed=global_seed, reshuffle_each_iteration=False)
275    return shadowData
276
277
278
279
280def _generate_labels(classes: int, size: int) -> NDArray:
281    """
282    Generate a numpy array of size `size`, where the values are integers between
283    0 and `classes` - 1, distributed as evenly as possible.
284
285    This array will be used to generate a synthetic array of features for each
286    array element.
287    """
288
289    records_per_class: int = int(size / classes)
290    extra_records: int = size % classes
291
292    labels: NDArray = np.zeros((size, 1))
293    index: int = 0
294
295    for x in range(classes):
296        if x < extra_records:
297            records_for_this_class = records_per_class + 1
298        else:
299            records_for_this_class = records_per_class
300        for y in range(records_for_this_class):
301            labels[index + y, 0] = x
302        index = index + records_for_this_class
303
304    return labels
305
306
307def _randomize_features(data: NDArray, k: int,
308                        numFeatures: int = 600) -> NDArray:
309
310    featuresToFlip = random.sample(range(numFeatures), k)
311
312    data = data.reshape((1, numFeatures))
313
314    data[0][featuresToFlip] ^= 1
315
316    return data
317
318
319def _get_random_record(numFeatures: int,
320                       randomGenerator=globalRandomGen) -> NDArray:
321
322    x = randomGenerator.integers(0, high=1, endpoint=True, size=numFeatures)
323
324    return x.reshape((1, numFeatures))
325
326
327def _randomize_features_batched(
328        data: NDArray, k: int, batchSize: int, numFeatures: int = 600) -> NDArray:
329
330    outputdata = np.repeat(data.reshape((numFeatures, 1)), batchSize, axis=1).transpose()
331
332    import numpy.testing as tt
333    tt.assert_equal(outputdata[0], data.reshape(numFeatures))
334
335    # Flip features of the first record
336    featuresToFlip = random.sample(range(numFeatures), k)
337    outputdata[0, featuresToFlip] ^= 1
338
339    # Flip all further records based on the previous one
340    for i in range(1,batchSize):
341        featuresToFlip = random.sample(range(numFeatures), k)
342        outputdata[i,:] = outputdata[i-1, :]
343        outputdata[i, featuresToFlip] ^= 1
344
345    return outputdata
346
347
348def _rebatch(x, k, batchSize, targetModel) -> Tuple[NDArray, NDArray, int]:
349    xs = _randomize_features_batched(x, k, batchSize)
350    ys = targetModel.predict(xs, batch_size=batchSize, verbose=0)
351    return xs, ys, 0
352
353
354def _generate_synthetic_record_batched(label: int,
355                                       targetModel: Sequential,
356                                       k_max: int = 200,
357                                       k_min: int = 5,
358                                       conf_min: float = 0.05,
359                                       rej_max: int = 20,
360                                       iter_max: int = 200,
361                                       batchSize: int = 1) -> Tuple[int, Optional[NDArray]]:
362    """
363    Synthesize a data record, using Algorithm 1 from Shokri et als
364    paper "Membership Inference Attacks against Machine Learning Models".
365    """
366    assert label < 100 and label >= 0
367
368    # Initalization
369    batchIndex: int = 0
370    numFeatures: int = 600
371    kWasUpdated = False
372    k = k_max
373    y_c_star = 0
374    j = 0
375    x = _get_random_record(numFeatures)
376    haveSampled = False
377
378    if batchSize == 1:
379        xs = x.reshape((1, 600))
380        ys = targetModel.predict(xs, batch_size=batchSize, verbose=0)
381    else:
382        xs, ys, batchIndex = _rebatch(x, k, batchSize, targetModel)
383
384    # Controls number of iterations
385    for i in range(iter_max):
386
387        x = xs[batchIndex]
388        y = ys[batchIndex]
389        y_c = y[label]
390        predictedClass = np.argmax(y, axis=0)
391
392        if y_c >= y_c_star:
393            if y_c > conf_min and predictedClass == label:
394                #  print(f"Now sampling! {batchIndex},{y_c},{y_c_star}")
395                haveSampled = True
396                if y_c > globalRandomGen.random():
397                    return i, x.reshape((1, numFeatures))
398
399            xs, ys, batchIndex = _rebatch(x, k, batchSize, targetModel)
400            y_c_star = y_c
401            j = 0
402            continue
403        else:
404            j = j + 1
405            if j > rej_max and (k != k_min) and haveSampled:
406                k = int(max(k_min, np.ceil(k / 2)))
407                j = 0
408                kWasUpdated = True
409
410        batchExhausted = (batchIndex == batchSize - 1)
411
412        if batchExhausted or kWasUpdated:
413            xs, ys, batchIndex = _rebatch(x, k, batchSize, targetModel)
414            kWasUpdated = False
415        else:
416            batchIndex += 1
417
418        #  if (i % 20) == 0:
419        #      print(f"{i}/{iter_max}, y_c/y_c*: {y_c:.1%}/{y_c_star:.1%}, pred/class: {predictedClass}/{label}")
420
421    return iter_max, None
422
423
424def _generate_synthetic_record(label: int,
425                               targetModel: Sequential,
426                               k_max: int = 200,
427                               k_min: int = 5,
428                               conf_min: float = 0.05,
429                               rej_max: int = 20,
430                               iter_max: int = 200,
431                               batchSize: int = 1) -> Tuple[int,Optional[NDArray]]:
432    """
433    Synthesize a data record, using Algorithm 1 from Shokri et als
434    paper "Membership Inference Attacks against Machine Learning Models".
435    """
436    assert label < 100 and label >= 0
437
438    # Initalization
439    numFeatures: int = 600
440    k = k_max
441    y_c_star = 0
442    j = 0
443    x = _get_random_record(numFeatures)
444
445    # Controls number of iterations
446    for i in range(iter_max):
447
448        y = targetModel.predict(x, batch_size=1, verbose=0)
449        y_c = y[0][label]
450        predictedClass = np.argmax(y, axis=1)[0]
451
452        if y_c >= y_c_star:
453            if y_c > conf_min and predictedClass == label:
454                #  print("Now sampling!")
455                if y_c > globalRandomGen.random():
456                    return i,x
457
458            y_c_star = y_c
459            j = 0
460        else:
461            j = j + 1
462            if j > rej_max and (k != k_min):
463                k = int(max(k_min, np.ceil(k / 2)))
464                j = 0
465
466        x = _randomize_features(x, k)  # pyright: ignore
467
468        #  if (i % 20) == 0:
469        #      print(
470        #          f"{i}/{iter_max}, y_c/y_c*: {y_c:.1%}/{y_c_star:.1%}, pred/class: {predictedClass}/{label}")
471
472    return iter_max,None
473
474
475def hill_climbing(targetModel: Sequential, numRecords: int,
476                  **hyperpars) -> Dataset:
477    """
478    Generate synthetic data for the shadow models by querying the target model
479    for randomly sampled records, in order to find those that are classified
480    with high confidence.
481
482    `numRecords`: size of generated dataset
483    `hyperpars` has the following keys (taken from the paper:
484    k_max,k_min,rej_max,conf_min,iter_max)
485    """
486
487    # Generate an array of labels, determining which class to synthesize for
488    # TODO: initializing and then changing `features` array might not be most
489    # efficient solution
490
491    numClasses: int = 100
492    labels: NDArray = _generate_labels(numClasses, numRecords)
493
494    numFeatures: int = 600
495    features: NDArray = np.zeros((numRecords, numFeatures))
496    overallNumQueries = []
497
498    for index, label in enumerate(labels):
499        label = int(label[0])
500        queries, new_record = _generate_synthetic_record(label, targetModel, **hyperpars)
501        overallNumQueries.append(queries)
502        while new_record is None:
503            queries, new_record = _generate_synthetic_record(label, targetModel, **hyperpars)
504            overallNumQueries.append(queries)
505        print(f"Generating synthetic records: {index}/{numRecords}, {index/numRecords*100:.2f}% done. On average {sum(overallNumQueries)/(index+1)} +- {np.std(overallNumQueries)} queries.")
506        features[index] = new_record.reshape((1, numFeatures))
507
508    features = features.reshape((numRecords, numFeatures))
509    labels = labels.reshape((numRecords, 1))
510    return Dataset.from_tensor_slices((features, labels))
511
512def get_target_model_rest_data(config:Dict) -> Dataset:
513    modelName = tm.get_model_name(config)
514    restDataName = modelName + "_rest_data"
515    return ds.load_target(restDataName)
516
517if __name__ == "__main__":
518    import argparse
519    import configuration as con
520    import datasets as ds
521    import target_models as tm
522
523    parser = argparse.ArgumentParser(description='Generate all the necessary shadow data and save it to disk.')
524    parser.add_argument('--config', help='Relative path to config file.',)
525    config = con.from_cli_options(vars(parser.parse_args()))
526    set_seed(config["seed"])
527
528    shadowData = get_shadow_data(config)

def set_seed(new_seed: int): View Source

29def set_seed(new_seed: int):
30    """
31    Set the global seed that will be used for all functions that include
32    randomness.
33    """
34    global global_seed
35    global_seed = new_seed
36    np.random.seed(global_seed)
37    random.seed(global_seed)
38    random_seed.set_seed(global_seed)

Set the global seed that will be used for all functions that include randomness.

def generate_shadow_data_sampling( original_data: tensorflow.python.data.ops.dataset_ops.DatasetV2) -> tensorflow.python.data.ops.dataset_ops.DatasetV2: View Source

41def generate_shadow_data_sampling(original_data: Dataset) -> Dataset:
42    """
43    Generate synthetic data for the shadow models by randomly sampling data
44    points from the original data set.
45    """
46    sample_dataset: Dataset = tf.data.Dataset.sample_from_datasets(
47        [original_data], seed=global_seed, stop_on_empty_dataset=True)
48    return sample_dataset

Generate synthetic data for the shadow models by randomly sampling data points from the original data set.

def split_shadow_data( config: Dict, shadowData: tensorflow.python.data.ops.dataset_ops.DatasetV2) -> List[tensorflow.python.data.ops.dataset_ops.DatasetV2]: View Source

51def split_shadow_data(config: Dict, shadowData: ds.Dataset) -> List[ds.Dataset]:
52    print("Splitting shadow data into subsets.")
53    numSubsets = config["shadowModels"]["number"]
54    return ds.split_dataset(shadowData, numSubsets)

def load_shadow_data(config: Dict): View Source

57def load_shadow_data(config: Dict):
58    dataName = get_shadow_data_name(config)
59    return ds.load_shadow(dataName, verbose=config["verbose"])

def get_shadow_data_name(config: Dict): View Source

62def get_shadow_data_name(config: Dict):
63    shadowConfig = config["shadowDataset"]
64    method = shadowConfig["method"]
65    targetDataName = config["targetDataset"]["name"]
66    dataSize = shadowConfig["size"]
67    hyperpars = shadowConfig[method]["hyperparameters"]
68    if method == "noisy":
69        dataName = f'{method}_fraction_{hyperpars["fraction"]}_size_{dataSize}_target_{targetDataName}'
70    elif method == "hill_climbing":
71        dataName = \
72            f'{method}_' + \
73            f'{targetDataName}_' + \
74            f'kmax_{hyperpars["k_max"]}_' + \
75            f'kmin_{hyperpars["k_min"]}_' + \
76            f'confmin_{hyperpars["conf_min"]}_' + \
77            f'rejmax_{hyperpars["rej_max"]}_' + \
78            f'itermax_{hyperpars["iter_max"]}_' + \
79            f'size_{dataSize}'
80    elif method == "original":
81        dataName = \
82            f'{method}_' + \
83            f'{targetDataName}_' + \
84            f'original_data_' + \
85            f'size_{dataSize}'
86    elif method == "statistic":
87        dataName = \
88            f'{method}_' + \
89            f'{targetDataName}_' + \
90            f'statistic_' + \
91            f'size_{dataSize}'
92    else:
93        raise ValueError(f"{method} is not a valid shadow data method.")
94    return dataName

def get_shadow_data(config: Dict) -> tensorflow.python.data.ops.dataset_ops.DatasetV2: View Source

 97def get_shadow_data(config: Dict) -> ds.Dataset:
 98    verbose = config["verbose"]
 99    shadowConfig = config["shadowDataset"]
100    method = shadowConfig["method"]
101    sizePerModel = shadowConfig["size"]
102    split = config["shadowModels"]["split"]
103    numModels = config["shadowModels"]["number"]
104    dataSize = int(np.ceil(sizePerModel * (numModels/split)))
105    hyperpars = shadowConfig[method]["hyperparameters"]
106    dataName = get_shadow_data_name(config)
107
108    try:
109        print("Loading shadow data from disk.")
110        shadowData = load_shadow_data(config)
111    except BaseException:
112        print("Loading failed, generating shadow data.")
113
114        targetDataset = get_target_model_rest_data(config)
115        targetModel = tm.load_model(tm.get_model_name(config), verbose=config["verbose"])
116
117        if method == "noisy":
118            shadowData = generate_shadow_data_noisy(targetDataset, dataSize, **hyperpars)
119        elif method == "hill_climbing":
120            shadowData = hill_climbing(targetModel, dataSize, **hyperpars)
121        elif method == "original":
122            shadowData = generate_shadow_data_original(targetDataset, dataSize)
123        elif method == "statistic":
124            shadowData = generate_shadow_data_statistic(config)
125        else:
126            raise ValueError(f"{method} is not a valid shadow data method.")
127
128        if verbose:
129            print(f"Saving shadow data {dataName} to disk.")
130        try:
131            ds.save_shadow(shadowData, dataName)
132        except BaseException:
133            print(f"Failed to save shadow data {dataName} to disk.")
134            ds.delete_shadow(dataName)
135            raise
136
137    return shadowData

def generate_shadow_data_original( targetDataset, outputSize) -> tensorflow.python.data.ops.dataset_ops.DatasetV2: View Source

157def generate_shadow_data_original(targetDataset, outputSize) -> Dataset:
158
159    inputSize = targetDataset.cardinality().numpy()
160
161    shadowData = ds.shuffle(targetDataset)
162
163    if inputSize >= outputSize:
164        return shadowData.take(outputSize)
165
166
167    numSets = int(np.floor(outputSize / inputSize))
168    for _ in range(numSets - 1):
169        newSet = ds.shuffle(targetDataset)
170        shadowData = shadowData.concatenate(newSet)
171
172    # How many records to add after collecting numSets sets
173    offset = outputSize % inputSize
174    offsetSet = ds.shuffle(targetDataset).take(offset)
175
176    return shadowData.concatenate(offsetSet)

def generate_shadow_data_noisy( original_data: tensorflow.python.data.ops.dataset_ops.DatasetV2, outputSize: int, fraction: float = 0.1) -> tensorflow.python.data.ops.dataset_ops.DatasetV2: View Source

178def generate_shadow_data_noisy(original_data: Dataset, outputSize: int, fraction: float = 0.1) -> Dataset:
179    """
180    Generate synthetic data for the shadow models by using a noisy version of
181    the original data.
182    Returns only the noisy data, no the oririnal data.
183
184    Arguments:
185        fraction: percentage of labels that will be flipped per data record to
186                  make it "noisy"
187    """
188    inputSize = original_data.cardinality().numpy()
189    # Since outputSize % inputSize not always 0, we have to fill the gap with a subset
190    # of the full input data. To avoid bias, shuffle the input data.
191    noisySet = _make_dataset_noisy(ds.shuffle(original_data), fraction)
192
193    if inputSize >= outputSize:
194        return noisySet.take(outputSize)
195
196    numNoisyVersions = int(np.floor(outputSize / inputSize))
197    # How many records to add after collecting numNoisyVersions sets
198    offset = outputSize % inputSize
199
200    for _ in range(numNoisyVersions - 1):
201        newNoisySet = _make_dataset_noisy(ds.shuffle(original_data), fraction)
202        noisySet = noisySet.concatenate(newNoisySet)
203
204    offsetSet = _make_dataset_noisy(
205        ds.shuffle(original_data), fraction).take(offset)
206    return noisySet.concatenate(offsetSet)

Generate synthetic data for the shadow models by using a noisy version of the original data. Returns only the noisy data, no the oririnal data.

Arguments: fraction: percentage of labels that will be flipped per data record to make it "noisy"

def generate_shadow_data_statistic(config: Dict) -> tensorflow.python.data.ops.dataset_ops.DatasetV2: View Source

234def generate_shadow_data_statistic(config: Dict) -> Dataset:
235    """
236    Generate synthetic data for the shadow models by using the marginal
237    distribution of features in the original dataset.
238    """
239    # TODO: Kaggle specific
240    size = config["shadowDataset"]["size"]
241    try:
242        marginalProbabilities = ds.load_numpy_array("kaggle_marginals.npy")
243    except:
244        marginalProbabilities = _compute_kaggle_marginals(config)
245        ds.save_numpy_array("kaggle_marginals.npy",marginalProbabilities)
246
247    # Generate new records
248    numClasses = marginalProbabilities.shape[0]
249    numFeatures = marginalProbabilities.shape[1]
250
251    features: NDArray = np.zeros((size,numFeatures)).astype(np.int32)
252    labels: NDArray = np.zeros((size,numClasses)).astype(np.int32)
253
254    recordsPerClass = int(size/numClasses)
255
256    for _class in range(numClasses):
257        if config["verbose"]:
258            print(f"Generating records for class {_class}")
259        index_start = _class * recordsPerClass
260        index_end = (_class + 1) * recordsPerClass
261        #  for index in range(index_start, index_end):
262        #      labels[index] = to_categorical(_class, num_classes = numClasses)
263        labels[index_start:index_end] = np.tile(to_categorical(_class, num_classes =
264                                                               numClasses),recordsPerClass).reshape(recordsPerClass,numClasses)
265        gen = np.random.default_rng(seed=global_seed)
266        marginalProbability = marginalProbabilities[_class]
267
268        for feature in range(numFeatures):
269            probability = marginalProbability[feature]
270            # sample one feature for all records in this class at once
271            sampledFeature = gen.choice([0,1], p=[1-probability,probability],size = recordsPerClass)
272            features[index_start:index_end,feature] = sampledFeature
273
274    shadowData = Dataset.from_tensor_slices((features, labels))
275    shadowData = shadowData.shuffle(size, seed=global_seed, reshuffle_each_iteration=False)
276    return shadowData

Generate synthetic data for the shadow models by using the marginal distribution of features in the original dataset.

def hill_climbing( targetModel: keras.engine.sequential.Sequential, numRecords: int, **hyperpars) -> tensorflow.python.data.ops.dataset_ops.DatasetV2: View Source

476def hill_climbing(targetModel: Sequential, numRecords: int,
477                  **hyperpars) -> Dataset:
478    """
479    Generate synthetic data for the shadow models by querying the target model
480    for randomly sampled records, in order to find those that are classified
481    with high confidence.
482
483    `numRecords`: size of generated dataset
484    `hyperpars` has the following keys (taken from the paper:
485    k_max,k_min,rej_max,conf_min,iter_max)
486    """
487
488    # Generate an array of labels, determining which class to synthesize for
489    # TODO: initializing and then changing `features` array might not be most
490    # efficient solution
491
492    numClasses: int = 100
493    labels: NDArray = _generate_labels(numClasses, numRecords)
494
495    numFeatures: int = 600
496    features: NDArray = np.zeros((numRecords, numFeatures))
497    overallNumQueries = []
498
499    for index, label in enumerate(labels):
500        label = int(label[0])
501        queries, new_record = _generate_synthetic_record(label, targetModel, **hyperpars)
502        overallNumQueries.append(queries)
503        while new_record is None:
504            queries, new_record = _generate_synthetic_record(label, targetModel, **hyperpars)
505            overallNumQueries.append(queries)
506        print(f"Generating synthetic records: {index}/{numRecords}, {index/numRecords*100:.2f}% done. On average {sum(overallNumQueries)/(index+1)} +- {np.std(overallNumQueries)} queries.")
507        features[index] = new_record.reshape((1, numFeatures))
508
509    features = features.reshape((numRecords, numFeatures))
510    labels = labels.reshape((numRecords, 1))
511    return Dataset.from_tensor_slices((features, labels))

Generate synthetic data for the shadow models by querying the target model for randomly sampled records, in order to find those that are classified with high confidence.

numRecords: size of generated dataset hyperpars has the following keys (taken from the paper: k_max,k_min,rej_max,conf_min,iter_max)

def get_target_model_rest_data(config: Dict) -> tensorflow.python.data.ops.dataset_ops.DatasetV2: View Source

513def get_target_model_rest_data(config:Dict) -> Dataset:
514    modelName = tm.get_model_name(config)
515    restDataName = modelName + "_rest_data"
516    return ds.load_target(restDataName)