-
Notifications
You must be signed in to change notification settings - Fork 2
/
msrigs_simulation_robustanalysis_ni.py
275 lines (260 loc) · 12.9 KB
/
msrigs_simulation_robustanalysis_ni.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# Multi-Stage Re-Identification (MSRI) Game Solver (GS) v1.1
# Component: MSRIGS Robust Analysis On Size of Identified Dataset Using Simulated Datasets
# (Data subjects have different strategies)
# Copyright 2018-2021 Zhiyu Wan, HIPLAB, Vanderilt University
# Compatible with python 3.8.5. Package dependencies: Numpy 1.19.1, Scikit-learn 0.23.2, Pandas 1.1.3, Matplotlib 3.3.1,
# Seaborn 0.11.0, and SciPy 1.5.2
# Update history:
# Added the surname inference algorithm from the paper "Identifying personal genomes by surname inference", Science 2013
# Nov 19, 2018: Added 3 global dictionaries for distance and confidence score computation;
# Nov 19, 2018: Deleted mulist, xlist; Let etmrca function be more efficient by using cache and pre-computing
# Nov 21, 2018: Added a global dictionary for attack simulation computation
# Nov 23, 2018: Utility for each attribute is proportional to entropy
# Nov 27, 2018: Adding generalization strategies for age and state attributes
# Dec 2, 2018: Adding weight vector for utility, and adding missing data to G
# Dec 17, 2018: Individual-wise defense strategy
# Dec 17, 2018: Adding a surname inference dictionary
# Jan 23, 2019: Deleting the dictionary for defense computation
# March 18, 2019: Fixing a bug regarding tuple_x (all changed to tuple_x_selection)
# March 12, 2020: 3)save detailed results!
# 4)add the scenario of data masking! 5)add the scenario that only opt-in or opt-out!
# 6)add the attacker's cost (attack rate * c)! 7)add the defender's cost (L*pa)!
# March 17, 2020: 8)consider the scenario of always_attack
# March 18, 2020: 2)fix the total utility for various m
# March 19, 2020: 1)threshold for theta
# April 16, 2020: To plot privacy and utility. To plot error bar.
# April 19, 2020: clean the code
# April 21, 2020: functions been moved out, changed optimal defense output
# April 21, 2020: 1) output all data points instead of just average values. 2)deleted "defense_mode" and "n_cost"
# July 15, 2020: Modify one scenario, and add three new scenarios.
# July 15, 2020: Rename saved file.
# Oct 15, 2020: Using function version 21.
# Oct 15, 2020: In the inner loop, add missing values and compute entropy (with missing data).
# April 17, 2021: Robust analysis
import numpy as np
import time
import pandas as pd
import msrigs_functions as sf
import os.path
import pickle
# configuration
id_exp = '2056_nI30000' # ID for the set of experiments ('2056_nI10000' or '2056_nI30000')
n_f: int = 20 # number of firstnames
#n_I: int = 20000 # size of the identified dataset (<=90000)
n_S: int = 1000 # size of the sensitive dataset (<=90000)
n_G: int = 20000 # size of the genetic genealogy dataset # from 4000 on 20180928 # from 10000 on 20200312 (<=90000)
rate_s = 0.6 # rate of sensitive
loss = 150
cost = 10
total_utility = 100
n_iter: int = 20
theta_p = 0.5
method: int = 2
m_g: int = 12 # default: 16
m_start: int = 2000
n_m: int = 20
step: int = 2000
weight = np.concatenate((np.ones(2) * 1, np.ones(m_g)), axis=None)
missing_level = 0.3
over_confident: int = 0
alter_weight: int = 0 # 0: Based on information entropy. 1: Uniform. 2: Special (the weight of 1st 2 geno features is 10x).
algorithm: int = 0 # 0: greedy algorithm. 1: brute-force algorithm.
pruning: int = 1
participation_rate = 0.05
random_masking_rate = 0.15
save_dic: int = 0
save_S: bool = True # save S in each iteration (only work for tested_scenario 0)
save_I: bool = True # save I in each iteration (only work for tested_scenario 0)
# choose a parameter
parameter: int = 2 # 0:Cost of an attack. 1:Number of records in the genetic genealogy dataset.
# 2:Number of records in the identified dataset.
parameter_names = ['cost', 'nG', 'nI']
# choose a scenario
scenario = 3.4 # 0: no protection. 1: no genomic data sharing. 2: random opt-in. 3: random masking. 3.x: custom masking.
# 3.1: k-anonymity. 3.4: test corresponding scenario in the standard setting.
# 4: opt-in game. 5: masking game. 6: no-attack masking game. 7: one-stage masking game.
tested_scenario = 4 # corresponding scenario to be tested
if scenario == 3.4:
scenario_name = 'ts' + str(tested_scenario)
else:
scenario_name = 's' + str(scenario)
# creat folders
folder_result = 'Results' + id_exp + '/' + parameter_names[parameter] + '_robust/'
if pruning == 1:
folder_result += 'pruning/'
# check the existence of the directory, create if not
folders = folder_result.rstrip('/').split('/')
folder = ''
for folder_name in folders:
folder += folder_name + '/'
if not os.path.exists(folder):
os.mkdir(folder)
pickle_filename = folder_result + 'result_p' + str(parameter) + '_ts' + str(tested_scenario) + '.pickle'
filename = folder_result + 'log_p' + str(parameter) + '_ts' + str(tested_scenario) + '.txt'
mu = 1e-3 * np.array([2.381, 2.081, 1.781, 2.803, 2.298, 3.081, 0.552, 0.893, 1.498, 0.425, 5.762, 1.590,
4.769, 6.359, 3.754, 2.180]) # updated mutation rate in 2008
tol = 0.2
Ne = 10000
inv_Ne = 1.0 / Ne
T_Max = 200
I_selection = np.array([2, 3, -2]).astype(int)
if __name__ == '__main__':
start1 = time.time()
surname = []
genome = []
ages = []
states = []
ID = []
n_r = []
array_optimal_payoff = np.empty(n_m * n_iter * n_S)
array_optimal_attacker_payoff = np.empty(n_m * n_iter * n_S)
array_privacy = np.empty(n_m * n_iter * n_S)
array_utility = np.empty(n_m * n_iter * n_S)
array_success_rate = np.empty(n_m * n_iter * n_S)
dic_dist = {}
dic_score_solo = {}
dic_score = {}
for i in range(3):
n = 0
f = open("data/csv/ped"+str(i+1)+".txt", "r")
f2 = open("data/csv/surname" + str(i + 1) + ".txt", "r")
f3 = open("data/csv/birth_year" + str(i + 1) + ".txt", "r")
f4 = open("data/csv/state" + str(i + 1) + ".txt", "r")
for line in f.readlines():
line2 = f2.readline()
line3 = f3.readline()
line4 = f4.readline()
sname = int(float(line2.rstrip("\n"))) #
age = 2020-int(float(line3.rstrip("\n")))
state = int(float(line4.rstrip("\n")))
loci = line.rstrip("\n").split(" ")
if loci[3] == 'M':
n += 1
y1 = []
for j in range(len(loci)):
if j == 0:
ID.append(int(loci[j]))
elif j >= 5 and j % 2 == 0:
y1.append(int(loci[j]))
genome.append(y1)
surname.append(sname)
ages.append(age)
states.append(state)
n_r.append(n)
f.close()
f2.close()
f3.close()
f4.close()
elapsed1 = (time.time() - start1)
f = open(filename, 'w')
f.write(folder_result + '\n')
start2 = time.time()
for i in range(n_iter):
print('iter: ', i)
for m_n in range(n_m):
n_I = step * m_n + m_start
print('n_I: ', n_I)
np.random.seed(i) # reset random number generator for comparison
World = sf.build_world(ID, genome, surname, ages, states, n_r, n_f, rate_s)
(S, I, G2) = sf.generate_datasets(World, n_I, n_S, n_G) #G2 has ground truth
# ID, first name, ages, states, genomic attributes, surname, sensitive
# save S
if save_S and tested_scenario == 0:
if not os.path.exists(folder_result + 'target_data'):
os.mkdir(folder_result + 'target_data')
header = "YOB,State"
for j in range(m_g):
header += ",STR" + str(j + 1)
np.savetxt(folder_result + 'target_data/i' + str(i) + '_m' + str(m_n) + '.csv', S[:, 2:(2 + m_g + 2)],
delimiter=',', fmt='%d', header=header, comments='')
# save I
if save_I and tested_scenario == 0:
if not os.path.exists(folder_result + 'identified_data'):
os.mkdir(folder_result + 'identified_data')
header = "YOB,State,Surname"
np.savetxt(folder_result + 'identified_data/i' + str(i) + '_m' + str(m_n) + '.csv',
I[:, [2, 3, (4 + m_g)]], delimiter=',', fmt='%d', header=header, comments='')
# Add missing values
if missing_level > 0:
np.random.seed(i)
n_missing = int(n_G * m_g * missing_level)
missed = np.append(np.zeros(n_missing), np.ones(n_G * m_g - n_missing))
np.random.shuffle(missed)
G1 = np.multiply(G2[:, 4:(4 + m_g)], missed.reshape(n_G, m_g))
G = np.concatenate((G2[:, 0:4], G1, G2[:, -2:]), axis=1)
else:
G = G2
# Compute entropy
if alter_weight == 0:
entropy = []
for j in range(m_g + 2):
if j == 0 or j == 1: # entropy in demographic dataset
c = I[:, j + 2]
else: # entropy in genetic genealogy dataset
c = G[:, j + 2]
entropy.append(sf.get_entropy(c))
entropy = np.asarray(entropy)
weighted_entropy = np.multiply(entropy, weight[0:(m_g + 2)])
elif alter_weight == 1:
weighted_entropy = weight[0:(m_g + 2)]
elif alter_weight == 2:
weighted_entropy = np.concatenate((np.ones(2), np.ones(2) * 10, np.ones(m_g - 2)), axis=None)
# load customized strategy
custom_strategy_folder = "custom_strategy/"
if scenario == 3.1:
custom_strategy_folder += "k_anonymity/"
elif scenario == 3.4: # robust test
custom_strategy_filename = 'Results' + id_exp + '/Violin/m2/pruning/opt_strategy/s' + str(
tested_scenario) + '_i' + str(i) + '.csv'
if scenario != 3.4:
custom_strategy_filename = folder_result + custom_strategy_folder + 'i' + str(i) + '.csv'
if os.path.exists(custom_strategy_filename):
custom_strategies = np.genfromtxt(custom_strategy_filename, delimiter=',').astype(bool)
else:
print(custom_strategy_filename + " does not exist!")
custom_strategies = np.ones([n_S, 2 + m_g]).astype(bool) # default
dic_surname = {}
dic_attack = {}
for j in range(n_S):
#print('j: ', j)
s = S[j, :]
(opt_payoff, opt_attacker_payoff, opt_attack, opt_success_rate, opt_utility, opt_strategy) = \
sf.optimal_defense(s, I, G, weighted_entropy, m_g, dic_attack, dic_surname, loss, cost, scenario,
total_utility, theta_p, over_confident, mu, method, tol, dic_dist,
dic_score_solo, dic_score, T_Max, inv_Ne, participation_rate, random_masking_rate,
algorithm, pruning, I_selection, custom_strategies[j, :], 1)
index = m_n * n_S * n_iter + i * n_S + j
array_optimal_payoff[index] = opt_payoff
array_optimal_attacker_payoff[index] = opt_attacker_payoff
array_privacy[index] = 1 - opt_success_rate * opt_attack
array_utility[index] = opt_utility
array_success_rate[index] = opt_success_rate
f.write(
'{}-{}-{}: {} {:f} {:d} {}\n'.format(i, m_n, j, np.array(list(map(int, opt_strategy))), opt_payoff,
opt_attack,
opt_success_rate))
dataset = pd.DataFrame({'privacy': array_privacy,
'utility': array_utility,
'defender_optimal': array_optimal_payoff,
'attacker_optimal': array_optimal_attacker_payoff,
'success_rate': array_success_rate})
dataset.to_pickle(pickle_filename)
elapsed2 = (time.time() - start2)
# print out the results
column_names = ['defender_optimal', 'privacy', 'utility', 'success_rate']
for i in range(4):
data = np.array(dataset[column_names[i]])
shaped_dataset = np.reshape(data, (n_m, n_iter * n_S))
av2_dataset = np.mean(shaped_dataset, axis=1)
f.write(column_names[i] + ' ' + str(av2_dataset) + '.\n')
f.write("Time used: " + str(elapsed1) + " seconds (loading) + " + str(elapsed2) + " seconds (computing).\n")
f.close()
if save_dic == 1:
# save dictionaries
dic_names = ['dist', 'score', 'score_solo', 'attack', 'surname']
dics = [dic_dist, dic_score, dic_score_solo, dic_attack, dic_surname]
for i in range(5):
f1 = open(folder_result + 'dic_p' + str(parameter) + '_' + scenario_name
+ '_' + dic_names[i] + '.pkl', 'wb')
pickle.dump(dics[i], f1, protocol=pickle.HIGHEST_PROTOCOL)
f1.close()