forked from FFmgll/shapiq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
precompute_lm.py
53 lines (41 loc) · 1.56 KB
/
precompute_lm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""This Module can be used to precompute model outpts for the language model for the NLPLookupGame"""
import random
import os
import pandas as pd
import tqdm
from games import NLPGame
from approximators.base import powerset
n = 14
N_SAMPLES = 1
if __name__ == "__main__":
df = pd.read_csv("games/data/simplified_imdb.csv")
df = df[df['length'] == n]
sampled_n = 0
save_dir = os.path.join("games/data", "nlp_values", str(n))
if not os.path.exists(save_dir):
os.makedirs(save_dir)
while sampled_n < N_SAMPLES:
sentence_id = random.choice(list(df["id"].values))
sentence = str(df[df["id"] == sentence_id]["text"].values[0])
files = list(os.listdir(os.path.join("games/data", "nlp_values", str(n))))
sentence_path = str(sentence_id) + ".csv"
if sentence_path in files:
continue
game = NLPGame(input_text=sentence)
N = set(range(0, n))
calls = []
print(f"Starting sample {sampled_n + 1} from {N_SAMPLES}.\n"
f"sentence_id: {sentence_id}\n"
f"Sentence: {sentence}")
pbar = tqdm.tqdm(total=2 ** n)
for S in powerset(N, min_size=0, max_size=n):
value = game.set_call(S)
S_storage = 's'
for player in S:
S_storage += str(player)
calls.append({"set": S_storage, "value": value})
pbar.update(1)
storage_df = pd.DataFrame(calls)
storage_df.to_csv(os.path.join(save_dir, sentence_path), index=False)
sampled_n += 1
pbar.close()