-
Notifications
You must be signed in to change notification settings - Fork 0
/
2-evaluate.py
152 lines (127 loc) · 5.47 KB
/
2-evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import argparse
import yaml
from os import path, makedirs, listdir
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, EvalPrediction, TrainingArguments, set_seed
from utils import sklearn_metrics_full, data_collator_tensordataset, load_data
import json
language = ""
current_model = ""
def get_metrics(y_true, predictions, threshold=0.5):
"""
Return the metrics for the predictions.
:param y_true: True labels.
:param predictions: Predictions.
:param threshold: Threshold for the predictions. Default: 0.5.
:return: Dictionary with the metrics.
"""
global current_model
global language
metrics, class_report, conf_matrix = sklearn_metrics_full(
y_true,
predictions,
path.join(args.data_path, language),
threshold,
True,
True,
args.parents,
)
# Save the classification report
with open(path.join(
current_model,
"evaluation",
"class_report.json",
), "w") as class_report_fp:
json.dump(class_report, class_report_fp, indent=2)
# Save the confusion matrix
with open(path.join(
current_model,
"evaluation",
"conf_matrix.json",
), "w") as conf_matrix_fp:
json.dump(conf_matrix, conf_matrix_fp)
# Save the metrics
with open(path.join(
current_model,
"evaluation",
"metrics.json"), "w") as metrics_fp:
json.dump(metrics, metrics_fp, indent=2)
print(metrics)
return metrics
def compute_metrics(p: EvalPrediction):
"""
Compute the metrics for the predictions during the training.
:param p: EvalPrediction object.
:return: Dictionary with the metrics.
"""
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
result = get_metrics(p.label_ids, preds, args.threshold)
return result
def start_evaluate():
"""
Launch the evaluation of the models.
"""
print(f"Working on device: {args.device}")
global language
global current_model
language = args.lang
# Load the seeds for the different splits
if args.seeds != "all":
seeds = args.seeds.split(",")
else:
seeds = [name.split("_")[1] for name in listdir(path.join(args.data_path, args.lang)) if "split" in name]
for seed in seeds:
if not path.exists(
path.join(args.models_path, args.lang, seed)
):
print(f"Models for seed {seed} not found. Skipping...")
continue
# Load the data
test_set = load_data(args.data_path, args.lang, "test", seed)
# Get the last checkpoint
last_checkpoint = max(
[
int(f.split("-")[1])
for f in listdir(path.join(args.models_path, args.lang, seed))
if f.startswith("checkpoint-") and path.isdir(path.join(args.models_path, args.lang, seed, f))
]
)
last_checkpoint = path.join(args.models_path, args.lang, seed, f"checkpoint-{last_checkpoint}")
# Create the directory for the evaluation output
makedirs(path.join(last_checkpoint, "evaluation"), exist_ok=True)
current_model = last_checkpoint
# Load model and tokenizer
print(f"\nEvaluating model: '{last_checkpoint}'...")
set_seed(int(seed))
tokenizer = AutoTokenizer.from_pretrained(last_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(last_checkpoint, trust_remote_code=args.trust_remote)
no_cuda = True if args.device == "cpu" else False
# Setup the evaluation
trainer = Trainer(
args=TrainingArguments(
path.join(last_checkpoint, "evaluation"),
per_device_eval_batch_size=args.batch_size,
seed = int(seed),
no_cuda = no_cuda,
report_to="all",
),
model=model,
tokenizer=tokenizer,
data_collator=data_collator_tensordataset,
compute_metrics=compute_metrics
)
# Evaluate the model
model.eval()
trainer.predict(test_set)
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--lang", type=str, default="it", help="Language to evaluate the model on.")
parser.add_argument("--data_path", type=str, default="data/", help="Path to the EuroVoc data.")
parser.add_argument("--models_path", type=str, default="models/", help="Path of the saved models.")
parser.add_argument("--seeds", type=str, default="all", help="Seeds to be used to load the data splits, separated by a comma (e.g. 110,221). Use 'all' to use all the data splits.")
parser.add_argument("--device", type=str, default="cpu", choices=["cpu", "cuda"], help="Device to evaluate on.")
parser.add_argument("--batch_size", type=int, default=8, help="Batch size of the dataset.")
parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for the predictions.")
parser.add_argument("--parents", type=str, default="none", choices=["none", "add", "builtin"], help="How to handle the parents of the labels. Add them 'artificially' with the 'add' option, or use the 'builtin' option if the labels were added during training.")
parser.add_argument("--trust_remote", action="store_true", default=False, help="Trust the remote code for the model.")
args = parser.parse_args()
start_evaluate()