-
Notifications
You must be signed in to change notification settings - Fork 0
/
adjust_hallucination.py
148 lines (125 loc) · 5.24 KB
/
adjust_hallucination.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pandas as pd
import json
from glob import glob
from datasets import Dataset, Image
import os
# each data has 6 keys
# common keys:
# 1. question
# 2. image
# 3. id
# 4. task
# cooccurence has:
# 5. label
# 6. target
# misleading and ocr have:
# 7. keyword
# counterfactual, distraction and natural have:
# 8. answer
# 9. bbox
# 10. natural_question
# 11. natural_answer
image_folder_path = "/Users/yuanlingzhi/Desktop/change_data/data/image-to-text/hallucination/cooccurrence/images"
task_list = ["action", "attribute", "count", "identification", "spatial"]
# cooccurrence_high_cooc split
data_list = []
json_root_path = "/Users/yuanlingzhi/Desktop/change_data/data/image-to-text/hallucination/cooccurrence/high_cooc"
for task in task_list:
file_path = os.path.join(json_root_path, f"{task}.json")
# each line is a json object
with open(file_path, 'r') as f:
for line in f:
data = json.loads(line)
data["task"] = task
data["target"] = str(data["target"])
data["image"] = os.path.join(image_folder_path, data["image_path"])
# remove the key "image_path"
data.pop("image_path")
# change the key "idx" to "id"
data["id"] = data.pop("idx")
data["keyword"] = ""
data["question"] = data.pop("prompt")
# change the "‘" in the question to "'"
if "\u2019" in data["question"]:
data["question"] = data["question"].replace("\u2019", "'")
data["answer"] = ""
data["bbox"] = ""
data["natural_question"] = ""
data["natural_answer"] = ""
data_list.append(data)
# save the data to a .json file
file_path = "/Users/yuanlingzhi/Desktop/change_data/MMDT_i2t_data/test_data/hallucination/cooccurrence_high_cooc.json"
with open(file_path, 'w') as f:
json.dump(data_list, f, indent=4)
dataset = Dataset.from_json(file_path)
dataset = dataset.cast_column("image", Image())
# push the dataset to huggingface repository
dataset.push_to_hub("YuanXiaopang/test_mmdt", config_name="hallucination", split="cooccurrence_high_cooc")
# cooccurrence_historical_bias
data_list = []
json_root_path = "/Users/yuanlingzhi/Desktop/change_data/data/image-to-text/hallucination/cooccurrence/historical_bias"
for task in task_list:
file_path = os.path.join(json_root_path, f"{task}.json")
# each line is a json object
with open(file_path, 'r') as f:
for line in f:
data = json.loads(line)
data["task"] = task
data["target"] = str(data["target"])
data["image"] = os.path.join(image_folder_path, data["image_path"])
# remove the key "image_path"
data.pop("image_path")
# change the key "idx" to "id"
data["id"] = data.pop("idx")
data["keyword"] = ""
data["question"] = data.pop("prompt")
# change the "‘" in the question to "'"
if "\u2019" in data["question"]:
data["question"] = data["question"].replace("\u2019", "'")
data["answer"] = ""
data["bbox"] = ""
data["natural_question"] = ""
data["natural_answer"] = ""
data_list.append(data)
# save the data to a .json file
file_path = "/Users/yuanlingzhi/Desktop/change_data/MMDT_i2t_data/test_data/hallucination/cooccurrence_historical_bias.json"
with open(file_path, 'w') as f:
json.dump(data_list, f, indent=4)
dataset = Dataset.from_json(file_path)
dataset = dataset.cast_column("image", Image())
# push the dataset to huggingface repository
dataset.push_to_hub("YuanXiaopang/test_mmdt", config_name="hallucination", split="cooccurrence_historical_bias")
# cooccurrence_low_cooc
data_list = []
json_root_path = "/Users/yuanlingzhi/Desktop/change_data/data/image-to-text/hallucination/cooccurrence/low_cooc"
for task in task_list:
file_path = os.path.join(json_root_path, f"{task}.json")
# each line is a json object
with open(file_path, 'r') as f:
for line in f:
data = json.loads(line)
data["task"] = task
data["target"] = str(data["target"])
data["image"] = os.path.join(image_folder_path, data["image_path"])
# remove the key "image_path"
data.pop("image_path")
# change the key "idx" to "id"
data["id"] = data.pop("idx")
data["keyword"] = ""
data["question"] = data.pop("prompt")
# change the "‘" in the question to "'"
if "\u2019" in data["question"]:
data["question"] = data["question"].replace("\u2019", "'")
data["answer"] = ""
data["bbox"] = ""
data["natural_question"] = ""
data["natural_answer"] = ""
data_list.append(data)
# save the data to a .json file
file_path = "/Users/yuanlingzhi/Desktop/change_data/MMDT_i2t_data/test_data/hallucination/cooccurrence_low_cooc.json"
with open(file_path, 'w') as f:
json.dump(data_list, f, indent=4)
dataset = Dataset.from_json(file_path)
dataset = dataset.cast_column("image", Image())
# push the dataset to huggingface repository
dataset.push_to_hub("YuanXiaopang/test_mmdt", config_name="hallucination", split="cooccurrence_low_cooc")