-
Notifications
You must be signed in to change notification settings - Fork 0
/
format_dataset.py
60 lines (45 loc) · 1.67 KB
/
format_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from pathlib import Path
from tqdm import tqdm
import json
import datasets
from typing import Dict, Type, List
import argparse
def parse_cla():
"""parses command-line arguments"""
parser = argparse.ArgumentParser()
parser.add_argument("-save_path", type=Path)
return parser.parse_args()
def dataset_dict(content:str, summary:str) -> Dict:
"""
takes a input and a summary and concatenates together with
stanford alpaca syntax for summarization
keyword arguments:
content -- content which should be summarized
summary -- summary of the content
"""
input_txt = f"### Instruction \nWrite a concise summary of the following text \n### Input \n{content}"
output_txt = f"### Output {summary}"
return {"prompt": input_txt, "completion": output_txt}
def load_dataset() -> Type[datasets.Dataset]:
"""returns tldr dataset"""
return datasets.load_dataset("webis/tldr-17")
def save_list(tldr_dataset:Type[datasets.Dataset]) -> List:
"""saves list of dataset dictionaries"""
save_list = []
for text_dict in tqdm(tldr_dataset["train"]):
prompt = dataset_dict(content=text_dict["content"], summary=text_dict["summary"])
save_list.append(prompt)
return save_list
def save_jsonl(save_list:List, save_path:Path):
"""saves jsonl file"""
with open(save_path, mode="w") as opened_jsonl:
for json_dict in save_list:
json.dump(json_dict, opened_jsonl)
opened_jsonl.write("\n")
def main():
args = parse_cla()
dataset = load_dataset()
s_list = save_list(tldr_dataset=dataset)
save_jsonl(save_list=s_list, save_path=args.save_path)
if __name__ == "__main__":
main()