-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
130 lines (92 loc) · 5.66 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import torch
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer
from datasets import load_dataset
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler
import wandb
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
# Build a dataset to be used for the training.
# It is a series of prompts (each with different length chosen randomly)
# We will use it to generate the responses and compute the rewards.
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token
# load the IMDB dataset
ds = load_dataset(dataset_name, split="train")
ds = ds.rename_columns({"text": "review"})
# Only choose reviews with more than 200 tokens
ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
input_size = LengthSampler(input_min_text_length, input_max_text_length)
def tokenize(sample):
# From each review just keep the first `input_size` tokens, this represents the prompt used to generate the response
sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
sample["query"] = tokenizer.decode(sample["input_ids"])
return sample
ds = ds.map(tokenize, batched=False)
ds.set_format(type="torch")
return ds
def collator(data):
return dict((key, [d[key] for d in data]) for key in data[0])
if __name__ == '__main__':
config = PPOConfig(
model_name="lvwerra/gpt2-imdb",
learning_rate=1.41e-5,
log_with="wandb",
)
wandb.init()
dataset = build_dataset(config)
# This is the model we are going to fine-tune with PPO
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
# This is the reference model (frozen) for the KL divergence
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
device = 0 if torch.cuda.is_available() else "cpu" # to avoid a `pipeline` bug
# This is the reward model: a "positive" (e.g. a positive review) response will be given a high reward, a "negative" response will be given a low reward
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
# Print some examples of sentiments generated by the reward model
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}
text = "this movie was really bad!!"
print(sentiment_pipe(text, **sent_kwargs))
text = "this movie was really good!!"
print(sentiment_pipe(text, **sent_kwargs)) # [{'label': 'NEGATIVE', 'score': -2.335047960281372}, {'label': 'POSITIVE', 'score': 2.557039737701416}]
output_min_length = 4
output_max_length = 30
output_length_sampler = LengthSampler(output_min_length, output_max_length)
# The configuration to generate responses (trajectories)
response_generation_kwargs = {
"min_length": -1,
"top_k": 0.0,
"top_p": 1.0,
"do_sample": True,
"pad_token_id": tokenizer.eos_token_id,
}
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
query_tensors = batch["input_ids"]
#### Phase 1: Get trajectories from the offline policy
# In this case we are only generating the responses, but not computing the log probabilities, which will be computed internally by the PPOTrainer.
response_tensors = []
for query in query_tensors:
gen_len = output_length_sampler()
response_generation_kwargs["max_new_tokens"] = gen_len # Number of tokens to generate (chosen randomly)
response = ppo_trainer.generate(query, **response_generation_kwargs) # It returns the (query + response) tokens
response_tensors.append(response.squeeze()[-gen_len:]) # Only take the tokens corresponding to the generated response (remove the prompt/query from the beginning)
batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
#### Phase 1: Compute rewards
# Join the query (prompt) + response (generated tokens)
texts = [q + r for q, r in zip(batch["query"], batch["response"])]
# Compute the reward for each of the texts (query + response)
# shape: A list of dictionaries with two keys: POSITIVE and NEGATIVE. We are interested in the POSITIVE score. This will be our reward.
pipe_outputs = sentiment_pipe(texts, **sent_kwargs) # [{'label': 'NEGATIVE', 'score': -2.335047960281372}, {'label': 'POSITIVE', 'score': 2.557039737701416}]
# The reward for each text is the score (logit) corresponding to the POSITIVE class.
# shape: A list of scalars, one for each generated response.
# It means we assign the reward to the whole response (not to each token).
rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
#### Phase 1 + Phase 2: calculate the logprobs and then run the PPO update
stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
ppo_trainer.log_stats(stats, batch, rewards)
model.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=False)
tokenizer.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=False)