train.py

import torch
from tqdm import tqdm

from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

import wandb



def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    # Build a dataset to be used for the training.
    # It is a series of prompts (each with different length chosen randomly)
    # We will use it to generate the responses and compute the rewards.
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load the IMDB dataset
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    # Only choose reviews with more than 200 tokens
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        # From each review just keep the first `input_size` tokens, this represents the prompt used to generate the response
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")

    return ds




def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])



if __name__ == '__main__':
    config = PPOConfig(
        model_name="lvwerra/gpt2-imdb",
        learning_rate=1.41e-5,
        log_with="wandb",
    )

    wandb.init()

    dataset = build_dataset(config)

    # This is the model we are going to fine-tune with PPO
    model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
    # This is the reference model (frozen) for the KL divergence
    ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)

    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token

    ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)

    device = ppo_trainer.accelerator.device
    if ppo_trainer.accelerator.num_processes == 1:
        device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

    # This is the reward model: a "positive" (e.g. a positive review) response will be given a high reward, a "negative" response will be given a low reward
    sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)


    # Print some examples of sentiments generated by the reward model
    sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}
    text = "this movie was really bad!!"
    print(sentiment_pipe(text, **sent_kwargs))

    text = "this movie was really good!!"
    print(sentiment_pipe(text, **sent_kwargs)) # [{'label': 'NEGATIVE', 'score': -2.335047960281372}, {'label': 'POSITIVE', 'score': 2.557039737701416}]

    output_min_length = 4
    output_max_length = 30
    output_length_sampler = LengthSampler(output_min_length, output_max_length)

    # The configuration to generate responses (trajectories)
    response_generation_kwargs = {
        "min_length": -1,
        "top_k": 0.0,
        "top_p": 1.0,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id,
    }
    

    for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):

        query_tensors = batch["input_ids"]

        #### Phase 1: Get trajectories from the offline policy
        # In this case we are only generating the responses, but not computing the log probabilities, which will be computed internally by the PPOTrainer.
        response_tensors = []
        for query in query_tensors:
            gen_len = output_length_sampler()
            response_generation_kwargs["max_new_tokens"] = gen_len # Number of tokens to generate (chosen randomly)
            response = ppo_trainer.generate(query, **response_generation_kwargs) # It returns the (query + response) tokens
            response_tensors.append(response.squeeze()[-gen_len:]) # Only take the tokens corresponding to the generated response (remove the prompt/query from the beginning)
        batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

        #### Phase 1: Compute rewards
        # Join the query (prompt) + response (generated tokens)
        texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        # Compute the reward for each of the texts (query + response)
        # shape: A list of dictionaries with two keys: POSITIVE and NEGATIVE. We are interested in the POSITIVE score. This will be our reward.
        pipe_outputs = sentiment_pipe(texts, **sent_kwargs) # [{'label': 'NEGATIVE', 'score': -2.335047960281372}, {'label': 'POSITIVE', 'score': 2.557039737701416}]

        # The reward for each text is the score (logit) corresponding to the POSITIVE class. 
        # shape: A list of scalars, one for each generated response. 
        # It means we assign the reward to the whole response (not to each token).
        rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

        #### Phase 1 + Phase 2: calculate the logprobs and then run the PPO update
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        
        ppo_trainer.log_stats(stats, batch, rewards)

    model.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=False)
    tokenizer.save_pretrained("gpt2-imdb-pos-v2", push_to_hub=False)