diff --git a/pretrain.py b/pretrain.py index 9e3f58f..26c15e4 100644 --- a/pretrain.py +++ b/pretrain.py @@ -37,6 +37,7 @@ def main(): help="The buffer size of instances in memory.") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout value.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") + parser.add_argument("--train_embedding_only", action="store_true") # Preprocess options. tokenizer_opts(parser) diff --git a/scripts/convert_model_into_blocks.py b/scripts/convert_model_into_blocks.py index bb65147..7ac1df8 100644 --- a/scripts/convert_model_into_blocks.py +++ b/scripts/convert_model_into_blocks.py @@ -28,7 +28,7 @@ state_dict = collections.OrderedDict() filename = f"tencentpretrain_model-0.bin" for k, v in input_model.items(): - state_dict[k] = v + state_dict[k] = v.bfloat16() index_dict["weight_map"][k] = filename param_count += v.numel() file_count += v.numel() diff --git a/tencentpretrain/trainer.py b/tencentpretrain/trainer.py index 3b768a3..0feff00 100755 --- a/tencentpretrain/trainer.py +++ b/tencentpretrain/trainer.py @@ -584,6 +584,16 @@ def worker(local_rank, gpu_ranks, args, model_for_training, model_for_dataloader # Build optimizer. param_optimizer = list(model_for_training.named_parameters()) + if args.train_embedding_only: + optimizer_grouped_parameters = [ + {"params": [p for n, p in param_optimizer if 'embedding' in n or "output_layer" in n]}, + #{"params": [p for n, p in param_optimizer if 'embedding' not in n and "output_layer" not in n], "lr": 0} + ] + #print(optimizer_grouped_parameters) + for n, p in list(model_for_training.named_parameters()): + if 'embedding' not in n and "output_layer" not in n: + p.requires_grad = False + if args.use_lora: optimizer_grouped_parameters = [ {"params": [p for n, p in param_optimizer if 'lora' in n]}