Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mlflow logging integration with yolox training #1773

Merged
merged 16 commits into from
Jul 11, 2024
65 changes: 65 additions & 0 deletions docs/mlflow_integration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
## MLFlow Integration
YOLOX now supports MLFlow integration. MLFlow is an open-source platform for managing the end-to-end machine learning lifecycle. It is designed to work with any ML library, algorithm, deployment tool, or language. MLFlow can be used to track experiments, metrics, and parameters, and to log and visualize model artifacts. \
For more information, please refer to: [MLFlow Documentation](https://www.mlflow.org/docs/latest/index.html)

## Follow these steps to start logging your experiments to MLFlow:
### Step-1: Install MLFlow via pip
```bash
pip install mlflow python-dotenv
```

### Step-2: Set up MLFlow Tracking Server
Start or connect to a MLFlow tracking server like databricks. You can start a local tracking server by running the following command:
```bash
mlflow server --host 127.0.0.1 --port 8080
```
Read more about setting up MLFlow tracking server [here](https://mlflow.org/docs/latest/tracking/server.html#mlflow-tracking-server)

### Step-3: Set up MLFlow Environment Variables
Set the following environment variables in your `.env` file:
```bash
MLFLOW_TRACKING_URI="127.0.0.1:5000" # set to your mlflow server URI
MLFLOW_EXPERIMENT_NAME="/path/to/experiment" # set to your experiment name
MLFLOW_TAGS={"release.candidate": "DEV1", "release.version": "0.0.0"}
# config related to logging model to mlflow as pyfunc
YOLOX_MLFLOW_LOG_MODEL_ARTIFACTS="True" # whether to log model (best or historical) or not
YOLOX_MLFLOW_LOG_MODEL_PER_n_EPOCHS=30 # try logging model only after every n epochs
YOLOX_MLFLOW_LOG_Nth_EPOCH_MODELS="False" # whether to log step model along with best_model or not
YOLOX_MLFLOW_RUN_NAME="" # give a custom name to your run, otherwise a random name is assign by mlflow
YOLOX_MLFLOW_FLATTEN_PARAMS="True" # flatten any sub sub params of dict to be logged as simple key value pair


MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING=True # log system gpu usage and other metrices
MLFLOW_NESTED_RUN="False" #whether to run as a nested run of given run_id
MLFLOW_RUN_ID="" # continue training from a given run_id
```
### Step-5: Provide --logger "mlflow" to the training script
```bash
python tools/train.py -l mlflow -f exps/path/to/exp.py -d 1 -b 8 --fp16 -o -c
pre_trained_model/<model>.pth
# note the -l mlflow flag
# one working example is this
python tools/train.py -l mlflow -f exps/example/custom/yolox_s.py -d 1 -b 8 --fp16 -o -c pre_trained_model/yolox_s.pth
```
### Step-4: optional; start the mlflow ui and track your experiments
If you log runs to a local mlruns directory, run the following command in the directory above it, then access http://127.0.0.1:5000 in your browser.

```bash
mlflow ui --port 5000
```

Im-Himanshu marked this conversation as resolved.
Show resolved Hide resolved
## Optional Databricks Integration

### Step-1: Install Databricks sdk
```bash
pip install databricks-sdk
```

### Step-2: Set up Databricks Environment Variables
Set the following environment variables in your `.env` file:
```bash
MLFLOW_TRACKING_URI="databricks" # set to databricks
MLFLOW_EXPERIMENT_NAME="/Users/<user>/<experiment_name>/"
DATABRICKS_HOST = "https://dbc-1234567890123456.cloud.databricks.com" # set to your server URI
DATABRICKS_TOKEN = "dapixxxxxxxxxxxxx"
```
2 changes: 1 addition & 1 deletion tools/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def make_parser():
"--logger",
type=str,
help="Logger to be used for metrics. \
Implemented loggers include `tensorboard` and `wandb`.",
Implemented loggers include `tensorboard`, `mlflow` and `wandb`.",
default="tensorboard"
)
parser.add_argument(
Expand Down
42 changes: 40 additions & 2 deletions yolox/core/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from yolox.exp import Exp
from yolox.utils import (
MeterBuffer,
MlflowLogger,
ModelEMA,
WandbLogger,
adjust_status,
Expand Down Expand Up @@ -74,7 +75,8 @@ def train(self):
self.before_train()
try:
self.train_in_epoch()
except Exception:
except Exception as e:
logger.error("Exception in training: ", e)
raise
finally:
self.after_train()
Expand Down Expand Up @@ -185,8 +187,11 @@ def before_train(self):
self.exp,
self.evaluator.dataloader.dataset
)
elif self.args.logger == "mlflow":
self.mlflow_logger = MlflowLogger()
self.mlflow_logger.setup(args=self.args, exp=self.exp)
else:
raise ValueError("logger must be either 'tensorboard' or 'wandb'")
raise ValueError("logger must be either 'tensorboard', 'mlflow' or 'wandb'")

logger.info("Training start...")
logger.info("\n{}".format(model))
Expand All @@ -198,6 +203,16 @@ def after_train(self):
if self.rank == 0:
if self.args.logger == "wandb":
self.wandb_logger.finish()
elif self.args.logger == "mlflow":
metadata = {
"epoch": self.epoch + 1,
"input_size": self.input_size,
'start_ckpt': self.args.ckpt,
'exp_file': self.args.exp_file,
"best_ap": float(self.best_ap)
}
self.mlflow_logger.on_train_end(self.args, file_name=self.file_name,
metadata=metadata)

def before_epoch(self):
logger.info("---> start train epoch{}".format(self.epoch + 1))
Expand Down Expand Up @@ -276,6 +291,10 @@ def after_iter(self):
"train/lr": self.meter["lr"].latest
})
self.wandb_logger.log_metrics(metrics, step=self.progress_in_iter)
if self.args.logger == 'mlflow':
logs = {"train/" + k: v.latest for k, v in loss_meter.items()}
logs.update({"train/lr": self.meter["lr"].latest})
self.mlflow_logger.on_log(self.args, self.exp, self.epoch+1, logs)

self.meter.clear_meters()

Expand Down Expand Up @@ -351,13 +370,32 @@ def evaluate_and_save_model(self):
"train/epoch": self.epoch + 1,
})
self.wandb_logger.log_images(predictions)
if self.args.logger == "mlflow":
logs = {
"val/COCOAP50": ap50,
"val/COCOAP50_95": ap50_95,
"val/best_ap": round(self.best_ap, 3),
"train/epoch": self.epoch + 1,
}
self.mlflow_logger.on_log(self.args, self.exp, self.epoch+1, logs)
logger.info("\n" + summary)
synchronize()

self.save_ckpt("last_epoch", update_best_ckpt, ap=ap50_95)
if self.save_history_ckpt:
self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)

if self.args.logger == "mlflow":
metadata = {
"epoch": self.epoch + 1,
"input_size": self.input_size,
'start_ckpt': self.args.ckpt,
'exp_file': self.args.exp_file,
"best_ap": float(self.best_ap)
}
self.mlflow_logger.save_checkpoints(self.args, self.exp, self.file_name, self.epoch,
metadata, update_best_ckpt)

def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
if self.rank == 0:
save_model = self.ema_model.ema if self.use_model_ema else self.model
Expand Down
1 change: 1 addition & 0 deletions yolox/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .logger import WandbLogger, setup_logger
from .lr_scheduler import LRScheduler
from .metric import *
from .mlflow_logger import MlflowLogger
from .model_utils import *
from .setup_env import *
from .visualize import *
Loading
Loading