diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..e69de29
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..0f72be9
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,8 @@
+# Users referenced in this file will automatically be requested as reviewers for PRs that modify the given paths.
+# See https://help.github.com/articles/about-code-owners/
+
+# Code
+*                 @andreysher @savchenkoyana
+
+# Actions
+/.github          @senysenyseny16
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..3bdd061
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,25 @@
+name: Lint
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  lint-python-format:
+    name: Python format
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: "3.9"
+      - uses: psf/black@stable
+        with:
+          options: "--check --diff"
+      - uses: isort/isort-action@master
+        with:
+          configuration:
+            --check
+            --diff
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f86a48a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+**/__pycache__
+*.pyc
+*vscode
+**/build
+evaluate
+**/build
+**/dist
+**/*egg-info
+*.lprof
+.idea
+*.avi
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..7dfe610
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,61 @@
+repos:
+- repo: https://github.com/psf/black
+  rev: 23.3.0
+  hooks:
+    - id: black
+- repo: https://github.com/PyCQA/isort
+  rev: 5.12.0
+  hooks:
+    - id: isort
+      args:
+        [
+          "--force-single-line-imports",
+          "--ensure-newline-before-comments",
+          "--line-length=120",
+        ]
+- repo: https://github.com/asottile/pyupgrade
+  rev: v3.8.0
+  hooks:
+    - id: pyupgrade
+- repo: https://github.com/PyCQA/docformatter
+  rev: v1.7.3
+  hooks:
+    - id: docformatter
+      additional_dependencies: [tomli]
+      args:
+        [
+          "--in-place",
+          "--config",
+          "pyproject.toml",
+        ]
+- repo: https://github.com/executablebooks/mdformat
+  rev: 0.7.16
+  hooks:
+    - id: mdformat
+      additional_dependencies:
+      - mdformat-gfm
+      - mdformat-black
+- repo: https://github.com/koalaman/shellcheck-precommit
+  rev: v0.9.0
+  hooks:
+    - id: shellcheck
+      exclude: evaluation/culane
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+    - id: check-yaml
+    - id: check-toml
+    - id: check-json
+    - id: check-ast
+    - id: fix-byte-order-marker
+    - id: end-of-file-fixer
+    - id: trailing-whitespace
+    - id: requirements-txt-fixer
+    - id: check-added-large-files
+    - id: check-case-conflict
+    - id: check-merge-conflict
+    - id: detect-private-key
+    - id: end-of-file-fixer
+    - id: detect-private-key
+    - id: no-commit-to-branch
+      args: ["-b=main"]
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 0000000..93a957c
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,145 @@
+# Install
+
+1. Clone the project
+
+   ```Shell
+   git clone https://github.com/cfzd/Ultra-Fast-Lane-Detection-V2
+   cd Ultra-Fast-Lane-Detection-V2
+   ```
+
+1. Create a conda virtual environment and activate it
+
+   ```Shell
+   conda create -n lane-det python=3.7 -y
+   conda activate lane-det
+   ```
+
+1. Install dependencies
+
+   ```Shell
+   # If you dont have pytorch
+   conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
+
+   pip install -r requirements.txt
+
+   pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda110
+   # Install Nvidia DALI (Very fast data loading lib))
+
+   cd my_interp
+
+   sh build.sh
+   # If this fails, you might need to upgrade your GCC to v7.3.0
+   ```
+
+1. Data preparation
+
+   #### **4.1 Tusimple dataset**
+
+   Download [CULane](https://xingangpan.github.io/projects/CULane.html), [Tusimple](https://github.com/TuSimple/tusimple-benchmark/issues/3), or [CurveLanes](https://github.com/SoulmateB/CurveLanes) as you want. The directory arrangement of Tusimple should look like(`test_label.json` can be downloaded from [here](https://github.com/TuSimple/tusimple-benchmark/issues/3) ):
+
+   ```
+   $TUSIMPLE
+   |──clips
+   |──label_data_0313.json
+   |──label_data_0531.json
+   |──label_data_0601.json
+   |──test_tasks_0627.json
+   |──test_label.json
+   |──readme.md
+   ```
+
+   For Tusimple, the segmentation annotation is not provided, hence we need to generate segmentation from the json annotation.
+
+   ```Shell
+   python scripts/convert_tusimple.py --root /path/to/your/tusimple
+
+   # this will generate segmentations and two list files: train_gt.txt and test.txt
+   ```
+
+   #### **4.2 CULane dataset**
+
+   The directory arrangement of CULane should look like:
+
+   ```
+   $CULANE
+   |──driver_100_30frame
+   |──driver_161_90frame
+   |──driver_182_30frame
+   |──driver_193_90frame
+   |──driver_23_30frame
+   |──driver_37_30frame
+   |──laneseg_label_w16
+   |──list
+   ```
+
+   For CULane, please run:
+
+   ```Shell
+   python scripts/cache_culane_ponits.py --root /path/to/your/culane
+
+   # this will generate a culane_anno_cache.json file containing all the lane annotations, which can be used for speed up training without reading lane segmentation maps
+   ```
+
+   #### **4.3 CurveLanes dataset**
+
+   The directory arrangement of CurveLanes should look like:
+
+   ```
+   $CurveLanes
+   |──test
+   |──train
+   |──valid
+   ```
+
+   For CurveLanes, please run:
+
+   ```Shell
+   python scripts/convert_curvelanes.py --root /path/to/your/curvelanes
+
+   python scripts/make_curvelane_as_culane_test.py --root /path/to/your/curvelanes
+
+   # this will also generate a curvelanes_anno_cache_train.json file. Moreover, many .lines.txt file will be generated on the val set to enable CULane style evaluation.
+   ```
+
+1. Install CULane evaluation tools (Only required for testing).
+
+   If you just want to train a model or make a demo, this tool is not necessary and you can skip this step. If you want to get the evaluation results on CULane, you should install this tool.
+
+   This tools requires OpenCV C++. Please follow [here](https://docs.opencv.org/master/d7/d9f/tutorial_linux_install.html) to install OpenCV C++. ***When you build OpenCV, remove the paths of anaconda from PATH or it will be failed.***
+
+   ```Shell
+   # First you need to install OpenCV C++.
+   # After installation, make a soft link of OpenCV include path.
+
+   ln -s /usr/local/include/opencv4/opencv2 /usr/local/include/opencv2
+   ```
+
+   We provide three kinds of complie pipelines to build the evaluation tool of CULane.
+
+   Option 1:
+
+   ```Shell
+   cd evaluation/culane
+   make
+   ```
+
+   Option 2:
+
+   ```Shell
+   cd evaluation/culane
+   mkdir build && cd build
+   cmake ..
+   make
+   mv culane_evaluator ../evaluate
+   ```
+
+   For Windows user:
+
+   ```Shell
+   mkdir build-vs2017
+   cd build-vs2017
+   cmake .. -G "Visual Studio 15 2017 Win64"
+   cmake --build . --config Release
+   # or, open the "xxx.sln" file by Visual Studio and click build button
+   move culane_evaluator ../evaluate
+   ```
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b75ce74
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 zequn qin
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2b12b85
--- /dev/null
+++ b/README.md
@@ -0,0 +1,219 @@
+# Ultra-Fast-Lane-Detection-V2
+
+This README shows how to perform hardware-aware optimization of Ultra-Fast-Lane-Detection-V2 ResNet-18 model on
+[TuSimple](https://github.com/TuSimple/tusimple-benchmark/tree/master/doc/lane_detection) dataset.
+
+## Setup the environment
+
+The repository code was tested on Python 3.8.
+
+To get started, install `torch==1.13.1` and `torchvision==0.14.1` compatible with your CUDA using
+[the instruction](https://pytorch.org/get-started/previous-versions/#v1131) from the official site.
+
+The repository is based on two main packages:
+
+- [ENOT Framework](https://enot-autodl.rtd.enot.ai/en/v3.3.2/) — a flexible tool for Deep Learning developers which automates neural architecture optimization.
+- [ENOT Latency Server](https://enot-autodl.rtd.enot.ai/en/latest/latency_server.html) — small open-source package that provides simple API for latency measurement on remote device.
+
+Follow [the installation guide](https://enot-autodl.rtd.enot.ai/en/v3.3.2/installation_guide.html) to install `enot-autodl==3.3.2`.
+
+To install `enot-latency-server` simply run:
+
+```bash
+pip install enot-latency-server==1.2.0
+```
+
+Install other requirements:
+
+> **_NOTE:_** You must have the same CUDA version on your system as PyTorch's CUDA version.
+> We built `my_interp` using CUDA 11.7.
+
+```bash
+pip install -r requirements.txt
+# Install NVIDIA DALI - very fast data loading library:
+pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda110
+
+cd my_interp
+# If the following command fails, you might need to add path to your cuda to PATH:
+# PATH=/usr/local/cuda-11/bin:$PATH bash build.sh
+bash build.sh
+```
+
+> **_NOTE:_** All pruning/training procedures are performed on x86-64 computer,
+> ONLY latency measurements are performed on a remote target device,
+> so you do not need to install `enot-autodl` package on the target device,
+> you only need to install `enot-latency-server` package for latency measurements.
+
+## Prepare dataset
+
+Download preprocessed TuSimple dataset from [Google Drive](https://drive.google.com/file/d/16Uk7_uRtue9OLaQCuMEfEbS5rhSxUHC1/view?usp=sharing) and unzip it to the repository root:
+
+```bash
+unzip dataset.zip
+```
+
+The dataset should have the following structure:
+
+```text
+└── ultra-fast-lane-detector-v2 (repository root)
+    └──dataset
+        ├── clips
+            ├── 0313-1
+            ├── 0313-2
+            ├── 0530
+            ├── 0531
+            └── 0601
+        ├── label_data_0313.json
+        ├── label_data_0531.json
+        ├── label_data_0601.json
+        ├── test_label.json
+        ├── test_tasks_0627.json
+        ├── test.txt
+        ├── train_gt.txt
+        └── tusimple_anno_cache.json
+```
+
+If you want to use your own path for dataset, change `data_root` parameter in `configs/tusimple_res18.py`.
+
+To train baseline model, run:
+
+```bash
+bash commands/baseline/train_baseline.sh
+```
+
+The result of this command is the `model_best.pth` checkpoint in the `runs/baseline` directory.
+
+Use this command to verify baseline accuracy:
+
+```bash
+bash commands/baseline/test_baseline.sh
+```
+
+## Model optimization (Jetson)
+
+To optimize a model by latency for Jetson, run our latency server on Jetson (see [instruction](https://github.com/ENOT-AutoDL/latency-server-nvidia-jetson-agx-orin-devkit)).
+
+> **_NOTE:_** Substitute `--host` and `--port` in the commands and `.sh` scripts below with the host and port of your server on Jetson.
+
+### Pruning
+
+To optimize a model by latency for Jetson, run the corresponding script (x2/x3 means latency acceleration):
+
+```bash
+bash commands/x2_jetson/prune_x2.sh
+bash commands/x3_jetson/prune_x3.sh
+```
+
+### Tune
+
+After pruning, the model should be tuned with the following command:
+
+```bash
+bash commands/x2_jetson/tune_x2.sh
+bash commands/x3_jetson/tune_x3.sh
+```
+
+### Accuracy and latency verification
+
+Use this command to verify the optimized model accuracy:
+
+```bash
+bash commands/x2_jetson/test_x2.sh
+bash commands/x3_jetson/test_x3.sh
+```
+
+Use this command to verify the optimized model latency:
+
+```bash
+bash commands/x2_jetson/measure.sh
+bash commands/x3_jetson/measure.sh
+```
+
+### Our optimization results
+
+Download our checkpoints from [Google Drive](https://drive.google.com/file/d/1OdKo6zMvtHMg7c_nXrsY_1x9z0jBlPYL/view?usp=sharing).
+
+To extract `checkpoints` use the following command:
+
+```bash
+unzip ufld_ckpt.zip
+```
+
+To check their accuracy, run with the following commands:
+
+```bash
+python test.py configs/tusimple_res18.py --model_ckpt checkpoints/baseline/model_best.pth
+pyhton test.py configs/tusimple_res18.py --model_ckpt checkpoints/x2_jetson/model_best.pth
+python test.py configs/tusimple_res18.py --model_ckpt checkpoints/x3_jetson/model_best.pth
+```
+
+To check their latency, run the following commands:
+
+```bash
+python measure.py --model_ckpt checkpoints/baseline/model_best.pth --host <server-host> --port 15003
+python measure.py --model_ckpt checkpoints/x2_jetson/model_best.pth --host <server-host> --port 15003
+python measure.py --model_ckpt checkpoints/x3_jetson/model_best.pth --host <server-host> --port 15003
+```
+
+## Model optimization (TI)
+
+To optimize a model by latency for Texas Instruments (TI), you need to run a latency server on TI and a compile server on x86 PC (Linux OS).
+The compile server creates binaries for a model and sends them to the latency server.
+The latency server measures model latency using these binaries.
+Use our [instruction](https://github.com/ENOT-AutoDL/latency-server-ti-tda4-j721exskg01evm) to run latency server and compile server.
+
+> **_NOTE:_** Substitute `--host` and `--port` in the commands and `.sh` scripts below with the host and port of your compile server on x86 PC.
+
+### Pruning
+
+To optimize a model by latency for TI, run the corresponding script (x4 means latency acceleration):
+
+```bash
+bash commands/x4_ti/prune_x4.sh
+```
+
+### Tune
+
+After pruning, the model should be tuned with the following command:
+
+```bash
+bash commands/x4_ti/tune_x4.sh
+```
+
+### Accuracy and latency verification
+
+Use this command to verify the optimized model accuracy:
+
+```bash
+bash commands/x4_ti/test_x4.sh
+```
+
+Use this command to verify the optimized model latency:
+
+```bash
+bash commands/x4_ti/measure.sh
+```
+
+### Our optimization results
+
+Download our checkpoints from [Google Drive](https://drive.google.com/file/d/1OdKo6zMvtHMg7c_nXrsY_1x9z0jBlPYL/view?usp=sharing).
+
+To extract `checkpoints` use the following command:
+
+```bash
+unzip ufld_ckpt.zip
+```
+
+To check their accuracy, run with the following commands:
+
+```bash
+python test.py configs/tusimple_res18.py --model_ckpt checkpoints/baseline/model_best.pth
+python test.py configs/tusimple_res18.py --model_ckpt checkpoints/x4_ti/model_best.pth
+```
+
+To check their latency, run the following commands:
+
+```bash
+python measure.py --model_ckpt checkpoints/baseline/model_best.pth --host <compile-server-host> --port 15003 --ti_server
+python measure.py --model_ckpt checkpoints/x4_ti/model_best.pth --host <compile-server-host> --port 15003 --ti_server
+```
diff --git a/add_model_ckpt.py b/add_model_ckpt.py
new file mode 100644
index 0000000..0776d32
--- /dev/null
+++ b/add_model_ckpt.py
@@ -0,0 +1,18 @@
+import argparse
+
+import torch
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path")
+
+    args = parser.parse_args()
+
+    ckpt = torch.load(args.model_path, map_location="cpu")
+
+    if isinstance(ckpt["model"], torch.nn.Module):
+        ckpt["model_ckpt"] = ckpt["model"]
+    else:
+        raise ValueError("model key is not nn.Module")
+
+    torch.save(ckpt, args.model_path)
diff --git a/commands/baseline/measure.sh b/commands/baseline/measure.sh
new file mode 100644
index 0000000..fe8449f
--- /dev/null
+++ b/commands/baseline/measure.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+python measure.py \
+    --model_ckpt runs/baseline/model_best.pth \
+    --host localhost \
+    --port 15003
diff --git a/commands/baseline/test_baseline.sh b/commands/baseline/test_baseline.sh
new file mode 100644
index 0000000..241a632
--- /dev/null
+++ b/commands/baseline/test_baseline.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+python test.py \
+    configs/tusimple_res18.py \
+    --model_ckpt runs/baseline/model_best.pth
diff --git a/commands/baseline/train_baseline.sh b/commands/baseline/train_baseline.sh
new file mode 100644
index 0000000..74258c3
--- /dev/null
+++ b/commands/baseline/train_baseline.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+python train.py \
+    configs/tusimple_res18.py \
+    --log_path runs/baseline
diff --git a/commands/x2_jetson/measure.sh b/commands/x2_jetson/measure.sh
new file mode 100644
index 0000000..350e581
--- /dev/null
+++ b/commands/x2_jetson/measure.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+python measure.py \
+    --model_ckpt runs/jetson/x2/tune/model_best.pth \
+    --host localhost \
+    --port 15003
diff --git a/commands/x2_jetson/prune_x2.sh b/commands/x2_jetson/prune_x2.sh
new file mode 100644
index 0000000..6236528
--- /dev/null
+++ b/commands/x2_jetson/prune_x2.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+python prune.py \
+    configs/tusimple_res18.py \
+    --log_path runs/jetson/x2/prune \
+    --latency_type server \
+    --acceleration 2.0 \
+    --n_search_steps 200 \
+    --host localhost \
+    --port 15003 \
+    --model_ckpt runs/baseline/model_best.pth
diff --git a/commands/x2_jetson/test_x2.sh b/commands/x2_jetson/test_x2.sh
new file mode 100644
index 0000000..5089f53
--- /dev/null
+++ b/commands/x2_jetson/test_x2.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+python test.py \
+    configs/tusimple_res18.py \
+    --model_ckpt runs/jetson/x2/tune/model_best.pth
diff --git a/commands/x2_jetson/tune_x2.sh b/commands/x2_jetson/tune_x2.sh
new file mode 100644
index 0000000..19c816c
--- /dev/null
+++ b/commands/x2_jetson/tune_x2.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+python train.py \
+    configs/tusimple_res18_tune.py \
+    --log_path runs/jetson/x2/tune \
+    --model_ckpt runs/jetson/x2/prune/model_best.pth \
+    --teacher runs/baseline/model_best.pth \
+    --distill_loss 2.0 \
+    --epoch 200
diff --git a/commands/x3_jetson/measure.sh b/commands/x3_jetson/measure.sh
new file mode 100644
index 0000000..0c3366a
--- /dev/null
+++ b/commands/x3_jetson/measure.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+python measure.py \
+    --model_ckpt runs/jetson/x3/tune/model_best.pth \
+    --host localhost \
+    --port 15003
diff --git a/commands/x3_jetson/prune_x3.sh b/commands/x3_jetson/prune_x3.sh
new file mode 100644
index 0000000..555faee
--- /dev/null
+++ b/commands/x3_jetson/prune_x3.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+python prune.py \
+    configs/tusimple_res18.py \
+    --log_path runs/jetson/x3/prune \
+    --latency_type server \
+    --acceleration 3.0 \
+    --n_search_steps 200 \
+    --host localhost \
+    --port 15003 \
+    --model_ckpt runs/baseline/model_best.pth
diff --git a/commands/x3_jetson/test_x3.sh b/commands/x3_jetson/test_x3.sh
new file mode 100644
index 0000000..fe490a2
--- /dev/null
+++ b/commands/x3_jetson/test_x3.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+python test.py \
+    configs/tusimple_res18.py \
+    --model_ckpt runs/jetson/x3/tune/model_best.pth
diff --git a/commands/x3_jetson/tune_x3.sh b/commands/x3_jetson/tune_x3.sh
new file mode 100644
index 0000000..f1beb9b
--- /dev/null
+++ b/commands/x3_jetson/tune_x3.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+python train.py \
+    configs/tusimple_res18_tune.py \
+    --log_path runs/jetson/x3/tune \
+    --model_ckpt runs/jetson/x3/prune/model_best.pth \
+    --teacher runs/baseline/model_best.pth \
+    --distill_loss 2.0 \
+    --epoch 200
diff --git a/commands/x4_ti/measure.sh b/commands/x4_ti/measure.sh
new file mode 100644
index 0000000..c4bd8bc
--- /dev/null
+++ b/commands/x4_ti/measure.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+python measure.py \
+    --model_ckpt runs/ti/x4/tune/model_best.pth \
+    --host localhost \
+    --port 15003 \
+    --ti_server
diff --git a/commands/x4_ti/prune_x4.sh b/commands/x4_ti/prune_x4.sh
new file mode 100644
index 0000000..04c3e84
--- /dev/null
+++ b/commands/x4_ti/prune_x4.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+python prune.py \
+    configs/tusimple_res18.py \
+    --log_path runs/ti/x4/prune \
+    --latency_type server \
+    --acceleration 1.12 \
+    --n_search_steps 200 \
+    --host localhost \
+    --port 15003 \
+    --model_ckpt checkpoints/x3_jetson/model_best.pth \
+    --ti_compatible \
+    --n_search_steps 200
diff --git a/commands/x4_ti/test_x4.sh b/commands/x4_ti/test_x4.sh
new file mode 100644
index 0000000..827477f
--- /dev/null
+++ b/commands/x4_ti/test_x4.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+python test.py \
+    configs/tusimple_res18.py \
+    --model_ckpt runs/ti/x4/tune/model_best.pth
diff --git a/commands/x4_ti/tune_x4.sh b/commands/x4_ti/tune_x4.sh
new file mode 100644
index 0000000..0f7ae24
--- /dev/null
+++ b/commands/x4_ti/tune_x4.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+python train.py \
+    configs/tusimple_res18_tune.py \
+    --log_path runs/ti/x4/tune \
+    --model_ckpt runs/ti/x4/prune/model_best.pth \
+    --teacher checkpoints/baseline/model_best.pth \
+    --distill_loss 2.0 \
+    --epoch 200
diff --git a/configs/culane_res18.py b/configs/culane_res18.py
new file mode 100644
index 0000000..a454e4c
--- /dev/null
+++ b/configs/culane_res18.py
@@ -0,0 +1,37 @@
+dataset = "CULane"
+data_root = ""  # Need to be modified before running
+epoch = 50
+batch_size = 32
+optimizer = "SGD"
+learning_rate = 0.05
+weight_decay = 0.0001
+momentum = 0.9
+scheduler = "multi"
+steps = [25, 38]
+gamma = 0.1
+warmup = "linear"
+warmup_iters = 695
+use_aux = False
+griding_num = 200
+backbone = "18"
+sim_loss_w = 0.0
+shp_loss_w = 0.0
+note = ""
+log_path = ""
+finetune = None
+resume = None
+test_model = ""
+test_work_dir = ""
+tta = True
+num_lanes = 4
+var_loss_power = 2.0
+auto_backup = True
+num_row = 72
+num_col = 81
+train_width = 1600
+train_height = 320
+num_cell_row = 200
+num_cell_col = 100
+mean_loss_w = 0.05
+fc_norm = True
+crop_ratio = 0.6
diff --git a/configs/culane_res34.py b/configs/culane_res34.py
new file mode 100644
index 0000000..38307a5
--- /dev/null
+++ b/configs/culane_res34.py
@@ -0,0 +1,37 @@
+dataset = "CULane"
+data_root = ""  # Need to be modified before running
+epoch = 50
+batch_size = 32
+optimizer = "SGD"
+learning_rate = 0.05
+weight_decay = 0.0001
+momentum = 0.9
+scheduler = "multi"
+steps = [25, 38]
+gamma = 0.1
+warmup = "linear"
+warmup_iters = 695
+use_aux = False
+griding_num = 200
+backbone = "34"
+sim_loss_w = 0.0
+shp_loss_w = 0.0
+note = ""
+log_path = ""
+finetune = None
+resume = None
+test_model = ""
+test_work_dir = ""
+tta = True
+num_lanes = 4
+var_loss_power = 2.0
+auto_backup = True
+num_row = 72
+num_col = 81
+train_width = 1600
+train_height = 320
+num_cell_row = 200
+num_cell_col = 100
+mean_loss_w = 0.05
+fc_norm = True
+crop_ratio = 0.6
diff --git a/configs/curvelanes_res18.py b/configs/curvelanes_res18.py
new file mode 100644
index 0000000..2786588
--- /dev/null
+++ b/configs/curvelanes_res18.py
@@ -0,0 +1,35 @@
+dataset = "CurveLanes"
+data_root = ""  # Need to be modified before running
+epoch = 50
+batch_size = 8
+optimizer = "SGD"
+learning_rate = 0.05
+weight_decay = 0.0001
+momentum = 0.9
+scheduler = "multi"
+steps = [25, 38]
+gamma = 0.1
+warmup = "linear"
+warmup_iters = 695
+use_aux = False
+backbone = "18"
+sim_loss_w = 1.0
+shp_loss_w = 0.0
+note = ""
+log_path = ""
+finetune = None
+resume = None
+test_model = ""
+test_work_dir = ""
+tta = False
+num_lanes = 10
+var_loss_power = 2.0
+auto_backup = True
+num_row = 72
+num_col = 41
+train_width = 1600
+train_height = 800
+num_cell_row = 200
+num_cell_col = 100
+mean_loss_w = 0.05
+crop_ratio = 0.8
diff --git a/configs/curvelanes_res34.py b/configs/curvelanes_res34.py
new file mode 100644
index 0000000..8e55482
--- /dev/null
+++ b/configs/curvelanes_res34.py
@@ -0,0 +1,35 @@
+dataset = "CurveLanes"
+data_root = ""  # Need to be modified before running
+epoch = 50
+batch_size = 4
+optimizer = "SGD"
+learning_rate = 0.025
+weight_decay = 0.0001
+momentum = 0.9
+scheduler = "multi"
+steps = [25, 38]
+gamma = 0.1
+warmup = "linear"
+warmup_iters = 695
+use_aux = False
+backbone = "34"
+sim_loss_w = 1.0
+shp_loss_w = 0.0
+note = ""
+log_path = ""
+finetune = None
+resume = None
+test_model = ""
+test_work_dir = ""
+tta = False
+num_lanes = 10
+var_loss_power = 2.0
+auto_backup = True
+num_row = 72
+num_col = 81
+train_width = 1600
+train_height = 800
+num_cell_row = 200
+num_cell_col = 100
+mean_loss_w = 0.05
+crop_ratio = 0.8
diff --git a/configs/tusimple_res18.py b/configs/tusimple_res18.py
new file mode 100644
index 0000000..b818a75
--- /dev/null
+++ b/configs/tusimple_res18.py
@@ -0,0 +1,45 @@
+dataset = "Tusimple"
+data_root = "dataset"  # Need to be modified before running
+epoch = 100
+batch_size = 32
+optimizer = "SGD"
+learning_rate = 0.05
+weight_decay = 0.0001
+momentum = 0.9
+scheduler = "cos"
+steps = [50, 75]
+gamma = 0.1
+warmup = "linear"
+warmup_iters = 100
+backbone = "18"
+griding_num = 100
+use_aux = False
+sim_loss_w = 0.0
+shp_loss_w = 0.0
+note = ""
+log_path = ""
+finetune = None
+resume = None
+test_model = ""
+test_work_dir = "."
+num_lanes = 4
+var_loss_power = 2.0
+auto_backup = True
+num_row = 56
+num_col = 41
+train_width = 800
+train_height = 320
+num_cell_row = 100
+num_cell_col = 100
+mean_loss_w = 0.05
+fc_norm = False
+soft_loss = True
+cls_loss_col_w = 1.0
+cls_ext_col_w = 1.0
+mean_loss_col_w = 0.05
+eval_mode = "normal"
+crop_ratio = 0.8
+onnx_path = None
+model_ckpt = None
+teacher = None
+ti_compatible = False
diff --git a/configs/tusimple_res18_tune.py b/configs/tusimple_res18_tune.py
new file mode 100644
index 0000000..1cf040b
--- /dev/null
+++ b/configs/tusimple_res18_tune.py
@@ -0,0 +1,45 @@
+dataset = "Tusimple"
+data_root = "dataset"  # Need to be modified before running
+epoch = 100
+batch_size = 32
+optimizer = "SGD"
+learning_rate = 0.005
+weight_decay = 0.0001
+momentum = 0.9
+scheduler = "cos"
+steps = [50, 75]
+gamma = 0.1
+warmup = "linear"
+warmup_iters = 100
+backbone = "18"
+griding_num = 100
+use_aux = False
+sim_loss_w = 0.0
+shp_loss_w = 0.0
+note = ""
+log_path = ""
+finetune = None
+resume = None
+test_model = ""
+test_work_dir = "."
+num_lanes = 4
+var_loss_power = 2.0
+auto_backup = True
+num_row = 56
+num_col = 41
+train_width = 800
+train_height = 320
+num_cell_row = 100
+num_cell_col = 100
+mean_loss_w = 0.05
+fc_norm = False
+soft_loss = True
+cls_loss_col_w = 1.0
+cls_ext_col_w = 1.0
+mean_loss_col_w = 0.05
+eval_mode = "normal"
+crop_ratio = 0.8
+onnx_path = None
+model_ckpt = None
+teacher = None
+ti_compatible = False
diff --git a/configs/tusimple_res34.py b/configs/tusimple_res34.py
new file mode 100644
index 0000000..f7ff0d8
--- /dev/null
+++ b/configs/tusimple_res34.py
@@ -0,0 +1,41 @@
+dataset = "Tusimple"
+data_root = ""  # Need to be modified before running
+epoch = 100
+batch_size = 32
+optimizer = "SGD"
+learning_rate = 0.05
+weight_decay = 0.0001
+momentum = 0.9
+scheduler = "multi"
+steps = [50, 75]
+gamma = 0.1
+warmup = "linear"
+warmup_iters = 100
+backbone = "34"
+griding_num = 100
+use_aux = False
+sim_loss_w = 0.0
+shp_loss_w = 0.0
+note = ""
+log_path = ""
+finetune = None
+resume = None
+test_model = ""
+test_work_dir = ""
+num_lanes = 4
+var_loss_power = 2.0
+auto_backup = True
+num_row = 56
+num_col = 41
+train_width = 800
+train_height = 320
+num_cell_row = 100
+num_cell_col = 100
+mean_loss_w = 0.05
+fc_norm = False
+soft_loss = True
+cls_loss_col_w = 1.0
+cls_ext_col_w = 1.0
+mean_loss_col_w = 0.05
+eval_mode = "normal"
+crop_ratio = 0.8
diff --git a/data/constant.py b/data/constant.py
new file mode 100644
index 0000000..423db8b
--- /dev/null
+++ b/data/constant.py
@@ -0,0 +1,107 @@
+# row anchors are a series of pre-defined coordinates in image height to detect lanes
+# the row anchors are defined according to the evaluation protocol of CULane and Tusimple
+# since our method will resize the image to 288x800 for training, the row anchors are defined with the height of 288
+# you can modify these row anchors according to your training image resolution
+
+tusimple_row_anchor = [
+    64,
+    68,
+    72,
+    76,
+    80,
+    84,
+    88,
+    92,
+    96,
+    100,
+    104,
+    108,
+    112,
+    116,
+    120,
+    124,
+    128,
+    132,
+    136,
+    140,
+    144,
+    148,
+    152,
+    156,
+    160,
+    164,
+    168,
+    172,
+    176,
+    180,
+    184,
+    188,
+    192,
+    196,
+    200,
+    204,
+    208,
+    212,
+    216,
+    220,
+    224,
+    228,
+    232,
+    236,
+    240,
+    244,
+    248,
+    252,
+    256,
+    260,
+    264,
+    268,
+    272,
+    276,
+    280,
+    284,
+]
+culane_row_anchor = [121, 131, 141, 150, 160, 170, 180, 189, 199, 209, 219, 228, 238, 248, 258, 267, 277, 287]
+culane_col_anchor = [
+    0.0,
+    20.0,
+    40.0,
+    60.0,
+    80.0,
+    100.0,
+    120.0,
+    140.0,
+    160.0,
+    180.0,
+    200.0,
+    220.0,
+    240.0,
+    260.0,
+    280.0,
+    300.0,
+    320.0,
+    340.0,
+    360.0,
+    380.0,
+    400.0,
+    420.0,
+    440.0,
+    460.0,
+    480.0,
+    500.0,
+    520.0,
+    540.0,
+    560.0,
+    580.0,
+    600.0,
+    620.0,
+    640.0,
+    660.0,
+    680.0,
+    700.0,
+    720.0,
+    740.0,
+    760.0,
+    780.0,
+    800.0,
+]
diff --git a/data/dali_data.py b/data/dali_data.py
new file mode 100644
index 0000000..e87eab6
--- /dev/null
+++ b/data/dali_data.py
@@ -0,0 +1,447 @@
+import json
+import os
+import random
+
+import numpy as np
+import nvidia.dali.fn as fn
+import nvidia.dali.types as types
+import torch
+from nvidia.dali.pipeline import Pipeline
+from nvidia.dali.plugin.pytorch import DALIGenericIterator
+from nvidia.dali.plugin.pytorch import LastBatchPolicy
+
+import my_interp
+
+
+class LaneExternalIterator:
+    def __init__(
+        self,
+        path,
+        list_path,
+        batch_size=None,
+        shard_id=None,
+        num_shards=None,
+        mode="train",
+        dataset_name=None,
+    ):
+        assert mode in ["train", "test"]
+        self.mode = mode
+        self.path = path
+        self.list_path = list_path
+        self.batch_size = batch_size
+        self.shard_id = shard_id
+        self.num_shards = num_shards
+
+        if isinstance(list_path, str):
+            with open(list_path) as f:
+                total_list = f.readlines()
+        elif isinstance(list_path, list) or isinstance(list_path, tuple):
+            total_list = []
+            for lst_path in list_path:
+                with open(lst_path) as f:
+                    total_list.extend(f.readlines())
+        else:
+            raise NotImplementedError
+        if self.mode == "train":
+            if dataset_name == "CULane":
+                cache_path = os.path.join(path, "culane_anno_cache.json")
+            elif dataset_name == "Tusimple":
+                cache_path = os.path.join(path, "tusimple_anno_cache.json")
+            elif dataset_name == "CurveLanes":
+                cache_path = os.path.join(path, "train", "curvelanes_anno_cache.json")
+            else:
+                raise NotImplementedError
+
+            if shard_id == 0:
+                print("loading cached data")
+            cache_fp = open(cache_path)
+            self.cached_points = json.load(cache_fp)
+            if shard_id == 0:
+                print("cached data loaded")
+
+        self.total_len = len(total_list)
+
+        self.list = total_list[self.total_len * shard_id // num_shards : self.total_len * (shard_id + 1) // num_shards]
+        self.n = len(self.list)
+
+    def __iter__(self):
+        self.i = 0
+        if self.mode == "train":
+            random.shuffle(self.list)
+        return self
+
+    def _prepare_train_batch(self):
+        images = []
+        seg_images = []
+        labels = []
+
+        for _ in range(self.batch_size):
+            l = self.list[self.i % self.n]
+            l_info = l.split()
+            img_name = l_info[0]
+            seg_name = l_info[1]
+
+            if img_name[0] == "/":
+                img_name = img_name[1:]
+            if seg_name[0] == "/":
+                seg_name = seg_name[1:]
+
+            img_name = img_name.strip()
+            seg_name = seg_name.strip()
+
+            img_path = os.path.join(self.path, img_name)
+            with open(img_path, "rb") as f:
+                images.append(np.frombuffer(f.read(), dtype=np.uint8))
+
+            img_path = os.path.join(self.path, seg_name)
+            with open(img_path, "rb") as f:
+                seg_images.append(np.frombuffer(f.read(), dtype=np.uint8))
+
+            points = np.array(self.cached_points[img_name])
+            labels.append(points.astype(np.float32))
+
+            self.i = self.i + 1
+
+        return (images, seg_images, labels)
+
+    def _prepare_test_batch(self):
+        images = []
+        names = []
+        for _ in range(self.batch_size):
+            img_name = self.list[self.i % self.n].split()[0]
+
+            if img_name[0] == "/":
+                img_name = img_name[1:]
+            img_name = img_name.strip()
+
+            img_path = os.path.join(self.path, img_name)
+
+            with open(img_path, "rb") as f:
+                images.append(np.frombuffer(f.read(), dtype=np.uint8))
+            names.append(np.array(list(map(ord, img_name))))
+            self.i = self.i + 1
+
+        return images, names
+
+    def __next__(self):
+        if self.i >= self.n:
+            self.__iter__()
+            raise StopIteration
+        if self.mode == "train":
+            res = self._prepare_train_batch()
+        elif self.mode == "test":
+            res = self._prepare_test_batch()
+        else:
+            raise NotImplementedError
+
+        return res
+
+    def __len__(self):
+        return self.total_len
+
+    next = __next__
+
+
+def encoded_images_sizes(jpegs):
+    shapes = fn.peek_image_shape(jpegs)  # the shapes are HWC
+    h = fn.slice(shapes, 0, 1, axes=[0])  # extract height...
+    w = fn.slice(shapes, 1, 1, axes=[0])  # ...and width...
+    return fn.cat(w, h)  # ...and concatenate
+
+
+def ExternalSourceTrainPipeline(
+    batch_size,
+    num_threads,
+    device_id,
+    external_data,
+    train_width,
+    train_height,
+    top_crop,
+    normalize_image_scale=False,
+    nscale_w=None,
+    nscale_h=None,
+):
+    pipe = Pipeline(batch_size, num_threads, device_id)
+    with pipe:
+        jpegs, seg_images, labels = fn.external_source(source=external_data, num_outputs=3)
+        images = fn.decoders.image(jpegs, device="mixed")
+        seg_images = fn.decoders.image(seg_images, device="mixed")
+        if normalize_image_scale:
+            images = fn.resize(images, resize_x=nscale_w, resize_y=nscale_h)
+            seg_images = fn.resize(seg_images, resize_x=nscale_w, resize_y=nscale_h, interp_type=types.INTERP_NN)
+            # make all images at the same size
+
+        size = encoded_images_sizes(jpegs)
+        center = size / 2
+
+        mt = fn.transforms.scale(scale=fn.random.uniform(range=(0.8, 1.2), shape=[2]), center=center)
+        mt = fn.transforms.rotation(mt, angle=fn.random.uniform(range=(-6, 6)), center=center)
+
+        off = fn.cat(fn.random.uniform(range=(-200, 200), shape=[1]), fn.random.uniform(range=(-100, 100), shape=[1]))
+        mt = fn.transforms.translation(mt, offset=off)
+
+        images = fn.warp_affine(images, matrix=mt, fill_value=0, inverse_map=False)
+        seg_images = fn.warp_affine(seg_images, matrix=mt, fill_value=0, inverse_map=False)
+        labels = fn.coord_transform(labels.gpu(), MT=mt)
+
+        images = fn.resize(images, resize_x=train_width, resize_y=int(train_height / top_crop))
+        seg_images = fn.resize(
+            seg_images, resize_x=train_width, resize_y=int(train_height / top_crop), interp_type=types.INTERP_NN
+        )
+
+        images = fn.crop_mirror_normalize(
+            images,
+            dtype=types.FLOAT,
+            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+            std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+            crop=(train_height, train_width),
+            crop_pos_x=0.0,
+            crop_pos_y=1.0,
+        )
+        seg_images = fn.crop_mirror_normalize(
+            seg_images,
+            dtype=types.FLOAT,
+            mean=[0.0, 0.0, 0.0],
+            std=[1.0, 1.0, 1.0],
+            crop=(train_height, train_width),
+            crop_pos_x=0.0,
+            crop_pos_y=1.0,
+        )
+        pipe.set_outputs(images, seg_images, labels)
+    return pipe
+
+
+def ExternalSourceValPipeline(batch_size, num_threads, device_id, external_data, train_width, train_height):
+    pipe = Pipeline(batch_size, num_threads, device_id)
+    with pipe:
+        jpegs, labels = fn.external_source(source=external_data, num_outputs=2)
+        images = fn.decoders.image(jpegs, device="mixed")
+        images = fn.resize(images, resize_x=train_width, resize_y=int(train_height / 0.6) + 1)
+        images = fn.crop_mirror_normalize(
+            images,
+            dtype=types.FLOAT,
+            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+            std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+            crop=(train_height, train_width),
+            crop_pos_x=0.0,
+            crop_pos_y=1.0,
+        )
+        pipe.set_outputs(images, labels.gpu())
+    return pipe
+
+
+def ExternalSourceTestPipeline(batch_size, num_threads, device_id, external_data):
+    pipe = Pipeline(batch_size, num_threads, device_id)
+    with pipe:
+        jpegs, names = fn.external_source(source=external_data, num_outputs=2)
+        images = fn.decoders.image(jpegs, device="mixed")
+
+        images = fn.resize(images, resize_x=800, resize_y=288)
+        images = fn.crop_mirror_normalize(
+            images,
+            dtype=types.FLOAT,
+            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+            std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+        )
+
+        names = fn.pad(names, axes=0, fill_value=-1, shape=46)
+        pipe.set_outputs(images, names)
+    return pipe
+
+
+# from data.constant import culane_row_anchor, culane_col_anchor
+class TrainCollect:
+    def __init__(
+        self,
+        batch_size,
+        num_threads,
+        data_root,
+        list_path,
+        shard_id,
+        num_shards,
+        row_anchor,
+        col_anchor,
+        train_width,
+        train_height,
+        num_cell_row,
+        num_cell_col,
+        dataset_name,
+        top_crop,
+    ):
+        eii = LaneExternalIterator(
+            data_root,
+            list_path,
+            batch_size=batch_size,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            dataset_name=dataset_name,
+        )
+
+        if dataset_name == "CULane":
+            self.original_image_width = 1640
+            self.original_image_height = 590
+        elif dataset_name == "Tusimple":
+            self.original_image_width = 1280
+            self.original_image_height = 720
+        elif dataset_name == "CurveLanes":
+            self.original_image_width = 2560
+            self.original_image_height = 1440
+
+        if dataset_name == "CurveLanes":
+            pipe = ExternalSourceTrainPipeline(
+                batch_size,
+                num_threads,
+                shard_id,
+                eii,
+                train_width,
+                train_height,
+                top_crop,
+                normalize_image_scale=True,
+                nscale_w=2560,
+                nscale_h=1440,
+            )
+        else:
+            pipe = ExternalSourceTrainPipeline(
+                batch_size, num_threads, shard_id, eii, train_width, train_height, top_crop
+            )
+        self.pii = DALIGenericIterator(
+            pipe,
+            output_map=["images", "seg_images", "points"],
+            last_batch_padded=True,
+            last_batch_policy=LastBatchPolicy.PARTIAL,
+        )
+        self.eii_n = eii.n
+        self.batch_size = batch_size
+
+        self.interp_loc_row = torch.tensor(row_anchor, dtype=torch.float32).cuda() * self.original_image_height
+        self.interp_loc_col = torch.tensor(col_anchor, dtype=torch.float32).cuda() * self.original_image_width
+        self.num_cell_row = num_cell_row
+        self.num_cell_col = num_cell_col
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        data = next(self.pii)
+        images = data[0]["images"]
+        seg_images = data[0]["seg_images"]
+        points = data[0]["points"]
+        points_row = my_interp.run(points, self.interp_loc_row, 0)
+        points_row_extend = self._extend(points_row[:, :, :, 0]).transpose(1, 2)
+        labels_row = (points_row_extend / self.original_image_width * (self.num_cell_row - 1)).long()
+        labels_row[points_row_extend < 0] = -1
+        labels_row[points_row_extend > self.original_image_width] = -1
+        labels_row[labels_row < 0] = -1
+        labels_row[labels_row > (self.num_cell_row - 1)] = -1
+
+        points_col = my_interp.run(points, self.interp_loc_col, 1)
+        points_col = points_col[:, :, :, 1].transpose(1, 2)
+        labels_col = (points_col / self.original_image_height * (self.num_cell_col - 1)).long()
+        labels_col[points_col < 0] = -1
+        labels_col[points_col > self.original_image_height] = -1
+
+        labels_col[labels_col < 0] = -1
+        labels_col[labels_col > (self.num_cell_col - 1)] = -1
+
+        labels_row_float = points_row_extend / self.original_image_width
+        labels_row_float[labels_row_float < 0] = -1
+        labels_row_float[labels_row_float > 1] = -1
+
+        labels_col_float = points_col / self.original_image_height
+        labels_col_float[labels_col_float < 0] = -1
+        labels_col_float[labels_col_float > 1] = -1
+
+        return {
+            "images": images,
+            "seg_images": seg_images,
+            "labels_row": labels_row,
+            "labels_col": labels_col,
+            "labels_row_float": labels_row_float,
+            "labels_col_float": labels_col_float,
+        }
+
+    def __len__(self):
+        return int((self.eii_n + self.batch_size - 1) / self.batch_size)
+
+    def reset(self):
+        self.pii.reset()
+
+    next = __next__
+
+    def _extend(self, coords):
+        # coords : n x num_lane x num_cls
+        n, num_lanes, num_cls = coords.shape
+        coords_np = coords.cpu().numpy()
+        coords_axis = np.arange(num_cls)
+        fitted_coords = coords.clone()
+        for i in range(n):
+            for j in range(num_lanes):
+                lane = coords_np[i, j]
+                if lane[-1] > 0:
+                    continue
+
+                valid = lane > 0
+                num_valid_pts = np.sum(valid)
+                if num_valid_pts < 6:
+                    continue
+                p = np.polyfit(coords_axis[valid][num_valid_pts // 2 :], lane[valid][num_valid_pts // 2 :], deg=1)
+                start_point = coords_axis[valid][num_valid_pts // 2]
+                fitted_lane = np.polyval(p, np.arange(start_point, num_cls))
+
+                fitted_coords[i, j, start_point:] = torch.tensor(fitted_lane, device=coords.device)
+        return fitted_coords
+
+    def _extend_col(self, coords):
+        pass
+
+
+class TestCollect:
+    def __init__(self, batch_size, num_threads, data_root, list_path, shard_id, num_shards):
+        self.batch_size = batch_size
+        eii = LaneExternalIterator(
+            data_root,
+            list_path,
+            batch_size=batch_size,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            mode="test",
+        )
+        pipe = ExternalSourceTestPipeline(
+            batch_size,
+            num_threads,
+            shard_id,
+            eii,
+        )
+        self.pii = DALIGenericIterator(
+            pipe,
+            output_map=["images", "names"],
+            last_batch_padded=True,
+            last_batch_policy=LastBatchPolicy.PARTIAL,
+        )
+        self.eii_n = eii.n
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        data = next(self.pii)
+        images = data[0]["images"]
+        names = data[0]["names"]
+        restored_names = []
+        for name in names:
+            if name[-1] == -1:
+                restored_name = "".join(list(map(chr, name[:-1])))
+            else:
+                restored_name = "".join(list(map(chr, name)))
+            restored_names.append(restored_name)
+
+        out_dict = {"images": images, "names": restored_names}
+        return out_dict
+
+    def __len__(self):
+        return int((self.eii_n + self.batch_size - 1) / self.batch_size)
+
+    def reset(self):
+        self.pii.reset()
+
+    next = __next__
diff --git a/data/dataloader.py b/data/dataloader.py
new file mode 100644
index 0000000..6b15bb9
--- /dev/null
+++ b/data/dataloader.py
@@ -0,0 +1,163 @@
+import os
+
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+
+import data.mytransforms as mytransforms
+from data.constant import culane_row_anchor
+from data.constant import tusimple_row_anchor
+from data.dataset import LaneClsDataset
+from data.dataset import LaneTestDataset
+
+
+def get_train_loader(batch_size, data_root, griding_num, dataset, use_aux, distributed, num_lanes):
+    target_transform = transforms.Compose(
+        [
+            mytransforms.FreeScaleMask((288, 800)),
+            mytransforms.MaskToTensor(),
+        ]
+    )
+    segment_transform = transforms.Compose(
+        [
+            mytransforms.FreeScaleMask((36, 100)),
+            mytransforms.MaskToTensor(),
+        ]
+    )
+    img_transform = transforms.Compose(
+        [
+            transforms.Resize((288, 800)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        ]
+    )
+    simu_transform = mytransforms.Compose2(
+        [mytransforms.RandomRotate(6), mytransforms.RandomUDoffsetLABEL(100), mytransforms.RandomLROffsetLABEL(200)]
+    )
+    if dataset == "CULane":
+        train_dataset = LaneClsDataset(
+            data_root,
+            os.path.join(data_root, "list/train_gt.txt"),
+            img_transform=img_transform,
+            target_transform=target_transform,
+            simu_transform=simu_transform,
+            segment_transform=segment_transform,
+            row_anchor=culane_row_anchor,
+            griding_num=griding_num,
+            use_aux=use_aux,
+            num_lanes=num_lanes,
+        )
+        cls_num_per_lane = 18
+
+    elif dataset == "Tusimple":
+        train_dataset = LaneClsDataset(
+            data_root,
+            os.path.join(data_root, "train_gt.txt"),
+            img_transform=img_transform,
+            target_transform=target_transform,
+            simu_transform=simu_transform,
+            griding_num=griding_num,
+            row_anchor=tusimple_row_anchor,
+            segment_transform=segment_transform,
+            use_aux=use_aux,
+            num_lanes=num_lanes,
+        )
+        cls_num_per_lane = 56
+    else:
+        raise NotImplementedError
+
+    if distributed:
+        sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        sampler = torch.utils.data.RandomSampler(train_dataset)
+
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler, num_workers=4)
+
+    return train_loader, cls_num_per_lane
+
+
+def get_test_loader(batch_size, data_root, dataset, distributed, crop_ratio, train_width, train_height):
+    if dataset == "CULane":
+        img_transforms = transforms.Compose(
+            [
+                transforms.Resize((int(train_height / crop_ratio), train_width)),
+                transforms.ToTensor(),
+                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+            ]
+        )
+        test_dataset = LaneTestDataset(
+            data_root,
+            os.path.join(data_root, "list/test.txt"),
+            img_transform=img_transforms,
+            crop_size=train_height,
+        )
+    elif dataset == "Tusimple":
+        img_transforms = transforms.Compose(
+            [
+                transforms.Resize((int(train_height / crop_ratio), train_width)),
+                transforms.ToTensor(),
+                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+            ]
+        )
+        test_dataset = LaneTestDataset(
+            data_root,
+            os.path.join(data_root, "test.txt"),
+            img_transform=img_transforms,
+            crop_size=train_height,
+        )
+    elif dataset == "CurveLanes":
+        img_transforms = transforms.Compose(
+            [
+                transforms.Resize((int(train_height / crop_ratio), train_width)),
+                transforms.ToTensor(),
+                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+            ]
+        )
+        test_dataset = LaneTestDataset(
+            data_root,
+            os.path.join(data_root, "valid/valid_for_culane_style.txt"),
+            img_transform=img_transforms,
+            crop_size=train_height,
+        )
+    else:
+        raise NotImplementedError
+    if distributed:
+        sampler = SeqDistributedSampler(test_dataset, shuffle=False)
+    else:
+        sampler = torch.utils.data.SequentialSampler(test_dataset)
+    loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, sampler=sampler, num_workers=4)
+    return loader
+
+
+class SeqDistributedSampler(torch.utils.data.distributed.DistributedSampler):
+    """
+    Change the behavior of DistributedSampler to sequential distributed sampling.
+
+    The sequential sampling helps the stability of multi-thread testing, which needs multi-thread file io.
+    Without sequentially sampling, the file io on thread may interfere other threads.
+
+    """
+
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=False):
+        super().__init__(dataset, num_replicas, rank, shuffle)
+
+    def __iter__(self):
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        if self.shuffle:
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        num_per_rank = int(self.total_size // self.num_replicas)
+
+        # sequential sampling
+        indices = indices[num_per_rank * self.rank : num_per_rank * (self.rank + 1)]
+
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/data/dataset.py b/data/dataset.py
new file mode 100644
index 0000000..d75d811
--- /dev/null
+++ b/data/dataset.py
@@ -0,0 +1,178 @@
+import os
+import pdb
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from data.mytransforms import find_start_pos
+
+
+def loader_func(path):
+    return Image.open(path)
+
+
+class LaneTestDataset(torch.utils.data.Dataset):
+    def __init__(self, path, list_path, img_transform=None, crop_size=None):
+        super().__init__()
+        self.path = path
+        self.img_transform = img_transform
+        self.crop_size = crop_size
+        with open(list_path) as f:
+            self.list = f.readlines()
+        self.list = [l[1:] if l[0] == "/" else l for l in self.list]  # exclude the incorrect path prefix '/' of CULane
+
+    def __getitem__(self, index):
+        name = self.list[index].split()[0]
+        img_path = os.path.join(self.path, name)
+        img = loader_func(img_path)
+
+        if self.img_transform is not None:
+            img = self.img_transform(img)
+        img = img[:, -self.crop_size :, :]
+
+        return img, name
+
+    def __len__(self):
+        return len(self.list)
+
+
+class LaneClsDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        path,
+        list_path,
+        img_transform=None,
+        target_transform=None,
+        simu_transform=None,
+        griding_num=50,
+        load_name=False,
+        row_anchor=None,
+        use_aux=False,
+        segment_transform=None,
+        num_lanes=4,
+    ):
+        super().__init__()
+        self.img_transform = img_transform
+        self.target_transform = target_transform
+        self.segment_transform = segment_transform
+        self.simu_transform = simu_transform
+        self.path = path
+        self.griding_num = griding_num
+        self.load_name = load_name
+        self.use_aux = use_aux
+        self.num_lanes = num_lanes
+
+        with open(list_path) as f:
+            self.list = f.readlines()
+
+        self.row_anchor = row_anchor
+        self.row_anchor.sort()
+
+    def __getitem__(self, index):
+        l = self.list[index]
+        l_info = l.split()
+        img_name, label_name = l_info[0], l_info[1]
+        if img_name[0] == "/":
+            img_name = img_name[1:]
+            label_name = label_name[1:]
+
+        label_path = os.path.join(self.path, label_name)
+        label = loader_func(label_path)
+
+        img_path = os.path.join(self.path, img_name)
+        img = loader_func(img_path)
+
+        if self.simu_transform is not None:
+            img, label = self.simu_transform(img, label)
+        lane_pts = self._get_index(label)
+        # get the coordinates of lanes at row anchors
+
+        w, h = img.size
+        cls_label = self._grid_pts(lane_pts, self.griding_num, w)
+        # make the coordinates to classification label
+        if self.use_aux:
+            assert self.segment_transform is not None
+            seg_label = self.segment_transform(label)
+
+        if self.img_transform is not None:
+            img = self.img_transform(img)
+
+        if self.use_aux:
+            return img, cls_label, seg_label
+        if self.load_name:
+            return img, cls_label, img_name
+        return img, cls_label
+
+    def __len__(self):
+        return len(self.list)
+
+    def _grid_pts(self, pts, num_cols, w):
+        # pts : numlane,n,2
+        num_lane, n, n2 = pts.shape
+        col_sample = np.linspace(0, w - 1, num_cols)
+
+        assert n2 == 2
+        to_pts = np.zeros((n, num_lane))
+        for i in range(num_lane):
+            pti = pts[i, :, 1]
+            to_pts[:, i] = np.asarray(
+                [int(pt // (col_sample[1] - col_sample[0])) if pt != -1 else num_cols for pt in pti]
+            )
+        return to_pts.astype(int)
+
+    def _get_index(self, label):
+        w, h = label.size
+
+        if h != 288:
+            scale_f = lambda x: int((x * 1.0 / 288) * h)
+            sample_tmp = list(map(scale_f, self.row_anchor))
+
+        all_idx = np.zeros((self.num_lanes, len(sample_tmp), 2))
+        for i, r in enumerate(sample_tmp):
+            label_r = np.asarray(label)[int(round(r))]
+            for lane_idx in range(1, self.num_lanes + 1):
+                pos = np.where(label_r == lane_idx)[0]
+                if len(pos) == 0:
+                    all_idx[lane_idx - 1, i, 0] = r
+                    all_idx[lane_idx - 1, i, 1] = -1
+                    continue
+                pos = np.mean(pos)
+                all_idx[lane_idx - 1, i, 0] = r
+                all_idx[lane_idx - 1, i, 1] = pos
+
+        # data augmentation: extend the lane to the boundary of image
+
+        all_idx_cp = all_idx.copy()
+        for i in range(self.num_lanes):
+            if np.all(all_idx_cp[i, :, 1] == -1):
+                continue
+            # if there is no lane
+
+            valid = all_idx_cp[i, :, 1] != -1
+            # get all valid lane points' index
+            valid_idx = all_idx_cp[i, valid, :]
+            # get all valid lane points
+            if valid_idx[-1, 0] == all_idx_cp[0, -1, 0]:
+                # if the last valid lane point's y-coordinate is already the last y-coordinate of all rows
+                # this means this lane has reached the bottom boundary of the image
+                # so we skip
+                continue
+            if len(valid_idx) < 6:
+                continue
+            # if the lane is too short to extend
+
+            valid_idx_half = valid_idx[len(valid_idx) // 2 :, :]
+            p = np.polyfit(valid_idx_half[:, 0], valid_idx_half[:, 1], deg=1)
+            start_line = valid_idx_half[-1, 0]
+            pos = find_start_pos(all_idx_cp[i, :, 0], start_line) + 1
+
+            fitted = np.polyval(p, all_idx_cp[i, pos:, 0])
+            fitted = np.array([-1 if y < 0 or y > w - 1 else y for y in fitted])
+
+            assert np.all(all_idx_cp[i, pos:, 1] == -1)
+            all_idx_cp[i, pos:, 1] = fitted
+        if -1 in all_idx[:, :, 0]:
+            pdb.set_trace()
+        return all_idx_cp
diff --git a/data/mytransforms.py b/data/mytransforms.py
new file mode 100644
index 0000000..e53cc73
--- /dev/null
+++ b/data/mytransforms.py
@@ -0,0 +1,185 @@
+import numbers
+import pdb
+import random
+
+import cv2
+import numpy as np
+
+# from config import cfg
+import torch
+from PIL import Image
+from PIL import ImageFilter
+from PIL import ImageOps
+
+# ===============================img tranforms============================
+
+
+class Compose2:
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img, mask, bbx=None):
+        if bbx is None:
+            for t in self.transforms:
+                img, mask = t(img, mask)
+            return img, mask
+        for t in self.transforms:
+            img, mask, bbx = t(img, mask, bbx)
+        return img, mask, bbx
+
+
+class FreeScale:
+    def __init__(self, size):
+        self.size = size  # (h, w)
+
+    def __call__(self, img, mask):
+        return img.resize((self.size[1], self.size[0]), Image.BILINEAR), mask.resize(
+            (self.size[1], self.size[0]), Image.NEAREST
+        )
+
+
+class FreeScaleMask:
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, mask):
+        return mask.resize((self.size[1], self.size[0]), Image.NEAREST)
+
+
+class Scale:
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, mask):
+        if img.size != mask.size:
+            print(img.size)
+            print(mask.size)
+        assert img.size == mask.size
+        w, h = img.size
+        if (w <= h and w == self.size) or (h <= w and h == self.size):
+            return img, mask
+        if w < h:
+            ow = self.size
+            oh = int(self.size * h / w)
+            return img.resize((ow, oh), Image.BILINEAR), mask.resize((ow, oh), Image.NEAREST)
+        else:
+            oh = self.size
+            ow = int(self.size * w / h)
+            return img.resize((ow, oh), Image.BILINEAR), mask.resize((ow, oh), Image.NEAREST)
+
+
+class RandomRotate:
+    """
+    Crops the given PIL.Image at a random location to have a region of
+    the given size.
+
+    size can be a tuple (target_height, target_width)
+    or an integer, in which case the target will be of a square shape (size, size)
+
+    """
+
+    def __init__(self, angle):
+        self.angle = angle
+
+    def __call__(self, image, label):
+        assert label is None or image.size == label.size
+
+        angle = random.randint(0, self.angle * 2) - self.angle
+
+        label = label.rotate(angle, resample=Image.NEAREST)
+        image = image.rotate(angle, resample=Image.BILINEAR)
+
+        return image, label
+
+
+# ===============================label tranforms============================
+
+
+class DeNormalize:
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, tensor):
+        for t, m, s in zip(tensor, self.mean, self.std):
+            t.mul_(s).add_(m)
+        return tensor
+
+
+class MaskToTensor:
+    def __call__(self, img):
+        return torch.from_numpy(np.array(img, dtype=np.int32)).long()
+
+
+def find_start_pos(row_sample, start_line):
+    # row_sample = row_sample.sort()
+    # for i,r in enumerate(row_sample):
+    #     if r >= start_line:
+    #         return i
+    l, r = 0, len(row_sample) - 1
+    while True:
+        mid = int((l + r) / 2)
+        if r - l == 1:
+            return r
+        if row_sample[mid] < start_line:
+            l = mid
+        if row_sample[mid] > start_line:
+            r = mid
+        if row_sample[mid] == start_line:
+            return mid
+
+
+class RandomLROffsetLABEL:
+    def __init__(self, max_offset):
+        self.max_offset = max_offset
+
+    def __call__(self, img, label):
+        offset = np.random.randint(-self.max_offset, self.max_offset)
+        w, h = img.size
+
+        img = np.array(img)
+        if offset > 0:
+            img[:, offset:, :] = img[:, 0 : w - offset, :]
+            img[:, :offset, :] = 0
+        if offset < 0:
+            real_offset = -offset
+            img[:, 0 : w - real_offset, :] = img[:, real_offset:, :]
+            img[:, w - real_offset :, :] = 0
+
+        label = np.array(label)
+        if offset > 0:
+            label[:, offset:] = label[:, 0 : w - offset]
+            label[:, :offset] = 0
+        if offset < 0:
+            offset = -offset
+            label[:, 0 : w - offset] = label[:, offset:]
+            label[:, w - offset :] = 0
+        return Image.fromarray(img), Image.fromarray(label)
+
+
+class RandomUDoffsetLABEL:
+    def __init__(self, max_offset):
+        self.max_offset = max_offset
+
+    def __call__(self, img, label):
+        offset = np.random.randint(-self.max_offset, self.max_offset)
+        w, h = img.size
+
+        img = np.array(img)
+        if offset > 0:
+            img[offset:, :, :] = img[0 : h - offset, :, :]
+            img[:offset, :, :] = 0
+        if offset < 0:
+            real_offset = -offset
+            img[0 : h - real_offset, :, :] = img[real_offset:, :, :]
+            img[h - real_offset :, :, :] = 0
+
+        label = np.array(label)
+        if offset > 0:
+            label[offset:, :] = label[0 : h - offset, :]
+            label[:offset, :] = 0
+        if offset < 0:
+            offset = -offset
+            label[0 : h - offset, :] = label[offset:, :]
+            label[h - offset :, :] = 0
+        return Image.fromarray(img), Image.fromarray(label)
diff --git a/demo.py b/demo.py
new file mode 100644
index 0000000..b3b1c30
--- /dev/null
+++ b/demo.py
@@ -0,0 +1,175 @@
+import os
+
+import cv2
+import torch
+import torchvision.transforms as transforms
+import tqdm
+
+from data.dataset import LaneTestDataset
+from utils.common import get_model
+from utils.common import merge_config
+from utils.dist_utils import dist_print
+
+
+def pred2coords(pred, row_anchor, col_anchor, local_width=1, original_image_width=1640, original_image_height=590):
+    batch_size, num_grid_row, num_cls_row, num_lane_row = pred["loc_row"].shape
+    batch_size, num_grid_col, num_cls_col, num_lane_col = pred["loc_col"].shape
+
+    max_indices_row = pred["loc_row"].argmax(1).cpu()
+    # n , num_cls, num_lanes
+    valid_row = pred["exist_row"].argmax(1).cpu()
+    # n, num_cls, num_lanes
+
+    max_indices_col = pred["loc_col"].argmax(1).cpu()
+    # n , num_cls, num_lanes
+    valid_col = pred["exist_col"].argmax(1).cpu()
+    # n, num_cls, num_lanes
+
+    pred["loc_row"] = pred["loc_row"].cpu()
+    pred["loc_col"] = pred["loc_col"].cpu()
+
+    coords = []
+
+    row_lane_idx = [1, 2]
+    col_lane_idx = [0, 3]
+
+    for i in row_lane_idx:
+        tmp = []
+        if valid_row[0, :, i].sum() > num_cls_row / 2:
+            for k in range(valid_row.shape[1]):
+                if valid_row[0, k, i]:
+                    all_ind = torch.tensor(
+                        list(
+                            range(
+                                max(0, max_indices_row[0, k, i] - local_width),
+                                min(num_grid_row - 1, max_indices_row[0, k, i] + local_width) + 1,
+                            )
+                        )
+                    )
+
+                    out_tmp = (pred["loc_row"][0, all_ind, k, i].softmax(0) * all_ind.float()).sum() + 0.5
+                    out_tmp = out_tmp / (num_grid_row - 1) * original_image_width
+                    tmp.append((int(out_tmp), int(row_anchor[k] * original_image_height)))
+            coords.append(tmp)
+
+    for i in col_lane_idx:
+        tmp = []
+        if valid_col[0, :, i].sum() > num_cls_col / 4:
+            for k in range(valid_col.shape[1]):
+                if valid_col[0, k, i]:
+                    all_ind = torch.tensor(
+                        list(
+                            range(
+                                max(0, max_indices_col[0, k, i] - local_width),
+                                min(num_grid_col - 1, max_indices_col[0, k, i] + local_width) + 1,
+                            )
+                        )
+                    )
+
+                    out_tmp = (pred["loc_col"][0, all_ind, k, i].softmax(0) * all_ind.float()).sum() + 0.5
+
+                    out_tmp = out_tmp / (num_grid_col - 1) * original_image_height
+                    tmp.append((int(col_anchor[k] * original_image_width), int(out_tmp)))
+            coords.append(tmp)
+
+    return coords
+
+
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+
+    args, cfg = merge_config()
+    cfg.batch_size = 1
+    print("setting batch_size to 1 for demo generation")
+
+    dist_print("start testing...")
+    assert cfg.backbone in ["18", "34", "50", "101", "152", "50next", "101next", "50wide", "101wide"]
+
+    if cfg.dataset == "CULane":
+        cls_num_per_lane = 18
+    elif cfg.dataset == "Tusimple":
+        cls_num_per_lane = 56
+    else:
+        raise NotImplementedError
+
+    if cfg.model_ckpt:
+        net = torch.load(cfg.model_ckpt, map_location="cpu")["model_ckpt"]
+    else:
+        net = get_model(cfg)
+
+        state_dict = torch.load(cfg.test_model, map_location="cpu")["model"]
+        compatible_state_dict = {}
+        for k, v in state_dict.items():
+            if "module." in k:
+                compatible_state_dict[k[7:]] = v
+            else:
+                compatible_state_dict[k] = v
+
+        net.load_state_dict(compatible_state_dict, strict=False)
+    net.cuda()
+    net.eval()
+
+    img_transforms = transforms.Compose(
+        [
+            transforms.Resize((int(cfg.train_height / cfg.crop_ratio), cfg.train_width)),
+            transforms.ToTensor(),
+            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        ]
+    )
+    if cfg.dataset == "CULane":
+        splits = [
+            "test0_normal.txt",
+            "test1_crowd.txt",
+            "test2_hlight.txt",
+            "test3_shadow.txt",
+            "test4_noline.txt",
+            "test5_arrow.txt",
+            "test6_curve.txt",
+            "test7_cross.txt",
+            "test8_night.txt",
+        ]
+        datasets = [
+            LaneTestDataset(
+                cfg.data_root,
+                os.path.join(cfg.data_root, "list/test_split/" + split),
+                img_transform=img_transforms,
+                crop_size=cfg.train_height,
+            )
+            for split in splits
+        ]
+        img_w, img_h = 1640, 590
+    elif cfg.dataset == "Tusimple":
+        splits = ["test.txt"]
+        datasets = [
+            LaneTestDataset(
+                cfg.data_root,
+                os.path.join(cfg.data_root, split),
+                img_transform=img_transforms,
+                crop_size=cfg.train_height,
+            )
+            for split in splits
+        ]
+        img_w, img_h = 1280, 720
+    else:
+        raise NotImplementedError
+    for split, dataset in zip(splits, datasets):
+        loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=1)
+        fourcc = cv2.VideoWriter_fourcc(*"MJPG")
+        print(split[:-3] + "avi")
+        vout = cv2.VideoWriter(split[:-3] + "avi", fourcc, 30.0, (img_w, img_h))
+        for i, data in enumerate(tqdm.tqdm(loader)):
+            imgs, names = data
+            imgs = imgs.cuda()
+            with torch.no_grad():
+                pred = net(imgs)
+
+            vis = cv2.imread(os.path.join(cfg.data_root, names[0]))
+            coords = pred2coords(
+                pred, cfg.row_anchor, cfg.col_anchor, original_image_width=img_w, original_image_height=img_h
+            )
+            for lane in coords:
+                for coord in lane:
+                    cv2.circle(vis, coord, 5, (0, 255, 0), -1)
+            vout.write(vis)
+
+        vout.release()
diff --git a/evaluation/culane/CMakeLists.txt b/evaluation/culane/CMakeLists.txt
new file mode 100644
index 0000000..f7e4dc6
--- /dev/null
+++ b/evaluation/culane/CMakeLists.txt
@@ -0,0 +1,79 @@
+# Thanks for the contribution of zchrissirhcz imzhuo@foxmail.com
+cmake_minimum_required(VERSION 3.1)
+
+project(culane_evaluator)
+
+set(CMAKE_CXX_STANDARD 11)
+
+add_definitions(
+    -DCPU_ONLY
+)
+
+set(SRC_LST
+    ${CMAKE_SOURCE_DIR}/src/counter.cpp
+    ${CMAKE_SOURCE_DIR}/src/evaluate.cpp
+    ${CMAKE_SOURCE_DIR}/src/lane_compare.cpp
+    ${CMAKE_SOURCE_DIR}/src/spline.cpp
+)
+
+set(HDR_LST
+    ${CMAKE_SOURCE_DIR}/include/counter.hpp
+    ${CMAKE_SOURCE_DIR}/include/hungarianGraph.hpp
+    ${CMAKE_SOURCE_DIR}/include/lane_compare.hpp
+    ${CMAKE_SOURCE_DIR}/include/spline.hpp
+)
+
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    list(APPEND SRC_LST ${CMAKE_SOURCE_DIR}/getopt/getopt.c)
+    list(APPEND HDR_LST ${CMAKE_SOURCE_DIR}/getopt/getopt.h)
+endif()
+
+add_executable(${PROJECT_NAME}
+    ${SRC_LST}
+    ${HDR_LST}
+)
+
+set(dep_libs "")
+
+#--- OpenCV
+# You may switch different version of OpenCV like this:
+# set(OpenCV_DIR "/usr/local/opencv-4.3.0" CACHE PATH "")
+find_package(OpenCV REQUIRED
+    COMPONENTS core highgui imgproc imgcodecs
+)
+if(NOT OpenCV_FOUND) # if not OpenCV 4.x/3.x, then imgcodecs are not found
+    find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
+endif()
+
+list(APPEND dep_libs
+    PUBLIC ${OpenCV_LIBS}
+)
+
+#--- OpenMP
+find_package(OpenMP)
+if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
+    target_compile_options(${PROJECT_NAME} PRIVATE ${OpenMP_CXX_FLAGS})
+endif()
+
+if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
+    message(STATUS "Building with OpenMP")
+    if(OpenMP_CXX_FOUND)
+        list(APPEND dep_libs PUBLIC OpenMP::OpenMP_CXX)
+    else()
+        list(APPEND dep_libs PRIVATE "${OpenMP_CXX_FLAGS}")
+    endif()
+endif()
+
+set(dep_incs ${CMAKE_SOURCE_DIR}/include)
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+    list(APPEND dep_incs "${CMAKE_SOURCE_DIR}/getopt")
+endif()
+
+# --- target config with include dirs / libs
+target_link_libraries(${PROJECT_NAME}
+    ${dep_libs}
+)
+
+target_include_directories(${PROJECT_NAME}
+    PUBLIC ${dep_incs}
+)
diff --git a/evaluation/culane/Makefile b/evaluation/culane/Makefile
new file mode 100755
index 0000000..5b9f66a
--- /dev/null
+++ b/evaluation/culane/Makefile
@@ -0,0 +1,50 @@
+PROJECT_NAME:= evaluate
+
+# config ----------------------------------
+
+INCLUDE_DIRS := include
+LIBRARY_DIRS := lib
+
+# You may switch different versions of opencv like this:
+# export PKG_CONFIG_PATH=/usr/local/opencv-4.1.1/lib/pkgconfig:$PKG_CONFIG_PATH
+# then use `pkg-config opencv4 --cflags --libs` since `opencv4.pc` is found
+
+COMMON_FLAGS := -DCPU_ONLY
+CXXFLAGS := -std=c++11 -fopenmp #`pkg-config --cflags opencv`
+LDFLAGS := -fopenmp -Wl,-rpath,./lib #`pkg-config --libs opencv`
+
+BUILD_DIR := build
+
+# make rules -------------------------------
+CXX ?= g++
+BUILD_DIR ?= ./build
+
+LIBRARIES += opencv_core opencv_highgui opencv_imgproc opencv_imgcodecs
+
+CXXFLAGS += $(COMMON_FLAGS) $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
+LDFLAGS +=  $(COMMON_FLAGS) $(foreach includedir,$(LIBRARY_DIRS),-L$(includedir)) $(foreach library,$(LIBRARIES),-l$(library))
+SRC_DIRS += $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print)
+CXX_SRCS += $(shell find src/ -name "*.cpp")
+CXX_TARGETS:=$(patsubst %.cpp, $(BUILD_DIR)/%.o, $(CXX_SRCS))
+ALL_BUILD_DIRS := $(sort $(BUILD_DIR) $(addprefix $(BUILD_DIR)/, $(SRC_DIRS)))
+
+.PHONY: all
+all: $(PROJECT_NAME)
+
+.PHONY: $(ALL_BUILD_DIRS)
+$(ALL_BUILD_DIRS):
+	@mkdir -p $@
+
+$(BUILD_DIR)/%.o: %.cpp | $(ALL_BUILD_DIRS)
+	@echo "CXX" $<
+	@$(CXX) $(CXXFLAGS) -c -o $@ $<
+
+$(PROJECT_NAME): $(CXX_TARGETS)
+	@echo "CXX/LD" $@
+	@$(CXX) -o $@ $^ $(LDFLAGS)
+
+.PHONY: clean
+clean:
+	@rm -rf $(CXX_TARGETS)
+	@rm -rf $(PROJECT_NAME)
+	@rm -rf $(BUILD_DIR)
diff --git a/evaluation/culane/calTotal.m b/evaluation/culane/calTotal.m
new file mode 100755
index 0000000..a0eeb08
--- /dev/null
+++ b/evaluation/culane/calTotal.m
@@ -0,0 +1,23 @@
+%% Calculate overall Fmeasure from each scenarios
+clc; clear; close all;
+
+allFile = 'output/vgg_SCNN_DULR_w9_iou0.5.txt';
+
+all = textread(allFile,'%s');
+TP = 0;
+FP = 0;
+FN = 0;
+
+for i=1:9
+   tpline = (i-1)*14+4;
+   tp = str2double(all(tpline));
+   fp = str2double(all(tpline+2));
+   fn = str2double(all(tpline+4));
+   TP = TP + tp;
+   FP = FP + fp;
+   FN = FN + fn;
+end
+
+P = TP/(TP + FP)
+R = TP/(TP + FN)
+F = 2*P*R/(P + R)*100
diff --git a/evaluation/culane/getopt/LICENSE.md b/evaluation/culane/getopt/LICENSE.md
new file mode 100644
index 0000000..ff62fbe
--- /dev/null
+++ b/evaluation/culane/getopt/LICENSE.md
@@ -0,0 +1,207 @@
+```
+                             Apache License
+                       Version 2.0, January 2004
+                    http://www.apache.org/licenses/
+```
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+1. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+1. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+1. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+   Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+   stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+   that You distribute, all copyright, patent, trademark, and
+   attribution notices from the Source form of the Work,
+   excluding those notices that do not pertain to any part of
+   the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+   distribution, then any Derivative Works that You distribute must
+   include a readable copy of the attribution notices contained
+   within such NOTICE file, excluding those notices that do not
+   pertain to any part of the Derivative Works, in at least one
+   of the following places: within a NOTICE text file distributed
+   as part of the Derivative Works; within the Source form or
+   documentation, if provided along with the Derivative Works; or,
+   within a display generated by the Derivative Works, if and
+   wherever such third-party notices normally appear. The contents
+   of the NOTICE file are for informational purposes only and
+   do not modify the License. You may add Your own attribution
+   notices within Derivative Works that You distribute, alongside
+   or as an addendum to the NOTICE text from the Work, provided
+   that such additional attribution notices cannot be construed
+   as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+1. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+1. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+1. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+1. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+1. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+```
+  To apply the Apache License to your work, attach the following
+  boilerplate notice, with the fields enclosed by brackets "[]"
+  replaced with your own identifying information. (Don't include
+  the brackets!)  The text should be enclosed in the appropriate
+  comment syntax for the file format. We also recommend that a
+  file or class name and description of purpose be included on the
+  same "printed page" as the copyright notice for easier
+  identification within third-party archives.
+```
+
+Copyright \[yyyy\] \[name of copyright owner\]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+```
+   http://www.apache.org/licenses/LICENSE-2.0
+```
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/evaluation/culane/getopt/getopt.c b/evaluation/culane/getopt/getopt.c
new file mode 100644
index 0000000..307baf2
--- /dev/null
+++ b/evaluation/culane/getopt/getopt.c
@@ -0,0 +1,51 @@
+/* *****************************************************************
+*
+* Copyright 2016 Microsoft
+*
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************/
+
+#include "getopt.h"
+#include <windows.h>
+
+char* optarg = NULL;
+int optind = 1;
+
+int getopt(int argc, char *const argv[], const char *optstring)
+{
+    if ((optind >= argc) || (argv[optind][0] != '-') || (argv[optind][0] == 0))
+    {
+        return -1;
+    }
+
+    int opt = argv[optind][1];
+    const char *p = strchr(optstring, opt);
+
+    if (p == NULL)
+    {
+        return '?';
+    }
+    if (p[1] == ':')
+    {
+        optind++;
+        if (optind >= argc)
+        {
+            return '?';
+        }
+        optarg = argv[optind];
+        optind++;
+    }
+    return opt;
+}
diff --git a/evaluation/culane/getopt/getopt.h b/evaluation/culane/getopt/getopt.h
new file mode 100644
index 0000000..33de8ad
--- /dev/null
+++ b/evaluation/culane/getopt/getopt.h
@@ -0,0 +1,36 @@
+/* *****************************************************************
+*
+* Copyright 2016 Microsoft
+*
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+******************************************************************/
+
+#ifndef GETOPT_H__
+#define GETOPT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern char *optarg;
+extern int optind;
+
+int getopt(int argc, char *const argv[], const char *optstring);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/evaluation/culane/getopt/readme.txt b/evaluation/culane/getopt/readme.txt
new file mode 100644
index 0000000..e8a6b3d
--- /dev/null
+++ b/evaluation/culane/getopt/readme.txt
@@ -0,0 +1,3 @@
+For windows build, `getopt.c` and `getopt.h` are required.
+
+They are taken from the [iotivity](https://github.com/iotivity/iotivity) open source project, under Apache LICENSE 2.0.
diff --git a/evaluation/culane/include/counter.hpp b/evaluation/culane/include/counter.hpp
new file mode 100644
index 0000000..a5d9988
--- /dev/null
+++ b/evaluation/culane/include/counter.hpp
@@ -0,0 +1,47 @@
+#ifndef COUNTER_HPP
+#define COUNTER_HPP
+
+#include "lane_compare.hpp"
+#include "hungarianGraph.hpp"
+#include <iostream>
+#include <algorithm>
+#include <tuple>
+#include <vector>
+#include <opencv2/core.hpp>
+
+using namespace std;
+using namespace cv;
+
+// before coming to use functions of this class, the lanes should resize to im_width and im_height using resize_lane() in lane_compare.hpp
+class Counter
+{
+	public:
+		Counter(int _im_width, int _im_height, double _iou_threshold=0.4, int _lane_width=10):tp(0),fp(0),fn(0){
+			im_width = _im_width;
+			im_height = _im_height;
+			sim_threshold = _iou_threshold;
+			lane_compare = new LaneCompare(_im_width, _im_height,  _lane_width, LaneCompare::IOU);
+		};
+		double get_precision(void);
+		double get_recall(void);
+		long getTP(void);
+		long getFP(void);
+		long getFN(void);
+		void setTP(long);
+		void setFP(long);
+		void setFN(long);
+		// direct add tp, fp, tn and fn
+		// first match with hungarian
+		tuple<vector<int>, long, long, long, long> count_im_pair(const vector<vector<Point2f> > &anno_lanes, const vector<vector<Point2f> > &detect_lanes);
+		void makeMatch(const vector<vector<double> > &similarity, vector<int> &match1, vector<int> &match2);
+
+	private:
+		double sim_threshold;
+		int im_width;
+		int im_height;
+		long tp;
+		long fp;
+		long fn;
+		LaneCompare *lane_compare;
+};
+#endif
diff --git a/evaluation/culane/include/hungarianGraph.hpp b/evaluation/culane/include/hungarianGraph.hpp
new file mode 100644
index 0000000..058a9b0
--- /dev/null
+++ b/evaluation/culane/include/hungarianGraph.hpp
@@ -0,0 +1,71 @@
+#ifndef HUNGARIAN_GRAPH_HPP
+#define HUNGARIAN_GRAPH_HPP
+#include <vector>
+using namespace std;
+
+struct pipartiteGraph {
+    vector<vector<double> > mat;
+    vector<bool> leftUsed, rightUsed;
+    vector<double> leftWeight, rightWeight;
+    vector<int>rightMatch, leftMatch;
+    int leftNum, rightNum;
+    bool matchDfs(int u) {
+        leftUsed[u] = true;
+        for (int v = 0; v < rightNum; v++) {
+            if (!rightUsed[v] && fabs(leftWeight[u] + rightWeight[v] - mat[u][v]) < 1e-2) {
+                rightUsed[v] = true;
+                if (rightMatch[v] == -1 || matchDfs(rightMatch[v])) {
+                    rightMatch[v] = u;
+                    leftMatch[u] = v;
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+    void resize(int leftNum, int rightNum) {
+        this->leftNum = leftNum;
+        this->rightNum = rightNum;
+        leftMatch.resize(leftNum);
+        rightMatch.resize(rightNum);
+        leftUsed.resize(leftNum);
+        rightUsed.resize(rightNum);
+        leftWeight.resize(leftNum);
+        rightWeight.resize(rightNum);
+        mat.resize(leftNum);
+        for (int i = 0; i < leftNum; i++) mat[i].resize(rightNum);
+    }
+    void match() {
+        for (int i = 0; i < leftNum; i++) leftMatch[i] = -1;
+        for (int i = 0; i < rightNum; i++) rightMatch[i] = -1;
+        for (int i = 0; i < rightNum; i++) rightWeight[i] = 0;
+        for (int i = 0; i < leftNum; i++) {
+            leftWeight[i] = -1e5;
+            for (int j = 0; j < rightNum; j++) {
+                if (leftWeight[i] < mat[i][j]) leftWeight[i] = mat[i][j];
+            }
+        }
+
+        for (int u = 0; u < leftNum; u++) {
+            while (1) {
+                for (int i = 0; i < leftNum; i++) leftUsed[i] = false;
+                for (int i = 0; i < rightNum; i++) rightUsed[i] = false;
+                if (matchDfs(u)) break;
+                double d = 1e10;
+                for (int i = 0; i < leftNum; i++) {
+                    if (leftUsed[i] ) {
+                        for (int j = 0; j < rightNum; j++) {
+                            if (!rightUsed[j]) d = min(d, leftWeight[i] + rightWeight[j] - mat[i][j]);
+                        }
+                    }
+                }
+                if (d == 1e10) return ;
+                for (int i = 0; i < leftNum; i++) if (leftUsed[i]) leftWeight[i] -= d;
+                for (int i = 0; i < rightNum; i++) if (rightUsed[i]) rightWeight[i] += d;
+            }
+        }
+    }
+};
+
+
+#endif // HUNGARIAN_GRAPH_HPP
diff --git a/evaluation/culane/include/lane_compare.hpp b/evaluation/culane/include/lane_compare.hpp
new file mode 100644
index 0000000..7a77af0
--- /dev/null
+++ b/evaluation/culane/include/lane_compare.hpp
@@ -0,0 +1,37 @@
+#ifndef LANE_COMPARE_HPP
+#define LANE_COMPARE_HPP
+
+#include "spline.hpp"
+#include <vector>
+#include <iostream>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+using namespace std;
+using namespace cv;
+
+class LaneCompare{
+	public:
+		enum CompareMode{
+			IOU,
+			Caltech
+		};
+
+		LaneCompare(int _im_width, int _im_height, int _lane_width = 10, CompareMode _compare_mode = IOU){
+			im_width = _im_width;
+			im_height = _im_height;
+			compare_mode = _compare_mode;
+			lane_width = _lane_width;
+		}
+
+		double get_lane_similarity(const vector<Point2f> &lane1, const vector<Point2f> &lane2);
+		void resize_lane(vector<Point2f> &curr_lane, int curr_width, int curr_height);
+	private:
+		CompareMode compare_mode;
+		int im_width;
+		int im_height;
+		int lane_width;
+		Spline splineSolver;
+};
+
+#endif
diff --git a/evaluation/culane/include/spline.hpp b/evaluation/culane/include/spline.hpp
new file mode 100644
index 0000000..7ebd7c6
--- /dev/null
+++ b/evaluation/culane/include/spline.hpp
@@ -0,0 +1,28 @@
+#ifndef SPLINE_HPP
+#define SPLINE_HPP
+#include <vector>
+#include <cstdio>
+#include <math.h>
+#include <opencv2/core.hpp>
+
+using namespace cv;
+using namespace std;
+
+struct Func {
+    double a_x;
+    double b_x;
+    double c_x;
+    double d_x;
+    double a_y;
+    double b_y;
+    double c_y;
+    double d_y;
+    double h;
+};
+class Spline {
+public:
+	vector<Point2f> splineInterpTimes(const vector<Point2f> &tmp_line, int times);
+    vector<Point2f> splineInterpStep(vector<Point2f> tmp_line, double step);
+	vector<Func> cal_fun(const vector<Point2f> &point_v);
+};
+#endif
diff --git a/evaluation/culane/run-full.sh b/evaluation/culane/run-full.sh
new file mode 100644
index 0000000..b49c356
--- /dev/null
+++ b/evaluation/culane/run-full.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+root=../../
+data_dir=${root}data/CULane/
+exp=vgg_SCNN_DULR_w9
+detect_dir=${root}tools/prob2lines/output/${exp}/
+w_lane=30;
+iou=0.5;  # Set iou to 0.3 or 0.5
+im_w=1640
+im_h=590
+frame=1
+list0=${data_dir}list/test_split/test0_normal.txt
+list1=${data_dir}list/test_split/test1_crowd.txt
+list2=${data_dir}list/test_split/test2_hlight.txt
+list3=${data_dir}list/test_split/test3_shadow.txt
+list4=${data_dir}list/test_split/test4_noline.txt
+list5=${data_dir}list/test_split/test5_arrow.txt
+list6=${data_dir}list/test_split/test6_curve.txt
+list7=${data_dir}list/test_split/test7_cross.txt
+list8=${data_dir}list/test_split/test8_night.txt
+out0=./output/out0_normal.txt
+out1=./output/out1_crowd.txt
+out2=./output/out2_hlight.txt
+out3=./output/out3_shadow.txt
+out4=./output/out4_noline.txt
+out5=./output/out5_arrow.txt
+out6=./output/out6_curve.txt
+out7=./output/out7_cross.txt
+out8=./output/out8_night.txt
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list0 -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out0
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list1 -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out1
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list2 -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out2
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list3 -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out3
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list4 -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out4
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list5 -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out5
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list6 -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out6
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list7 -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out7
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list8 -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out8
+cat ./output/out*.txt>./output/${exp}_iou${iou}_split.txt
diff --git a/evaluation/culane/run-lite.sh b/evaluation/culane/run-lite.sh
new file mode 100644
index 0000000..710b67e
--- /dev/null
+++ b/evaluation/culane/run-lite.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+root=../../
+data_dir=${root}data/CULane/
+exp=vgg_SCNN_DULR_w9
+detect_dir=${root}tools/prob2lines/output/${exp}/
+w_lane=30;
+iou=0.5;  # Set iou to 0.3 or 0.5
+im_w=1640
+im_h=590
+frame=1
+list=${data_dir}list/test.txt
+out=./output/${exp}_iou${iou}.txt
+./evaluate -a $data_dir -d $detect_dir -i $data_dir -l $list -w $w_lane -t $iou -c $im_w -r $im_h -f $frame -o $out
diff --git a/evaluation/culane/src/counter.cpp b/evaluation/culane/src/counter.cpp
new file mode 100644
index 0000000..c1728bd
--- /dev/null
+++ b/evaluation/culane/src/counter.cpp
@@ -0,0 +1,134 @@
+/*************************************************************************
+	> File Name: counter.cpp
+	> Author: Xingang Pan, Jun Li
+	> Mail: px117@ie.cuhk.edu.hk
+	> Created Time: Thu Jul 14 20:23:08 2016
+ ************************************************************************/
+
+#include "counter.hpp"
+
+double Counter::get_precision(void)
+{
+	cerr<<"tp: "<<tp<<" fp: "<<fp<<" fn: "<<fn<<endl;
+	if(tp+fp == 0)
+	{
+		cerr<<"no positive detection"<<endl;
+		return -1;
+	}
+	return tp/double(tp + fp);
+}
+
+double Counter::get_recall(void)
+{
+	if(tp+fn == 0)
+	{
+		cerr<<"no ground truth positive"<<endl;
+		return -1;
+	}
+	return tp/double(tp + fn);
+}
+
+long Counter::getTP(void)
+{
+	return tp;
+}
+
+long Counter::getFP(void)
+{
+	return fp;
+}
+
+long Counter::getFN(void)
+{
+	return fn;
+}
+
+void Counter::setTP(long value)
+{
+	tp = value;
+}
+
+void Counter::setFP(long value)
+{
+  fp = value;
+}
+
+void Counter::setFN(long value)
+{
+	fn = value;
+}
+
+tuple<vector<int>, long, long, long, long> Counter::count_im_pair(const vector<vector<Point2f> > &anno_lanes, const vector<vector<Point2f> > &detect_lanes)
+{
+	vector<int> anno_match(anno_lanes.size(), -1);
+	vector<int> detect_match;
+	if(anno_lanes.empty())
+	{
+		return make_tuple(anno_match, 0, detect_lanes.size(), 0, 0);
+	}
+
+	if(detect_lanes.empty())
+	{
+		return make_tuple(anno_match, 0, 0, 0, anno_lanes.size());
+	}
+	// hungarian match first
+
+	// first calc similarity matrix
+	vector<vector<double> > similarity(anno_lanes.size(), vector<double>(detect_lanes.size(), 0));
+	for(int i=0; i<anno_lanes.size(); i++)
+	{
+		const vector<Point2f> &curr_anno_lane = anno_lanes[i];
+		for(int j=0; j<detect_lanes.size(); j++)
+		{
+			const vector<Point2f> &curr_detect_lane = detect_lanes[j];
+			similarity[i][j] = lane_compare->get_lane_similarity(curr_anno_lane, curr_detect_lane);
+		}
+	}
+
+
+
+	makeMatch(similarity, anno_match, detect_match);
+
+
+	int curr_tp = 0;
+	// count and add
+	for(int i=0; i<anno_lanes.size(); i++)
+	{
+		if(anno_match[i]>=0 && similarity[i][anno_match[i]] > sim_threshold)
+		{
+			curr_tp++;
+		}
+		else
+		{
+			anno_match[i] = -1;
+		}
+	}
+	int curr_fn = anno_lanes.size() - curr_tp;
+	int curr_fp = detect_lanes.size() - curr_tp;
+	return make_tuple(anno_match, curr_tp, curr_fp, 0, curr_fn);
+}
+
+
+void Counter::makeMatch(const vector<vector<double> > &similarity, vector<int> &match1, vector<int> &match2) {
+	int m = similarity.size();
+	int n = similarity[0].size();
+    pipartiteGraph gra;
+    bool have_exchange = false;
+    if (m > n) {
+        have_exchange = true;
+        swap(m, n);
+    }
+    gra.resize(m, n);
+    for (int i = 0; i < gra.leftNum; i++) {
+        for (int j = 0; j < gra.rightNum; j++) {
+			if(have_exchange)
+				gra.mat[i][j] = similarity[j][i];
+			else
+				gra.mat[i][j] = similarity[i][j];
+        }
+    }
+    gra.match();
+    match1 = gra.leftMatch;
+    match2 = gra.rightMatch;
+    if (have_exchange) swap(match1, match2);
+}
diff --git a/evaluation/culane/src/evaluate.cpp b/evaluation/culane/src/evaluate.cpp
new file mode 100644
index 0000000..f74240f
--- /dev/null
+++ b/evaluation/culane/src/evaluate.cpp
@@ -0,0 +1,303 @@
+/*************************************************************************
+	> File Name: evaluate.cpp
+	> Author: Xingang Pan, Jun Li
+	> Mail: px117@ie.cuhk.edu.hk
+	> Created Time: 2016年07月14日 星期四 18时28分45秒
+ ************************************************************************/
+
+#include "counter.hpp"
+#include "spline.hpp"
+#if __linux__
+#include <unistd.h>
+#elif _MSC_VER
+#include "getopt.h"
+#endif
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <cstdlib>
+#include <string>
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+using namespace std;
+using namespace cv;
+
+void help(void)
+{
+	cout<<"./evaluate [OPTIONS]"<<endl;
+	cout<<"-h                  : print usage help"<<endl;
+	cout<<"-a                  : directory for annotation files (default: /data/driving/eval_data/anno_label/)"<<endl;
+	cout<<"-d                  : directory for detection files (default: /data/driving/eval_data/predict_label/)"<<endl;
+	cout<<"-i                  : directory for image files (default: /data/driving/eval_data/img/)"<<endl;
+	cout<<"-l                  : list of images used for evaluation (default: /data/driving/eval_data/img/all.txt)"<<endl;
+	cout<<"-w                  : width of the lanes (default: 10)"<<endl;
+	cout<<"-t                  : threshold of iou (default: 0.4)"<<endl;
+	cout<<"-c                  : cols (max image width) (default: 1920)"<<endl;
+	cout<<"-r                  : rows (max image height) (default: 1080)"<<endl;
+	cout<<"-s                  : show visualization"<<endl;
+	cout<<"-f                  : start frame in the test set (default: 1)"<<endl;
+}
+
+
+void read_lane_file(const string &file_name, vector<vector<Point2f> > &lanes, float x_factor, float y_factor);
+void visualize(string &full_im_name, vector<vector<Point2f> > &anno_lanes, vector<vector<Point2f> > &detect_lanes, vector<int> anno_match, int width_lane);
+
+int main(int argc, char **argv)
+{
+	// process params
+	string anno_dir = "/data/driving/eval_data/anno_label/";
+	string detect_dir = "/data/driving/eval_data/predict_label/";
+	string im_dir = "/data/driving/eval_data/img/";
+	string list_im_file = "/data/driving/eval_data/img/all.txt";
+	string output_file = "./output.txt";
+	int width_lane = 10;
+	double iou_threshold = 0.4;
+	int im_width = 1920;
+	int im_height = 1080;
+	int oc;
+	bool show = false;
+	int frame = 1;
+	double x_factor = 1.0;
+	double y_factor = 1.0;
+	while((oc = getopt(argc, argv, "ha:d:i:l:w:t:c:r:sf:o:x:y:")) != -1)
+	{
+		switch(oc)
+		{
+			case 'h':
+				help();
+				return 0;
+			case 'a':
+				anno_dir = optarg;
+				break;
+			case 'd':
+				detect_dir = optarg;
+				break;
+			case 'i':
+				im_dir = optarg;
+				break;
+			case 'l':
+				list_im_file = optarg;
+				break;
+			case 'w':
+				width_lane = atoi(optarg);
+				break;
+			case 't':
+				iou_threshold = atof(optarg);
+				break;
+			case 'c':
+				im_width = atoi(optarg);
+				break;
+			case 'r':
+				im_height = atoi(optarg);
+				break;
+			case 's':
+				show = true;
+				break;
+			case 'f':
+				frame = atoi(optarg);
+				break;
+			case 'o':
+				output_file = optarg;
+				break;
+			case 'x':
+				x_factor = atof(optarg);
+				break;
+			case 'y':
+				y_factor = atof(optarg);
+				break;
+		}
+	}
+
+
+	cout<<"------------Configuration---------"<<endl;
+	cout<<"anno_dir: "<<anno_dir<<endl;
+	cout<<"detect_dir: "<<detect_dir<<endl;
+	cout<<"im_dir: "<<im_dir<<endl;
+	cout<<"list_im_file: "<<list_im_file<<endl;
+	cout<<"width_lane: "<<width_lane<<endl;
+	cout<<"iou_threshold: "<<iou_threshold<<endl;
+	cout<<"im_width: "<<im_width<<endl;
+	cout<<"im_height: "<<im_height<<endl;
+	cout<<"x_factor: "<<x_factor<<endl;
+	cout<<"y_factor: "<<y_factor<<endl;
+	cout<<"-----------------------------------"<<endl;
+	cout<<"Evaluating the results..."<<endl;
+	// this is the max_width and max_height
+
+	if(width_lane<1)
+	{
+		cerr<<"width_lane must be positive"<<endl;
+		help();
+		return 1;
+	}
+
+	ifstream ifs_im_list(list_im_file, ios::in);
+	if(ifs_im_list.fail())
+	{
+		cerr<<"Error: file "<<list_im_file<<" not exist!"<<endl;
+		return 1;
+	}
+
+
+	Counter counter(im_width, im_height, iou_threshold, width_lane);
+
+	vector<int> anno_match;
+	string sub_im_name;
+  // pre-load filelist
+  vector<string> filelists;
+  while (getline(ifs_im_list, sub_im_name)) {
+    filelists.push_back(sub_im_name);
+  }
+  ifs_im_list.close();
+
+  vector<tuple<vector<int>, long, long, long, long>> tuple_lists;
+  tuple_lists.resize(filelists.size());
+
+#pragma omp parallel for
+	for (int i = 0; i < filelists.size(); i++)
+	{
+		auto sub_im_name = filelists[i];
+		string full_im_name = im_dir + sub_im_name;
+		string sub_txt_name =  sub_im_name.substr(0, sub_im_name.find_last_of(".")) + ".lines.txt";
+		string anno_file_name = anno_dir + sub_txt_name;
+		string detect_file_name = detect_dir + sub_txt_name;
+		vector<vector<Point2f> > anno_lanes;
+		vector<vector<Point2f> > detect_lanes;
+		read_lane_file(anno_file_name, anno_lanes, x_factor, y_factor);
+		read_lane_file(detect_file_name, detect_lanes, x_factor, y_factor);
+		tuple_lists[i] = counter.count_im_pair(anno_lanes, detect_lanes);
+	}
+	long tp = 0, fp = 0, tn = 0, fn = 0;
+  for (auto result: tuple_lists) {
+    tp += get<1>(result);
+    fp += get<2>(result);
+    // tn = get<3>(result);
+    fn += get<4>(result);
+  }
+	counter.setTP(tp);
+	counter.setFP(fp);
+	counter.setFN(fn);
+
+	double precision = counter.get_precision();
+	double recall = counter.get_recall();
+	double F = 2 * precision * recall / (precision + recall);
+	cerr<<"finished process file"<<endl;
+	cout<<"precision: "<<precision<<endl;
+	cout<<"recall: "<<recall<<endl;
+	cout<<"Fmeasure: "<<F<<endl;
+	cout<<"----------------------------------"<<endl;
+
+	ofstream ofs_out_file;
+	ofs_out_file.open(output_file, ios::out);
+	ofs_out_file<<"file: "<<output_file<<endl;
+	ofs_out_file<<"tp: "<<counter.getTP()<<" fp: "<<counter.getFP()<<" fn: "<<counter.getFN()<<endl;
+	ofs_out_file<<"precision: "<<precision<<endl;
+	ofs_out_file<<"recall: "<<recall<<endl;
+	ofs_out_file<<"Fmeasure: "<<F<<endl<<endl;
+	ofs_out_file.close();
+	return 0;
+}
+
+void read_lane_file(const string &file_name, vector<vector<Point2f> > &lanes, float x_factor, float y_factor)
+{
+	lanes.clear();
+	ifstream ifs_lane(file_name, ios::in);
+	if(ifs_lane.fail())
+	{
+		return;
+	}
+
+	string str_line;
+	while(getline(ifs_lane, str_line))
+	{
+		vector<Point2f> curr_lane;
+		stringstream ss;
+		ss<<str_line;
+		double x,y;
+		while(ss>>x>>y)
+		{
+			curr_lane.push_back(Point2f(x* x_factor, y* y_factor));
+		}
+		lanes.push_back(curr_lane);
+	}
+
+	ifs_lane.close();
+}
+
+void visualize(string &full_im_name, vector<vector<Point2f> > &anno_lanes, vector<vector<Point2f> > &detect_lanes, vector<int> anno_match, int width_lane)
+{
+	Mat img = imread(full_im_name, 1);
+	Mat img2 = imread(full_im_name, 1);
+	vector<Point2f> curr_lane;
+	vector<Point2f> p_interp;
+	Spline splineSolver;
+	Scalar color_B = Scalar(255, 0, 0);
+	Scalar color_G = Scalar(0, 255, 0);
+	Scalar color_R = Scalar(0, 0, 255);
+	Scalar color_P = Scalar(255, 0, 255);
+	Scalar color;
+	for (int i=0; i<anno_lanes.size(); i++)
+	{
+		curr_lane = anno_lanes[i];
+		if(curr_lane.size() == 2)
+		{
+			p_interp = curr_lane;
+		}
+		else
+		{
+			p_interp = splineSolver.splineInterpTimes(curr_lane, 50);
+		}
+		if (anno_match[i] >= 0)
+		{
+			color = color_G;
+		}
+		else
+		{
+			color = color_G;
+		}
+		for (int n=0; n<p_interp.size()-1; n++)
+		{
+			line(img, p_interp[n], p_interp[n+1], color, width_lane);
+			line(img2, p_interp[n], p_interp[n+1], color, 2);
+		}
+	}
+	bool detected;
+	for (int i=0; i<detect_lanes.size(); i++)
+	{
+		detected = false;
+		curr_lane = detect_lanes[i];
+		if(curr_lane.size() == 2)
+		{
+			p_interp = curr_lane;
+		}
+		else
+		{
+			p_interp = splineSolver.splineInterpTimes(curr_lane, 50);
+		}
+		for (int n=0; n<anno_lanes.size(); n++)
+		{
+			if (anno_match[n] == i)
+			{
+				detected = true;
+				break;
+			}
+		}
+		if (detected == true)
+		{
+			color = color_B;
+		}
+		else
+		{
+			color = color_R;
+		}
+		for (int n=0; n<p_interp.size()-1; n++)
+		{
+			line(img, p_interp[n], p_interp[n+1], color, width_lane);
+			line(img2, p_interp[n], p_interp[n+1], color, 2);
+		}
+	}
+	namedWindow("visualize", 1);
+	imshow("visualize", img);
+	namedWindow("visualize2", 1);
+	imshow("visualize2", img2);
+}
diff --git a/evaluation/culane/src/lane_compare.cpp b/evaluation/culane/src/lane_compare.cpp
new file mode 100644
index 0000000..96ffa15
--- /dev/null
+++ b/evaluation/culane/src/lane_compare.cpp
@@ -0,0 +1,72 @@
+/*************************************************************************
+	> File Name: lane_compare.cpp
+	> Author: Xingang Pan, Jun Li
+	> Mail: px117@ie.cuhk.edu.hk
+	> Created Time: Fri Jul 15 10:26:32 2016
+ ************************************************************************/
+
+#include "lane_compare.hpp"
+
+double LaneCompare::get_lane_similarity(const vector<Point2f> &lane1, const vector<Point2f> &lane2)
+{
+	if(lane1.size()<2 || lane2.size()<2)
+	{
+		cerr<<"lane size must be greater or equal to 2"<<endl;
+		return 0;
+	}
+	Mat im1 = Mat::zeros(im_height, im_width, CV_8UC1);
+	Mat im2 = Mat::zeros(im_height, im_width, CV_8UC1);
+	// draw lines on im1 and im2
+	vector<Point2f> p_interp1;
+	vector<Point2f> p_interp2;
+	if(lane1.size() == 2)
+	{
+		p_interp1 = lane1;
+	}
+	else
+	{
+		p_interp1 = splineSolver.splineInterpTimes(lane1, 50);
+	}
+
+	if(lane2.size() == 2)
+	{
+		p_interp2 = lane2;
+	}
+	else
+	{
+		p_interp2 = splineSolver.splineInterpTimes(lane2, 50);
+	}
+
+	Scalar color_white = Scalar(1);
+	for(int n=0; n<p_interp1.size()-1; n++)
+	{
+		line(im1, p_interp1[n], p_interp1[n+1], color_white, lane_width);
+	}
+	for(int n=0; n<p_interp2.size()-1; n++)
+	{
+		line(im2, p_interp2[n], p_interp2[n+1], color_white, lane_width);
+	}
+
+	double sum_1 = cv::sum(im1).val[0];
+	double sum_2 = cv::sum(im2).val[0];
+	double inter_sum = cv::sum(im1.mul(im2)).val[0];
+	double union_sum = sum_1 + sum_2 - inter_sum;
+	double iou = inter_sum / union_sum;
+	return iou;
+}
+
+
+// resize the lane from Size(curr_width, curr_height) to Size(im_width, im_height)
+void LaneCompare::resize_lane(vector<Point2f> &curr_lane, int curr_width, int curr_height)
+{
+	if(curr_width == im_width && curr_height == im_height)
+	{
+		return;
+	}
+	double x_scale = im_width/(double)curr_width;
+	double y_scale = im_height/(double)curr_height;
+	for(int n=0; n<curr_lane.size(); n++)
+	{
+		curr_lane[n] = Point2f(curr_lane[n].x*x_scale, curr_lane[n].y*y_scale);
+	}
+}
diff --git a/evaluation/culane/src/spline.cpp b/evaluation/culane/src/spline.cpp
new file mode 100644
index 0000000..fd45500
--- /dev/null
+++ b/evaluation/culane/src/spline.cpp
@@ -0,0 +1,178 @@
+#include <vector>
+#include <iostream>
+#include "spline.hpp"
+using namespace std;
+using namespace cv;
+
+vector<Point2f> Spline::splineInterpTimes(const vector<Point2f>& tmp_line, int times) {
+    vector<Point2f> res;
+
+    if(tmp_line.size() == 2) {
+        double x1 = tmp_line[0].x;
+        double y1 = tmp_line[0].y;
+        double x2 = tmp_line[1].x;
+        double y2 = tmp_line[1].y;
+
+        for (int k = 0; k <= times; k++) {
+            double xi =  x1 + double((x2 - x1) * k) / times;
+            double yi =  y1 + double((y2 - y1) * k) / times;
+            res.push_back(Point2f(xi, yi));
+        }
+    }
+
+    else if(tmp_line.size() > 2)
+    {
+        vector<Func> tmp_func;
+        tmp_func = this->cal_fun(tmp_line);
+        if (tmp_func.empty()) {
+            cout << "in splineInterpTimes: cal_fun failed" << endl;
+            return res;
+        }
+        for(int j = 0; j < tmp_func.size(); j++)
+        {
+            double delta = tmp_func[j].h / times;
+            for(int k = 0; k < times; k++)
+            {
+                double t1 = delta*k;
+                double x1 = tmp_func[j].a_x + tmp_func[j].b_x*t1 + tmp_func[j].c_x*pow(t1,2) + tmp_func[j].d_x*pow(t1,3);
+                double y1 = tmp_func[j].a_y + tmp_func[j].b_y*t1 + tmp_func[j].c_y*pow(t1,2) + tmp_func[j].d_y*pow(t1,3);
+                res.push_back(Point2f(x1, y1));
+            }
+        }
+        res.push_back(tmp_line[tmp_line.size() - 1]);
+    }
+	else {
+		cerr << "in splineInterpTimes: not enough points" << endl;
+	}
+    return res;
+}
+vector<Point2f> Spline::splineInterpStep(vector<Point2f> tmp_line, double step) {
+	vector<Point2f> res;
+	/*
+	if (tmp_line.size() == 2) {
+		double x1 = tmp_line[0].x;
+		double y1 = tmp_line[0].y;
+		double x2 = tmp_line[1].x;
+		double y2 = tmp_line[1].y;
+
+		for (double yi = std::min(y1, y2); yi < std::max(y1, y2); yi += step) {
+            double xi;
+			if (yi == y1) xi = x1;
+			else xi = (x2 - x1) / (y2 - y1) * (yi - y1) + x1;
+			res.push_back(Point2f(xi, yi));
+		}
+	}*/
+	if (tmp_line.size() == 2) {
+		double x1 = tmp_line[0].x;
+		double y1 = tmp_line[0].y;
+		double x2 = tmp_line[1].x;
+		double y2 = tmp_line[1].y;
+		tmp_line[1].x = (x1 + x2) / 2;
+		tmp_line[1].y = (y1 + y2) / 2;
+		tmp_line.push_back(Point2f(x2, y2));
+	}
+	if (tmp_line.size() > 2) {
+		vector<Func> tmp_func;
+		tmp_func = this->cal_fun(tmp_line);
+		double ystart = tmp_line[0].y;
+		double yend = tmp_line[tmp_line.size() - 1].y;
+		bool down;
+		if (ystart < yend) down = 1;
+		else down = 0;
+		if (tmp_func.empty()) {
+			cerr << "in splineInterpStep: cal_fun failed" << endl;
+		}
+
+		for(int j = 0; j < tmp_func.size(); j++)
+        {
+            for(double t1 = 0; t1 < tmp_func[j].h; t1 += step)
+            {
+                double x1 = tmp_func[j].a_x + tmp_func[j].b_x*t1 + tmp_func[j].c_x*pow(t1,2) + tmp_func[j].d_x*pow(t1,3);
+                double y1 = tmp_func[j].a_y + tmp_func[j].b_y*t1 + tmp_func[j].c_y*pow(t1,2) + tmp_func[j].d_y*pow(t1,3);
+                res.push_back(Point2f(x1, y1));
+            }
+        }
+        res.push_back(tmp_line[tmp_line.size() - 1]);
+	}
+    else {
+        cerr << "in splineInterpStep: not enough points" << endl;
+    }
+    return res;
+}
+
+vector<Func> Spline::cal_fun(const vector<Point2f> &point_v)
+{
+    vector<Func> func_v;
+    int n = point_v.size();
+    if(n<=2) {
+        cout << "in cal_fun: point number less than 3" << endl;
+        return func_v;
+    }
+
+    func_v.resize(point_v.size()-1);
+
+    vector<double> Mx(n);
+    vector<double> My(n);
+    vector<double> A(n-2);
+    vector<double> B(n-2);
+    vector<double> C(n-2);
+    vector<double> Dx(n-2);
+    vector<double> Dy(n-2);
+    vector<double> h(n-1);
+    //vector<func> func_v(n-1);
+
+    for(int i = 0; i < n-1; i++)
+    {
+        h[i] = sqrt(pow(point_v[i+1].x - point_v[i].x, 2) + pow(point_v[i+1].y - point_v[i].y, 2));
+    }
+
+    for(int i = 0; i < n-2; i++)
+    {
+        A[i] = h[i];
+        B[i] = 2*(h[i]+h[i+1]);
+        C[i] = h[i+1];
+
+        Dx[i] =  6*( (point_v[i+2].x - point_v[i+1].x)/h[i+1] - (point_v[i+1].x - point_v[i].x)/h[i] );
+        Dy[i] =  6*( (point_v[i+2].y - point_v[i+1].y)/h[i+1] - (point_v[i+1].y - point_v[i].y)/h[i] );
+    }
+
+    //TDMA
+    C[0] = C[0] / B[0];
+    Dx[0] = Dx[0] / B[0];
+    Dy[0] = Dy[0] / B[0];
+    for(int i = 1; i < n-2; i++)
+    {
+        double tmp = B[i] - A[i]*C[i-1];
+        C[i] = C[i] / tmp;
+        Dx[i] = (Dx[i] - A[i]*Dx[i-1]) / tmp;
+        Dy[i] = (Dy[i] - A[i]*Dy[i-1]) / tmp;
+    }
+    Mx[n-2] = Dx[n-3];
+    My[n-2] = Dy[n-3];
+    for(int i = n-4; i >= 0; i--)
+    {
+        Mx[i+1] = Dx[i] - C[i]*Mx[i+2];
+        My[i+1] = Dy[i] - C[i]*My[i+2];
+    }
+
+    Mx[0] = 0;
+    Mx[n-1] = 0;
+    My[0] = 0;
+    My[n-1] = 0;
+
+    for(int i = 0; i < n-1; i++)
+    {
+        func_v[i].a_x = point_v[i].x;
+        func_v[i].b_x = (point_v[i+1].x - point_v[i].x)/h[i] - (2*h[i]*Mx[i] + h[i]*Mx[i+1]) / 6;
+        func_v[i].c_x = Mx[i]/2;
+        func_v[i].d_x = (Mx[i+1] - Mx[i]) / (6*h[i]);
+
+        func_v[i].a_y = point_v[i].y;
+        func_v[i].b_y = (point_v[i+1].y - point_v[i].y)/h[i] - (2*h[i]*My[i] + h[i]*My[i+1]) / 6;
+        func_v[i].c_y = My[i]/2;
+        func_v[i].d_y = (My[i+1] - My[i]) / (6*h[i]);
+
+        func_v[i].h = h[i];
+    }
+    return func_v;
+}
diff --git a/evaluation/eval_wrapper.py b/evaluation/eval_wrapper.py
new file mode 100644
index 0000000..09ff523
--- /dev/null
+++ b/evaluation/eval_wrapper.py
@@ -0,0 +1,1353 @@
+import json
+import os
+import platform
+
+import numpy as np
+import scipy
+import torch
+from scipy.optimize import leastsq
+
+from data.constant import culane_col_anchor
+from data.constant import culane_row_anchor
+from data.dataloader import get_test_loader
+from evaluation.tusimple.lane2 import LaneEval
+from utils.dist_utils import dist_print
+from utils.dist_utils import dist_tqdm
+from utils.dist_utils import get_rank
+from utils.dist_utils import get_world_size
+from utils.dist_utils import is_main_process
+from utils.dist_utils import synchronize
+
+
+def generate_lines(out, out_ext, shape, names, output_path, griding_num, localization_type="abs", flip_updown=False):
+    grid = torch.arange(out.shape[1]) + 0.5
+    grid = grid.view(1, -1, 1, 1).cuda()
+    loc = (out.softmax(1) * grid).sum(1)
+
+    loc = loc / (out.shape[1] - 1) * 1640
+    # n, num_cls, num_lanes
+    valid = out_ext.argmax(1)
+    # n, num_cls, num_lanes
+    valid = valid.cpu()
+    loc = loc.cpu()
+
+    for j in range(valid.shape[0]):
+        name = names[j]
+        line_save_path = os.path.join(output_path, name[:-3] + "lines.txt")
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "w") as fp:
+            for i in [1, 2]:
+                if valid[j, :, i].sum() > 2:
+                    for k in range(valid.shape[1]):
+                        if valid[j, k, i]:
+                            fp.write("{:.3f} {:.3f} ".format(loc[j, k, i], culane_row_anchor[k] * 590))
+                    fp.write("\n")
+
+
+def generate_lines_col(
+    out_col, out_col_ext, shape, names, output_path, griding_num, localization_type="abs", flip_updown=False
+):
+    grid = torch.arange(out_col.shape[1]) + 0.5
+    grid = grid.view(1, -1, 1, 1).cuda()
+    loc = (out_col.softmax(1) * grid).sum(1)
+
+    loc = loc / (out_col.shape[1] - 1) * 590
+    # n, num_cls, num_lanes
+    valid = out_col_ext.argmax(1)
+    # n, num_cls, num_lanes
+    valid = valid.cpu()
+    loc = loc.cpu()
+
+    for j in range(valid.shape[0]):
+        name = names[j]
+        line_save_path = os.path.join(output_path, name[:-3] + "lines.txt")
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "a") as fp:
+            for i in [0, 3]:
+                if valid[j, :, i].sum() > 2:
+                    for k in range(valid.shape[1]):
+                        if valid[j, k, i]:
+                            fp.write("{:.3f} {:.3f} ".format(culane_col_anchor[k] * 1640, loc[j, k, i]))
+                    fp.write("\n")
+
+
+def postprocess(
+    max_indices: torch.Tensor,
+    out: torch.Tensor,
+    i: int,
+    j: int,
+    k: int,
+    local_width: int = 1,
+):
+    indices_from = max(0, max_indices[j, k, i] - local_width)
+    indices_to = min(out.shape[1] - 1, max_indices[j, k, i] + local_width) + 1
+    all_ind = torch.tensor(list(range(indices_from, indices_to)))
+
+    out_tmp = (out[j, all_ind, k, i].softmax(0) * all_ind.float()).sum() + 0.5
+    return out_tmp
+
+
+def generate_lines_local(dataset, out, out_ext, names, output_path, mode="normal", row_anchor=None):
+    batch_size, num_grid_row, num_cls, num_lane = out.shape
+    max_indices = out.argmax(1).cpu()
+    # n , num_cls, num_lanes
+
+    valid = out_ext.argmax(1).cpu()
+    # n, num_cls, num_lanes
+    out = out.cpu()
+
+    if mode == "normal" or mode == "2row2col":
+        if dataset == "CULane":
+            lane_list = [1, 2]
+        elif dataset == "CurveLanes":
+            # lane_list = [2, 3, 4, 5, 6, 7]
+            lane_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    else:
+        lane_list = range(num_lane)
+
+    local_width = 1
+    for j in range(valid.shape[0]):
+        name = names[j]
+        line_save_path = os.path.join(output_path, name[:-3] + "lines.txt")
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "w") as fp:
+            # for i in range(num_lane):
+            for i in lane_list:
+                if valid[j, :, i].sum() > num_cls / 2:
+                    for k in range(valid.shape[1]):
+                        if valid[j, k, i]:
+                            # TODO: refactor
+                            out_tmp = postprocess(
+                                max_indices=max_indices,
+                                local_width=local_width,
+                                out=out,
+                                i=i,
+                                j=j,
+                                k=k,
+                            )
+
+                            if dataset == "CULane":
+                                out_tmp = out_tmp / (out.shape[1] - 1) * 1640
+                                fp.write("{:.3f} {:.3f} ".format(out_tmp, row_anchor[k] * 590))
+                            elif dataset == "CurveLanes":
+                                out_tmp = out_tmp / (out.shape[1] - 1) * 2560
+                                fp.write("{:.3f} {:.3f} ".format(out_tmp, row_anchor[k] * 1440))
+                            else:
+                                raise Exception
+                    fp.write("\n")
+                elif mode == "all":
+                    fp.write("\n")
+
+
+def generate_lines_col_local(dataset, out_col, out_col_ext, names, output_path, mode="normal", col_anchor=None):
+    batch_size, num_grid_col, num_cls, num_lane = out_col.shape
+    max_indices = out_col.argmax(1).cpu()
+    # n, num_cls, num_lanes
+    valid = out_col_ext.argmax(1).cpu()
+    # n, num_cls, num_lanes
+    out_col = out_col.cpu()
+    local_width = 1
+
+    if mode == "normal" or mode == "2row2col":
+        if dataset == "CULane":
+            lane_list = [0, 3]
+        elif dataset == "CurveLanes":
+            # lane_list = [0, 1, 8, 9]
+            lane_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    else:
+        lane_list = range(num_lane)
+
+    for j in range(valid.shape[0]):
+        name = names[j]
+        line_save_path = os.path.join(output_path, name[:-3] + "lines.txt")
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "a") as fp:
+            # for i in range(num_lane):
+            for i in lane_list:
+                if valid[j, :, i].sum() > num_cls / 4:
+                    for k in range(valid.shape[1]):
+                        if valid[j, k, i]:
+                            out_tmp = postprocess(
+                                max_indices=max_indices,
+                                local_width=local_width,
+                                out=out_col,
+                                i=i,
+                                j=j,
+                                k=k,
+                            )
+                            if dataset == "CULane":
+                                out_tmp = out_tmp / (out_col.shape[1] - 1) * 590
+                                fp.write("{:.3f} {:.3f} ".format(col_anchor[k] * 1640, out_tmp))
+                            elif dataset == "CurveLanes":
+                                out_tmp = out_tmp / (out_col.shape[1] - 1) * 1440
+                                fp.write("{:.3f} {:.3f} ".format(col_anchor[k] * 2560, out_tmp))
+                            else:
+                                raise Exception
+
+                    fp.write("\n")
+                elif mode == "all":
+                    fp.write("\n")
+
+
+def generate_lines_local_curve_combine(dataset, out, out_ext, names, output_path, mode="normal", row_anchor=None):
+    batch_size, num_grid_row, num_cls, num_lane = out.shape
+    max_indices = out.argmax(1).cpu()
+    # n , num_cls, num_lanes
+
+    valid = out_ext.argmax(1).cpu()
+    # n, num_cls, num_lanes
+    out = out.cpu()
+
+    if mode == "normal" or mode == "2row2col":
+        if dataset == "CULane":
+            lane_list = [1, 2]
+        elif dataset == "CurveLanes":
+            # lane_list = [2, 3, 4, 5, 6, 7]
+            lane_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    else:
+        lane_list = range(num_lane)
+
+    local_width = 1
+    for j in range(valid.shape[0]):
+        # import pdb; pdb.set_trace()
+
+        name = names[j]
+        line_save_path = os.path.join(output_path, name[:-3] + "lines_row.txt")
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "w") as fp:
+            # for i in range(num_lane):
+            for i in lane_list:
+                if valid[j, :, i].sum() > num_cls / 4:
+                    for k in range(valid.shape[1]):
+                        if valid[j, k, i]:
+                            out_tmp = postprocess(
+                                max_indices=max_indices,
+                                local_width=local_width,
+                                out=out,
+                                i=i,
+                                j=j,
+                                k=k,
+                            )
+                            if dataset == "CULane":
+                                out_tmp = out_tmp / (out.shape[1] - 1) * 1640
+                                fp.write("{:.3f} {:.3f} ".format(out_tmp, row_anchor[k] * 590))
+                            elif dataset == "CurveLanes":
+                                out_tmp = out_tmp / (out.shape[1] - 1) * 2560
+                                fp.write("{:.3f} {:.3f} ".format(out_tmp, row_anchor[k] * 1440))
+                            else:
+                                raise Exception
+                    fp.write("\n")
+                else:
+                    fp.write("\n")
+
+
+def generate_lines_col_local_curve_combine(
+    dataset, out_col, out_col_ext, names, output_path, mode="normal", col_anchor=None
+):
+    batch_size, num_grid_col, num_cls, num_lane = out_col.shape
+    max_indices = out_col.argmax(1).cpu()
+    # n, num_cls, num_lanes
+    valid = out_col_ext.argmax(1).cpu()
+    # n, num_cls, num_lanes
+    out_col = out_col.cpu()
+    local_width = 1
+
+    if mode == "normal" or mode == "2row2col":
+        if dataset == "CULane":
+            lane_list = [0, 3]
+        elif dataset == "CurveLanes":
+            # lane_list = [0, 1, 8, 9]
+            lane_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    else:
+        lane_list = range(num_lane)
+
+    for j in range(valid.shape[0]):
+        # import pdb; pdb.set_trace()
+
+        name = names[j]
+        line_save_path = os.path.join(output_path, name[:-3] + "lines_col.txt")
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "w") as fp:
+            # for i in range(num_lane):
+            for i in lane_list:
+                if valid[j, :, i].sum() > num_cls / 4:
+                    for k in range(valid.shape[1]):
+                        if valid[j, k, i]:
+                            out_tmp = postprocess(
+                                max_indices=max_indices,
+                                local_width=local_width,
+                                out=out_col,
+                                i=i,
+                                j=j,
+                                k=k,
+                            )
+                            if dataset == "CULane":
+                                out_tmp = out_tmp / (out_col.shape[1] - 1) * 590
+                                fp.write("{:.3f} {:.3f} ".format(col_anchor[k] * 1640, out_tmp))
+                            elif dataset == "CurveLanes":
+                                out_tmp = out_tmp / (out_col.shape[1] - 1) * 1440
+                                fp.write("{:.3f} {:.3f} ".format(col_anchor[k] * 2560, out_tmp))
+                            else:
+                                raise Exception
+
+                    fp.write("\n")
+                # elif mode == 'all':
+                #     fp.write('\n')
+                else:
+                    fp.write("\n")
+
+
+def revise_lines_curve_combine(names, output_path):
+    for name in names:
+        line_save_path = os.path.join(output_path, name[:-3] + "lines.txt")
+        row_line_save_path = os.path.join(output_path, name[:-3] + "lines_row.txt")
+        col_line_save_path = os.path.join(output_path, name[:-3] + "lines_col.txt")
+        if not os.path.exists(row_line_save_path):
+            continue
+        if not os.path.exists(col_line_save_path):
+            continue
+        with open(row_line_save_path) as fp:
+            row_lines = fp.readlines()
+        with open(col_line_save_path) as fp:
+            col_lines = fp.readlines()
+        flag = True
+        for i in range(10):
+            x1, y1 = coordinate_parse(row_lines[i])
+            x2, y2 = coordinate_parse(col_lines[i])
+            x = x1 + x2
+            y = y1 + y2
+            if x == [] or y == []:
+                continue
+            x = np.array(x)
+            y = np.array(y)
+
+            p_init = np.random.randn(3)
+            para_x = leastsq(resudual, p_init, args=(x, y))
+            y_temp = func(para_x[0], x)
+            y_error = np.mean(np.square(y_temp - y))
+
+            para_y = leastsq(resudual, p_init, args=(y, x))
+            x_temp = func(para_y[0], y)
+            x_error = np.mean(np.square(x_temp - x))
+
+            if x_error > y_error:
+                x_new = np.linspace(min(x), max(x), 36)
+                y_new = func(para_x[0], x_new)
+            else:
+                y_new = np.linspace(min(y), max(y), 41)
+                x_new = func(para_y[0], y_new)
+
+            if flag:
+                fp = open(line_save_path, "w")
+                flag = False
+            else:
+                fp = open(line_save_path, "a")
+            for i in range(x_new.shape[0]):
+                fp.write("{:.3f} {:.3f} ".format(x_new[i], y_new[i]))
+            fp.write("\n")
+            fp.close()
+        if flag:
+            fp = open(line_save_path, "w")
+            fp.close()
+
+
+def generate_lines_reg(out, out_ext, names, output_path, mode="normal", row_anchor=None):
+    batch_size, num_grid_row, num_cls, num_lane = out.shape
+    # n , num_cls, num_lanes
+
+    valid = out_ext.argmax(1).cpu()
+    # n, num_cls, num_lanes
+    out = out.cpu().sigmoid()
+
+    if mode == "normal" or mode == "2row2col":
+        lane_list = [1, 2]
+    else:
+        lane_list = range(num_lane)
+
+    local_width = 1
+    for j in range(valid.shape[0]):
+        name = names[j]
+        line_save_path = os.path.join(output_path, name[:-3] + "lines.txt")
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "w") as fp:
+            # for i in range(num_lane):
+            for i in lane_list:
+                if valid[j, :, i].sum() > num_cls / 2:
+                    for k in range(valid.shape[1]):
+                        if valid[j, k, i]:
+                            # all_ind = torch.tensor(list(range(max(0,max_indices[j,k,i] - local_width), min(out.shape[1]-1, max_indices[j,k,i] + local_width) + 1)))
+
+                            out_tmp = out[j, 0, k, i] * 1640
+
+                            fp.write("{:.3f} {:.3f} ".format(out_tmp, row_anchor[k] * 590))
+                    fp.write("\n")
+                elif mode == "all":
+                    fp.write("\n")
+
+
+def generate_lines_col_reg(out_col, out_col_ext, names, output_path, mode="normal", col_anchor=None):
+    batch_size, num_grid_col, num_cls, num_lane = out_col.shape
+    # max_indices = out_col.argmax(1).cpu()
+    # n, num_cls, num_lanes
+    valid = out_col_ext.argmax(1).cpu()
+    # n, num_cls, num_lanes
+    out_col = out_col.cpu().sigmoid()
+    local_width = 1
+
+    if mode == "normal" or mode == "2row2col":
+        lane_list = [0, 3]
+    else:
+        lane_list = range(num_lane)
+
+    for j in range(valid.shape[0]):
+        name = names[j]
+        line_save_path = os.path.join(output_path, name[:-3] + "lines.txt")
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "a") as fp:
+            # for i in range(num_lane):
+            for i in lane_list:
+                if valid[j, :, i].sum() > num_cls / 4:
+                    for k in range(valid.shape[1]):
+                        if valid[j, k, i]:
+                            # all_ind = torch.tensor(list(range(max(0,max_indices[j,k,i] - local_width), min(out_col.shape[1]-1, max_indices[j,k,i] + local_width) + 1)))
+                            # out_tmp = (out_col[j,all_ind,k,i].softmax(0) * all_ind.float()).sum() + 0.5
+                            out_tmp = out_col[j, 0, k, i] * 590
+                            fp.write("{:.3f} {:.3f} ".format(col_anchor[k] * 1640, out_tmp))
+                    fp.write("\n")
+                elif mode == "all":
+                    fp.write("\n")
+
+
+def coordinate_parse(line):
+    if line == "\n":
+        return [], []
+
+    items = line.split(" ")[:-1]
+    x = [float(items[2 * i]) for i in range(len(items) // 2)]
+    y = [float(items[2 * i + 1]) for i in range(len(items) // 2)]
+
+    return x, y
+
+
+def func(p, x):
+    f = np.poly1d(p)
+    return f(x)
+
+
+def resudual(p, x, y):
+    error = y - func(p, x)
+    return error
+
+
+def revise_lines(names, output_path):
+    for name in names:
+        line_save_path = os.path.join(output_path, name[:-3] + "lines.txt")
+        if not os.path.exists(line_save_path):
+            continue
+        with open(line_save_path) as fp:
+            lines = fp.readlines()
+        flag = True
+        for i in range(4):
+            x1, y1 = coordinate_parse(lines[i])
+            x2, y2 = coordinate_parse(lines[i + 4])
+            x = x1 + x2
+            y = y1 + y2
+            if x == [] or y == []:
+                continue
+            x = np.array(x)
+            y = np.array(y)
+
+            p_init = np.random.randn(3)
+            para_x = leastsq(resudual, p_init, args=(x, y))
+            y_temp = func(para_x[0], x)
+            y_error = np.mean(np.square(y_temp - y))
+
+            para_y = leastsq(resudual, p_init, args=(y, x))
+            x_temp = func(para_y[0], y)
+            x_error = np.mean(np.square(x_temp - x))
+
+            if x_error > y_error:
+                x_new = np.linspace(min(x), max(x), 18)
+                y_new = func(para_x[0], x_new)
+            else:
+                y_new = np.linspace(min(y), max(y), 41)
+                x_new = func(para_y[0], y_new)
+
+            if flag:
+                fp = open(line_save_path, "w")
+                flag = False
+            else:
+                fp = open(line_save_path, "a")
+            for i in range(x_new.shape[0]):
+                fp.write("{:.3f} {:.3f} ".format(x_new[i], y_new[i]))
+            fp.write("\n")
+            fp.close()
+        if flag:
+            fp = open(line_save_path, "w")
+            fp.close()
+
+
+def rectify_lines(names, output_path):
+    for name in names:
+        line_save_path = os.path.join(output_path, name[:-3] + "lines.txt")
+        if not os.path.exists(line_save_path):
+            continue
+        with open(line_save_path) as fp:
+            lines = fp.readlines()
+        flag = True
+        for line in lines:
+            x, y = coordinate_parse(line)
+            if x == [] or y == []:
+                continue
+            x = np.array(x)
+            y = np.array(y)
+
+            p_init = np.random.randn(3)
+            para_x = leastsq(resudual, p_init, args=(x, y))
+            y_temp = func(para_x[0], x)
+            y_error = np.mean(np.square(y_temp - y))
+
+            para_y = leastsq(resudual, p_init, args=(y, x))
+            x_temp = func(para_y[0], y)
+            x_error = np.mean(np.square(x_temp - x))
+
+            if x_error > y_error:
+                x_new = np.linspace(min(x), max(x), 18)
+                y_new = func(para_x[0], x_new)
+            else:
+                y_new = np.linspace(min(y), max(y), 41)
+                x_new = func(para_y[0], y_new)
+
+            if flag:
+                fp = open(line_save_path, "w")
+                flag = False
+            else:
+                fp = open(line_save_path, "a")
+            for i in range(x_new.shape[0]):
+                fp.write("{:.3f} {:.3f} ".format(x_new[i], y_new[i]))
+            fp.write("\n")
+            fp.close()
+        if flag:
+            fp = open(line_save_path, "w")
+            fp.close()
+
+
+def run_test(
+    dataset,
+    net,
+    data_root,
+    exp_name,
+    work_dir,
+    distributed,
+    crop_ratio,
+    train_width,
+    train_height,
+    batch_size=8,
+    row_anchor=None,
+    col_anchor=None,
+):
+    # torch.backends.cudnn.benchmark = True
+    output_path = os.path.join(work_dir, exp_name)
+    if not os.path.exists(output_path) and is_main_process():
+        os.mkdir(output_path)
+    synchronize()
+    loader = get_test_loader(batch_size, data_root, dataset, distributed, crop_ratio, train_width, train_height)
+    # import pdb;pdb.set_trace()
+    for i, data in enumerate(dist_tqdm(loader)):
+        imgs, names = data
+        imgs = imgs.cuda()
+        with torch.no_grad():
+            pred = net(imgs)
+
+        if dataset == "CULane":
+            generate_lines_local(
+                dataset, pred["loc_row"], pred["exist_row"], names, output_path, "normal", row_anchor=row_anchor
+            )
+            generate_lines_col_local(
+                dataset, pred["loc_col"], pred["exist_col"], names, output_path, "normal", col_anchor=col_anchor
+            )
+        elif dataset == "CurveLanes":
+            generate_lines_local_curve_combine(
+                dataset, pred["loc_row"], pred["exist_row"], names, output_path, row_anchor=row_anchor
+            )
+            generate_lines_col_local_curve_combine(
+                dataset, pred["loc_col"], pred["exist_col"], names, output_path, col_anchor=col_anchor
+            )
+            revise_lines_curve_combine(names, output_path)
+        else:
+            raise NotImplementedError
+
+
+def generate_lines_local_tta(
+    loc_row, loc_row_left, loc_row_right, exist_row, exist_row_left, exist_row_right, names, output_path, row_anchor
+):
+    local_width = 1
+
+    max_indices = loc_row.argmax(1).cpu()
+    valid = exist_row.argmax(1).cpu()
+    loc_row = loc_row.cpu()
+
+    max_indices_left = loc_row_left.argmax(1).cpu()
+    valid_left = exist_row_left.argmax(1).cpu()
+    loc_row_left = loc_row_left.cpu()
+
+    max_indices_right = loc_row_right.argmax(1).cpu()
+    valid_right = exist_row_right.argmax(1).cpu()
+    loc_row_right = loc_row_right.cpu()
+
+    batch_size, num_grid, num_cls, num_lane = loc_row.shape
+
+    min_lane_length = num_cls / 2
+
+    for batch_idx in range(batch_size):
+        name = names[batch_idx]
+        line_save_path = os.path.join(output_path, name.replace("jpg", "lines.txt"))
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "w") as fp:
+            # for lane_idx in range(num_lane):
+            for lane_idx in [1, 2]:
+                if valid[batch_idx, :, lane_idx].sum() >= min_lane_length:
+                    pt_all = []
+                    for cls_idx in range(num_cls):
+                        cnt = 0
+                        out_tmp_all = 0
+                        if valid[batch_idx, cls_idx, lane_idx]:
+                            all_ind = torch.tensor(
+                                list(
+                                    range(
+                                        max(0, max_indices[batch_idx, cls_idx, lane_idx] - local_width),
+                                        min(num_grid - 1, max_indices[batch_idx, cls_idx, lane_idx] + local_width) + 1,
+                                    )
+                                )
+                            )
+                            out_tmp = (
+                                loc_row[batch_idx, all_ind, cls_idx, lane_idx].softmax(0) * all_ind.float()
+                            ).sum() + 0.5
+                            out_tmp = out_tmp / (num_grid - 1) * 1640
+                            cnt += 1
+                            out_tmp_all = out_tmp_all + out_tmp
+
+                        if valid_left[batch_idx, cls_idx, lane_idx]:
+                            all_ind_left = torch.tensor(
+                                list(
+                                    range(
+                                        max(0, max_indices_left[batch_idx, cls_idx, lane_idx] - local_width),
+                                        min(num_grid - 1, max_indices_left[batch_idx, cls_idx, lane_idx] + local_width)
+                                        + 1,
+                                    )
+                                )
+                            )
+
+                            out_tmp_left = (
+                                loc_row_left[batch_idx, all_ind_left, cls_idx, lane_idx].softmax(0)
+                                * all_ind_left.float()
+                            ).sum() + 0.5
+                            out_tmp_left = out_tmp_left / (num_grid - 1) * 1640 + 1640.0 / 25
+                            cnt += 1
+                            out_tmp_all = out_tmp_all + out_tmp_left
+
+                        if valid_right[batch_idx, cls_idx, lane_idx]:
+                            all_ind_right = torch.tensor(
+                                list(
+                                    range(
+                                        max(0, max_indices_right[batch_idx, cls_idx, lane_idx] - local_width),
+                                        min(num_grid - 1, max_indices_right[batch_idx, cls_idx, lane_idx] + local_width)
+                                        + 1,
+                                    )
+                                )
+                            )
+
+                            out_tmp_right = (
+                                loc_row_right[batch_idx, all_ind_right, cls_idx, lane_idx].softmax(0)
+                                * all_ind_right.float()
+                            ).sum() + 0.5
+                            out_tmp_right = out_tmp_right / (num_grid - 1) * 1640 - 1640.0 / 25
+                            cnt += 1
+                            out_tmp_all = out_tmp_all + out_tmp_right
+
+                        if cnt >= 2:
+                            pt_all.append((out_tmp_all / cnt, row_anchor[cls_idx] * 590))
+                    if len(pt_all) < min_lane_length:
+                        continue
+                    for pt in pt_all:
+                        fp.write("%.3f %.3f " % pt)
+                    fp.write("\n")
+
+
+def generate_lines_col_local_tta(
+    loc_col, loc_col_up, loc_col_down, exist_col, exist_col_up, exist_col_down, names, output_path, col_anchor
+):
+    local_width = 1
+
+    max_indices = loc_col.argmax(1).cpu()
+    valid = exist_col.argmax(1).cpu()
+    loc_col = loc_col.cpu()
+
+    max_indices_up = loc_col_up.argmax(1).cpu()
+    valid_up = exist_col_up.argmax(1).cpu()
+    loc_col_up = loc_col_up.cpu()
+
+    max_indices_down = loc_col_down.argmax(1).cpu()
+    valid_down = exist_col_down.argmax(1).cpu()
+    loc_col_down = loc_col_down.cpu()
+
+    batch_size, num_grid, num_cls, num_lane = loc_col.shape
+
+    min_lane_length = num_cls / 4
+
+    for batch_idx in range(batch_size):
+        name = names[batch_idx]
+        line_save_path = os.path.join(output_path, name.replace("jpg", "lines.txt"))
+        save_dir, _ = os.path.split(line_save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        with open(line_save_path, "a") as fp:
+            # for lane_idx in range(num_lane):
+            for lane_idx in [0, 3]:
+                if valid[batch_idx, :, lane_idx].sum() >= min_lane_length:
+                    pt_all = []
+                    for cls_idx in range(num_cls):
+                        cnt = 0
+                        out_tmp_all = 0
+                        if valid[batch_idx, cls_idx, lane_idx]:
+                            all_ind = torch.tensor(
+                                list(
+                                    range(
+                                        max(0, max_indices[batch_idx, cls_idx, lane_idx] - local_width),
+                                        min(num_grid - 1, max_indices[batch_idx, cls_idx, lane_idx] + local_width) + 1,
+                                    )
+                                )
+                            )
+                            out_tmp = (
+                                loc_col[batch_idx, all_ind, cls_idx, lane_idx].softmax(0) * all_ind.float()
+                            ).sum() + 0.5
+                            out_tmp = out_tmp / (num_grid - 1) * 590
+                            cnt += 1
+                            out_tmp_all = out_tmp_all + out_tmp
+
+                        if valid_up[batch_idx, cls_idx, lane_idx]:
+                            all_ind_up = torch.tensor(
+                                list(
+                                    range(
+                                        max(0, max_indices_up[batch_idx, cls_idx, lane_idx] - local_width),
+                                        min(num_grid - 1, max_indices_up[batch_idx, cls_idx, lane_idx] + local_width)
+                                        + 1,
+                                    )
+                                )
+                            )
+                            out_tmp_up = (
+                                loc_col_up[batch_idx, all_ind_up, cls_idx, lane_idx].softmax(0) * all_ind_up.float()
+                            ).sum() + 0.5
+                            out_tmp_up = out_tmp_up / (num_grid - 1) * 590 + 32.0 / 534 * 590
+                            cnt += 1
+                            out_tmp_all = out_tmp_all + out_tmp_up
+                        if valid_down[batch_idx, cls_idx, lane_idx]:
+                            all_ind_down = torch.tensor(
+                                list(
+                                    range(
+                                        max(0, max_indices_down[batch_idx, cls_idx, lane_idx] - local_width),
+                                        min(num_grid - 1, max_indices_down[batch_idx, cls_idx, lane_idx] + local_width)
+                                        + 1,
+                                    )
+                                )
+                            )
+                            out_tmp_down = (
+                                loc_col_down[batch_idx, all_ind_down, cls_idx, lane_idx].softmax(0)
+                                * all_ind_down.float()
+                            ).sum() + 0.5
+                            out_tmp_down = out_tmp_down / (num_grid - 1) * 590 - 32.0 / 534 * 590
+                            cnt += 1
+                            out_tmp_all = out_tmp_all + out_tmp_down
+
+                        if cnt >= 2:
+                            pt_all.append((col_anchor[cls_idx] * 1640, out_tmp_all / cnt))
+                    if len(pt_all) < min_lane_length:
+                        continue
+                    for pt in pt_all:
+                        fp.write("%.3f %.3f " % pt)
+                    fp.write("\n")
+
+
+def run_test_tta(
+    dataset,
+    net,
+    data_root,
+    exp_name,
+    work_dir,
+    distributed,
+    crop_ratio,
+    train_width,
+    train_height,
+    batch_size=8,
+    row_anchor=None,
+    col_anchor=None,
+):
+    output_path = os.path.join(work_dir, exp_name)
+    if not os.path.exists(output_path) and is_main_process():
+        os.mkdir(output_path)
+    synchronize()
+    loader = get_test_loader(batch_size, data_root, dataset, distributed, crop_ratio, train_width, train_height)
+    # import pdb;pdb.set_trace()
+    for i, data in enumerate(dist_tqdm(loader)):
+        imgs, names = data
+        imgs = imgs.cuda()
+        with torch.no_grad():
+            if hasattr(net, "module"):
+                pred = net.module.forward_tta(imgs)
+            else:
+                pred = net.forward_tta(imgs)
+
+            loc_row, loc_row_left, loc_row_right, _, _ = torch.chunk(pred["loc_row"], 5)
+            loc_col, _, _, loc_col_up, loc_col_down = torch.chunk(pred["loc_col"], 5)
+
+            exist_row, exist_row_left, exist_row_right, _, _ = torch.chunk(pred["exist_row"], 5)
+            exist_col, _, _, exist_col_up, exist_col_down = torch.chunk(pred["exist_col"], 5)
+
+        generate_lines_local_tta(
+            loc_row,
+            loc_row_left,
+            loc_row_right,
+            exist_row,
+            exist_row_left,
+            exist_row_right,
+            names,
+            output_path,
+            row_anchor,
+        )
+        generate_lines_col_local_tta(
+            loc_col, loc_col_up, loc_col_down, exist_col, exist_col_up, exist_col_down, names, output_path, col_anchor
+        )
+
+
+def generate_tusimple_lines(row_out, row_ext, col_out, col_ext, row_anchor=None, col_anchor=None, mode="2row2col"):
+    tusimple_h_sample = np.linspace(160, 710, 56)
+    row_num_grid, row_num_cls, row_num_lane = row_out.shape
+    row_max_indices = row_out.argmax(0).cpu()
+    # num_cls, num_lanes
+    row_valid = row_ext.argmax(0).cpu()
+    # num_cls, num_lanes
+    row_out = row_out.cpu()
+
+    col_num_grid, col_num_cls, col_num_lane = col_out.shape
+    col_max_indices = col_out.argmax(0).cpu()
+    # num_cls, num_lanes
+    col_valid = col_ext.argmax(0).cpu()
+    # num_cls, num_lanes
+    col_out = col_out.cpu()
+
+    # mode = '2row2col'
+
+    if mode == "normal" or mode == "2row2col":
+        row_lane_list = [1, 2]
+        col_lane_list = [0, 3]
+    elif mode == "4row":
+        row_lane_list = range(row_num_lane)
+        col_lane_list = []
+    elif mode == "4col":
+        row_lane_list = []
+        col_lane_list = range(col_num_lane)
+    else:
+        raise NotImplementedError
+
+    local_width_row = 14
+    local_width_col = 14
+    min_lanepts_row = 3
+    min_lanepts_col = 3
+
+    # local_width = 2
+    all_lanes = []
+
+    for row_lane_idx in row_lane_list:
+        if row_valid[:, row_lane_idx].sum() > min_lanepts_row:
+            cur_lane = []
+            for row_cls_idx in range(row_num_cls):
+                if row_valid[row_cls_idx, row_lane_idx]:
+                    all_ind = torch.tensor(
+                        list(
+                            range(
+                                max(0, row_max_indices[row_cls_idx, row_lane_idx] - local_width_row),
+                                min(row_num_grid - 1, row_max_indices[row_cls_idx, row_lane_idx] + local_width_row) + 1,
+                            )
+                        )
+                    )
+                    coord = (row_out[all_ind, row_cls_idx, row_lane_idx].softmax(0) * all_ind.float()).sum() + 0.5
+                    coord_x = coord / (row_num_grid - 1) * 1280
+                    coord_y = row_anchor[row_cls_idx] * 720
+                    cur_lane.append(int(coord_x))
+                else:
+                    cur_lane.append(-2)
+                    # cur_lane.append((coord_x, coord_y))
+            # cur_lane = np.array(cur_lane)
+            # p = np.polyfit(cur_lane[:,1], cur_lane[:,0], deg = 2)
+            # top_lim = min(cur_lane[:,1])
+            # # all_lane_interps.append((p, top_lim))
+            # lanes_on_tusimple = np.polyval(p, tusimple_h_sample)
+            # lanes_on_tusimple = np.round(lanes_on_tusimple)
+            # lanes_on_tusimple = lanes_on_tusimple.astype(int)
+            # lanes_on_tusimple[lanes_on_tusimple < 0] = -2
+            # lanes_on_tusimple[lanes_on_tusimple > 1280] = -2
+            # lanes_on_tusimple[tusimple_h_sample < top_lim] = -2
+            # all_lanes.append(lanes_on_tusimple.tolist())
+            all_lanes.append(cur_lane)
+        else:
+            # all_lanes.append([-2]*56)
+            pass
+
+    for col_lane_idx in col_lane_list:
+        if col_valid[:, col_lane_idx].sum() > min_lanepts_col:
+            cur_lane = []
+            for col_cls_idx in range(col_num_cls):
+                if col_valid[col_cls_idx, col_lane_idx]:
+                    all_ind = torch.tensor(
+                        list(
+                            range(
+                                max(0, col_max_indices[col_cls_idx, col_lane_idx] - local_width_col),
+                                min(col_num_grid - 1, col_max_indices[col_cls_idx, col_lane_idx] + local_width_col) + 1,
+                            )
+                        )
+                    )
+                    coord = (col_out[all_ind, col_cls_idx, col_lane_idx].softmax(0) * all_ind.float()).sum() + 0.5
+                    coord_y = coord / (col_num_grid - 1) * 720
+                    coord_x = col_anchor[col_cls_idx] * 1280
+                    cur_lane.append((coord_x, coord_y))
+            cur_lane = np.array(cur_lane)
+            top_lim = min(cur_lane[:, 1])
+            bot_lim = max(cur_lane[:, 1])
+
+            p = np.polyfit(cur_lane[:, 1], cur_lane[:, 0], deg=2)
+            lanes_on_tusimple = np.polyval(p, tusimple_h_sample)
+
+            # cur_lane_x = cur_lane[:,0]
+            # cur_lane_y = cur_lane[:,1]
+            # cur_lane_x_sorted = [x for _, x in sorted(zip(cur_lane_y, cur_lane_x))]
+            # cur_lane_y_sorted = sorted(cur_lane_y)
+            # p = InterpolatedUnivariateSpline(cur_lane_y_sorted, cur_lane_x_sorted, k=min(3, len(cur_lane_x_sorted) - 1))
+            # lanes_on_tusimple = p(tusimple_h_sample)
+
+            lanes_on_tusimple = np.round(lanes_on_tusimple)
+            lanes_on_tusimple = lanes_on_tusimple.astype(int)
+            lanes_on_tusimple[lanes_on_tusimple < 0] = -2
+            lanes_on_tusimple[lanes_on_tusimple > 1280] = -2
+            lanes_on_tusimple[tusimple_h_sample < top_lim] = -2
+            lanes_on_tusimple[tusimple_h_sample > bot_lim] = -2
+            all_lanes.append(lanes_on_tusimple.tolist())
+        else:
+            # all_lanes.append([-2]*56)
+            pass
+    # for (p, top_lim) in all_lane_interps:
+    #     lanes_on_tusimple = np.polyval(p, tusimple_h_sample)
+    #     lanes_on_tusimple = np.round(lanes_on_tusimple)
+    #     lanes_on_tusimple = lanes_on_tusimple.astype(int)
+    #     lanes_on_tusimple[lanes_on_tusimple < 0] = -2
+    #     lanes_on_tusimple[lanes_on_tusimple > 1280] = -2
+    #     lanes_on_tusimple[tusimple_h_sample < top_lim] = -2
+    #     all_lanes.append(lanes_on_tusimple.tolist())
+    return all_lanes
+
+
+def run_test_tusimple(
+    net,
+    cfg,
+):
+    output_path = os.path.join(cfg.test_work_dir, "tusimple_eval_tmp" + ".%d.txt" % get_rank())
+    fp = open(output_path, "w")
+    loader = get_test_loader(
+        batch_size=cfg.batch_size,
+        data_root=cfg.data_root,
+        dataset="Tusimple",
+        distributed=cfg.distributed,
+        crop_ratio=cfg.crop_ratio,
+        train_width=cfg.train_width,
+        train_height=cfg.train_height,
+    )
+    for data in dist_tqdm(loader):
+        imgs, names = data
+        imgs = imgs.cuda()
+        with torch.no_grad():
+            pred = net(imgs)
+
+            if not isinstance(net, torch.nn.Module):
+                # For ort inference on input image size 800*320 and default hyperparameters
+                pred = pred[0]
+                num_grid_row = cfg.num_cell_row
+                num_cls_row = cfg.num_row
+                num_lane_on_row = cfg.num_lanes
+                num_grid_col = cfg.num_cell_col
+                num_cls_col = cfg.num_col
+                num_lane_on_col = cfg.num_lanes
+
+                dim1 = num_grid_row * num_cls_row * num_lane_on_row
+                dim2 = num_grid_col * num_cls_col * num_lane_on_col
+                dim3 = 2 * num_cls_row * num_lane_on_row
+                dim4 = 2 * num_cls_col * num_lane_on_col
+
+                pred = {
+                    "loc_row": pred[:, :dim1].view(-1, num_grid_row, num_cls_row, num_lane_on_row),
+                    "loc_col": pred[:, dim1 : dim1 + dim2].view(-1, num_grid_col, num_cls_col, num_lane_on_col),
+                    "exist_row": pred[:, dim1 + dim2 : dim1 + dim2 + dim3].view(-1, 2, num_cls_row, num_lane_on_row),
+                    "exist_col": pred[:, -dim4:].view(-1, 2, num_cls_col, num_lane_on_col),
+                    "out": pred,
+                }
+
+        for b_idx, name in enumerate(names):
+            tmp_dict = {}
+            tmp_dict["lanes"] = generate_tusimple_lines(
+                pred["loc_row"][b_idx],
+                pred["exist_row"][b_idx],
+                pred["loc_col"][b_idx],
+                pred["exist_col"][b_idx],
+                row_anchor=cfg.row_anchor,
+                col_anchor=cfg.col_anchor,
+                mode="4row",
+            )
+            tmp_dict["h_samples"] = list(range(160, 720, 10))
+            tmp_dict["raw_file"] = name
+            tmp_dict["run_time"] = 10
+            json_str = json.dumps(tmp_dict)
+
+            fp.write(json_str + "\n")
+    fp.close()
+
+
+def combine_tusimple_test(work_dir, exp_name):
+    size = get_world_size()
+    all_res = []
+    for i in range(size):
+        output_path = os.path.join(work_dir, exp_name + ".%d.txt" % i)
+        with open(output_path) as fp:
+            res = fp.readlines()
+        all_res.extend(res)
+    names = set()
+    all_res_no_dup = []
+    for i, res in enumerate(all_res):
+        pos = res.find("clips")
+        name = res[pos:].split('"')[0]
+        if name not in names:
+            names.add(name)
+            all_res_no_dup.append(res)
+
+    output_path = os.path.join(work_dir, exp_name + ".txt")
+    with open(output_path, "w") as fp:
+        fp.writelines(all_res_no_dup)
+
+
+def eval_lane(net, cfg, ep=None, logger=None):
+    if isinstance(net, torch.nn.Module):
+        net.eval()  # don't need to change mode to .eval() for onnx model
+    if cfg.dataset == "CurveLanes":
+        if not cfg.tta:
+            run_test(
+                cfg.dataset,
+                net,
+                cfg.data_root,
+                "curvelanes_eval_tmp",
+                cfg.test_work_dir,
+                cfg.distributed,
+                cfg.crop_ratio,
+                cfg.train_width,
+                cfg.train_height,
+                row_anchor=cfg.row_anchor,
+                col_anchor=cfg.col_anchor,
+            )
+        else:
+            run_test_tta(
+                cfg.dataset,
+                net,
+                cfg.data_root,
+                "curvelanes_eval_tmp",
+                cfg.test_work_dir,
+                cfg.distributed,
+                cfg.crop_ratio,
+                cfg.train_width,
+                cfg.train_height,
+                row_anchor=cfg.row_anchor,
+                col_anchor=cfg.col_anchor,
+            )
+        synchronize()  # wait for all results
+        if is_main_process():
+            res = call_curvelane_eval(cfg.data_root, "curvelanes_eval_tmp", cfg.test_work_dir)
+            TP, FP, FN = 0, 0, 0
+            for k, v in res.items():
+                val = float(v["Fmeasure"]) if "nan" not in v["Fmeasure"] else 0
+                val_tp, val_fp, val_fn = int(v["tp"]), int(v["fp"]), int(v["fn"])
+                TP += val_tp
+                FP += val_fp
+                FN += val_fn
+                dist_print(k, val)
+                if logger is not None:
+                    if k == "res_cross":
+                        logger.add_scalar("CuEval_cls/" + k, val_fp, global_step=ep)
+                        continue
+                    logger.add_scalar("CuEval_cls/" + k, val, global_step=ep)
+            if TP + FP == 0:
+                P = 0
+                print("nearly no results!")
+            else:
+                P = TP * 1.0 / (TP + FP)
+            if TP + FN == 0:
+                R = 0
+                print("nearly no results!")
+            else:
+                R = TP * 1.0 / (TP + FN)
+            if (P + R) == 0:
+                F = 0
+            else:
+                F = 2 * P * R / (P + R)
+            dist_print(F)
+            if logger is not None:
+                logger.add_scalar("CuEval/total", F, global_step=ep)
+                logger.add_scalar("CuEval/P", P, global_step=ep)
+                logger.add_scalar("CuEval/R", R, global_step=ep)
+
+        synchronize()
+        if is_main_process():
+            return F
+        else:
+            return None
+    elif cfg.dataset == "CULane":
+        if not cfg.tta:
+            run_test(
+                cfg.dataset,
+                net,
+                cfg.data_root,
+                "culane_eval_tmp",
+                cfg.test_work_dir,
+                cfg.distributed,
+                cfg.crop_ratio,
+                cfg.train_width,
+                cfg.train_height,
+                row_anchor=cfg.row_anchor,
+                col_anchor=cfg.col_anchor,
+            )
+        else:
+            run_test_tta(
+                cfg.dataset,
+                net,
+                cfg.data_root,
+                "culane_eval_tmp",
+                cfg.test_work_dir,
+                cfg.distributed,
+                cfg.crop_ratio,
+                cfg.train_width,
+                cfg.train_height,
+                row_anchor=cfg.row_anchor,
+                col_anchor=cfg.col_anchor,
+            )
+        synchronize()  # wait for all results
+        if is_main_process():
+            res = call_culane_eval(cfg.data_root, "culane_eval_tmp", cfg.test_work_dir)
+            TP, FP, FN = 0, 0, 0
+            for k, v in res.items():
+                val = float(v["Fmeasure"]) if "nan" not in v["Fmeasure"] else 0
+                val_tp, val_fp, val_fn = int(v["tp"]), int(v["fp"]), int(v["fn"])
+                TP += val_tp
+                FP += val_fp
+                FN += val_fn
+                dist_print(k, val)
+                if logger is not None:
+                    if k == "res_cross":
+                        logger.add_scalar("CuEval_cls/" + k, val_fp, global_step=ep)
+                        continue
+                    logger.add_scalar("CuEval_cls/" + k, val, global_step=ep)
+            if TP + FP == 0:
+                P = 0
+                print("nearly no results!")
+            else:
+                P = TP * 1.0 / (TP + FP)
+            if TP + FN == 0:
+                R = 0
+                print("nearly no results!")
+            else:
+                R = TP * 1.0 / (TP + FN)
+            if (P + R) == 0:
+                F = 0
+            else:
+                F = 2 * P * R / (P + R)
+            dist_print(F)
+            if logger is not None:
+                logger.add_scalar("CuEval/total", F, global_step=ep)
+                logger.add_scalar("CuEval/P", P, global_step=ep)
+                logger.add_scalar("CuEval/R", R, global_step=ep)
+
+        synchronize()
+        if is_main_process():
+            return F
+        else:
+            return None
+    elif cfg.dataset == "Tusimple":
+        exp_name = "tusimple_eval_tmp"
+        run_test_tusimple(net, cfg)
+        synchronize()  # wait for all results
+        if is_main_process():
+            combine_tusimple_test(cfg.test_work_dir, exp_name)
+            res = LaneEval.bench_one_submit(
+                os.path.join(cfg.test_work_dir, exp_name + ".txt"), os.path.join(cfg.data_root, "test_label.json")
+            )
+            res = json.loads(res)
+            res.append({"name": "epoch", "value": ep})
+            for r in res:
+                dist_print(r["name"], r["value"])
+                if logger is not None:
+                    logger.add_scalar("TuEval/" + r["name"], r["value"], global_step=ep)
+        synchronize()
+        if is_main_process():
+            for r in res:
+                if r["name"] == "F1":
+                    return r["value"]
+        else:
+            return None
+
+
+def read_helper(path):
+    lines = open(path).readlines()[1:]
+    lines = " ".join(lines)
+    values = lines.split(" ")[1::2]
+    keys = lines.split(" ")[0::2]
+    keys = [key[:-1] for key in keys]
+    res = {k: v for k, v in zip(keys, values)}
+    return res
+
+
+def call_culane_eval(data_dir, exp_name, output_path):
+    if data_dir[-1] != "/":
+        data_dir = data_dir + "/"
+    detect_dir = os.path.join(output_path, exp_name) + "/"
+
+    w_lane = 30
+    iou = 0.5  # Set iou to 0.3 or 0.5
+    im_w = 1640
+    im_h = 590
+    frame = 1
+    list0 = os.path.join(data_dir, "list/test_split/test0_normal.txt")
+    list1 = os.path.join(data_dir, "list/test_split/test1_crowd.txt")
+    list2 = os.path.join(data_dir, "list/test_split/test2_hlight.txt")
+    list3 = os.path.join(data_dir, "list/test_split/test3_shadow.txt")
+    list4 = os.path.join(data_dir, "list/test_split/test4_noline.txt")
+    list5 = os.path.join(data_dir, "list/test_split/test5_arrow.txt")
+    list6 = os.path.join(data_dir, "list/test_split/test6_curve.txt")
+    list7 = os.path.join(data_dir, "list/test_split/test7_cross.txt")
+    list8 = os.path.join(data_dir, "list/test_split/test8_night.txt")
+    if not os.path.exists(os.path.join(output_path, "txt")):
+        os.mkdir(os.path.join(output_path, "txt"))
+    out0 = os.path.join(output_path, "txt", "out0_normal.txt")
+    out1 = os.path.join(output_path, "txt", "out1_crowd.txt")
+    out2 = os.path.join(output_path, "txt", "out2_hlight.txt")
+    out3 = os.path.join(output_path, "txt", "out3_shadow.txt")
+    out4 = os.path.join(output_path, "txt", "out4_noline.txt")
+    out5 = os.path.join(output_path, "txt", "out5_arrow.txt")
+    out6 = os.path.join(output_path, "txt", "out6_curve.txt")
+    out7 = os.path.join(output_path, "txt", "out7_cross.txt")
+    out8 = os.path.join(output_path, "txt", "out8_night.txt")
+
+    eval_cmd = "./evaluation/culane/evaluate"
+    if platform.system() == "Windows":
+        eval_cmd = eval_cmd.replace("/", os.sep)
+
+    # print('./evaluate -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s'%(data_dir,detect_dir,data_dir,list0,w_lane,iou,im_w,im_h,frame,out0))
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list0, w_lane, iou, im_w, im_h, frame, out0)
+    )
+    # print('./evaluate -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s'%(data_dir,detect_dir,data_dir,list1,w_lane,iou,im_w,im_h,frame,out1))
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list1, w_lane, iou, im_w, im_h, frame, out1)
+    )
+    # print('./evaluate -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s'%(data_dir,detect_dir,data_dir,list2,w_lane,iou,im_w,im_h,frame,out2))
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list2, w_lane, iou, im_w, im_h, frame, out2)
+    )
+    # print('./evaluate -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s'%(data_dir,detect_dir,data_dir,list3,w_lane,iou,im_w,im_h,frame,out3))
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list3, w_lane, iou, im_w, im_h, frame, out3)
+    )
+    # print('./evaluate -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s'%(data_dir,detect_dir,data_dir,list4,w_lane,iou,im_w,im_h,frame,out4))
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list4, w_lane, iou, im_w, im_h, frame, out4)
+    )
+    # print('./evaluate -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s'%(data_dir,detect_dir,data_dir,list5,w_lane,iou,im_w,im_h,frame,out5))
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list5, w_lane, iou, im_w, im_h, frame, out5)
+    )
+    # print('./evaluate -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s'%(data_dir,detect_dir,data_dir,list6,w_lane,iou,im_w,im_h,frame,out6))
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list6, w_lane, iou, im_w, im_h, frame, out6)
+    )
+    # print('./evaluate -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s'%(data_dir,detect_dir,data_dir,list7,w_lane,iou,im_w,im_h,frame,out7))
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list7, w_lane, iou, im_w, im_h, frame, out7)
+    )
+    # print('./evaluate -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s'%(data_dir,detect_dir,data_dir,list8,w_lane,iou,im_w,im_h,frame,out8))
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list8, w_lane, iou, im_w, im_h, frame, out8)
+    )
+    res_all = {}
+    res_all["res_normal"] = read_helper(out0)
+    res_all["res_crowd"] = read_helper(out1)
+    res_all["res_night"] = read_helper(out8)
+    res_all["res_noline"] = read_helper(out4)
+    res_all["res_shadow"] = read_helper(out3)
+    res_all["res_arrow"] = read_helper(out5)
+    res_all["res_hlight"] = read_helper(out2)
+    res_all["res_curve"] = read_helper(out6)
+    res_all["res_cross"] = read_helper(out7)
+    return res_all
+
+
+def call_curvelane_eval(data_dir, exp_name, output_path):
+    if data_dir[-1] != "/":
+        data_dir = data_dir + "/"
+    detect_dir = os.path.join(output_path, exp_name) + "/"
+
+    w_lane = 5
+    iou = 0.5  # Set iou to 0.3 or 0.5
+    im_w = 224
+    im_h = 224
+    x_factor = 224 / 2560
+    y_factor = 224 / 1440
+    frame = 1
+    list0 = os.path.join(data_dir, "valid", "valid_for_culane_style.txt")
+    if not os.path.exists(os.path.join(output_path, "txt")):
+        os.mkdir(os.path.join(output_path, "txt"))
+    out0 = os.path.join(output_path, "txt", "out0_curve.txt")
+
+    eval_cmd = "./evaluation/culane/evaluate"
+    if platform.system() == "Windows":
+        eval_cmd = eval_cmd.replace("/", os.sep)
+
+    print(
+        "./evaluate -s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s -x %s -y %s"
+        % (data_dir, detect_dir, data_dir, list0, w_lane, iou, im_w, im_h, frame, out0, x_factor, y_factor)
+    )
+    os.system(
+        "%s -a %s -d %s -i %s -l %s -w %s -t %s -c %s -r %s -f %s -o %s -x %s -y %s"
+        % (eval_cmd, data_dir, detect_dir, data_dir, list0, w_lane, iou, im_w, im_h, frame, out0, x_factor, y_factor)
+    )
+    res_all = {}
+    res_all["res_curve"] = read_helper(out0)
+    return res_all
diff --git a/evaluation/tusimple/lane.py b/evaluation/tusimple/lane.py
new file mode 100644
index 0000000..8b40c75
--- /dev/null
+++ b/evaluation/tusimple/lane.py
@@ -0,0 +1,109 @@
+import json
+
+import numpy as np
+from sklearn.linear_model import LinearRegression
+
+
+class LaneEval:
+    lr = LinearRegression()
+    pixel_thresh = 20
+    pt_thresh = 0.85
+
+    @staticmethod
+    def get_angle(xs, y_samples):
+        xs, ys = xs[xs >= 0], y_samples[xs >= 0]
+        if len(xs) > 1:
+            LaneEval.lr.fit(ys[:, None], xs)
+            k = LaneEval.lr.coef_[0]
+            theta = np.arctan(k)
+        else:
+            theta = 0
+        return theta
+
+    @staticmethod
+    def line_accuracy(pred, gt, thresh):
+        pred = np.array([p if p >= 0 else -100 for p in pred])
+        gt = np.array([g if g >= 0 else -100 for g in gt])
+        return np.sum(np.where(np.abs(pred - gt) < thresh, 1.0, 0.0)) / len(gt)
+
+    @staticmethod
+    def bench(pred, gt, y_samples, running_time):
+        if any(len(p) != len(y_samples) for p in pred):
+            raise Exception("Format of lanes error.")
+        if running_time > 200 or len(gt) + 2 < len(pred):
+            return 0.0, 0.0, 1.0
+        angles = [LaneEval.get_angle(np.array(x_gts), np.array(y_samples)) for x_gts in gt]
+        threshs = [LaneEval.pixel_thresh / np.cos(angle) for angle in angles]
+        line_accs = []
+        fp, fn = 0.0, 0.0
+        matched = 0.0
+        for x_gts, thresh in zip(gt, threshs):
+            accs = [LaneEval.line_accuracy(np.array(x_preds), np.array(x_gts), thresh) for x_preds in pred]
+            max_acc = np.max(accs) if len(accs) > 0 else 0.0
+            if max_acc < LaneEval.pt_thresh:
+                fn += 1
+            else:
+                matched += 1
+            line_accs.append(max_acc)
+        fp = len(pred) - matched
+        if len(gt) > 4 and fn > 0:
+            fn -= 1
+        s = sum(line_accs)
+        if len(gt) > 4:
+            s -= min(line_accs)
+        return (
+            s / max(min(4.0, len(gt)), 1.0),
+            fp / len(pred) if len(pred) > 0 else 0.0,
+            fn / max(min(len(gt), 4.0), 1.0),
+        )
+
+    @staticmethod
+    def bench_one_submit(pred_file, gt_file):
+        try:
+            json_pred = [json.loads(line) for line in open(pred_file).readlines()]
+        except BaseException as e:
+            raise Exception("Fail to load json file of the prediction.")
+        json_gt = [json.loads(line) for line in open(gt_file).readlines()]
+        if len(json_gt) != len(json_pred):
+            raise Exception("We do not get the predictions of all the prune tasks")
+        gts = {l["raw_file"]: l for l in json_gt}
+        accuracy, fp, fn = 0.0, 0.0, 0.0
+        for pred in json_pred:
+            if "raw_file" not in pred or "lanes" not in pred or "run_time" not in pred:
+                raise Exception("raw_file or lanes or run_time not in some predictions.")
+            raw_file = pred["raw_file"]
+            pred_lanes = pred["lanes"]
+            run_time = pred["run_time"]
+            if raw_file not in gts:
+                raise Exception("Some raw_file from your predictions do not exist in the prune tasks.")
+            gt = gts[raw_file]
+            gt_lanes = gt["lanes"]
+            y_samples = gt["h_samples"]
+            try:
+                a, p, n = LaneEval.bench(pred_lanes, gt_lanes, y_samples, run_time)
+            except BaseException as e:
+                raise Exception("Format of lanes error.")
+            accuracy += a
+            fp += p
+            fn += n
+        num = len(gts)
+        # the first return parameter is the default ranking parameter
+        return json.dumps(
+            [
+                {"name": "Accuracy", "value": accuracy / num, "order": "desc"},
+                {"name": "FP", "value": fp / num, "order": "asc"},
+                {"name": "FN", "value": fn / num, "order": "asc"},
+            ]
+        )
+
+
+if __name__ == "__main__":
+    import sys
+
+    try:
+        if len(sys.argv) != 3:
+            raise Exception("Invalid input arguments")
+        print(LaneEval.bench_one_submit(sys.argv[1], sys.argv[2]))
+    except Exception as e:
+        print(e.message)
+        sys.exit(e.message)
diff --git a/evaluation/tusimple/lane2.py b/evaluation/tusimple/lane2.py
new file mode 100644
index 0000000..db871ae
--- /dev/null
+++ b/evaluation/tusimple/lane2.py
@@ -0,0 +1,111 @@
+import numpy as np
+import ujson as json
+from sklearn.linear_model import LinearRegression
+
+
+class LaneEval:
+    lr = LinearRegression()
+    pixel_thresh = 20
+    pt_thresh = 0.85
+
+    @staticmethod
+    def get_angle(xs, y_samples):
+        xs, ys = xs[xs >= 0], y_samples[xs >= 0]
+        if len(xs) > 1:
+            LaneEval.lr.fit(ys[:, None], xs)
+            k = LaneEval.lr.coef_[0]
+            theta = np.arctan(k)
+        else:
+            theta = 0
+        return theta
+
+    @staticmethod
+    def line_accuracy(pred, gt, thresh):
+        pred = np.array([p if p >= 0 else -100 for p in pred])
+        gt = np.array([g if g >= 0 else -100 for g in gt])
+        return np.sum(np.where(np.abs(pred - gt) < thresh, 1.0, 0.0)) / len(gt)
+
+    @staticmethod
+    def bench(pred, gt, y_samples, running_time):
+        if any(len(p) != len(y_samples) for p in pred):
+            raise Exception("Format of lanes error.")
+        if running_time > 200 or len(gt) + 2 < len(pred):
+            return 0.0, 0.0, 1.0
+        angles = [LaneEval.get_angle(np.array(x_gts), np.array(y_samples)) for x_gts in gt]
+        threshs = [LaneEval.pixel_thresh / np.cos(angle) for angle in angles]
+        line_accs = []
+        fp, fn = 0.0, 0.0
+        matched = 0.0
+        for x_gts, thresh in zip(gt, threshs):
+            accs = [LaneEval.line_accuracy(np.array(x_preds), np.array(x_gts), thresh) for x_preds in pred]
+            max_acc = np.max(accs) if len(accs) > 0 else 0.0
+            if max_acc < LaneEval.pt_thresh:
+                fn += 1
+            else:
+                matched += 1
+            line_accs.append(max_acc)
+        fp = len(pred) - matched
+        if len(gt) > 4 and fn > 0:
+            fn -= 1
+        s = sum(line_accs)
+        if len(gt) > 4:
+            s -= min(line_accs)
+        return (
+            s / max(min(4.0, len(gt)), 1.0),
+            fp / len(pred) if len(pred) > 0 else 0.0,
+            fn / max(min(len(gt), 4.0), 1.0),
+        )
+
+    @staticmethod
+    def bench_one_submit(pred_file, gt_file):
+        # try:
+        json_pred = [json.loads(line) for line in open(pred_file).readlines()]
+        # except BaseException as e:
+        #     raise Exception('Fail to load json file of the prediction.')
+        json_gt = [json.loads(line) for line in open(gt_file).readlines()]
+        if len(json_gt) != len(json_pred):
+            raise Exception("We do not get the predictions of all the prune tasks")
+        gts = {l["raw_file"]: l for l in json_gt}
+        accuracy, fp, fn = 0.0, 0.0, 0.0
+        for pred in json_pred:
+            if "raw_file" not in pred or "lanes" not in pred or "run_time" not in pred:
+                raise Exception("raw_file or lanes or run_time not in some predictions.")
+            raw_file = pred["raw_file"]
+            pred_lanes = pred["lanes"]
+            run_time = pred["run_time"]
+            if raw_file not in gts:
+                raise Exception("Some raw_file from your predictions do not exist in the prune tasks.")
+            gt = gts[raw_file]
+            gt_lanes = gt["lanes"]
+            y_samples = gt["h_samples"]
+            try:
+                a, p, n = LaneEval.bench(pred_lanes, gt_lanes, y_samples, run_time)
+            except BaseException as e:
+                raise Exception("Format of lanes error.")
+            accuracy += a
+            fp += p
+            fn += n
+        num = len(gts)
+        # the first return parameter is the default ranking parameter
+        pr = 1 - fp / num
+        re = 1 - fn / num
+        if (pr + re) == 0:
+            f1 = 0
+        else:
+            f1 = 2 * pr * re / (pr + re)
+        return json.dumps(
+            [
+                {"name": "Accuracy", "value": accuracy / num, "order": "desc"},
+                {"name": "FP", "value": fp / num, "order": "asc"},
+                {"name": "FN", "value": fn / num, "order": "asc"},
+                {"name": "F1", "value": f1, "order": "asc"},
+            ]
+        )
+
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) != 3:
+        raise Exception("Invalid input arguments")
+    print(LaneEval.bench_one_submit(sys.argv[1], sys.argv[2]))
diff --git a/export.py b/export.py
new file mode 100644
index 0000000..b81cef6
--- /dev/null
+++ b/export.py
@@ -0,0 +1,100 @@
+import argparse
+from pathlib import Path
+
+import torch
+from torch import nn
+
+from utils.common import get_model
+from utils.config import Config
+
+
+class TiCompatibleClsLinear(nn.Module):
+    def __init__(self, linear: nn.Linear):
+        super().__init__()
+
+        self.in_features = linear.in_features
+        self.out_features = linear.out_features
+        self.conv = nn.Conv2d(
+            in_channels=self.in_features,
+            out_channels=self.out_features,
+            kernel_size=(1, 1),
+        )
+
+        with torch.no_grad():
+            self.conv.weight.copy_(linear.weight.unsqueeze(-1).unsqueeze(-1))
+            self.conv.bias.copy_(linear.bias)
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        input_tensor = input_tensor.reshape(1, self.in_features, 1, 1)
+        out_tensor = self.conv(input_tensor)
+        return out_tensor
+
+
+if __name__ == "__main__":
+    current_dir = Path(__file__).parent
+    default_config_path = current_dir / "configs/tusimple_res18.py"
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o",
+        "--output-onnx",
+        type=str,
+        required=True,
+        help="path to output onnx",
+    )
+    parser.add_argument(
+        "-c",
+        "--checkpoint",
+        type=str,
+        required=True,
+        help="path to model checkpoint",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=str(default_config_path),
+        help="path to model config",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=320,
+        help="input image height",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=800,
+        help="input image width",
+    )
+    parser.add_argument(
+        "--ti-compatible",
+        action="store_true",
+        default=False,
+        help="replace last Linear with conv1x1 for Texas Instruments compatibility",
+    )
+    parser.add_argument("--opset-version", type=int, default=9, help="opset version")
+    args = parser.parse_args()
+
+    checkpoint = torch.load(args.checkpoint, map_location="cpu")["model_ckpt"]
+    if isinstance(checkpoint, nn.Module):
+        model = checkpoint
+    else:
+        config = Config.fromfile(args.config)
+        model = get_model(config).eval().cpu()
+        model.load_state_dict(checkpoint, strict=False)
+
+    if args.ti_compatible:
+        model.cls[3] = TiCompatibleClsLinear(linear=model.cls[3])
+
+    torch.onnx.export(
+        f=args.output_onnx,
+        model=model,
+        args=torch.ones(
+            [1, 3, args.height, args.width],
+            dtype=torch.float32,
+        ),
+        input_names=["input"],
+        output_names=["output"],
+        opset_version=args.opset_version,
+    )
diff --git a/measure.py b/measure.py
new file mode 100644
index 0000000..ff62ccb
--- /dev/null
+++ b/measure.py
@@ -0,0 +1,46 @@
+import argparse
+
+import onnx
+import torch
+from enot_latency_server.client import measure_latency_remote
+
+from prune import measure_latency_on_server
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_ckpt", default=None, help="Path to model checkpoint for latency measurement")
+    parser.add_argument(
+        "--image_size",
+        type=int,
+        nargs=2,
+        default=(320, 800),
+        help="Size of image for MACs measurement",
+    )
+    parser.add_argument("--host", default="localhost", type=str, help="Host of latency measurement server")
+    parser.add_argument("--port", default=15003, type=int, help="Port of latency measurement server")
+    parser.add_argument("--num_runs", default=1, type=int)
+    parser.add_argument("--ti_server", action="store_true", help="Whether to measure on TI server.")
+    parser.add_argument("--onnx", default=None, type=str, help="Path to model ONNX to measure latency.")
+
+    args = parser.parse_args()
+
+    if args.onnx:
+        onnx_model = onnx.load(args.onnx)
+        latency = measure_latency_remote(onnx_model.SerializeToString(), host=args.host, port=args.port)
+        print(latency)
+        exit()
+
+    model = torch.load(args.model_ckpt, map_location="cpu")["model_ckpt"]
+    if torch.cuda.is_available():
+        model = model.cuda()
+    print("model_loaded")
+
+    latency = measure_latency_on_server(
+        model=model,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+        image_size=args.image_size,
+        port=args.port,
+        host=args.host,
+        ti_server=args.ti_server,
+    )
+    print(latency)
diff --git a/measure_macs.py b/measure_macs.py
new file mode 100644
index 0000000..132d5d2
--- /dev/null
+++ b/measure_macs.py
@@ -0,0 +1,23 @@
+import argparse
+
+import fvcore
+import torch
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-ckpt", required=True, help="Path to model")
+    parser.add_argument(
+        "--image-size",
+        type=int,
+        nargs=2,
+        default=(800, 320),
+        help="Size of image for MACs measurement",
+    )
+
+    args = parser.parse_args()
+
+    model = torch.load(args.model_ckpt, map_location="cpu")["model_ckpt"]
+
+    counter = fvcore.nn.FlopCountAnalysis(model=model, inputs=torch.ones(1, 3, *args.image_size))
+
+    print(counter.total())
diff --git a/model/backbone.py b/model/backbone.py
new file mode 100644
index 0000000..64dc7bb
--- /dev/null
+++ b/model/backbone.py
@@ -0,0 +1,63 @@
+import pdb
+
+import torch
+import torch.nn.modules
+import torchvision
+
+
+class vgg16bn(torch.nn.Module):
+    def __init__(self, pretrained=False):
+        super().__init__()
+        model = list(torchvision.models.vgg16_bn(pretrained=pretrained).features.children())
+        model = model[:33] + model[34:43]
+        self.model = torch.nn.Sequential(*model)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class resnet(torch.nn.Module):
+    def __init__(self, layers, pretrained=False):
+        super().__init__()
+        if layers == "18":
+            model = torchvision.models.resnet18(pretrained=pretrained)
+        elif layers == "34":
+            model = torchvision.models.resnet34(pretrained=pretrained)
+        elif layers == "50":
+            model = torchvision.models.resnet50(pretrained=pretrained)
+        elif layers == "101":
+            model = torchvision.models.resnet101(pretrained=pretrained)
+        elif layers == "152":
+            model = torchvision.models.resnet152(pretrained=pretrained)
+        elif layers == "50next":
+            model = torchvision.models.resnext50_32x4d(pretrained=pretrained)
+        elif layers == "101next":
+            model = torchvision.models.resnext101_32x8d(pretrained=pretrained)
+        elif layers == "50wide":
+            model = torchvision.models.wide_resnet50_2(pretrained=pretrained)
+        elif layers == "101wide":
+            model = torchvision.models.wide_resnet101_2(pretrained=pretrained)
+        elif layers == "34fca":
+            model = torch.hub.load("cfzd/FcaNet", "fca34", pretrained=True)
+        else:
+            raise NotImplementedError
+
+        self.conv1 = model.conv1
+        self.bn1 = model.bn1
+        self.relu = model.relu
+        self.maxpool = model.maxpool
+        self.layer1 = model.layer1
+        self.layer2 = model.layer2
+        self.layer3 = model.layer3
+        self.layer4 = model.layer4
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x2 = self.layer2(x)
+        x3 = self.layer3(x2)
+        x4 = self.layer4(x3)
+        return x2, x3, x4
diff --git a/model/layer.py b/model/layer.py
new file mode 100644
index 0000000..2d8517c
--- /dev/null
+++ b/model/layer.py
@@ -0,0 +1,111 @@
+import torch
+from torch import nn
+
+
+class AddCoordinates:
+    r"""Coordinate Adder Module as defined in 'An Intriguing Failing of
+    Convolutional Neural Networks and the CoordConv Solution'
+    (https://arxiv.org/pdf/1807.03247.pdf).
+    This module concatenates coordinate information (`x`, `y`, and `r`) with
+    given input tensor.
+    `x` and `y` coordinates are scaled to `[-1, 1]` range where origin is the
+    center. `r` is the Euclidean distance from the center and is scaled to
+    `[0, 1]`.
+    Args:
+        with_r (bool, optional): If `True`, adds radius (`r`) coordinate
+            information to input image. Default: `False`
+    Shape:
+        - Input: `(N, C_{in}, H_{in}, W_{in})`
+        - Output: `(N, (C_{in} + 2) or (C_{in} + 3), H_{in}, W_{in})`
+    Examples:
+        >>> coord_adder = AddCoordinates(True)
+        >>> input = torch.randn(8, 3, 64, 64)
+        >>> output = coord_adder(input)
+        >>> coord_adder = AddCoordinates(True)
+        >>> input = torch.randn(8, 3, 64, 64).cuda()
+        >>> output = coord_adder(input)
+        >>> device = torch.device("cuda:0")
+        >>> coord_adder = AddCoordinates(True)
+        >>> input = torch.randn(8, 3, 64, 64).to(device)
+        >>> output = coord_adder(input)
+    """
+
+    def __init__(self, with_r=False):
+        self.with_r = with_r
+
+    def __call__(self, image):
+        batch_size, _, image_height, image_width = image.size()
+
+        y_coords = (
+            2.0 * torch.arange(image_height).unsqueeze(1).expand(image_height, image_width) / (image_height - 1.0) - 1.0
+        )
+        x_coords = (
+            2.0 * torch.arange(image_width).unsqueeze(0).expand(image_height, image_width) / (image_width - 1.0) - 1.0
+        )
+
+        coords = torch.stack((y_coords, x_coords), dim=0)
+
+        if self.with_r:
+            rs = ((y_coords**2) + (x_coords**2)) ** 0.5
+            rs = rs / torch.max(rs)
+            rs = torch.unsqueeze(rs, dim=0)
+            coords = torch.cat((coords, rs), dim=0)
+
+        coords = torch.unsqueeze(coords, dim=0).repeat(batch_size, 1, 1, 1)
+
+        image = torch.cat((coords.to(image.device), image), dim=1)
+
+        return image
+
+
+class CoordConv(nn.Module):
+    r"""2D Convolution Module Using Extra Coordinate Information as defined
+    in 'An Intriguing Failing of Convolutional Neural Networks and the
+    CoordConv Solution' (https://arxiv.org/pdf/1807.03247.pdf).
+    Args:
+        Same as `torch.nn.Conv2d` with two additional arguments
+        with_r (bool, optional): If `True`, adds radius (`r`) coordinate
+            information to input image. Default: `False`
+    Shape:
+        - Input: `(N, C_{in}, H_{in}, W_{in})`
+        - Output: `(N, C_{out}, H_{out}, W_{out})`
+    Examples:
+        >>> coord_conv = CoordConv(3, 16, 3, with_r=True)
+        >>> input = torch.randn(8, 3, 64, 64)
+        >>> output = coord_conv(input)
+        >>> coord_conv = CoordConv(3, 16, 3, with_r=True).cuda()
+        >>> input = torch.randn(8, 3, 64, 64).cuda()
+        >>> output = coord_conv(input)
+        >>> device = torch.device("cuda:0")
+        >>> coord_conv = CoordConv(3, 16, 3, with_r=True).to(device)
+        >>> input = torch.randn(8, 3, 64, 64).to(device)
+        >>> output = coord_conv(input)
+    """
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, with_r=False
+    ):
+        super().__init__()
+
+        in_channels += 2
+        if with_r:
+            in_channels += 1
+
+        self.conv_layer = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+        self.coord_adder = AddCoordinates(with_r)
+
+    def forward(self, x):
+        x = self.coord_adder(x)
+        x = self.conv_layer(x)
+
+        return x
diff --git a/model/model_culane.py b/model/model_culane.py
new file mode 100644
index 0000000..37a4ff3
--- /dev/null
+++ b/model/model_culane.py
@@ -0,0 +1,147 @@
+import numpy as np
+import torch
+
+from model.backbone import resnet
+from model.layer import CoordConv
+from model.seg_model import SegHead
+from utils.common import initialize_weights
+
+
+class parsingNet(torch.nn.Module):
+    def __init__(
+        self,
+        pretrained=True,
+        backbone="50",
+        num_grid_row=None,
+        num_cls_row=None,
+        num_grid_col=None,
+        num_cls_col=None,
+        num_lane_on_row=None,
+        num_lane_on_col=None,
+        use_aux=False,
+        input_height=None,
+        input_width=None,
+        fc_norm=False,
+    ):
+        super().__init__()
+        self.num_grid_row = num_grid_row
+        self.num_cls_row = num_cls_row
+        self.num_grid_col = num_grid_col
+        self.num_cls_col = num_cls_col
+        self.num_lane_on_row = num_lane_on_row
+        self.num_lane_on_col = num_lane_on_col
+        self.use_aux = use_aux
+        self.dim1 = self.num_grid_row * self.num_cls_row * self.num_lane_on_row
+        self.dim2 = self.num_grid_col * self.num_cls_col * self.num_lane_on_col
+        self.dim3 = 2 * self.num_cls_row * self.num_lane_on_row
+        self.dim4 = 2 * self.num_cls_col * self.num_lane_on_col
+        self.total_dim = self.dim1 + self.dim2 + self.dim3 + self.dim4
+        mlp_mid_dim = 2048
+        self.input_dim = input_height // 32 * input_width // 32 * 8
+
+        self.model = resnet(backbone, pretrained=pretrained)
+
+        # for avg pool experiment
+        # self.pool = torch.nn.AdaptiveAvgPool2d(1)
+        # self.pool = torch.nn.AdaptiveMaxPool2d(1)
+
+        # self.register_buffer('coord', torch.stack([torch.linspace(0.5,9.5,10).view(-1,1).repeat(1,50), torch.linspace(0.5,49.5,50).repeat(10,1)]).view(1,2,10,50))
+
+        self.cls = torch.nn.Sequential(
+            torch.nn.LayerNorm(self.input_dim) if fc_norm else torch.nn.Identity(),
+            torch.nn.Linear(self.input_dim, mlp_mid_dim),
+            torch.nn.ReLU(),
+            torch.nn.Linear(mlp_mid_dim, self.total_dim),
+        )
+        self.pool = torch.nn.Conv2d(512, 8, 1) if backbone in ["34", "18", "34fca"] else torch.nn.Conv2d(2048, 8, 1)
+        if self.use_aux:
+            self.seg_head = SegHead(backbone, num_lane_on_row + num_lane_on_col)
+        initialize_weights(self.cls)
+
+    def forward(self, x):
+        x2, x3, fea = self.model(x)
+        if self.use_aux:
+            seg_out = self.seg_head(x2, x3, fea)
+        fea = self.pool(fea)
+        # print(fea.shape)
+        # print(self.coord.shape)
+        # fea = torch.cat([fea, self.coord.repeat(fea.shape[0],1,1,1)], dim = 1)
+
+        fea = fea.view(-1, self.input_dim)
+        out = self.cls(fea)
+
+        if torch.onnx.is_in_onnx_export():
+            return out
+        # print(f'dim1: {self.dim1}, dim2: {self.dim2}, dim3: {self.dim3}, dim4: {self.dim4}, num_grid_row: {self.num_grid_row}, num_cls_row: {self.num_cls_row}, num_lane_on_row: {self.num_lane_on_row}, num_grid_col: {self.num_grid_col}, num_cls_col: {self.num_cls_col}, num_lane_on_col: {self.num_lane_on_col}, ')
+        else:
+            pred_dict = {
+                "loc_row": out[:, : self.dim1].view(-1, self.num_grid_row, self.num_cls_row, self.num_lane_on_row),
+                "loc_col": out[:, self.dim1 : self.dim1 + self.dim2].view(
+                    -1, self.num_grid_col, self.num_cls_col, self.num_lane_on_col
+                ),
+                "exist_row": out[:, self.dim1 + self.dim2 : self.dim1 + self.dim2 + self.dim3].view(
+                    -1, 2, self.num_cls_row, self.num_lane_on_row
+                ),
+                "exist_col": out[:, -self.dim4 :].view(-1, 2, self.num_cls_col, self.num_lane_on_col),
+                "out": out,
+            }
+        if self.use_aux:
+            pred_dict["seg_out"] = seg_out
+
+        return pred_dict
+
+    def forward_tta(self, x):
+        x2, x3, fea = self.model(x)
+
+        pooled_fea = self.pool(fea)
+        n, c, h, w = pooled_fea.shape
+
+        left_pooled_fea = torch.zeros_like(pooled_fea)
+        right_pooled_fea = torch.zeros_like(pooled_fea)
+        up_pooled_fea = torch.zeros_like(pooled_fea)
+        down_pooled_fea = torch.zeros_like(pooled_fea)
+
+        left_pooled_fea[:, :, :, : w - 1] = pooled_fea[:, :, :, 1:]
+        left_pooled_fea[:, :, :, -1] = pooled_fea.mean(-1)
+
+        right_pooled_fea[:, :, :, 1:] = pooled_fea[:, :, :, : w - 1]
+        right_pooled_fea[:, :, :, 0] = pooled_fea.mean(-1)
+
+        up_pooled_fea[:, :, : h - 1, :] = pooled_fea[:, :, 1:, :]
+        up_pooled_fea[:, :, -1, :] = pooled_fea.mean(-2)
+
+        down_pooled_fea[:, :, 1:, :] = pooled_fea[:, :, : h - 1, :]
+        down_pooled_fea[:, :, 0, :] = pooled_fea.mean(-2)
+        # 10 x 25
+        fea = torch.cat([pooled_fea, left_pooled_fea, right_pooled_fea, up_pooled_fea, down_pooled_fea], dim=0)
+        fea = fea.view(-1, self.input_dim)
+
+        out = self.cls(fea)
+
+        return {
+            "loc_row": out[:, : self.dim1].view(-1, self.num_grid_row, self.num_cls_row, self.num_lane_on_row),
+            "loc_col": out[:, self.dim1 : self.dim1 + self.dim2].view(
+                -1, self.num_grid_col, self.num_cls_col, self.num_lane_on_col
+            ),
+            "exist_row": out[:, self.dim1 + self.dim2 : self.dim1 + self.dim2 + self.dim3].view(
+                -1, 2, self.num_cls_row, self.num_lane_on_row
+            ),
+            "exist_col": out[:, -self.dim4 :].view(-1, 2, self.num_cls_col, self.num_lane_on_col),
+        }
+
+
+def get_model(cfg):
+    return parsingNet(
+        pretrained=True,
+        backbone=cfg.backbone,
+        num_grid_row=cfg.num_cell_row,
+        num_cls_row=cfg.num_row,
+        num_grid_col=cfg.num_cell_col,
+        num_cls_col=cfg.num_col,
+        num_lane_on_row=cfg.num_lanes,
+        num_lane_on_col=cfg.num_lanes,
+        use_aux=cfg.use_aux,
+        input_height=cfg.train_height,
+        input_width=cfg.train_width,
+        fc_norm=cfg.fc_norm,
+    ).cuda()
diff --git a/model/model_curvelanes.py b/model/model_curvelanes.py
new file mode 100644
index 0000000..ed07a9f
--- /dev/null
+++ b/model/model_curvelanes.py
@@ -0,0 +1,113 @@
+import numpy as np
+import torch
+
+from model.backbone import resnet
+from model.seg_model import SegHead
+from utils.common import initialize_weights
+
+
+class parsingNet(torch.nn.Module):
+    def __init__(
+        self,
+        pretrained=True,
+        backbone="50",
+        num_grid_row=None,
+        num_cls_row=None,
+        num_grid_col=None,
+        num_cls_col=None,
+        num_lane_on_row=None,
+        num_lane_on_col=None,
+        use_aux=False,
+        input_height=None,
+        input_width=None,
+    ):
+        super().__init__()
+        self.num_grid_row = num_grid_row
+        self.num_cls_row = num_cls_row
+        self.num_grid_col = num_grid_col
+        self.num_cls_col = num_cls_col
+        self.num_lane_on_row = num_lane_on_row
+        self.num_lane_on_col = num_lane_on_col
+        self.use_aux = use_aux
+
+        self.input_height = input_height
+        self.input_width = input_width
+
+        self.dim1 = self.num_grid_row * self.num_cls_row
+        self.dim2 = 2 * self.num_cls_row
+        self.dim3 = self.num_grid_col * self.num_cls_col
+        self.dim4 = 2 * self.num_cls_col
+        self.total_dim_row = self.dim1 + self.dim2
+        self.total_dim_col = self.dim3 + self.dim4
+        mlp_mid_dim = 2048
+
+        self.input_dim = (self.input_height // 32) * (self.input_width // 32) * 9
+
+        self.model = resnet(backbone, pretrained=pretrained)
+
+        self.cls_distribute = torch.nn.Sequential(
+            torch.nn.Conv2d(512, 128, 3, padding=1),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(128, 20, 3, padding=1),
+        )
+        self.cls = torch.nn.Sequential(
+            torch.nn.LayerNorm(self.input_dim), torch.nn.Linear(self.input_dim, mlp_mid_dim), torch.nn.ReLU()
+        )
+        self.cls_row = torch.nn.Linear(mlp_mid_dim, self.total_dim_row)
+        self.cls_col = torch.nn.Linear(mlp_mid_dim, self.total_dim_col)
+        self.pool = torch.nn.Conv2d(512, 8, 1) if backbone in ["34", "18", "34fca"] else torch.nn.Conv2d(2048, 8, 1)
+        if self.use_aux:
+            self.seg_head = SegHead(backbone, num_lane_on_row + num_lane_on_col)
+        initialize_weights(self.cls_distribute)
+        initialize_weights(self.cls)
+        initialize_weights([self.cls_row])
+        initialize_weights([self.cls_col])
+
+    def forward(self, x):
+        x2, x3, fea = self.model(x)
+        if self.use_aux:
+            seg_out = self.seg_head(x2, x3, fea)
+        lane_token = self.cls_distribute(fea).reshape(-1, 20, 1, self.input_height // 32, self.input_width // 32)
+        fea = self.pool(fea).unsqueeze(1).repeat(1, 20, 1, 1, 1)
+        fea = torch.cat([fea, lane_token], 2)
+
+        fea = fea.view(-1, self.input_dim)
+        out = self.cls(fea).reshape(-1, 20, 2048)
+        out_row = self.cls_row(out[:, :10, :]).permute(0, 2, 1)
+        out_col = self.cls_col(out[:, 10:, :]).permute(0, 2, 1)
+
+        pred_dict = {
+            "loc_row": out_row[:, : self.dim1, :].view(-1, self.num_grid_row, self.num_cls_row, self.num_lane_on_row),
+            "loc_col": out_col[:, : self.dim3, :].view(-1, self.num_grid_col, self.num_cls_col, self.num_lane_on_col),
+            "exist_row": out_row[:, self.dim1 : self.dim1 + self.dim2, :].view(
+                -1, 2, self.num_cls_row, self.num_lane_on_row
+            ),
+            "exist_col": out_col[:, self.dim3 : self.dim3 + self.dim4, :].view(
+                -1, 2, self.num_cls_col, self.num_lane_on_col
+            ),
+            "lane_token_row": lane_token[:, :10, :, :].sum(1),
+            "lane_token_col": lane_token[:, 10:, :, :].sum(1),
+        }
+        if self.use_aux:
+            pred_dict["seg_out"] = seg_out
+
+        return pred_dict
+
+    def forward_tta(self, x):
+        raise NotImplementedError
+
+
+def get_model(cfg):
+    return parsingNet(
+        pretrained=True,
+        backbone=cfg.backbone,
+        num_grid_row=cfg.num_cell_row,
+        num_cls_row=cfg.num_row,
+        num_grid_col=cfg.num_cell_col,
+        num_cls_col=cfg.num_col,
+        num_lane_on_row=cfg.num_lanes,
+        num_lane_on_col=cfg.num_lanes,
+        use_aux=cfg.use_aux,
+        input_height=cfg.train_height,
+        input_width=cfg.train_width,
+    ).cuda()
diff --git a/model/model_tusimple.py b/model/model_tusimple.py
new file mode 100644
index 0000000..b493a47
--- /dev/null
+++ b/model/model_tusimple.py
@@ -0,0 +1,18 @@
+from model.model_culane import parsingNet
+
+
+def get_model(cfg):
+    return parsingNet(
+        pretrained=True,
+        backbone=cfg.backbone,
+        num_grid_row=cfg.num_cell_row,
+        num_cls_row=cfg.num_row,
+        num_grid_col=cfg.num_cell_col,
+        num_cls_col=cfg.num_col,
+        num_lane_on_row=cfg.num_lanes,
+        num_lane_on_col=cfg.num_lanes,
+        use_aux=cfg.use_aux,
+        input_height=cfg.train_height,
+        input_width=cfg.train_width,
+        fc_norm=cfg.fc_norm,
+    ).cuda()
diff --git a/model/seg_model.py b/model/seg_model.py
new file mode 100644
index 0000000..ebf11cb
--- /dev/null
+++ b/model/seg_model.py
@@ -0,0 +1,68 @@
+import torch
+
+from utils.common import initialize_weights
+
+
+class conv_bn_relu(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, bias=False):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias
+        )
+        self.bn = torch.nn.BatchNorm2d(out_channels)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class SegHead(torch.nn.Module):
+    def __init__(self, backbone, num_lanes):
+        super().__init__()
+
+        self.aux_header2 = torch.nn.Sequential(
+            conv_bn_relu(128, 128, kernel_size=3, stride=1, padding=1)
+            if backbone in ["34", "18"]
+            else conv_bn_relu(512, 128, kernel_size=3, stride=1, padding=1),
+            conv_bn_relu(128, 128, 3, padding=1),
+            conv_bn_relu(128, 128, 3, padding=1),
+            conv_bn_relu(128, 128, 3, padding=1),
+        )
+        self.aux_header3 = torch.nn.Sequential(
+            conv_bn_relu(256, 128, kernel_size=3, stride=1, padding=1)
+            if backbone in ["34", "18"]
+            else conv_bn_relu(1024, 128, kernel_size=3, stride=1, padding=1),
+            conv_bn_relu(128, 128, 3, padding=1),
+            conv_bn_relu(128, 128, 3, padding=1),
+        )
+        self.aux_header4 = torch.nn.Sequential(
+            conv_bn_relu(512, 128, kernel_size=3, stride=1, padding=1)
+            if backbone in ["34", "18"]
+            else conv_bn_relu(2048, 128, kernel_size=3, stride=1, padding=1),
+            conv_bn_relu(128, 128, 3, padding=1),
+        )
+        self.aux_combine = torch.nn.Sequential(
+            conv_bn_relu(384, 256, 3, padding=2, dilation=2),
+            conv_bn_relu(256, 128, 3, padding=2, dilation=2),
+            conv_bn_relu(128, 128, 3, padding=2, dilation=2),
+            conv_bn_relu(128, 128, 3, padding=4, dilation=4),
+            torch.nn.Conv2d(128, num_lanes + 1, 1)
+            # output : n, num_of_lanes+1, h, w
+        )
+
+        initialize_weights(self.aux_header2, self.aux_header3, self.aux_header4, self.aux_combine)
+
+        # self.droput = torch.nn.Dropout(0.1)
+
+    def forward(self, x2, x3, fea):
+        x2 = self.aux_header2(x2)
+        x3 = self.aux_header3(x3)
+        x3 = torch.nn.functional.interpolate(x3, scale_factor=2, mode="bilinear")
+        x4 = self.aux_header4(fea)
+        x4 = torch.nn.functional.interpolate(x4, scale_factor=4, mode="bilinear")
+        aux_seg = torch.cat([x2, x3, x4], dim=1)
+        aux_seg = self.aux_combine(aux_seg)
+        return aux_seg
diff --git a/my_interp/build.sh b/my_interp/build.sh
new file mode 100644
index 0000000..ad4d984
--- /dev/null
+++ b/my_interp/build.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+rm -rf build &&
+rm -rf ./*.egg-info &&
+rm -rf dist &&
+python setup.py install
diff --git a/my_interp/my_interp_cuda.cpp b/my_interp/my_interp_cuda.cpp
new file mode 100644
index 0000000..376d792
--- /dev/null
+++ b/my_interp/my_interp_cuda.cpp
@@ -0,0 +1,30 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+torch::Tensor my_interp_cuda(
+    torch::Tensor input, torch::Tensor interp_loc, int direction);
+  // direction 0 for horizontal, 1 for vertical
+
+
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+torch::Tensor run(
+    torch::Tensor input, torch::Tensor interp_loc, int direction) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(interp_loc);
+  // direction 0 for horizontal, 1 for vertical
+  auto res =  my_interp_cuda(input, interp_loc, direction);
+  return res;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run", &run, "run my interp");
+}
diff --git a/my_interp/my_interp_cuda_kernel.cu b/my_interp/my_interp_cuda_kernel.cu
new file mode 100644
index 0000000..20c7f01
--- /dev/null
+++ b/my_interp/my_interp_cuda_kernel.cu
@@ -0,0 +1,96 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+#include <cmath>
+
+namespace {
+
+    __global__ void my_interp_cuda_kernel(
+            const torch::PackedTensorAccessor<float,4,torch::RestrictPtrTraits,size_t> d_input,
+            const torch::PackedTensorAccessor<float,1,torch::RestrictPtrTraits,size_t> d_interp_loc,
+            torch::PackedTensorAccessor<float,4,torch::RestrictPtrTraits,size_t> d_output,
+            int direction
+        ){
+        // direction 0 for horizontal, 1 for vertical
+
+
+        int blockId = blockIdx.x + blockIdx.y * gridDim.x;
+        int idx = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
+
+        const int bs = d_input.size(0);
+        const int ls = d_input.size(1);
+        const int cs = d_input.size(2);
+
+        const int newcs = d_interp_loc.size(0);
+
+        if (idx >= bs*ls*newcs){
+            return;
+        }
+        int newcs_idx = idx % newcs;
+        int ls_idx = (idx / newcs) % ls;
+        int bs_idx = (idx / newcs) / ls;
+
+        float current_loc = d_interp_loc[newcs_idx];
+        d_output[bs_idx][ls_idx][newcs_idx][1-direction] = current_loc;
+
+        int pos = -1;
+        for (int i = cs - 1; i > 0; i-- ){
+            if (d_input[bs_idx][ls_idx][i][direction] < 0 || d_input[bs_idx][ls_idx][i-1][direction] < 0){
+                continue;
+            }
+            if (d_input[bs_idx][ls_idx][i][1-direction] < 0 || d_input[bs_idx][ls_idx][i-1][1-direction] < 0){
+                continue;
+            }
+            if ( (d_input[bs_idx][ls_idx][i][1-direction] - current_loc) * (d_input[bs_idx][ls_idx][i-1][1-direction] - current_loc) <= 0){
+                pos = i;
+                break;
+            }
+        }
+        if (pos == -1){ return; }
+
+        float len = abs(d_input[bs_idx][ls_idx][pos][1-direction] - d_input[bs_idx][ls_idx][pos-1][1-direction]);
+        float part1 = abs( d_input[bs_idx][ls_idx][pos][1-direction] - current_loc );
+        // float part2 = abs( d_input[bs_idx][ls_idx][pos-1][1-direction] - current_loc );
+        float factor1 = 1 - part1 / len;
+        float factor2 = 1 - factor1;
+
+        float value = d_input[bs_idx][ls_idx][pos][direction] * factor1 + d_input[bs_idx][ls_idx][pos-1][direction] * factor2;
+
+
+        d_output[bs_idx][ls_idx][newcs_idx][direction] = value;
+
+    }
+
+
+}
+
+torch::Tensor my_interp_cuda(
+    torch::Tensor input, torch::Tensor interp_loc, int direction){
+    // input is : num_batch, num_lane, num_cls_per_lane, 2
+    // interp_loc: new_num_cls_per_lane
+
+    const int bs = input.size(0);
+    const int ls = input.size(1);
+    const int cs = input.size(2);
+
+    const int newcs = interp_loc.size(0);
+
+    auto options =torch::TensorOptions().dtype(torch::kFloat32).layout(torch::kStrided).device(torch::kCUDA).requires_grad(false);
+    auto res = torch::zeros({bs, ls, newcs, 2}, options) -1;
+
+
+    const int threads = 1024;
+    const int blocks = (bs*ls*newcs + threads - 1) / threads;
+
+    my_interp_cuda_kernel<<<blocks, threads>>>(
+        input.packed_accessor<float,4,torch::RestrictPtrTraits,size_t>(),
+        interp_loc.packed_accessor<float,1,torch::RestrictPtrTraits,size_t>(),
+        res.packed_accessor<float,4,torch::RestrictPtrTraits,size_t>(),
+        direction
+    );
+    // direction 0 for horizontal, 1 for vertical
+    return res;
+}
diff --git a/my_interp/setup.py b/my_interp/setup.py
new file mode 100644
index 0000000..21f80bb
--- /dev/null
+++ b/my_interp/setup.py
@@ -0,0 +1,17 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+setup(
+    name="my_interp",
+    ext_modules=[
+        CUDAExtension(
+            "my_interp",
+            [
+                "my_interp_cuda.cpp",
+                "my_interp_cuda_kernel.cu",
+            ],
+        ),
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/my_interp/test.py b/my_interp/test.py
new file mode 100644
index 0000000..80f1abf
--- /dev/null
+++ b/my_interp/test.py
@@ -0,0 +1,105 @@
+import os
+
+import cv2
+import numpy as np
+import torch
+
+import my_interp
+
+
+def draw_points(img, points, color):
+    points = points.view(-1, 2).cpu().numpy()
+    for x, y in points:
+        if x < 0 or y < 0:
+            continue
+        img = cv2.circle(img, (int(x), int(y)), 5, color, -1)
+    return img
+
+
+def test(culane_root):
+    # points = torch.rand(32, 4, 35, 2)
+    import cv2
+
+    test_lines_txt_path = os.path.join(culane_root, "/driver_161_90frame/06031919_0929.MP4/00000.lines.txt")
+    lanes = open(test_lines_txt_path).readlines()
+
+    all_points = np.zeros((4, 35, 2), dtype=np.float)
+    the_anno_row_anchor = np.array(
+        [
+            250,
+            260,
+            270,
+            280,
+            290,
+            300,
+            310,
+            320,
+            330,
+            340,
+            350,
+            360,
+            370,
+            380,
+            390,
+            400,
+            410,
+            420,
+            430,
+            440,
+            450,
+            460,
+            470,
+            480,
+            490,
+            500,
+            510,
+            520,
+            530,
+            540,
+            550,
+            560,
+            570,
+            580,
+            590,
+        ]
+    )
+    all_points[:, :, 1] = np.tile(the_anno_row_anchor, (4, 1))
+    all_points[:, :, 0] = -99999
+
+    label_img = cv2.imread(
+        os.path.join(culane_root, "/laneseg_label_w16/driver_161_90frame/06031919_0929.MP4/00000.png")
+    )[:, :, 0]
+
+    for lane_idx, lane in enumerate(lanes):
+        ll = lane.strip().split(" ")
+        point_x = ll[::2]
+        point_y = ll[1::2]
+
+        mid_x = int(float(point_x[int(len(point_x) / 2)]))
+        mid_y = int(float(point_y[int(len(point_x) / 2)]))
+        lane_order = label_img[mid_y - 1, mid_x - 1]
+        if lane_order == 0:
+            import pdb
+
+            pdb.set_trace()
+
+        for i in range(len(point_x)):
+            p1x = float(point_x[i])
+            pos = (int(point_y[i]) - 250) / 10
+            all_points[lane_order - 1, int(pos), 0] = p1x
+    all_points = torch.tensor(all_points).cuda().view(1, 4, 35, 2)
+    new_interp_locations = torch.linspace(0, 590, 30).cuda()
+    new_all_points = my_interp.run(all_points.float(), new_interp_locations.float(), 0)
+    # new_interp_locations = torch.linspace(0,1640,100).cuda()
+    # new_all_points = my_interp.run(all_points.float(), new_interp_locations.float(), 1)
+    img = (
+        cv2.imread(os.path.join(culane_root, "/laneseg_label_w16/driver_161_90frame/06031919_0929.MP4/00000.png")) * 128
+    )
+    img = draw_points(img, all_points, (0, 255, 0))
+    img = draw_points(img, new_all_points, (0, 0, 255))
+    cv2.imwrite("test.png", img)
+    torch.set_printoptions(sci_mode=False)
+
+
+if __name__ == "__main__":
+    test("path/to/your/culane")
diff --git a/prune.py b/prune.py
new file mode 100644
index 0000000..752391c
--- /dev/null
+++ b/prune.py
@@ -0,0 +1,226 @@
+import datetime
+import os
+import time
+from copy import deepcopy
+from functools import partial
+from typing import Dict
+
+import onnx
+import torch
+from enot.pruning.label_selector import OptimalPruningLabelSelector
+from enot.pruning.prune import prune_model
+from enot.pruning.prune_calibrator import PruningCalibrator
+from enot_latency_server.client import measure_latency_remote
+from fvcore.nn import FlopCountAnalysis
+
+import utils.dist_utils
+from evaluation.eval_wrapper import eval_lane
+from export import TiCompatibleClsLinear
+from utils.common import calc_loss
+from utils.common import get_logger
+from utils.common import get_model
+from utils.common import get_train_loader
+from utils.common import get_work_dir
+from utils.common import inference
+from utils.common import merge_config
+from utils.common import save_model
+from utils.dist_utils import dist_print
+from utils.dist_utils import dist_tqdm
+from utils.dist_utils import synchronize
+from utils.factory import get_loss_dict
+from utils.factory import get_metric_dict
+from utils.factory import get_optimizer
+from utils.factory import get_scheduler
+
+
+def calibrate(
+    net: torch.nn.Module,
+    data_loader: torch.utils.data.DataLoader,
+    loss_dict: Dict,
+    logger: utils.dist_utils.DistSummaryWriter,
+    epoch: int,
+    dataset: torch.utils.data.Dataset,
+):
+    net.eval()
+    pruning_calibrator = PruningCalibrator(model=net)
+    progress_bar = dist_tqdm(train_loader)
+    with pruning_calibrator:
+        for b_idx, data_label in enumerate(progress_bar):
+            global_step = epoch * len(data_loader) + b_idx
+            results = inference(net, data_label, dataset)
+
+            loss = calc_loss(
+                loss_dict=loss_dict,
+                results=results,
+                logger=logger,
+                global_step=global_step,
+                epoch=epoch,
+            )
+            loss.backward()
+
+    return pruning_calibrator.pruning_info
+
+
+def tune_bn(net, data_loader, dataset):
+    net.train()
+    progress_bar = dist_tqdm(data_loader)
+    for b_idx, data_label in enumerate(progress_bar):
+        _ = inference(net, data_label, dataset)
+
+    return net
+
+
+def measure_latency_on_server(model, device, image_size, port, host, ti_server=False):
+    model = deepcopy(model)
+    model.eval()
+    if ti_server:
+        opset = 9
+        model.cls[3] = TiCompatibleClsLinear(linear=model.cls[3]).to(device)
+    else:
+        opset = 11
+
+    torch.onnx.export(
+        model=model,
+        args=torch.ones((1, 3, *image_size), device=device),
+        f="model.onnx",
+        opset_version=opset,
+        input_names=["input"],
+        output_names=["output"],
+    )
+
+    onnx_model = onnx.load("model.onnx")
+    result = measure_latency_remote(onnx_model.SerializeToString(), host=host, port=port)
+    if isinstance(result, float):
+        return result
+    print(result)
+    return result["NPU_execution_ms"]
+
+
+def measure_flops(model):
+    model.eval()
+    flops = FlopCountAnalysis(model, torch.ones((1, 3, cfg.train_height, cfg.train_width)))
+    flops = flops.total()
+    return flops
+
+
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+
+    args, cfg = merge_config()
+
+    if args.local_rank == 0:
+        work_dir = get_work_dir(cfg)
+
+    distributed = False
+    if "WORLD_SIZE" in os.environ:
+        distributed = int(os.environ["WORLD_SIZE"]) > 1
+    if distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+
+        if args.local_rank == 0:
+            with open(".work_dir_tmp_file.txt", "w") as f:
+                f.write(work_dir)
+        else:
+            while not os.path.exists(".work_dir_tmp_file.txt"):
+                time.sleep(0.1)
+            with open(".work_dir_tmp_file.txt") as f:
+                work_dir = f.read().strip()
+
+    synchronize()
+    cfg.test_work_dir = work_dir
+    cfg.distributed = distributed
+
+    if args.local_rank == 0:
+        os.system("rm .work_dir_tmp_file.txt")
+
+    dist_print(datetime.datetime.now().strftime("[%Y/%m/%d %H:%M:%S]") + " start training...")
+    dist_print(cfg)
+    assert cfg.backbone in ["18", "34", "50", "101", "152", "50next", "101next", "50wide", "101wide", "34fca"]
+
+    train_loader = get_train_loader(cfg)
+    resume_epoch = 0
+    # resume now work as model ckpt
+    if cfg.model_ckpt is not None:
+        net = torch.load(cfg.model_ckpt, map_location="cpu")["model_ckpt"]
+    else:
+        ValueError("--model_ckpt should be passed to pruning script.")
+
+    if distributed:
+        net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[args.local_rank])
+    optimizer = get_optimizer(net, cfg)
+
+    if cfg.finetune is not None:
+        dist_print("finetune from ", cfg.finetune)
+        state_all = torch.load(cfg.finetune, map_location="cpu")["model"]
+        state_clip = {}  # only use backbone parameters
+        for k, v in state_all.items():
+            if "model" in k:
+                state_clip[k] = v
+        net.load_state_dict(state_clip, strict=False)
+
+    scheduler = get_scheduler(optimizer, cfg, len(train_loader))
+    dist_print(len(train_loader))
+    metric_dict = get_metric_dict(cfg)
+    loss_dict = get_loss_dict(cfg)
+    logger = get_logger(work_dir, cfg)
+    epoch = 1
+
+    max_res = 0
+    res = None
+
+    if cfg.latency_type == "MAC":
+        latency_measurement_func = measure_flops
+    elif cfg.latency_type == "server":
+        latency_measurement_func = partial(
+            measure_latency_on_server,
+            device="cpu",
+            image_size=(cfg.train_height, cfg.train_width),
+            host=cfg.host,
+            port=cfg.port,
+            ti_server=cfg.ti_compatible,
+        )
+    else:
+        raise ValueError(f"latency_type {cfg.latency_type} is not supported.")
+
+    net.cpu()
+    baseline_latency = latency_measurement_func(net)
+    dist_print("baseline latency:", baseline_latency)
+    net.cuda()
+
+    pruning_info = calibrate(
+        net=net,
+        data_loader=train_loader,
+        loss_dict=loss_dict,
+        logger=logger,
+        epoch=epoch,
+        dataset=cfg.dataset,
+    )
+
+    net.cpu()
+    label_selector = OptimalPruningLabelSelector(
+        model=net,
+        latency_calculation_function=latency_measurement_func,
+        target_latency=baseline_latency / cfg.acceleration,
+        n_search_steps=cfg.n_search_steps,
+        architecture_optimization_strategy=lambda x: (8, 1),
+    )
+    labels = label_selector.select(pruning_info)
+
+    pruned_model = prune_model(model=net, pruning_info=pruning_info, prune_labels=labels)
+
+    train_loader.reset()
+    net.cuda()
+    tune_bn(net=pruned_model, data_loader=train_loader, dataset=cfg.dataset)
+
+    res = eval_lane(pruned_model, cfg, ep=epoch, logger=logger)
+
+    pruned_model.cpu()
+    pruned_model_latency = latency_measurement_func(pruned_model)
+    dist_print("pruned model latency:", pruned_model_latency)
+    dist_print("acceleration:", baseline_latency / pruned_model_latency)
+
+    save_model(pruned_model, optimizer, epoch, work_dir, distributed)
+    logger.add_scalar("CuEval/X", max_res, global_step=epoch)
+
+    logger.close()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..f82e0c0
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,21 @@
+[tool.black]
+line-length = 120
+target-version = ["py38", "py39"]
+include = '\.pyi?$'
+
+[tool.isort]
+profile = "black"
+line_length = 120
+ensure_newline_before_comments = true
+force_single_line = true
+
+[tool.docformatter]
+recursive = true
+wrap-summaries = 0
+wrap-descriptions = 0
+blank = true
+black = true
+pre-summary-newline = true
+
+[tool.pyupgrade]
+py38plus = true
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..2a4554e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+addict
+enot-autodl==3.3.2
+enot-latency-server==1.1.0
+imagesize
+opencv-python
+pathspec
+scikit-learn
+tensorboard
+tqdm
+ujson
diff --git a/scripts/cache_culane_ponits.py b/scripts/cache_culane_ponits.py
new file mode 100644
index 0000000..8916f9e
--- /dev/null
+++ b/scripts/cache_culane_ponits.py
@@ -0,0 +1,96 @@
+import argparse
+import json
+import os
+
+import cv2
+import numpy as np
+import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", required=True, help="The root of the dataset")
+    return parser
+
+
+if __name__ == "__main__":
+    args = get_args().parse_args()
+    culane_root = args.root
+    train_list = os.path.join(culane_root, "list/train_gt.txt")
+    with open(train_list) as fp:
+        res = fp.readlines()
+    cache_dict = {}
+    for line in tqdm.tqdm(res):
+        info = line.split(" ")
+
+        label_path = os.path.join(culane_root, info[1][1:])
+        label_img = cv2.imread(label_path)[:, :, 0]
+
+        txt_path = info[0][1:].replace("jpg", "lines.txt")
+        txt_path = os.path.join(culane_root, txt_path)
+        lanes = open(txt_path).readlines()
+
+        all_points = np.zeros((4, 35, 2), dtype=np.float)
+        the_anno_row_anchor = np.array(
+            [
+                250,
+                260,
+                270,
+                280,
+                290,
+                300,
+                310,
+                320,
+                330,
+                340,
+                350,
+                360,
+                370,
+                380,
+                390,
+                400,
+                410,
+                420,
+                430,
+                440,
+                450,
+                460,
+                470,
+                480,
+                490,
+                500,
+                510,
+                520,
+                530,
+                540,
+                550,
+                560,
+                570,
+                580,
+                590,
+            ]
+        )
+        all_points[:, :, 1] = np.tile(the_anno_row_anchor, (4, 1))
+        all_points[:, :, 0] = -99999
+        # init using no lane
+
+        for lane_idx, lane in enumerate(lanes):
+            ll = lane.strip().split(" ")
+            point_x = ll[::2]
+            point_y = ll[1::2]
+
+            mid_x = int(float(point_x[int(len(point_x) / 2)]))
+            mid_y = int(float(point_y[int(len(point_x) / 2)]))
+            lane_order = label_img[mid_y - 1, mid_x - 1]
+            if lane_order == 0:
+                import pdb
+
+                pdb.set_trace()
+
+            for i in range(len(point_x)):
+                p1x = float(point_x[i])
+                pos = (int(point_y[i]) - 250) / 10
+                all_points[lane_order - 1, int(pos), 0] = p1x
+        cache_dict[info[0][1:]] = all_points.tolist()
+    with open(os.path.join(culane_root, "culane_anno_cache.json"), "w") as f:
+        json.dump(cache_dict, f)
diff --git a/scripts/convert_curvelanes.py b/scripts/convert_curvelanes.py
new file mode 100644
index 0000000..1c81e85
--- /dev/null
+++ b/scripts/convert_curvelanes.py
@@ -0,0 +1,198 @@
+import argparse
+import json
+import os
+
+import cv2
+import imagesize
+import numpy as np
+import tqdm
+
+
+def calc_k(line, height, width, angle=False):
+    """Calculate the direction of lanes."""
+    line_x = line[::2]
+    line_y = line[1::2]
+
+    length = np.sqrt((line_x[0] - line_x[-1]) ** 2 + (line_y[0] - line_y[-1]) ** 2)
+    if length < 90:
+        return -10
+    p = np.polyfit(line_x, line_y, deg=1)
+    rad = np.arctan(p[0])
+
+    if angle:
+        return rad
+
+    try:
+        curve = np.polyfit(line_x[:2], line_y[:2], deg=1)
+    except Exception:
+        curve = np.polyfit(line_x[:3], line_y[:3], deg=1)
+
+    try:
+        curve1 = np.polyfit(line_y[:2], line_x[:2], deg=1)
+    except Exception:
+        curve1 = np.polyfit(line_y[:3], line_x[:3], deg=1)
+
+    if rad < 0:
+        y = np.poly1d(curve)(0)
+        if y > height:
+            result = np.poly1d(curve1)(height)
+        else:
+            result = -(height - y)
+    else:
+        y = np.poly1d(curve)(width)
+        if y > height:
+            result = np.poly1d(curve1)(height)
+        else:
+            result = width + (height - y)
+
+    return result
+
+
+def draw(im, line, idx, ratio_height=1, ratio_width=1, show=False):
+    """Generate the segmentation label according to json annotation."""
+    line_x = np.array(line[::2]) * ratio_width
+    line_y = np.array(line[1::2]) * ratio_height
+    pt0 = (int(line_x[0]), int(line_y[0]))
+    if show:
+        cv2.putText(
+            im,
+            str(idx),
+            (int(line_x[len(line_x) // 2]), int(line_y[len(line_x) // 2]) - 20),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            1.0,
+            (255, 255, 255),
+            lineType=cv2.LINE_AA,
+        )
+        idx = idx * 60
+
+    for i in range(len(line_x) - 1):
+        cv2.line(im, pt0, (int(line_x[i + 1]), int(line_y[i + 1])), (idx,), thickness=16)
+        pt0 = (int(line_x[i + 1]), int(line_y[i + 1]))
+
+
+def spline(arr, the_anno_row_anchor, ratio_height=1, ratio_width=1):
+    arr = np.array(arr)
+    arr[1::2] = arr[1::2] * ratio_height
+    arr[::2] = arr[::2] * ratio_width
+    curve = np.polyfit(arr[1::2], arr[::2], min(len(arr[::2]) - 1, 3))
+    _min = arr[1::2].min()
+    _max = arr[1::2].max()
+    valid = ~((the_anno_row_anchor <= _max) & (the_anno_row_anchor >= _min))
+    new_x = np.polyval(curve, the_anno_row_anchor)
+    final_anno_list = np.concatenate([new_x.reshape(-1, 1), the_anno_row_anchor.reshape(-1, 1)], -1)
+    final_anno_list[valid, 0] = -99999
+
+    return final_anno_list
+
+
+def get_curvelanes_list(root, label_dir):
+    """Get all the files' names from the json annotation."""
+    l = os.path.join(root, label_dir)
+    line_txt = []
+    names = []
+    for img_name in tqdm.tqdm(os.listdir(os.path.join(l, "images"))):
+        temp_img_name = os.path.join("images", img_name)
+        names.append(temp_img_name)
+        label = img_name.replace("jpg", "lines.json")
+        f = open(os.path.join(l, "labels", label))
+        lines = json.load(f)["Lines"]
+        f.close()
+        temp_lines = []
+        for line in lines:
+            temp_line = []
+            line = sorted(line, key=lambda x: -float(x["y"]))
+            for point in line:
+                temp_line.append(float(point["x"]))
+                temp_line.append(float(point["y"]))
+            temp_lines.append(temp_line)
+        line_txt.append(temp_lines)
+
+    return names, line_txt
+
+
+def generate_segmentation_and_train_list(
+    root, line_txt, names, file_name="train_gt.txt", json_name="curvelanes_anno_cache.json"
+):
+    """
+    The lane annotations of the Tusimple dataset is not strictly in order, so we need to find out the correct lane order for segmentation.
+
+    We use the same definition as CULane, in which the four lanes from left to right are represented as 1,2,3,4 in segentation label respectively.
+
+    """
+    assert os.path.exists(root)
+    train_gt_fp = open(os.path.join(root, file_name), "w")
+    cache_dict = {}
+    if not os.path.exists(os.path.join(root, "segs")):
+        os.mkdir(os.path.join(root, "segs"))
+
+    for i in tqdm.tqdm(range(len(line_txt))):
+        lines = line_txt[i]
+
+        width, height = imagesize.get(os.path.join(root, names[i]))
+
+        ks = np.array([calc_k(line, height, width) for line in lines])  # get the direction of each lane
+        ks_theta = np.array(
+            [calc_k(line, height, width, angle=True) for line in lines]
+        )  # get the direction of each lane
+
+        k_neg = ks[ks_theta < 0].copy()
+        k_neg_theta = ks_theta[ks_theta < 0].copy()
+        k_pos = ks[ks_theta > 0].copy()
+        k_pos_theta = ks_theta[ks_theta > 0].copy()
+        k_neg = k_neg[k_neg_theta != -10]  # -10 means the lane is too short and is discarded
+        k_pos = k_pos[k_pos_theta != -10]
+        k_neg.sort()
+        k_pos.sort()
+        label_path = "segs" + names[i][6:-3] + "png"
+        label = np.zeros((height, width), dtype=np.uint8)
+        bin_label = [0] * 10
+
+        all_points = np.zeros((10, 125, 2), dtype=np.float)
+        the_anno_row_anchor = np.array(list(range(200, 1450, 10)))
+        all_points[:, :, 1] = np.tile(the_anno_row_anchor, (10, 1))
+        all_points[:, :, 0] = -99999
+
+        rw = 2560 / width
+        rh = 1440 / height
+        for idx in range(len(k_neg))[:5]:
+            which_lane = np.where(ks == k_neg[idx])[0][0]
+            draw(label, lines[which_lane], 5 - idx)
+            bin_label[4 - idx] = 1
+            all_points[4 - idx] = spline(
+                np.array(lines[which_lane]), the_anno_row_anchor, ratio_height=rh, ratio_width=rw
+            )
+
+        for idx in range(len(k_pos))[:5]:
+            which_lane = np.where(ks == k_pos[-(idx + 1)])[0][0]
+            draw(label, lines[which_lane], 6 + idx)
+            bin_label[5 + idx] = 1
+            all_points[5 + idx] = spline(
+                np.array(lines[which_lane]), the_anno_row_anchor, ratio_height=rh, ratio_width=rw
+            )
+
+        cv2.imwrite(os.path.join(root, label_path), label)
+        cache_dict["train/" + names[i]] = all_points.tolist()
+
+        train_gt_fp.write(
+            "train/" + names[i] + " " + "train/" + label_path + " " + " ".join(list(map(str, bin_label))) + "\n"
+        )
+    train_gt_fp.close()
+    with open(os.path.join(root, json_name), "w") as f:
+        json.dump(cache_dict, f)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", required=True, help="The root of the CurveLanes dataset")
+    return parser
+
+
+if __name__ == "__main__":
+    args = get_args().parse_args()
+
+    names, line_txt = get_curvelanes_list(args.root, "train")
+    # generate training list for training
+    generate_segmentation_and_train_list(os.path.join(args.root, "train"), line_txt, names)
+
+    # names, line_txt = get_curvelanes_list(args.root,  'valid')
+    # generate_segmentation_and_train_list(os.path.join(args.root, 'valid'), line_txt, names, file_name='valid_gt.txt', json_name='culane_anno_cache_val.json')
diff --git a/scripts/convert_tusimple.py b/scripts/convert_tusimple.py
new file mode 100644
index 0000000..9e02744
--- /dev/null
+++ b/scripts/convert_tusimple.py
@@ -0,0 +1,264 @@
+import argparse
+import json
+import os
+import pdb
+
+import cv2
+import numpy as np
+import tqdm
+
+
+def calc_k(line):
+    """Calculate the direction of lanes."""
+    line_x = line[::2]
+    line_y = line[1::2]
+    length = np.sqrt((line_x[0] - line_x[-1]) ** 2 + (line_y[0] - line_y[-1]) ** 2)
+    if length < 90:
+        return -10  # if the lane is too short, it will be skipped
+
+    p = np.polyfit(line_x, line_y, deg=1)
+    rad = np.arctan(p[0])
+
+    return rad
+
+
+def draw(im, line, idx, show=False):
+    """Generate the segmentation label according to json annotation."""
+    line_x = line[::2]
+    line_y = line[1::2]
+    pt0 = (int(line_x[0]), int(line_y[0]))
+    if show:
+        cv2.putText(
+            im,
+            str(idx),
+            (int(line_x[len(line_x) // 2]), int(line_y[len(line_x) // 2]) - 20),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            1.0,
+            (255, 255, 255),
+            lineType=cv2.LINE_AA,
+        )
+        idx = idx * 60
+
+    for i in range(len(line_x) - 1):
+        cv2.line(im, pt0, (int(line_x[i + 1]), int(line_y[i + 1])), (idx,), thickness=16)
+        pt0 = (int(line_x[i + 1]), int(line_y[i + 1]))
+
+
+def get_tusimple_list(root, label_list):
+    """Get all the files' names from the json annotation."""
+    label_json_all = []
+    for l in label_list:
+        l = os.path.join(root, l)
+        label_json = [json.loads(line) for line in open(l).readlines()]
+        label_json_all += label_json
+    names = [l["raw_file"] for l in label_json_all]
+    h_samples = [np.array(l["h_samples"]) for l in label_json_all]
+    lanes = [np.array(l["lanes"]) for l in label_json_all]
+
+    line_txt = []
+    for i in range(len(lanes)):
+        line_txt_i = []
+        for j in range(len(lanes[i])):
+            if np.all(lanes[i][j] == -2):
+                continue
+            valid = lanes[i][j] != -2
+            line_txt_tmp = [None] * (len(h_samples[i][valid]) + len(lanes[i][j][valid]))
+            line_txt_tmp[::2] = list(map(str, lanes[i][j][valid]))
+            line_txt_tmp[1::2] = list(map(str, h_samples[i][valid]))
+            line_txt_i.append(line_txt_tmp)
+        line_txt.append(line_txt_i)
+
+    return names, line_txt
+
+
+def generate_segmentation_and_train_list(root, line_txt, names):
+    """
+    The lane annotations of the Tusimple dataset is not strictly in order, so we need to find out the correct lane order for segmentation.
+
+    We use the same definition as CULane, in which the four lanes from left to right are represented as 1,2,3,4 in segentation label respectively.
+
+    """
+    train_gt_fp = open(os.path.join(root, "train_gt.txt"), "w")
+
+    cache_dict = {}
+
+    for i in tqdm.tqdm(range(len(line_txt))):
+        tmp_line = line_txt[i]
+        lines = []
+        for j in range(len(tmp_line)):
+            lines.append(list(map(float, tmp_line[j])))
+
+        ks = np.array([calc_k(line) for line in lines])  # get the direction of each lane
+
+        k_neg = ks[ks < 0].copy()
+        k_pos = ks[ks > 0].copy()
+        k_neg = k_neg[k_neg != -10]  # -10 means the lane is too short and is discarded
+        k_pos = k_pos[k_pos != -10]
+        k_neg.sort()
+        k_pos.sort()
+
+        label_path = names[i][:-3] + "png"
+        label = np.zeros((720, 1280), dtype=np.uint8)
+        all_points = np.zeros((4, 56, 2), dtype=np.float)
+        the_anno_row_anchor = np.array(
+            [
+                160,
+                170,
+                180,
+                190,
+                200,
+                210,
+                220,
+                230,
+                240,
+                250,
+                260,
+                270,
+                280,
+                290,
+                300,
+                310,
+                320,
+                330,
+                340,
+                350,
+                360,
+                370,
+                380,
+                390,
+                400,
+                410,
+                420,
+                430,
+                440,
+                450,
+                460,
+                470,
+                480,
+                490,
+                500,
+                510,
+                520,
+                530,
+                540,
+                550,
+                560,
+                570,
+                580,
+                590,
+                600,
+                610,
+                620,
+                630,
+                640,
+                650,
+                660,
+                670,
+                680,
+                690,
+                700,
+                710,
+            ]
+        )
+        all_points[:, :, 1] = np.tile(the_anno_row_anchor, (4, 1))
+        all_points[:, :, 0] = -99999
+        bin_label = [0, 0, 0, 0]
+        if len(k_neg) == 1:  # for only one lane in the left
+            which_lane = np.where(ks == k_neg[0])[0][0]
+            draw(label, lines[which_lane], 2)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[1, yy, 0] = xx
+            bin_label[1] = 1
+        elif len(k_neg) == 2:  # for two lanes in the left
+            which_lane = np.where(ks == k_neg[1])[0][0]
+            draw(label, lines[which_lane], 1)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[0, yy, 0] = xx
+            which_lane = np.where(ks == k_neg[0])[0][0]
+            draw(label, lines[which_lane], 2)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[1, yy, 0] = xx
+            bin_label[0] = 1
+            bin_label[1] = 1
+        elif len(k_neg) > 2:  # for more than two lanes in the left,
+            which_lane = np.where(ks == k_neg[1])[0][0]  # we only choose the two lanes that are closest to the center
+            draw(label, lines[which_lane], 1)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[0, yy, 0] = xx
+            which_lane = np.where(ks == k_neg[0])[0][0]
+            draw(label, lines[which_lane], 2)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[1, yy, 0] = xx
+            bin_label[0] = 1
+            bin_label[1] = 1
+
+        if len(k_pos) == 1:  # For the lanes in the right, the same logical is adopted.
+            which_lane = np.where(ks == k_pos[0])[0][0]
+            draw(label, lines[which_lane], 3)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[2, yy, 0] = xx
+            bin_label[2] = 1
+        elif len(k_pos) == 2:
+            which_lane = np.where(ks == k_pos[1])[0][0]
+            draw(label, lines[which_lane], 3)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[2, yy, 0] = xx
+            which_lane = np.where(ks == k_pos[0])[0][0]
+            draw(label, lines[which_lane], 4)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[3, yy, 0] = xx
+            bin_label[2] = 1
+            bin_label[3] = 1
+        elif len(k_pos) > 2:
+            which_lane = np.where(ks == k_pos[-1])[0][0]
+            draw(label, lines[which_lane], 3)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[2, yy, 0] = xx
+            which_lane = np.where(ks == k_pos[-2])[0][0]
+            draw(label, lines[which_lane], 4)
+            xx = np.array(lines[which_lane][::2])
+            yy = ((np.array(lines[which_lane][1::2]) - 160) / 10).astype(int)
+            all_points[3, yy, 0] = xx
+            bin_label[2] = 1
+            bin_label[3] = 1
+
+        cv2.imwrite(os.path.join(root, label_path), label)
+
+        cache_dict[names[i]] = all_points.tolist()
+        train_gt_fp.write(names[i] + " " + label_path + " " + " ".join(list(map(str, bin_label))) + "\n")
+    train_gt_fp.close()
+    with open(os.path.join(root, "tusimple_anno_cache.json"), "w") as f:
+        json.dump(cache_dict, f)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", required=True, help="The root of the Tusimple dataset")
+    return parser
+
+
+if __name__ == "__main__":
+    args = get_args().parse_args()
+
+    # training set
+    names, line_txt = get_tusimple_list(
+        args.root, ["label_data_0601.json", "label_data_0531.json", "label_data_0313.json"]
+    )
+    # generate segmentation and training list for training
+    generate_segmentation_and_train_list(args.root, line_txt, names)
+
+    # testing set
+    names, line_txt = get_tusimple_list(args.root, ["test_tasks_0627.json"])
+    # generate testing set for testing
+    with open(os.path.join(args.root, "test.txt"), "w") as fp:
+        for name in names:
+            fp.write(name + "\n")
diff --git a/scripts/make_curvelane_as_culane_test.py b/scripts/make_curvelane_as_culane_test.py
new file mode 100644
index 0000000..e00a3ca
--- /dev/null
+++ b/scripts/make_curvelane_as_culane_test.py
@@ -0,0 +1,73 @@
+import argparse
+import json
+import os
+
+import imagesize
+import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", required=True, help="The root of the dataset")
+    return parser
+
+
+def read_label(label_path, x_factor, y_factor):
+    js = json.load(open(label_path))["Lines"]
+    all_lanes = []
+    for ll in js:
+        cur_lane_x = []
+        cur_lane_y = []
+        for pt in ll:
+            loc_x = float(pt["x"]) * x_factor
+            loc_y = float(pt["y"]) * y_factor
+            cur_lane_x.append(loc_x)
+            cur_lane_y.append(loc_y)
+            # img = cv2.circle(img, (int(loc_x), int(loc_y)), 5, (0,0,255), -1)
+
+        cur_lane_x_sorted = [x for _, x in sorted(zip(cur_lane_y, cur_lane_x))]
+        cur_lane_y_sorted = sorted(cur_lane_y)
+
+        all_lanes.append([str(val) for pair in zip(cur_lane_x_sorted, cur_lane_y_sorted) for val in pair])
+
+    return all_lanes
+
+
+def generate_linestxt_on_curvelane_val():
+    args = get_args().parse_args()
+    curvelane_val_root = os.path.join(args.root, "valid")
+
+    assert os.path.exists(curvelane_val_root)
+
+    assert os.path.exists(os.path.join(curvelane_val_root, "images"))
+
+    list_file = os.path.join(curvelane_val_root, "valid.txt")
+
+    all_files = open(list_file).readlines()
+
+    for file in tqdm.tqdm(all_files):
+        file = file.strip()
+        label_path = file.replace("images", "labels")
+        label_path = label_path.replace(".jpg", ".lines.json")
+
+        label_path = os.path.join(curvelane_val_root, label_path)
+        file_path = os.path.join(curvelane_val_root, file)
+
+        width, height = imagesize.get(file_path)
+
+        culane_style_label = read_label(label_path, x_factor=2560 / width, y_factor=1440 / height)
+        culane_style_label_store_path = os.path.join(curvelane_val_root, file).replace("jpg", "lines.txt")
+        with open(culane_style_label_store_path, "w") as f:
+            for culane_style_label_i in culane_style_label:
+                f.write(" ".join(culane_style_label_i) + "\n")
+
+    fp = open(os.path.join(curvelane_val_root, "valid.txt"))
+    res = fp.readlines()
+    fp.close()
+    res = [os.path.join("valid", r) for r in res]
+    with open(os.path.join(curvelane_val_root, "valid_for_culane_style.txt"), "w") as fp:
+        fp.writelines(res)
+
+
+if __name__ == "__main__":
+    generate_linestxt_on_curvelane_val()
diff --git a/speed_simple.py b/speed_simple.py
new file mode 100644
index 0000000..de96b52
--- /dev/null
+++ b/speed_simple.py
@@ -0,0 +1,32 @@
+import time
+
+import numpy as np
+import torch
+
+from utils.common import get_model
+from utils.common import merge_config
+
+torch.backends.cudnn.benchmark = True
+args, cfg = merge_config()
+net = get_model(cfg)
+net.eval()
+
+x = torch.ones((1, 3, cfg.train_height, cfg.train_width)).cuda()
+for i in range(10):
+    y = net(x)
+
+t_all = []
+for i in range(100):
+    t1 = time.time()
+    y = net(x)
+    t2 = time.time()
+    t_all.append(t2 - t1)
+
+print("average time:", np.mean(t_all) / 1)
+print("average fps:", 1 / np.mean(t_all))
+
+print("fastest time:", min(t_all) / 1)
+print("fastest fps:", 1 / min(t_all))
+
+print("slowest time:", max(t_all) / 1)
+print("slowest fps:", 1 / max(t_all))
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..e520811
--- /dev/null
+++ b/test.py
@@ -0,0 +1,43 @@
+import os
+
+import torch
+
+from evaluation.eval_wrapper import eval_lane
+from utils.common import get_model
+from utils.common import merge_config
+
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+
+    args, cfg = merge_config()
+
+    distributed = False
+    if "WORLD_SIZE" in os.environ:
+        distributed = int(os.environ["WORLD_SIZE"]) > 1
+    cfg.distributed = distributed
+    if distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+
+    net = get_model(cfg)
+
+    if cfg.model_ckpt:
+        net = torch.load(cfg.model_ckpt, map_location="cpu")["model_ckpt"].cuda()
+    else:
+        state_dict = torch.load(cfg.test_model, map_location="cpu")["model"].cuda()
+        compatible_state_dict = {}
+        for k, v in state_dict.items():
+            if "module." in k:
+                compatible_state_dict[k[7:]] = v
+            else:
+                compatible_state_dict[k] = v
+
+        net.load_state_dict(compatible_state_dict, strict=True)
+
+    if distributed:
+        net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[args.local_rank])
+
+    if not os.path.exists(cfg.test_work_dir):
+        os.mkdir(cfg.test_work_dir)
+
+    eval_lane(net, cfg)
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..e4fdf4d
--- /dev/null
+++ b/train.py
@@ -0,0 +1,243 @@
+import datetime
+import os
+import time
+from typing import Callable
+from typing import Dict
+
+import torch
+from enot.optimize import GTBaselineOptimizer
+
+from evaluation.eval_wrapper import eval_lane
+from utils.common import ExponentialMovingAverage
+from utils.common import calc_loss
+from utils.common import get_logger
+from utils.common import get_model
+from utils.common import get_train_loader
+from utils.common import get_work_dir
+from utils.common import inference
+from utils.common import merge_config
+from utils.common import save_model
+from utils.dist_utils import dist_print
+from utils.dist_utils import dist_tqdm
+from utils.dist_utils import synchronize
+from utils.factory import get_loss_dict
+from utils.factory import get_metric_dict
+from utils.factory import get_optimizer
+from utils.factory import get_scheduler
+from utils.metrics import reset_metrics
+from utils.metrics import update_metrics
+
+
+def train(
+    net: torch.nn.Module,
+    data_loader: torch.utils.data.DataLoader,
+    loss_dict: Dict[str, Callable],
+    optimizer,
+    scheduler,
+    logger,
+    epoch: int,
+    metric_dict: Dict[str, Callable],
+    dataset: str,
+    teacher: torch.nn.Module = None,
+    distill_loss_weight: float = None,
+    distill_loss_fn: Callable = None,
+    model_ema: ExponentialMovingAverage = None,
+):
+    net.train()
+    progress_bar = dist_tqdm(data_loader)
+    for b_idx, data_label in enumerate(progress_bar):
+        global_step = epoch * len(data_loader) + b_idx
+
+        results = None
+        common_loss = None
+        task_loss = None
+        distill_loss = None
+
+        optimizer.zero_grad()
+
+        def closure():
+            nonlocal results
+            nonlocal common_loss
+            nonlocal task_loss
+            nonlocal distill_loss
+            results = inference(net, data_label, dataset, teacher=teacher)
+            task_loss = calc_loss(loss_dict, results, logger, global_step, epoch)
+            if teacher:
+                distill_loss = distill_loss_fn(results["student_out"], results["teacher_out"]) * distill_loss_weight
+                common_loss = task_loss + distill_loss
+            else:
+                common_loss = task_loss
+            common_loss.backward()
+
+            if model_ema and b_idx % cfg.model_ema_steps == 0:
+                model_ema.update_parameters(net)
+                if epoch < args.ema_warmup_epochs:
+                    # Reset ema buffer to keep copying weights during warmup period
+                    model_ema.n_averaged.fill_(0)
+
+            return common_loss
+
+        optimizer.step(closure)
+        scheduler.step(global_step)
+
+        if global_step % 20 == 0:
+            reset_metrics(metric_dict)
+            update_metrics(metric_dict, results)
+            for me_name, me_op in zip(metric_dict["name"], metric_dict["op"]):
+                logger.add_scalar("metric/" + me_name, me_op.get(), global_step=global_step)
+            logger.add_scalar("meta/lr", optimizer.param_groups[0]["lr"], global_step=global_step)
+            logger.add_scalar("train/task_loss", task_loss, global_step=global_step)
+            if teacher:
+                logger.add_scalar("train/distill_loss", distill_loss, global_step=global_step)
+            logger.add_scalar("train/common_loss", common_loss, global_step=global_step)
+
+            if hasattr(progress_bar, "set_postfix"):
+                kwargs = {
+                    me_name: "%.3f" % me_op.get() for me_name, me_op in zip(metric_dict["name"], metric_dict["op"])
+                }
+                new_kwargs = {}
+                for k, v in kwargs.items():
+                    if "lane" in k:
+                        continue
+                    new_kwargs[k] = v
+                progress_bar.set_postfix(loss="%.3f" % float(common_loss), **new_kwargs)
+
+
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+
+    args, cfg = merge_config()
+
+    if args.local_rank == 0:
+        work_dir = get_work_dir(cfg)
+
+    distributed = False
+    if "WORLD_SIZE" in os.environ:
+        distributed = int(os.environ["WORLD_SIZE"]) > 1
+    if distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+
+        if args.local_rank == 0:
+            with open(".work_dir_tmp_file.txt", "w") as f:
+                f.write(work_dir)
+        else:
+            while not os.path.exists(".work_dir_tmp_file.txt"):
+                time.sleep(0.1)
+            with open(".work_dir_tmp_file.txt") as f:
+                work_dir = f.read().strip()
+
+    synchronize()
+    cfg.test_work_dir = work_dir
+    cfg.distributed = distributed
+
+    if args.local_rank == 0:
+        os.system("rm .work_dir_tmp_file.txt")
+
+    dist_print(datetime.datetime.now().strftime("[%Y/%m/%d %H:%M:%S]") + " start training...")
+    dist_print(cfg)
+    assert cfg.backbone in ["18", "34", "50", "101", "152", "50next", "101next", "50wide", "101wide", "34fca"]
+
+    train_loader = get_train_loader(cfg)
+
+    resume_epoch = 0
+    net = get_model(cfg)
+    if args.model_ckpt is not None:
+        net = torch.load(args.model_ckpt, map_location="cpu")["model_ckpt"].cuda()
+    optimizer = get_optimizer(net, cfg)
+    # resume now work as model ckpt
+    if cfg.resume is not None:
+        dist_print("==> Resume model from " + cfg.resume)
+        resume_dict = torch.load(cfg.resume, map_location="cpu")
+        net.load_state_dict(resume_dict["model"])
+        net.cuda()
+        if "optimizer" in resume_dict.keys():
+            optimizer.load_state_dict(resume_dict["optimizer"])
+
+    if distributed:
+        net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[args.local_rank])
+
+    optimizer = GTBaselineOptimizer(model=net, optimizer=optimizer, rho=0.05)
+    model_ema = None
+    if cfg.model_ema:
+        # Decay adjustment that aims to keep the decay independent of other hyper-parameters originally proposed at:
+        # https://github.com/facebookresearch/pycls/blob/f8cd9627/pycls/core/net.py#L123
+        #
+        # total_ema_updates = (Dataset_size / n_GPUs) * epochs / (batch_size_per_gpu * EMA_steps)
+        # We consider constant = Dataset_size for a given dataset/setup and omit it. Thus:
+        # adjust = 1 / total_ema_updates ~= n_GPUs * batch_size_per_gpu * EMA_steps / epochs
+        adjust = 1 * cfg.batch_size * cfg.model_ema_steps / cfg.epoch
+        alpha = 1.0 - cfg.model_ema_decay
+        alpha = min(1.0, alpha * adjust)
+        print(1.0 - alpha)
+        model_ema = ExponentialMovingAverage(
+            net,
+            decay=1.0 - alpha,
+            device="cuda",
+        )
+
+    if cfg.finetune is not None:
+        dist_print("finetune from ", cfg.finetune)
+        state_all = torch.load(cfg.finetune, map_location="cpu")["model"]
+        state_clip = {}  # only use backbone parameters
+        for k, v in state_all.items():
+            if "model" in k:
+                state_clip[k] = v
+        net.load_state_dict(state_clip, strict=False)
+
+    if cfg.teacher:
+        teacher = torch.load(cfg.teacher, map_location="cpu")["model_ckpt"].cuda()
+    else:
+        teacher = None
+
+    scheduler = get_scheduler(optimizer, cfg, len(train_loader))
+    metric_dict = get_metric_dict(cfg)
+    loss_dict = get_loss_dict(cfg)
+    logger = get_logger(work_dir, cfg)
+    # cp_projects(cfg.auto_backup, work_dir)
+    max_res = 0
+    res = None
+    for epoch in range(resume_epoch, cfg.epoch):
+        train(
+            net=net,
+            data_loader=train_loader,
+            loss_dict=loss_dict,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            logger=logger,
+            epoch=epoch,
+            metric_dict=metric_dict,
+            dataset=cfg.dataset,
+            teacher=teacher,
+            distill_loss_weight=cfg.distill_loss_weight,
+            distill_loss_fn=torch.nn.MSELoss() if teacher else None,
+            model_ema=model_ema,
+        )
+        train_loader.reset()
+
+        if cfg.model_ema:
+            res = eval_lane(model_ema, cfg, ep=epoch, logger=logger)
+        else:
+            res = eval_lane(net, cfg, ep=epoch, logger=logger)
+
+        if res is not None and res > max_res:
+            max_res = res
+            if cfg.model_ema:
+                save_model(
+                    net=model_ema,
+                    optimizer=optimizer,
+                    epoch=epoch,
+                    save_path=work_dir,
+                    distributed=distributed,
+                )
+            else:
+                save_model(
+                    net=net,
+                    optimizer=optimizer,
+                    epoch=epoch,
+                    save_path=work_dir,
+                    distributed=distributed,
+                )
+        logger.add_scalar("CuEval/X", max_res, global_step=epoch)
+
+    logger.close()
diff --git a/ufldv2.png b/ufldv2.png
new file mode 100644
index 0000000..f615c4c
Binary files /dev/null and b/ufldv2.png differ
diff --git a/utils/common.py b/utils/common.py
new file mode 100644
index 0000000..fe84f1b
--- /dev/null
+++ b/utils/common.py
@@ -0,0 +1,481 @@
+import argparse
+import datetime
+import importlib
+import os
+import time
+
+import numpy as np
+import pathspec
+import torch
+
+from data.dali_data import TrainCollect
+from utils.config import Config
+from utils.dist_utils import DistSummaryWriter
+from utils.dist_utils import dist_print
+from utils.dist_utils import get_rank
+from utils.dist_utils import get_world_size
+from utils.dist_utils import is_main_process
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config", help="path to config file")
+    parser.add_argument("--local_rank", type=int, default=0)
+
+    parser.add_argument("--dataset", default=None, type=str)
+    parser.add_argument("--data_root", default=None, type=str)
+    parser.add_argument("--epoch", default=None, type=int)
+    parser.add_argument("--batch_size", default=None, type=int)
+    parser.add_argument("--optimizer", default=None, type=str)
+    parser.add_argument("--learning_rate", default=None, type=float)
+    parser.add_argument("--weight_decay", default=None, type=float)
+    parser.add_argument("--momentum", default=None, type=float)
+    parser.add_argument("--scheduler", default=None, type=str)
+    parser.add_argument("--steps", default=None, type=int, nargs="+")
+    parser.add_argument("--gamma", default=None, type=float)
+    parser.add_argument("--warmup", default=None, type=str)
+    parser.add_argument("--warmup_iters", default=None, type=int)
+    parser.add_argument("--backbone", default=None, type=str)
+    parser.add_argument("--griding_num", default=None, type=int)
+    parser.add_argument("--use_aux", default=None, type=str2bool)
+    parser.add_argument("--sim_loss_w", default=None, type=float)
+    parser.add_argument("--shp_loss_w", default=None, type=float)
+    parser.add_argument("--note", default=None, type=str)
+    parser.add_argument("--log_path", default=None, type=str)
+    parser.add_argument(
+        "--finetune",
+        default=None,
+        type=str,
+        help="Path to checkpoint with model state_dict, to start training from",
+    )
+    parser.add_argument(
+        "--resume",
+        default=None,
+        type=str,
+        help="Path ot checkpoint with model and optimizer states to start train from",
+    )
+    parser.add_argument("--test_model", default=None, type=str)
+    parser.add_argument("--test_work_dir", default=None, type=str)
+    parser.add_argument("--num_lanes", default=None, type=int)
+    parser.add_argument("--auto_backup", action="store_false", help="automatically backup current code in the log path")
+    parser.add_argument("--var_loss_power", default=None, type=float)
+    parser.add_argument("--num_row", default=None, type=int)
+    parser.add_argument("--num_col", default=None, type=int)
+    parser.add_argument("--train_width", default=None, type=int)
+    parser.add_argument("--train_height", default=None, type=int)
+    parser.add_argument("--num_cell_row", default=None, type=int)
+    parser.add_argument("--num_cell_col", default=None, type=int)
+    parser.add_argument("--mean_loss_w", default=None, type=float)
+    parser.add_argument("--fc_norm", default=None, type=str2bool)
+    parser.add_argument("--soft_loss", default=None, type=str2bool)
+    parser.add_argument("--cls_loss_col_w", default=None, type=float)
+    parser.add_argument("--cls_ext_col_w", default=None, type=float)
+    parser.add_argument("--mean_loss_col_w", default=None, type=float)
+    parser.add_argument("--eval_mode", default=None, type=str)
+    parser.add_argument("--eval_during_training", default=None, type=str2bool)
+    parser.add_argument("--split_channel", default=None, type=str2bool)
+    parser.add_argument("--match_method", default=None, type=str, choices=["fixed", "hungarian"])
+    parser.add_argument("--selected_lane", default=None, type=int, nargs="+")
+    parser.add_argument("--cumsum", default=None, type=str2bool)
+    parser.add_argument("--masked", default=None, type=str2bool)
+    parser.add_argument(
+        "--model_ckpt",
+        type=str,
+        help="Path to checkpoint to resume model as nn.Module (not state_diсt)",
+    )
+
+    # EMA
+    parser.add_argument("--model_ema", action="store_true", help="Flag for EMA using during model training")
+    parser.add_argument(
+        "--model_ema_steps",
+        type=int,
+        default=32,
+        help="the number of iterations that controls how often to update the EMA model (default: 32)",
+    )
+    parser.add_argument(
+        "--model_ema_decay",
+        type=float,
+        default=0.99998,
+        help="decay factor for Exponential Moving Average of model parameters (default: 0.99998)",
+    )
+    parser.add_argument("--ema_warmup_epochs", type=int, default=10, help="Number of epochs to train without EMA")
+
+    # distillation args
+    parser.add_argument("--teacher", type=str, default=None, help="Path to teacher checkpoint.")
+    parser.add_argument("--distill_loss_weight", type=float, default=1.0, help="Weight for distillation loss")
+
+    # pruning args
+    parser.add_argument(
+        "--latency_type",
+        default="MAC",
+        type=str,
+        choices=["MAC", "server"],
+        help="Type of latency for pruning, MAC using as default",
+    )
+    parser.add_argument(
+        "--acceleration",
+        type=float,
+        default=2.0,
+        help="Pruned model will be acceleration times faster than baseline",
+    )
+    parser.add_argument(
+        "--n_search_steps",
+        default=200,
+        type=int,
+        help="Number of search steps for optimal architecture",
+    )
+    parser.add_argument(
+        "--host",
+        default="localhost",
+        type=str,
+        help="Host of latency measurement server",
+    )
+    parser.add_argument(
+        "--port",
+        default=15003,
+        type=int,
+        help="Port of latency measurement server",
+    )
+    parser.add_argument(
+        "--ti_compatible",
+        action="store_true",
+        help="Flag for TI compatible ONNX export duping pruning.",
+    )
+
+    # for evaluation on onnx
+    parser.add_argument("--onnx_path", default=None, type=str, help="Path to onnx for TensorRT inference.")
+
+    return parser
+
+
+def merge_config():
+    args = get_args().parse_args()
+    cfg = Config.fromfile(args.config)
+
+    items = [
+        "dataset",
+        "data_root",
+        "epoch",
+        "batch_size",
+        "optimizer",
+        "learning_rate",
+        "weight_decay",
+        "momentum",
+        "scheduler",
+        "steps",
+        "gamma",
+        "warmup",
+        "warmup_iters",
+        "use_aux",
+        "griding_num",
+        "backbone",
+        "sim_loss_w",
+        "shp_loss_w",
+        "note",
+        "log_path",
+        "finetune",
+        "resume",
+        "test_model",
+        "test_work_dir",
+        "num_lanes",
+        "var_loss_power",
+        "num_row",
+        "num_col",
+        "train_width",
+        "train_height",
+        "num_cell_row",
+        "num_cell_col",
+        "mean_loss_w",
+        "fc_norm",
+        "soft_loss",
+        "cls_loss_col_w",
+        "cls_ext_col_w",
+        "mean_loss_col_w",
+        "eval_mode",
+        "eval_during_training",
+        "split_channel",
+        "match_method",
+        "selected_lane",
+        "cumsum",
+        "masked",
+        "model_ckpt",
+        "model_ema",
+        "model_ema_steps",
+        "model_ema_decay",
+        "ema_warmup_epochs",
+        "teacher",
+        "distill_loss_weight",
+        "latency_type",
+        "acceleration",
+        "n_search_steps",
+        "host",
+        "port",
+        "ti_compatible",
+        "onnx_path",
+    ]
+    for item in items:
+        if getattr(args, item) is not None:
+            dist_print("merge ", item, " config")
+            setattr(cfg, item, getattr(args, item))
+
+    if cfg.dataset == "CULane":
+        cfg.row_anchor = np.linspace(0.42, 1, cfg.num_row)
+        cfg.col_anchor = np.linspace(0, 1, cfg.num_col)
+    elif cfg.dataset == "Tusimple":
+        cfg.row_anchor = np.linspace(160, 710, cfg.num_row) / 720
+        cfg.col_anchor = np.linspace(0, 1, cfg.num_col)
+    elif cfg.dataset == "CurveLanes":
+        cfg.row_anchor = np.linspace(0.4, 1, cfg.num_row)
+        cfg.col_anchor = np.linspace(0, 1, cfg.num_col)
+
+    return args, cfg
+
+
+def save_model(net, optimizer, epoch, save_path, distributed, model_path="model_best.pth"):
+    if is_main_process():
+        model_state_dict = net.state_dict()
+        state = {"model_ckpt": net, "model": model_state_dict, "optimizer": optimizer.state_dict()}
+        assert os.path.exists(save_path)
+        model_path = os.path.join(save_path, model_path)
+        torch.save(state, model_path)
+
+
+def cp_projects(auto_backup, to_path):
+    if is_main_process() and auto_backup:
+        with open("./.gitignore") as fp:
+            ign = fp.read()
+        ign += "\n.git"
+        spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ign.splitlines())
+        all_files = {os.path.join(root, name) for root, dirs, files in os.walk("./") for name in files}
+        matches = spec.match_files(all_files)
+        matches = set(matches)
+        to_cp_files = all_files - matches
+        dist_print("Copying projects to " + to_path + " for backup")
+        t0 = time.time()
+        warning_flag = True
+        for f in to_cp_files:
+            dirs = os.path.join(to_path, "code", os.path.split(f[2:])[0])
+            if not os.path.exists(dirs):
+                os.makedirs(dirs)
+            os.system("cp {} {}".format(f, os.path.join(to_path, "code", f[2:])))
+            elapsed_time = time.time() - t0
+            if elapsed_time > 5 and warning_flag:
+                dist_print(
+                    "If the program is stuck, it might be copying large files in this directory. please don't set"
+                    " --auto_backup. Or please make you working directory clean, i.e, don't"
+                    " place large files like dataset, log results under this directory."
+                )
+                warning_flag = False
+
+
+def get_work_dir(cfg):
+    work_dir = os.path.join(cfg.log_path, cfg.note)
+    return work_dir
+
+
+def get_logger(work_dir, cfg):
+    logger = DistSummaryWriter(work_dir)
+    config_txt = os.path.join(work_dir, "cfg.txt")
+    if is_main_process():
+        with open(config_txt, "w") as fp:
+            fp.write(str(cfg))
+
+    return logger
+
+
+def initialize_weights(*models):
+    for model in models:
+        real_init_weights(model)
+
+
+def real_init_weights(m):
+    if isinstance(m, list):
+        for mini_m in m:
+            real_init_weights(mini_m)
+    else:
+        if isinstance(m, torch.nn.Conv2d):
+            torch.nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+            if m.bias is not None:
+                torch.nn.init.constant_(m.bias, 0)
+        elif isinstance(m, torch.nn.Linear):
+            m.weight.data.normal_(0.0, std=0.01)
+        elif isinstance(m, torch.nn.BatchNorm2d):
+            torch.nn.init.constant_(m.weight, 1)
+            torch.nn.init.constant_(m.bias, 0)
+        elif isinstance(m, torch.nn.Module):
+            for mini_m in m.children():
+                real_init_weights(mini_m)
+        else:
+            print("unkonwn module", m)
+
+
+def get_model(cfg):
+    return importlib.import_module("model.model_" + cfg.dataset.lower()).get_model(cfg)
+
+
+def get_train_loader(cfg):
+    if cfg.dataset == "CULane":
+        train_loader = TrainCollect(
+            cfg.batch_size,
+            4,
+            cfg.data_root,
+            os.path.join(cfg.data_root, "list/train_gt.txt"),
+            get_rank(),
+            get_world_size(),
+            cfg.row_anchor,
+            cfg.col_anchor,
+            cfg.train_width,
+            cfg.train_height,
+            cfg.num_cell_row,
+            cfg.num_cell_col,
+            cfg.dataset,
+            cfg.crop_ratio,
+        )
+    elif cfg.dataset == "Tusimple":
+        train_loader = TrainCollect(
+            cfg.batch_size,
+            4,
+            cfg.data_root,
+            os.path.join(cfg.data_root, "train_gt.txt"),
+            get_rank(),
+            get_world_size(),
+            cfg.row_anchor,
+            cfg.col_anchor,
+            cfg.train_width,
+            cfg.train_height,
+            cfg.num_cell_row,
+            cfg.num_cell_col,
+            cfg.dataset,
+            cfg.crop_ratio,
+        )
+    elif cfg.dataset == "CurveLanes":
+        train_loader = TrainCollect(
+            cfg.batch_size,
+            4,
+            cfg.data_root,
+            os.path.join(cfg.data_root, "train", "train_gt.txt"),
+            get_rank(),
+            get_world_size(),
+            cfg.row_anchor,
+            cfg.col_anchor,
+            cfg.train_width,
+            cfg.train_height,
+            cfg.num_cell_row,
+            cfg.num_cell_col,
+            cfg.dataset,
+            cfg.crop_ratio,
+        )
+    else:
+        raise NotImplementedError
+    return train_loader
+
+
+def inference(net, data_label, dataset, teacher=None):
+    if dataset == "CurveLanes":
+        return inference_curvelanes(net, data_label, teacher=teacher)
+    elif dataset in ["Tusimple", "CULane"]:
+        return inference_culane_tusimple(net, data_label, teacher=teacher)
+    else:
+        raise NotImplementedError
+
+
+def inference_culane_tusimple(net, data_label, teacher=None):
+    pred = net(data_label["images"])
+    cls_out_ext_label = (data_label["labels_row"] != -1).long()
+    cls_out_col_ext_label = (data_label["labels_col"] != -1).long()
+    res_dict = {
+        "cls_out": pred["loc_row"],
+        "cls_label": data_label["labels_row"],
+        "cls_out_col": pred["loc_col"],
+        "cls_label_col": data_label["labels_col"],
+        "cls_out_ext": pred["exist_row"],
+        "cls_out_ext_label": cls_out_ext_label,
+        "cls_out_col_ext": pred["exist_col"],
+        "cls_out_col_ext_label": cls_out_col_ext_label,
+        "labels_row_float": data_label["labels_row_float"],
+        "labels_col_float": data_label["labels_col_float"],
+    }
+    if teacher:
+        res_dict["student_out"] = pred["out"]
+        res_dict["teacher_out"] = teacher(data_label["images"])["out"]
+    if "seg_out" in pred.keys():
+        res_dict["seg_out"] = pred["seg_out"]
+        res_dict["seg_label"] = data_label["seg_images"]
+
+    return res_dict
+
+
+def inference_curvelanes(net, data_label, teacher=None):
+    pred = net(data_label["images"])
+    cls_out_ext_label = (data_label["labels_row"] != -1).long()
+    cls_out_col_ext_label = (data_label["labels_col"] != -1).long()
+
+    res_dict = {
+        "cls_out": pred["loc_row"],
+        "cls_label": data_label["labels_row"],
+        "cls_out_col": pred["loc_col"],
+        "cls_label_col": data_label["labels_col"],
+        "cls_out_ext": pred["exist_row"],
+        "cls_out_ext_label": cls_out_ext_label,
+        "cls_out_col_ext": pred["exist_col"],
+        "cls_out_col_ext_label": cls_out_col_ext_label,
+        "seg_label": data_label["seg_images"],
+        "seg_out_row": pred["lane_token_row"],
+        "seg_out_col": pred["lane_token_col"],
+    }
+    if teacher:
+        res_dict["student_out"] = pred["out"]
+        res_dict["teacher_out"] = teacher(data_label["images"])["out"]
+    if "seg_out" in pred.keys():
+        res_dict["seg_out"] = pred["seg_out"]
+        res_dict["seg_label"] = data_label["segs"]
+    return res_dict
+
+
+def calc_loss(loss_dict, results, logger, global_step, epoch):
+    loss = 0
+
+    for i in range(len(loss_dict["name"])):
+        if loss_dict["weight"][i] == 0:
+            continue
+
+        data_src = loss_dict["data_src"][i]
+
+        datas = [results[src] for src in data_src]
+
+        loss_cur = loss_dict["op"][i](*datas)
+
+        if global_step % 20 == 0:
+            logger.add_scalar("loss/" + loss_dict["name"][i], loss_cur, global_step)
+
+        loss += loss_cur * loss_dict["weight"][i]
+
+    return loss
+
+
+class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
+    """
+    Maintains moving averages of model parameters using an exponential decay.
+
+    ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
+    `torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
+    is used to compute the EMA.
+
+    """
+
+    def __init__(self, model, decay, device="cpu"):
+        def ema_avg(avg_model_param, model_param, num_averaged):
+            return decay * avg_model_param + (1 - decay) * model_param
+
+        super().__init__(model, device, ema_avg, use_buffers=True)
diff --git a/utils/config.py b/utils/config.py
new file mode 100644
index 0000000..f7dcb1d
--- /dev/null
+++ b/utils/config.py
@@ -0,0 +1,348 @@
+import json
+import os.path as osp
+import shutil
+import sys
+import tempfile
+from argparse import Action
+from argparse import ArgumentParser
+from collections import abc
+from importlib import import_module
+
+from addict import Dict
+
+BASE_KEY = "_base_"
+DELETE_KEY = "_delete_"
+
+
+class ConfigDict(Dict):
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super().__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no " f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+
+def add_args(parser, cfg, prefix=""):
+    for k, v in cfg.items():
+        if isinstance(v, str):
+            parser.add_argument("--" + prefix + k)
+        elif isinstance(v, int):
+            parser.add_argument("--" + prefix + k, type=int)
+        elif isinstance(v, float):
+            parser.add_argument("--" + prefix + k, type=float)
+        elif isinstance(v, bool):
+            parser.add_argument("--" + prefix + k, action="store_true")
+        elif isinstance(v, dict):
+            add_args(parser, v, prefix + k + ".")
+        elif isinstance(v, abc.Iterable):
+            parser.add_argument("--" + prefix + k, type=type(v[0]), nargs="+")
+        else:
+            print(f"cannot parse key {prefix + k} of type {type(v)}")
+    return parser
+
+
+class Config:
+    """
+    A facility for config and config files.
+
+    It supports common file formats as configs: python/json/yaml. The interface
+    is the same as a dict object and also allows access config values as
+    attributes.
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'prune'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'prune'}"
+
+    """
+
+    @staticmethod
+    def _file2dict(filename):
+        filename = osp.abspath(osp.expanduser(filename))
+        if filename.endswith(".py"):
+            with tempfile.TemporaryDirectory() as temp_config_dir:
+                temp_config_file = tempfile.NamedTemporaryFile(dir=temp_config_dir, suffix=".py")
+                temp_config_name = osp.basename(temp_config_file.name)
+                # close temp file
+                temp_config_file.close()
+                shutil.copyfile(filename, osp.join(temp_config_dir, temp_config_name))
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {name: value for name, value in mod.__dict__.items() if not name.startswith("__")}
+                # delete imported module
+                del sys.modules[temp_module_name]
+
+        elif filename.endswith((".yml", ".yaml", ".json")):
+            import mmcv
+
+            cfg_dict = mmcv.load(filename)
+        else:
+            raise OSError("Only py/yml/yaml/json type are supported now!")
+
+        cfg_text = filename + "\n"
+        with open(filename) as f:
+            cfg_text += f.read()
+
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                if len(base_cfg_dict.keys() & c.keys()) > 0:
+                    raise KeyError("Duplicate key is not allowed among bases")
+                base_cfg_dict.update(c)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = "\n".join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b):
+        # merge dict `a` into dict `b` (non-inplace). values in `a` will
+        # overwrite `b`.
+        # copy first to avoid inplace modification
+        b = b.copy()
+        for k, v in a.items():
+            if isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False):
+                if not isinstance(b[k], dict):
+                    raise TypeError(
+                        f"{k}={v} in child config cannot inherit from base "
+                        f"because {k} is a dict in the child config but is of "
+                        f"type {type(b[k])} in base config. You may set "
+                        f"`{DELETE_KEY}=True` to ignore the base config"
+                    )
+                b[k] = Config._merge_a_into_b(v, b[k])
+            else:
+                b[k] = v
+        return b
+
+    @staticmethod
+    def fromfile(filename):
+        cfg_dict, cfg_text = Config._file2dict(filename)
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+    @staticmethod
+    def auto_argparser(description=None):
+        """Generate argparser from config file automatically (experimental)"""
+        partial_parser = ArgumentParser(description=description)
+        partial_parser.add_argument("config", help="config file path")
+        cfg_file = partial_parser.parse_known_args()[0].config
+        cfg = Config.fromfile(cfg_file)
+        parser = ArgumentParser(description=description)
+        parser.add_argument("config", help="config file path")
+        add_args(parser, cfg)
+        return parser, cfg
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError("cfg_dict must be a dict, but " f"got {type(cfg_dict)}")
+
+        super().__setattr__("_cfg_dict", ConfigDict(cfg_dict))
+        super().__setattr__("_filename", filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename) as f:
+                text = f.read()
+        else:
+            text = ""
+        super().__setattr__("_text", text)
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split("\n")
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * " ") + line for line in s]
+            s = "\n".join(s)
+            s = first + "\n" + s
+            return s
+
+        def _format_basic_types(k, v):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+            attr_str = f"{str(k)}={v_str}"
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = "[\n"
+                v_str += "\n".join(f"dict({_indent(_format_dict(v_), indent)})," for v_ in v).rstrip(",")
+                attr_str = f"{str(k)}={v_str}"
+                attr_str = _indent(attr_str, indent) + "]"
+            else:
+                attr_str = _format_basic_types(k, v)
+            return attr_str
+
+        def _format_dict(d, outest_level=False):
+            r = ""
+            s = []
+            for idx, (k, v) in enumerate(d.items()):
+                is_last = idx >= len(d) - 1
+                end = "" if outest_level or is_last else ","
+                if isinstance(v, dict):
+                    v_str = "\n" + _format_dict(v)
+                    attr_str = f"{str(k)}=dict({v_str}"
+                    attr_str = _indent(attr_str, indent) + ")" + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v) + end
+                else:
+                    attr_str = _format_basic_types(k, v) + end
+
+                s.append(attr_str)
+            r += "\n".join(s)
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+
+        return text
+
+    def __repr__(self):
+        return f"Config (path: {self.filename}): {self._cfg_dict.__repr__()}"
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def dump(self):
+        cfg_dict = super().__getattribute__("_cfg_dict")
+        format_text = json.dumps(cfg_dict, indent=2)
+        return format_text
+
+    def merge_from_dict(self, options):
+        """
+        Merge list into cfg_dict
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+        Args:
+            options (dict): dict of configs to merge from.
+
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split(".")
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super().__getattribute__("_cfg_dict")
+        super().__setattr__("_cfg_dict", Config._merge_a_into_b(option_cfg_dict, cfg_dict))
+
+
+class DictAction(Action):
+    """
+    Argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary.
+
+    List options should
+    be passed as comma separated values, i.e KEY=V1,V2,V3
+
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ["true", "false"]:
+            return True if val.lower() == "true" else False
+        return val
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split("=", maxsplit=1)
+            val = [self._parse_int_float_bool(v) for v in val.split(",")]
+            if len(val) == 1:
+                val = val[0]
+            options[key] = val
+        setattr(namespace, self.dest, options)
diff --git a/utils/dist_utils.py b/utils/dist_utils.py
new file mode 100644
index 0000000..e2e68af
--- /dev/null
+++ b/utils/dist_utils.py
@@ -0,0 +1,173 @@
+import pickle
+
+import torch
+import torch.distributed as dist
+
+
+def get_world_size():
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def to_python_float(t):
+    if hasattr(t, "item"):
+        return t.item()
+    else:
+        return t[0]
+
+
+def get_rank():
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def can_log():
+    return is_main_process()
+
+
+def dist_print(*args, **kwargs):
+    if can_log():
+        print(*args, **kwargs)
+
+
+def synchronize():
+    """Helper function to synchronize (barrier) among all processes when
+    using distributed training."""
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+def dist_cat_reduce_tensor(tensor):
+    if not dist.is_available():
+        return tensor
+    if not dist.is_initialized():
+        return tensor
+    # dist_print(tensor)
+    rt = tensor.clone()
+    all_list = [torch.zeros_like(tensor) for _ in range(get_world_size())]
+    dist.all_gather(all_list, rt)
+    # dist_print(all_list[0][1],all_list[1][1],all_list[2][1],all_list[3][1])
+    # dist_print(all_list[0][2],all_list[1][2],all_list[2][2],all_list[3][2])
+    # dist_print(all_list[0][3],all_list[1][3],all_list[2][3],all_list[3][3])
+    # dist_print(all_list[0].shape)
+    return torch.cat(all_list, dim=0)
+
+
+def dist_sum_reduce_tensor(tensor):
+    if not dist.is_available():
+        return tensor
+    if not dist.is_initialized():
+        return tensor
+    if not isinstance(tensor, torch.Tensor):
+        return tensor
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.reduce_op.SUM)
+    return rt
+
+
+def dist_mean_reduce_tensor(tensor):
+    rt = dist_sum_reduce_tensor(tensor)
+    rt /= get_world_size()
+    return rt
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.LongTensor([tensor.numel()]).to("cuda")
+    size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+from torch.utils.tensorboard import SummaryWriter
+
+
+class DistSummaryWriter(SummaryWriter):
+    def __init__(self, *args, **kwargs):
+        if can_log():
+            super().__init__(*args, **kwargs)
+
+    def add_scalar(self, *args, **kwargs):
+        if can_log():
+            super().add_scalar(*args, **kwargs)
+
+    def add_figure(self, *args, **kwargs):
+        if can_log():
+            super().add_figure(*args, **kwargs)
+
+    def add_graph(self, *args, **kwargs):
+        if can_log():
+            super().add_graph(*args, **kwargs)
+
+    def add_histogram(self, *args, **kwargs):
+        if can_log():
+            super().add_histogram(*args, **kwargs)
+
+    def add_image(self, *args, **kwargs):
+        if can_log():
+            super().add_image(*args, **kwargs)
+
+    def close(self):
+        if can_log():
+            super().close()
+
+
+import tqdm
+
+
+def dist_tqdm(obj, *args, **kwargs):
+    if can_log():
+        return tqdm.tqdm(obj, *args, **kwargs)
+    else:
+        return obj
diff --git a/utils/factory.py b/utils/factory.py
new file mode 100644
index 0000000..f09ca90
--- /dev/null
+++ b/utils/factory.py
@@ -0,0 +1,267 @@
+import math
+
+import torch
+
+from utils.loss import MeanLoss
+from utils.loss import ParsingRelationDis
+from utils.loss import ParsingRelationLoss
+from utils.loss import SoftmaxFocalLoss
+from utils.loss import TokenSegLoss
+from utils.loss import VarLoss
+from utils.metrics import AccTopk
+from utils.metrics import Metric_mIoU
+from utils.metrics import MultiLabelAcc
+
+
+def get_optimizer(net, cfg):
+    training_params = filter(lambda p: p.requires_grad, net.parameters())
+    if cfg.optimizer == "Adam":
+        optimizer = torch.optim.Adam(
+            training_params,
+            lr=cfg.learning_rate,
+            weight_decay=cfg.weight_decay,
+        )
+    elif cfg.optimizer == "SGD":
+        optimizer = torch.optim.SGD(
+            training_params, lr=cfg.learning_rate, momentum=cfg.momentum, weight_decay=cfg.weight_decay
+        )
+    else:
+        raise NotImplementedError
+    return optimizer
+
+
+def get_scheduler(optimizer, cfg, iters_per_epoch):
+    if cfg.scheduler == "multi":
+        scheduler = MultiStepLR(
+            optimizer,
+            cfg.steps,
+            cfg.gamma,
+            iters_per_epoch,
+            cfg.warmup,
+            iters_per_epoch if cfg.warmup_iters is None else cfg.warmup_iters,
+        )
+    elif cfg.scheduler == "cos":
+        scheduler = CosineAnnealingLR(
+            optimizer, cfg.epoch * iters_per_epoch, eta_min=0, warmup=cfg.warmup, warmup_iters=cfg.warmup_iters
+        )
+    else:
+        raise NotImplementedError
+    return scheduler
+
+
+def get_loss_dict(cfg):
+    if cfg.dataset == "CurveLanes":
+        loss_dict = {
+            "name": [
+                "cls_loss",
+                "relation_loss",
+                "relation_dis",
+                "cls_loss_col",
+                "cls_ext",
+                "cls_ext_col",
+                "mean_loss_row",
+                "mean_loss_col",
+                "var_loss_row",
+                "var_loss_col",
+                "lane_token_seg_loss_row",
+                "lane_token_seg_loss_col",
+            ],
+            "op": [
+                SoftmaxFocalLoss(2, ignore_lb=-1),
+                ParsingRelationLoss(),
+                ParsingRelationDis(),
+                SoftmaxFocalLoss(2, ignore_lb=-1),
+                torch.nn.CrossEntropyLoss(),
+                torch.nn.CrossEntropyLoss(),
+                MeanLoss(),
+                MeanLoss(),
+                VarLoss(cfg.var_loss_power),
+                VarLoss(cfg.var_loss_power),
+                TokenSegLoss(),
+                TokenSegLoss(),
+            ],
+            "weight": [
+                1.0,
+                cfg.sim_loss_w,
+                cfg.shp_loss_w,
+                1.0,
+                1.0,
+                1.0,
+                cfg.mean_loss_w,
+                cfg.mean_loss_w,
+                0.01,
+                0.01,
+                1.0,
+                1.0,
+            ],
+            "data_src": [
+                ("cls_out", "cls_label"),
+                ("cls_out",),
+                ("cls_out",),
+                ("cls_out_col", "cls_label_col"),
+                ("cls_out_ext", "cls_out_ext_label"),
+                ("cls_out_col_ext", "cls_out_col_ext_label"),
+                ("cls_out", "cls_label"),
+                ("cls_out_col", "cls_label_col"),
+                ("cls_out", "cls_label"),
+                ("cls_out_col", "cls_label_col"),
+                ("seg_out_row", "seg_label"),
+                ("seg_out_col", "seg_label"),
+            ],
+        }
+    elif cfg.dataset in ["Tusimple", "CULane"]:
+        loss_dict = {
+            "name": [
+                "cls_loss",
+                "relation_loss",
+                "relation_dis",
+                "cls_loss_col",
+                "cls_ext",
+                "cls_ext_col",
+                "mean_loss_row",
+                "mean_loss_col",
+            ],
+            "op": [
+                SoftmaxFocalLoss(2, ignore_lb=-1),
+                ParsingRelationLoss(),
+                ParsingRelationDis(),
+                SoftmaxFocalLoss(2, ignore_lb=-1),
+                torch.nn.CrossEntropyLoss(),
+                torch.nn.CrossEntropyLoss(),
+                MeanLoss(),
+                MeanLoss(),
+            ],
+            "weight": [
+                1.0,
+                cfg.sim_loss_w,
+                cfg.shp_loss_w,
+                1.0,
+                1.0,
+                1.0,
+                cfg.mean_loss_w,
+                cfg.mean_loss_w,
+            ],
+            "data_src": [
+                ("cls_out", "cls_label"),
+                ("cls_out",),
+                ("cls_out",),
+                ("cls_out_col", "cls_label_col"),
+                ("cls_out_ext", "cls_out_ext_label"),
+                ("cls_out_col_ext", "cls_out_col_ext_label"),
+                ("cls_out", "cls_label"),
+                ("cls_out_col", "cls_label_col"),
+            ],
+        }
+    else:
+        raise NotImplementedError
+
+    if cfg.use_aux:
+        loss_dict["name"].append("seg_loss")
+        loss_dict["op"].append(torch.nn.CrossEntropyLoss(weight=torch.tensor([0.6, 1.0, 1.0, 1.0, 1.0])).cuda())
+        loss_dict["weight"].append(1.0)
+        loss_dict["data_src"].append(("seg_out", "seg_label"))
+
+    assert len(loss_dict["name"]) == len(loss_dict["op"]) == len(loss_dict["data_src"]) == len(loss_dict["weight"])
+    return loss_dict
+
+
+def get_metric_dict(cfg):
+    metric_dict = {
+        "name": ["top1", "top2", "top3", "ext_row", "ext_col"],
+        "op": [AccTopk(-1, 1), AccTopk(-1, 2), AccTopk(-1, 3), MultiLabelAcc(), MultiLabelAcc()],
+        "data_src": [
+            ("cls_out", "cls_label"),
+            ("cls_out", "cls_label"),
+            ("cls_out", "cls_label"),
+            ("cls_out_ext", "cls_out_ext_label"),
+            ("cls_out_col_ext", "cls_out_col_ext_label"),
+        ],
+    }
+    metric_dict["name"].extend(["col_top1", "col_top2", "col_top3"])
+    metric_dict["op"].extend(
+        [
+            AccTopk(-1, 1),
+            AccTopk(-1, 2),
+            AccTopk(-1, 3),
+        ]
+    )
+    metric_dict["data_src"].extend(
+        [
+            ("cls_out_col", "cls_label_col"),
+            ("cls_out_col", "cls_label_col"),
+            ("cls_out_col", "cls_label_col"),
+        ]
+    )
+
+    if cfg.use_aux:
+        metric_dict["name"].append("iou")
+        metric_dict["op"].append(Metric_mIoU(5))
+        metric_dict["data_src"].append(("seg_out", "seg_label"))
+
+    assert len(metric_dict["name"]) == len(metric_dict["op"]) == len(metric_dict["data_src"])
+    return metric_dict
+
+
+class MultiStepLR:
+    def __init__(self, optimizer, steps, gamma=0.1, iters_per_epoch=None, warmup=None, warmup_iters=None):
+        self.warmup = warmup
+        self.warmup_iters = warmup_iters
+        self.optimizer = optimizer
+        self.steps = steps
+        self.steps.sort()
+        self.gamma = gamma
+        self.iters_per_epoch = iters_per_epoch
+        self.iters = 0
+        self.base_lr = [group["lr"] for group in optimizer.param_groups]
+
+    def step(self, external_iter=None):
+        self.iters += 1
+        if external_iter is not None:
+            self.iters = external_iter
+        if self.warmup == "linear" and self.iters < self.warmup_iters:
+            rate = self.iters / self.warmup_iters
+            for group, lr in zip(self.optimizer.param_groups, self.base_lr):
+                group["lr"] = lr * rate
+            return
+
+        # multi policy
+        if self.iters % self.iters_per_epoch == 0:
+            epoch = int(self.iters / self.iters_per_epoch)
+            power = -1
+            for i, st in enumerate(self.steps):
+                if epoch < st:
+                    power = i
+                    break
+            if power == -1:
+                power = len(self.steps)
+            # print(self.iters, self.iters_per_epoch, self.steps, power)
+
+            for group, lr in zip(self.optimizer.param_groups, self.base_lr):
+                group["lr"] = lr * (self.gamma**power)
+
+
+class CosineAnnealingLR:
+    def __init__(self, optimizer, T_max, eta_min=0, warmup=None, warmup_iters=None):
+        self.warmup = warmup
+        self.warmup_iters = warmup_iters
+        self.optimizer = optimizer
+        self.T_max = T_max
+        self.eta_min = eta_min
+
+        self.iters = 0
+        self.base_lr = [group["lr"] for group in optimizer.param_groups]
+
+    def step(self, external_iter=None):
+        self.iters += 1
+        if external_iter is not None:
+            self.iters = external_iter
+        if self.warmup == "linear" and self.iters < self.warmup_iters:
+            rate = self.iters / self.warmup_iters
+            for group, lr in zip(self.optimizer.param_groups, self.base_lr):
+                group["lr"] = lr * rate
+            return
+
+        # cos policy
+
+        for group, lr in zip(self.optimizer.param_groups, self.base_lr):
+            group["lr"] = self.eta_min + (lr - self.eta_min) * (1 + math.cos(math.pi * self.iters / self.T_max)) / 2
diff --git a/utils/loss.py b/utils/loss.py
new file mode 100644
index 0000000..d8175b4
--- /dev/null
+++ b/utils/loss.py
@@ -0,0 +1,233 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class OhemCELoss(nn.Module):
+    def __init__(self, thresh, n_min, ignore_lb=255, *args, **kwargs):
+        super().__init__()
+        self.thresh = -torch.log(torch.tensor(thresh, dtype=torch.float)).cuda()
+        self.n_min = n_min
+        self.ignore_lb = ignore_lb
+        self.criteria = nn.CrossEntropyLoss(ignore_index=ignore_lb, reduction="none")
+
+    def forward(self, logits, labels):
+        N, C, H, W = logits.size()
+        loss = self.criteria(logits, labels).view(-1)
+        loss, _ = torch.sort(loss, descending=True)
+        if loss[self.n_min] > self.thresh:
+            loss = loss[loss > self.thresh]
+        else:
+            loss = loss[: self.n_min]
+        return torch.mean(loss)
+
+
+def soft_nll(pred, target, ignore_index=-1):
+    C = pred.shape[1]
+    invalid_target_index = target == ignore_index
+
+    ttarget = target.clone()
+    ttarget[invalid_target_index] = C
+
+    target_l = target - 1
+    target_r = target + 1
+
+    invalid_part_l = target_l == -1
+    invalid_part_r = target_r == C
+
+    invalid_target_l_index = torch.logical_or(invalid_target_index, invalid_part_l)
+    target_l[invalid_target_l_index] = C
+
+    invalid_target_r_index = torch.logical_or(invalid_target_index, invalid_part_r)
+    target_r[invalid_target_r_index] = C
+
+    supp_part_l = target.clone()
+    supp_part_r = target.clone()
+    supp_part_l[target != 0] = C
+    supp_part_r[target != C - 1] = C
+
+    target_onehot = torch.nn.functional.one_hot(ttarget, num_classes=C + 1)
+    target_onehot = target_onehot[..., :-1].permute(0, 3, 1, 2)
+
+    target_l_onehot = torch.nn.functional.one_hot(target_l, num_classes=C + 1)
+    target_l_onehot = target_l_onehot[..., :-1].permute(0, 3, 1, 2)
+
+    target_r_onehot = torch.nn.functional.one_hot(target_r, num_classes=C + 1)
+    target_r_onehot = target_r_onehot[..., :-1].permute(0, 3, 1, 2)
+
+    supp_part_l_onehot = torch.nn.functional.one_hot(supp_part_l, num_classes=C + 1)
+    supp_part_l_onehot = supp_part_l_onehot[..., :-1].permute(0, 3, 1, 2)
+
+    supp_part_r_onehot = torch.nn.functional.one_hot(supp_part_r, num_classes=C + 1)
+    supp_part_r_onehot = supp_part_r_onehot[..., :-1].permute(0, 3, 1, 2)
+
+    target_fusion = (
+        0.9 * target_onehot
+        + 0.05 * target_l_onehot
+        + 0.05 * target_r_onehot
+        + 0.05 * supp_part_l_onehot
+        + 0.05 * supp_part_r_onehot
+    )
+    # import pdb; pdb.set_trace()
+    return -(target_fusion * pred).sum() / (target != ignore_index).sum()
+
+
+class SoftmaxFocalLoss(nn.Module):
+    def __init__(self, gamma, ignore_lb=255, soft_loss=True, *args, **kwargs):
+        super().__init__()
+        self.gamma = gamma
+        self.ignore_lb = ignore_lb
+        self.soft_loss = soft_loss
+        if not self.soft_loss:
+            self.nll = nn.NLLLoss(ignore_index=ignore_lb)
+
+    def forward(self, logits, labels):
+        scores = F.softmax(logits, dim=1)
+        factor = torch.pow(1.0 - scores, self.gamma)
+        log_score = F.log_softmax(logits, dim=1)
+        log_score = factor * log_score
+        if self.soft_loss:
+            loss = soft_nll(log_score, labels, ignore_index=self.ignore_lb)
+        else:
+            loss = self.nll(log_score, labels)
+
+        # import pdb; pdb.set_trace()
+        return loss
+
+
+class ParsingRelationLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, logits):
+        n, c, h, w = logits.shape
+        loss_all = []
+        for i in range(0, h - 1):
+            loss_all.append(logits[:, :, i, :] - logits[:, :, i + 1, :])
+        # loss0 : n,c,w
+        loss = torch.cat(loss_all)
+        return torch.nn.functional.smooth_l1_loss(loss, torch.zeros_like(loss))
+
+
+class MeanLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.SmoothL1Loss(reduction="none")
+
+    def forward(self, logits, label):
+        n, c, h, w = logits.shape
+        grid = torch.arange(c, device=logits.device).view(1, c, 1, 1)
+        logits = (logits.softmax(1) * grid).sum(1)
+        loss = self.l1(logits, label.float())[label != -1]
+        return loss.mean()
+
+
+class VarLoss(nn.Module):
+    def __init__(self, power=2):
+        super().__init__()
+        self.power = power
+
+    def forward(self, logits, label):
+        n, c, h, w = logits.shape
+        grid = torch.arange(c, device=logits.device).view(1, c, 1, 1)
+        logits = logits.softmax(1)
+        mean = (logits * grid).sum(1).view(n, 1, h, w)
+        # n,1,h,w
+        var = (mean - grid).abs().pow(self.power) * logits
+        # var = ((mean - grid).abs() - 4) * logits
+        # n,c,h,w
+        loss = var.sum(1)[(label != -1) & ((label - mean.squeeze()).abs() < 1)]
+        return loss.mean()
+
+
+class EMDLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, logits, label):
+        n, c, h, w = logits.shape
+        grid = torch.arange(c, device=logits.device).view(1, c, 1, 1)
+        logits = logits.softmax(1)
+        # n,1,h,w
+        var = (label.reshape(n, 1, h, w) - grid) * (label.reshape(n, 1, h, w) - grid) * logits
+        # n,c,h,w
+        loss = var.sum(1)[label != -1]
+        return loss.mean()
+
+
+class ParsingRelationDis(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = torch.nn.L1Loss()
+        # self.l1 = torch.nn.MSELoss()
+
+    def forward(self, x):
+        n, dim, num_rows, num_cols = x.shape
+        x = torch.nn.functional.softmax(x[:, : dim - 1, :, :], dim=1)
+        embedding = torch.Tensor(np.arange(dim - 1)).float().to(x.device).view(1, -1, 1, 1)
+        pos = torch.sum(x * embedding, dim=1)
+
+        diff_list1 = []
+        for i in range(0, num_rows // 2):
+            diff_list1.append(pos[:, i, :] - pos[:, i + 1, :])
+
+        loss = 0
+        for i in range(len(diff_list1) - 1):
+            loss += self.l1(diff_list1[i], diff_list1[i + 1])
+        loss /= len(diff_list1) - 1
+        return loss
+
+
+def cross_entropy(pred, target, reduction="elementwise_mean"):
+    res = -target * torch.nn.functional.log_softmax(pred, dim=1)
+    if reduction == "elementwise_mean":
+        return torch.mean(torch.sum(res, dim=1))
+    elif reduction == "sum":
+        return torch.sum(torch.sum(res, dim=1))
+    else:
+        return res
+
+
+class RegLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.L1Loss(reduction="none")
+
+    def forward(self, logits, label):
+        n, c, h, w = logits.shape
+        assert c == 1
+        logits = logits.sigmoid()
+        loss = self.l1(logits[:, 0], label)[label != -1]
+        # print(logits[0], label[0])
+        # import pdb; pdb.set_trace()
+        return loss.mean()
+
+
+class TokenSegLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.criterion = nn.BCELoss()
+        self.max_pool = nn.MaxPool2d(4)
+
+    def forward(self, logits, labels):
+        return self.criterion(
+            F.interpolate(logits, size=(200, 400), mode="bilinear").sigmoid(),
+            (self.max_pool(labels[:, 0:1, :, :]) != 0).float(),
+        )
+
+
+def test_cross_entropy():
+    pred = torch.rand(10, 200, 33, 66)
+    target = torch.randint(200, (10, 33, 66))
+    target_one_hot = torch.nn.functional.one_hot(target, num_classes=200).permute(0, 3, 1, 2)
+    print(torch.nn.functional.cross_entropy(pred, target))
+    print(cross_entropy(pred, target_one_hot))
+    print(soft_nll(torch.nn.functional.log_softmax(pred, dim=1), torch.randint(-1, 200, (10, 33, 66))))
+
+    # assert torch.nn.functional.cross_entropy(pred,target) == cross_entropy(pred,target_one_hot)
+    print("OK")
+
+
+if __name__ == "__main__":
+    test_cross_entropy()
diff --git a/utils/metrics.py b/utils/metrics.py
new file mode 100644
index 0000000..0d2fe05
--- /dev/null
+++ b/utils/metrics.py
@@ -0,0 +1,150 @@
+import pdb
+import time
+
+import numpy as np
+import torch
+
+
+def converter(data):
+    if isinstance(data, torch.Tensor):
+        data = data.cpu().data.numpy().flatten()
+    return data.flatten()
+
+
+def fast_hist(label_pred, label_true, num_classes):
+    # pdb.set_trace()
+    hist = np.bincount(num_classes * label_true.astype(int) + label_pred, minlength=num_classes**2)
+    hist = hist.reshape(num_classes, num_classes)
+    return hist
+
+
+class Metric_mIoU:
+    def __init__(self, class_num):
+        self.class_num = class_num
+        self.hist = np.zeros((self.class_num, self.class_num))
+
+    def update(self, predict, target):
+        predict, target = converter(predict), converter(target)
+
+        self.hist += fast_hist(predict, target, self.class_num)
+
+    def reset(self):
+        self.hist = np.zeros((self.class_num, self.class_num))
+
+    def get_miou(self):
+        miou = np.diag(self.hist) / (np.sum(self.hist, axis=1) + np.sum(self.hist, axis=0) - np.diag(self.hist))
+        miou = np.nanmean(miou)
+        return miou
+
+    def get_acc(self):
+        acc = np.diag(self.hist) / self.hist.sum(axis=1)
+        acc = np.nanmean(acc)
+        return acc
+
+    def get(self):
+        return self.get_miou()
+
+
+class MultiLabelAcc:
+    def __init__(self):
+        self.cnt = 0
+        self.correct = 0
+
+    def reset(self):
+        self.cnt = 0
+        self.correct = 0
+
+    def update(self, predict, target):
+        predict = predict.argmax(1)
+        predict, target = converter(predict), converter(target)
+        self.cnt += len(predict)
+        self.correct += np.sum(predict == target)
+
+    def get_acc(self):
+        return self.correct * 1.0 / self.cnt
+
+    def get(self):
+        return self.get_acc()
+
+
+class AccTopk:
+    def __init__(self, background_classes, k):
+        self.background_classes = background_classes
+        self.k = k
+        self.cnt = 0
+        self.top5_correct = 0
+
+    def reset(self):
+        self.cnt = 0
+        self.top5_correct = 0
+
+    def update(self, predict, target):
+        predict = predict.argmax(1)
+        predict, target = converter(predict), converter(target)
+        self.cnt += len(predict)
+        background_idx = target == self.background_classes
+        # self.top5_correct += np.sum(predict[background_idx] == target[background_idx])
+        not_background_idx = np.logical_not(background_idx)
+        self.top5_correct += np.sum(np.absolute(predict[not_background_idx] - target[not_background_idx]) < self.k)
+
+    def get(self):
+        return self.top5_correct * 1.0 / self.cnt
+
+
+class Mae:
+    def __init__(self, dim_sel, ignore=-1):
+        self.dim_sel = dim_sel
+        self.ignore = ignore
+        self.all_res = []
+
+    def reset(self):
+        self.all_res = []
+
+    def update(self, predict, target):
+        # import pdb; pdb.set_trace()
+        predict = predict[..., self.dim_sel]
+        target = target[..., self.dim_sel]
+
+        cls_dim = predict.shape[1]
+        grid = torch.arange(cls_dim, device=predict.device).view(1, cls_dim, 1)
+        predict = predict.softmax(1)
+        predict = (predict * grid).sum(1) / (cls_dim - 1)
+        res = (predict - target).abs()[target != self.ignore]
+        res = converter(res)
+        if len(res) != 0:
+            self.all_res.append(res)
+
+    def get(self):
+        if len(self.all_res) == 0:
+            return 1
+        return np.mean(np.concatenate(self.all_res))
+
+
+def update_metrics(metric_dict, pair_data):
+    for i in range(len(metric_dict["name"])):
+        metric_op = metric_dict["op"][i]
+        data_src = metric_dict["data_src"][i]
+        metric_op.update(pair_data[data_src[0]], pair_data[data_src[1]])
+
+
+def reset_metrics(metric_dict):
+    for op in metric_dict["op"]:
+        op.reset()
+
+
+if __name__ == "__main__":
+    # p = np.random.randint(5, size=(800, 800))
+    # t = np.zeros((800, 800))
+    # me = Metric_mIoU(5)
+    # me.update(p,p)
+    # me.update(p,t)
+    # me.update(p,p)
+    # me.update(p,t)
+    # print(me.get_miou())
+    # print(me.get_acc())
+
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 0])
+    b = np.array([1, 1, 2, 2, 2, 3, 3, 4, 4, 0])
+    me = AccTopk(0, 5)
+    me.update(b, a)
+    print(me.get())