index.html

<!DOCTYPE html>
<html>
<head>
  <title>DecisionNCE</title>
    <style>
        .hidden {
            display: none;
        }
    </style>
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <meta charset="utf-8">
    <meta name="description"
        content="Embodied Multimodal Representations via Implicit Preference Learning">
    <meta name="keywords" content="DecisionNCE, Representation Learning, Embodied AI, AI">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>Embodied Multimodal Representations via Implicit Preference Learning</title>

    <!-- <link rel="icon" href="./assets/icon.png"> -->

    <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

    <link rel="stylesheet" href="./assets/css/bulma.min.css">
    <link rel="stylesheet" href="./assets/css/bulma-carousel.min.css">
    <link rel="stylesheet" href="./assets/css/bulma-slider.min.css">
    <link rel="stylesheet" href="./assets/css/fontawesome.all.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">

    <script defer src="./assets/js/fontawesome.all.min.js"></script>

</head>

<body>

<!-- <nav class="navbar" role="navigation" aria-label="main navigation">
    <div class="navbar-brand">
        <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
        <span aria-hidden="true"></span>
        </a>
    </div>
    <div class="navbar-menu">
        <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
        <div class="navbar-item has-dropdown is-hoverable">
            <a class="navbar-link">
            More Research
            </a>
            <div class="navbar-dropdown">
            <a class="navbar-item" href="https://github.com/ZhengYinan-AIR/OMIGA">
                <b>OMIGA</b> <p style="font-size:18px; display: inline; margin-left: 5px;"></p>
            </a>
            </a>
            </div>
        </div>
        </div>
    
    </div>
</nav> -->

      
<section class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
        <div class="columns is-centered">
            <div class="column has-text-centered">
            <h1 class="title is-1 publication-title is-bold">
                <!-- <img src="./assets/icon.png" style="width:1em;vertical-align: middle" alt="Logo"/>  -->
                <span class="mmmu" style="vertical-align: middle">DecisionNCE</span>
                </h1>
            <h2 class="subtitle is-3 publication-subtitle">
                Embodied Multimodal Representations via Implicit Preference Learning
                <!-- <br>
                and Reasoning Benchmark for Expert AGI -->
            </h2>
            <div class="is-size-5 publication-authors">
                <span class="author-block">Jianxiong Li* <sup>†</sup> <sup style="color:#DC4437;">1</sup>,</span>
                <span class="author-block">Jinliang Zheng* <sup style="color:#DC4437;">1</sup> <sup style="color:#4385F5;">2</sup>,</span>
                <span class="author-block">Yinan Zheng* <sup style="color:#DC4437;">1</sup>,</span><br>
                <span class="author-block">Liyuan Mao<sup style="color:#109D59;">3</sup>,</span>
                <span class="author-block">Xiao Hu<sup style="color:#DC4437;">1</sup>,</span>
                <span class="author-block">Sijie Cheng<sup style="color:#DC4437;">1</sup>,</span>
                <span class="author-block">Haoyi Niu<sup style="color:#DC4437;">1</sup>,</span>
                <span class="author-block">Jihao Liu<sup style="color:#F5B400;">4</sup> <sup style="color:#4385F5;">2</sup>,</span><br>
                <span class="author-block">Yu Liu<sup style="color:#4385F5;">2</sup>,</span>
                <span class="author-block">Jingjing Liu<sup style="color:#DC4437;">1</sup>,</span>
                <span class="author-block">Ya-Qin Zhang<sup style="color:#DC4437;">1</sup>,</span>
                <span class="author-block">Xianyuan Zhan<sup>✉</sup> <sup style="color:#DC4437;">1</sup> <sup style="color:#f542dd;">5</sup>,</span>
                
            </div>
            
            <br>
            
            <div class="is-size-5 publication-authors">
                <span class="author-block"><sup style="color:#DC4437;">1</sup>AIR, Tsinghua University</span>
                <span class="author-block"><sup style="color:#4385F5;">2</sup>SenseTime Research</span>
                <span class="author-block"><sup style="color:#109D59;">3</sup>Shanghai Jiaotong University</span><br>
                <span class="author-block"><sup style="color:#F5B400;">4</sup>CUHK MMLab</span>
                <span class="author-block"><sup style="color:#f542dd;">5</sup>Shanghai AI Lab</span>
            </div>
    
            <br>
            <div class="is-size-5 publication-authors">
                <span class="author-block">*Equal contribution, </span><br>
                <span class="author-block">†Project Lead:</span>
                <span class="author-block"><a href="mailto:li-jx21@mails.tsinghua.edu.cn">li-jx21@mails.tsinghua.edu.cn</a></span><br>
                <span class="author-block">✉Corresponding author:</span>
                <span class="author-block"><a href="mailto:zhanxianyuan@air.tsinghua.edu.cn">zhanxianyuan@air.tsinghua.edu.cn</a></span>
            </div>
            
    
            <div class="column has-text-centered">
                <div class="publication-links">
                <!-- PDF Link. -->
                <span class="link-block">
                  <a href="https://arxiv.org/pdf/2402.18137.pdf"
                      class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Arxiv</span>
                  </a>
              </span>
                <!-- <span class="link-block">
                    <a href="https://openreview.net/forum?id=j5JvZCaDM0"
                        class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                        <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Openreview</span>
                    </a>
                </span> -->
                <!-- <span class="link-block">
                    <a href="https://cloud.tsinghua.edu.cn/d/0d2939f7f7234cf68f1d/"
                        class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                        <p style="font-size:18px">🔗</p>
                    </span>
                    <span>Dataset</span>
                    </a>
                </span> -->
                <span class="link-block">
                    <a href="https://github.com/2toinf/DecisionNCE"
                        class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                        <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                    </a>
                </span>

                </div>
    
            </div>
            </div>
        </div>
        </div>
    </div>
</section>
<style>
    .center {
      display: block;
      margin-left: auto;
      margin-right: auto;
      width: 80%;
    }
</style>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <video id="matting-video" autoplay muted controls playsinline width="100%" >
        <source src="./assets/web/teaser.mp4" type="video/mp4">
      </video>
    </div>
  </div>
</section>

<section class="section">
    <div class="container" style="margin-bottom: 2vh;">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h1 class="title is-2">Abstraction</h1>
          <div class="content has-text-justified">
            <p>
              Multimodal pretraining has emerged as an effective strategy for 
              the trinity of goals of representation learning in autonomous robots: 
              1) extracting both local and global task progression information; 2) enforcing temporal consistency of visual representation; 3) capturing trajectory-level language grounding.
              Most existing methods approach these via separate objectives, which often reach sub-optimal solutions.
              In this paper, we propose a universal unified objective that can simultaneously extract meaningful task progression information from image sequences and seamlessly align them with language instructions. 
              We discover that via implicit preferences, where a visual trajectory inherently aligns better with its corresponding language instruction than mismatched pairs, the popular Bradley-Terry model can transform into representation learning through proper reward reparameterizations. The resulted framework, <b><i>DecisionNCE</i></b>, mirrors an InfoNCE-style objective but is distinctively tailored for decision-making tasks, providing an embodied representation learning framework that elegantly <b>extracts both local and global task progression features</b>, with temporal consistency enforced through implicit time contrastive learning, while <b>ensuring trajectory-level instruction grounding</b> via multimodal joint encoding.
              Evaluation on both simulated and real robots demonstrates that DecisionNCE effectively facilitates diverse downstream policy learning tasks, offering a versatile solution for unified representation and reward learning.
            </p>
          </div>
        </div>
      </div>
  </div>
</section>


<section class="section">
    <div class="container">
      <div class="columns is-centered has-text-centered">
        <!-- <div class="column is-full-width has-text-centered"> -->
        <div class="column is-four-fifths">
            <h1 class="title is-2">Downstream Control Tasks Results</h1>
          <div class="content has-text-justified">
          <p>
            The DecisionNCE encoders are pretrained using large-scale human video dataset <a href="https://epic-kitchens.github.io/2023">EpicKitchen</a>.
            We freeze the pretrained vision-language encoders and use their output representations as input to a 256-256 MLP to train LCBC policies.
        </p>
        <div class="column">
          <h2 style="text-align:left" class="title is-3">Results on Real Robots</h2>
          <div class="content has-text-justified">
          
          <div class="content has-text-centered">
            <img src="./assets/web/realrobot.jpg" alt="algebraic reasoning" class="center">
            <p><b><i>Figure 1:</i></b> Real robot LCBC experimental results. Success rate is averaged over 10 episodes and 3 seeds.</p>
          </div>

          <div class="column">
            <h3 style="text-align:left" class="title is-4">Red cup on silver pan</h3>
            <div class="content has-text-justified">
            </div>
              <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                <source src="./assets/web/redcupsilverpan/1.mp4"
                        type="video/mp4">
              </video>
              <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                <source src="./assets/web/redcupsilverpan/2.mp4"
                        type="video/mp4">
              </video>
              <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                <source src="./assets/web/redcupsilverpan/3.mp4"
                        type="video/mp4">
              </video>
            </div>

              <div class="column">
                <h3 style="text-align:left" class="title is-4">Red cup on red plate</h3>
                <div class="content has-text-justified">
                </div>
                  <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                    <source src="./assets/web/redcupredplate/1.mp4"
                            type="video/mp4">
                  </video>
                  <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                    <source src="./assets/web/redcupredplate/2.mp4"
                            type="video/mp4">
                  </video>
                  <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                    <source src="./assets/web/redcupredplate/3.mp4"
                            type="video/mp4">
                  </video>
                </div>

                  <div class="column">
                    <h3 style="text-align:left" class="title is-4">Duck on green plate</h3>
                    <div class="content has-text-justified">
                    </div>
                      <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                        <source src="./assets/web/duckgreenplate/1.mp4"
                                type="video/mp4">
                      </video>
                      <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                        <source src="./assets/web/duckgreenplate/2.mp4"
                                type="video/mp4">
                      </video>
                      <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                        <source src="./assets/web/duckgreenplate/3.mp4"
                                type="video/mp4">
                      </video>
                    </div>


                    <div class="column">
                      <h3 style="text-align:left" class="title is-4">Duck in pot</h3>
                      <div class="content has-text-justified">
                      </div>
                        <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                          <source src="./assets/web/duckpot/1.mp4"
                                  type="video/mp4">
                        </video>
                        <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                          <source src="./assets/web/duckpot/2.mp4"
                                  type="video/mp4">
                        </video>
                        <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                          <source src="./assets/web/duckpot/3.mp4"
                                  type="video/mp4">
                        </video>
                      </div>

                      <div class="column">
                        <h3 style="text-align:left" class="title is-4">Move pot</h3>
                        <div class="content has-text-justified">
                        </div>
                          <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                            <source src="./assets/web/movepot/1.mp4"
                                    type="video/mp4">
                          </video>
                          <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                            <source src="./assets/web/movepot/2.mp4"
                                    type="video/mp4">
                          </video>
                          <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                            <source src="./assets/web/movepot/3.mp4"
                                    type="video/mp4">
                          </video>
                        </div>
  
                        <div class="column">
                          <h3 style="text-align:left" class="title is-4">Fold cloth</h3>
                          <div class="content has-text-justified">
                          </div>
                            <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                              <source src="./assets/web/foldcloth/1.mp4"
                                      type="video/mp4">
                            </video>
                            <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                              <source src="./assets/web/foldcloth/2.mp4"
                                      type="video/mp4">
                            </video>
                            <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                              <source src="./assets/web/foldcloth/3.mp4"
                                      type="video/mp4">
                            </video>
                          </div>
    
                          <div class="column">
                            <h3 style="text-align:left" class="title is-4">Flip the red cup upright</h3>
                            <div class="content has-text-justified">
                            </div>
                              <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                                <source src="./assets/web/flip/1.mp4"
                                        type="video/mp4">
                              </video>
                              <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                                <source src="./assets/web/flip/2.mp4"
                                        type="video/mp4">
                              </video>
                              <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                                <source src="./assets/web/flip/3.mp4"
                                        type="video/mp4">
                              </video>
                            </div>

                            <div class="column">
                              <h3 style="text-align:left" class="title is-4">Open the microwave</h3>
                              <div class="content has-text-justified">
                              </div>
                                <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                                  <source src="./assets/web/openmicrowave/1.mp4"
                                          type="video/mp4">
                                </video>
                                <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                                  <source src="./assets/web/openmicrowave/2.mp4"
                                          type="video/mp4">
                                </video>
                                <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                                  <source src="./assets/web/openmicrowave/3.mp4"
                                          type="video/mp4">
                                </video>
                              </div>
        

                              <div class="column">
                                <h3 style="text-align:left" class="title is-4">Close the microwave</h3>
                                <div class="content has-text-justified">
                                </div>
                                  <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                                    <source src="./assets/web/closemicrowave/1.mp4"
                                            type="video/mp4">
                                  </video>
                                  <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                                    <source src="./assets/web/closemicrowave/2.mp4"
                                            type="video/mp4">
                                  </video>
                                  <video id="dollyzoom" autoplay controls muted loop playsinline width="32.5%" height="100%">
                                    <source src="./assets/web/closemicrowave/3.mp4"
                                            type="video/mp4">
                                  </video>
                                </div>


        <div class="column">
          <h2 style="text-align:left" class="title is-3">Results on Simulation</h2>
        <div class="content has-text-justified">
          <p>
            We also evaluate on the FrankaKitchen benchmark. 
            We train LCBC policies on 5 tasks in FrankaKitchen environment using 1/3/5 demonstrations for each task. 
            DecisionNCE achieves the highest success rate across diverse dataset quantities, 
            demonstrating its effectiveness in extracting valuable information from out-of-domain data.
        </p>

        <div class="content has-text-centered">
          <img src="./assets/web/simulation.png" alt="algebraic reasoning" class="center">
          <p><b><i>Figure 2:</i></b> Simulation LCBC results. Max success rate averaged over 25 evaluation episodes and 3 seeds.</p>
        </div>

          <img src="./assets/web/simulation/1.gif" alt="Image" style="width: 19%;">
          <img src="./assets/web/simulation/2.gif" alt="Image" style="width: 19%;">
          <img src="./assets/web/simulation/3.gif" alt="Image" style="width: 19%;">
          <img src="./assets/web/simulation/4.gif" alt="Image" style="width: 19%;">
          <img src="./assets/web/simulation/5.gif" alt="Image" style="width: 19%;">
        </div>
        </div>

                  </div>
                </div>
        </div>
        </div>
      </div>
</section>    


<!-- <section class="section">
    <div class="container">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
            <h2 class="title is-3">Feasibility-Guided Diffusion Model</h2>
          <div class="content has-text-justified">
          <p>
            We propose a feasibility-dependent objective, i.e., 
            <b>maximizing reward value within the feasible region while minimizing safety risks in the infeasible region</b>.
            In FISOR, the optimal policy for the optimization problem can be derived in a special form of weighted behavior cloning.
            Moreover, we propose a novel energy-guided sampling method that <b>does not require training a complicated time-dependent classifier</b> to simplify the training. <b>No more Lagrangian.</b>
            
            </p>
          <div class="content has-text-centered">
            <img src="./assets/framework.jpg" alt="algebraic reasoning" class="center">
            <p><b><i>Figure 2:</i></b> Feasibility-guided diffusion model with time-independent classifier-guided sampling method.</p>
          </div>
        </div>
        </div>
      </div>

</section>     -->


<!-- @PAN TODO: bibtex -->
<section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title is-3 has-text-centered">BibTeX</h2>
      <pre><code>
      @article{li2024decisionnce,
        title={DecisionNCE: Embodied Multimodal Representations via Implicit Preference Learning},
        author={Li, Jianxiong and Zheng, Jinliang and Zheng, Yinan and Mao, Liyuan and Hu, Xiao and Cheng, Sijie and Niu, Haoyi and Liu, Jihao and Liu, Yu and Liu, Jingjing and others},
        journal={arXiv preprint arXiv:2402.18137},
        year={2024}
      }
  </code></pre>
    </div>
</section>
  
<footer class="footer">
<!-- <div class="container"> -->
    <div class="content has-text-centered">
    </div>
    <div class="columns is-centered">
    <div class="column is-8">
        <div class="content has-text-centered">
        <p>
            This website is website adapted from <a href="https://mmmu-benchmark.github.io/">MMMU</a>, licensed under a <a rel="license"
                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
        </p>
        </div>
    </div>
    </div>
<!-- </div> -->

</footer>


</body>