index.html

<!DOCTYPE html>
<html>
<head>
  <link rel = "icon" href = "assets/title_image.png" type = "image/x-icon">
        
  <meta charset="utf-8">
  <meta name="description"
        content="Cutscene: Active vision for Next Best View Planning in outdoor scenes ">
  <meta name="keywords" content="cutscene,FoundVLAD, Foundation Models, Visual Place Recognition, VPR, DINOv2, DINO, SAM">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <meta property="og:image" content="assets/Thumbnail.png" />        
  <title>Cutscene: Active vision for Next Best View Planning in outdoor scenes </title>

  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/favicon.svg">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>

</head>
<body>

<nav class="navbar" role="navigation" aria-label="main navigation">
  <div class="navbar-brand">
    <a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
      <span aria-hidden="true"></span>
    </a>
  </div>
  <div class="navbar-menu">
    <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
      <a class="navbar-item" href="https://adityarauniyar.com/">
      <span class="icon">
          <i class="fas fa-home"></i>
      </span>
      </a>
    </div>

  </div>
</nav>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-2 publication-title"><a href="https://visual-learning.cs.cmu.edu/">16824</a> Course Project: <br>
             Active vision for Next Best View Planning in outdoor scenes.</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="https://adityarauniyar.com/">Aditya Rauniyar</a>,</span>
              <span class="author-block">
                <a href="https://www.linkedin.com/in/omaralama/">Omar Alama</a>,</span>
              <span class="author-block">
                <a href="https://yuechuanhou.com//">Yuechuan Hou</a>,</span>
              <span class="author-block">
                <a href="https://www.linkedin.com/in/mukul-ganwal/">Mukul Ganwal</a>,</span> 
                <br>
              <!-- <span class="author-block">
              <a href="https://theairlab.org/">Sebastian Scherer</a><sup>1</sup>,</span>
              <span class="author-block">
                <a href="https://robotics.iiit.ac.in/">Madhava Krishna</a><sup>2</sup>,</span>  
              <span class="author-block">
              <a href="https://scholar.google.co.in/citations?user=oVS3HHIAAAAJ&hl=en">Sourav Garg</a><sup>4</sup></span> -->
          </div>

          <div class="is-size-6 publication-authors">
            <span class="author-block">
              <a href="https://www.cmu.edu/" style="color: rgb(179, 8, 8);">Carnegie Mellon University</a>
            </span>
          </div>          

          <!-- <div class="is-size-7 publication-authors">
            <span class="author-block">* denotes equal contribution</span>
          </div> -->

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a href="https://vcc.tech/UrbanScene3D"
                   class="external-link button is-normal ">
                  <span class="icon">
                      <i class="far fa-images"></i>
                  </span>
                  <span>Dataset</span>
                </a>
              </span>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- <section class="hero teaser"> -->
  <!-- <div class="container is-max-desktop"> -->
    <!-- <div class="hero-body"> -->
      <!-- <video id="teaser" autoplay muted loop playsinline height="100%"> -->
        <!-- <img source src="./data/method_viz/Splash GIF.gif" /> -->
        <!-- <video id="dinov2_gardens" autoplay controls muted loop playsinline height="100%"> -->
          <!-- <source src="./data/method_viz/splash_vid_compressed.mp4" -->
                  <!-- type="video/mp4"> -->
        <!-- </video> -->
      <!-- </video> -->
      <!-- <h2 class="subtitle has-text-centered"> -->
        <!-- <span class="coolname">cutscene</span> enables <span>universal visual place recognition (VPR) <i>anywhere</i>, <i>anytime</i> and under <i>anyview</i>.</span>  -->
      <!-- </h2> -->
    <!-- </div> -->
  <!-- </div> -->
<!-- </section> -->

<section class="hero is-small is-light">
  <div class="hero-body">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            In this project, we extend the exploration of autonomous robotic 
            tasks in the context of larger outdoor scenes. Building upon the 
            referenced work, we focus on planning views for these expansive 
            environments, addressing questions related to optimal data collection 
            given a set of reference images. A significant contribution of our 
            approach lies in the introduction of a cutscene augmentation method. 
            This innovative technique involves semantically dividing larger 
            outdoor scenes into smaller components. Our model is then trained 
            to predict uncertainty and RGB values for novel poses within these 
            segmented scenes. This cutscene augmentation method serves a dual 
            purpose. First, it effectively increases the size of the dataset by 
            a significant percentage, enhancing the efficiency of the training 
            process, and reducing overfitting on a scene. Second, and more importantly, it substantially increases 
            the accuracy of novel view predictions. By leveraging this method, 
            our project aims to overcome challenges associated with data 
            collection in large-scale outdoor scenarios, providing a valuable 
            contribution to the broader field of autonomous robotic tasks. 
            Our experiments, using both synthetic and real-world data, 
            demonstrate the effectiveness of our proposed uncertainty-guided 
            approach. The results showcase improved accuracy in scene representations 
            compared to baseline methods, validating the utility and generalizability 
            of our methodology.
          </p>
        </div>
      </div>
    </div>
    </div>
    <!-- Paper video. -->
    <!-- <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Video</h2>
        <div class="publication-video">
          <a id="overview_video"></a>
          <iframe
            src="./data/">
          </iframe>
        </div>
      </div>
    </div>
  </div> -->
    <!--/ Paper video. -->
</section>

<section class="section">
  <div class="container is-max-desktop">

    <div class="columns is-centered has-text-centered">
      <div class="column">
        <a id="interactive_demo"></a>
        <h2 class="title is-3">Introduction and Related work</h2>
        <div class="content has-text-justified">
        <p> 
        Embodied robotic intelligence relies on active perception and 
        exploration, essential for various applications like robotic 
        manipulation, inspection, and vision-based navigation. 
        The autonomous collection of data plays a pivotal role in scene 
        understanding and subsequent tasks, with a focus on novel view 
        synthesis using multiple UAVs to make the process more robust [1]. 
        However, a significant challenge lies in efficiently planning a 
        sequence of views for sensors, ensuring the acquisition of the 
        most valuable information while adhering to platform-specific 
        constraints and enhancing scene understanding for manipulation 
        tasks [2]. Addressing this challenge is crucial for enhancing the 
        training process, particularly in scenarios involving larger scenes.
        </p>
        </div>
      </div>
    </div>

    
    <div class="columns is-centered">

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">Multi-UAV Data Gathering</h2>
        <img id="q_image" src = "data/related_work/multi_uav_data_gathering.gif" width="500">
        </div>
      </div>

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">Scene Understanding for Manipulation</h2>
          <img id="db_image" src = "data/related_work/nbv_manipulation.gif" width="500">
        </div>
      </div>

      <script src = "data/trajectory_data/hawkins.js"></script>
      <script src = "demo/plot.js"></script>
  
    </div>

    <p></p>
    </div>
</section>


<section class="hero is-small is-light">
  <div class="hero-body">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Contributions</h2>
        <div class="content has-text-justified">
          <p>
            In our research, we explored 3D scene understanding, 
            focusing on larger outdoor environments represented by 
            UrbanScene3D. We integrated this environment into Blender, 
            generating datasets emulating camera poses and renders 
            akin to the DTU dataset. We conduct a viewing range sensitivity analysis 
            which involved various viewing ranges, creating datasets 
            with different parameters, including a double dome 
            configuration and maximum view angle changes aligned with 
            specified view ranges of 30, 60, and 90 degrees. Additionally, 
            we introduced cutscene augmentation, which augments that dataset by
            extracting more subscenes from each big scene, improving the utility
            of small datasets and giving the network more room to learn.
          </p>
        </div>
      </div>
      
    </div>
    
    <div class="container is-max-desktop">
      <div class="columns is-centered">

        <div class="column has-text-centered">
          <div class="content">
            <h2 class="title is-4">Training on larger outdoor scenes</h2>
            <img id="db_image" src = "data/method_viz/View capture_180_renders.gif" width="500">
            <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
            
          </div>
        </div>

        <div class="column has-text-centered">
          <div class="content">
            <h2 class="title is-4">Viewing range sensitivity analysis</h2>
            <img id="db_image" src = "data/method_viz/Selection_180_90_30.gif" width="500">
            <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
            
          </div>
        </div>

        <div class="column has-text-centered">
          <div class="content">
            <h2 class="title is-4">Cutscene Augmentation</h2>
            <img id="db_image" src = "data/method_viz/cutscene.gif" width="500">
            <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
            
          </div>
        </div>
      </div>
    </div>
    </div>
</section>

<section class="section">
  <div class="container is-max-desktop">

    <div class="columns is-centered has-text-centered">
      <div class="column">
        <a id="interactive_demo"></a>
        <h2 class="title is-3">Background</h2>
        <div class="content has-text-justified">
        <p> 
          "NeRF" (Neural Radiance Fields) is a pioneering research paper 
          in computer vision that presents a revolutionary method for 3D 
          scene reconstruction and novel view synthesis. This approach 
          employs neural networks to model volumetric scenes from 2D images, 
          enabling the creation of immersive 3D environments and realistic 
          rendering of novel viewpoints. NeRF has had a profound impact 
          on various fields, including computer graphics, virtual reality, 
          and robotics.
          <br>
          "PixelNeRF" is an extension of the NeRF framework, it attempts to 
          generalize the NeRF network to multiple scenes by incorporating
          pixel level features from input reference views.This advancement further
          solidifies the potential of NeRF-based methods for tasks such as image synthesis, view 
          interpolation, and scene reconstruction, making them valuable 
          tools in computer vision and related domains.
        </p>
        </div>
      </div>
    </div>

    
    <div class="columns is-centered">

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">NeRF</h2>
          <div class="NeRF-video">
            <iframe width="480" height="270" src="https://www.youtube.com/embed/JuH79E8rdKc?si=AsJpnK8WQT1vKl5q" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>      
          </div>
        </div>
      </div>

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">PixelNeRF</h2>
          <div class="PixelNeRF-video">
            <iframe width="480" height="270" src="https://www.youtube.com/embed/voebZx7f32g?si=K-HQnVwn3tHiTIwu" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>          </div>
        </div>
      </div>

      <script src = "data/trajectory_data/hawkins.js"></script>
      <script src = "demo/plot.js"></script>
  
    </div>

    <p></p>
    </div>
</section>


<section class="hero is-small is-light">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <a id="interactive_demo"></a>
          <h2 class="title is-3">Our Approach</h2>
          <div class="content has-text-justified">
          <p> 
            Our approach is based on the Next Best View Planning Using Uncertainty Estimation
            in <a href = "https://arxiv.org/abs/2303.01284" style="color: blue;">Image-Based Neural Rendering (NeU-NBV) work by Jin</a> which had been built
            on top of the pixelnerf framework. The NeU-NBV framework adapts pixel nerf
            to next best view planning using two techniques: 1. It enhances the speed of 
            volumetric rendering through using an LSTM to predict the jumping distance 
            in ray tracing. 2. It incorporates uncertainity estimation by pushing the network
            to output both a mean RGB value and a variance. The uncertainty and final RGB values
            are then extracted by sampling from the log normal distribution 100 times and taking
            the mean and variance of those samples.
            <br/>
            Furthermore, the (NeU-NBV) work adds a planning framework on top of their
            adapted pixelnerf model as follows: Given two starting reference views,
            they sample multiple candidate views within a limited viewing range.
            They then feed all of those candidate views through the trained network which
            in turn outputs both an RGB and uncertainty prediction for each pixel. The next
            best view is then chosen such that it maximizes the information gain by targeting
            the novel view with the highest average pixel uncertainity. The novel view is then 
            collected and the process is then repeated until the capture budget has been expended.
            <br/>
            
          </p>
          </div>
        </div>
      </div>
    </div>

  <div class="container is-max-desktop">
  <div class="columns is-centered">

  <div class="column has-text-centered">
    <div class="content">
      <h2 class="title is-4">Uncertainty prediction network architechture</h2>
      <img id="db_image" src = "data/method_viz/network_architechture.png" width="500">
      <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
    </div>
  </div>

  <div class="column has-text-centered">
    <div class="content">
      <h2 class="title is-4">Uncertainity estimation loss function</h2>
      <img id="db_image" src = "data/method_viz/loss_function.png" width="500">
      <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
    </div>
  </div>

  <div class="column has-text-centered">
    <div class="content">
      <h2 class="title is-4">NBV planning architechture</h2>
      <img id="db_image" src = "data/planning/nbv_planning_arc.png" width="500">
      <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
    </div>
  </div>
  </div>

  </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">

    <div class="columns is-centered has-text-centered">
      <div class="column">
        <a id="interactive_demo"></a>
        <h2 class="title is-3">Method</h2>
        <div class="content has-text-justified">
        <p> 
        Challenges arise when applying DTU pre-trained models for novel view 
        prediction on larger outdoor navigational scenes, which it wasn't 
        originally trained on. To address this, we embarked on an experimentation 
        journey to develop a robust training strategy. We employed the 
        UrbanScene3D environment, imported its .obj representation into 
        Blender, and scripted a custom dataset generation process aligned 
        with DTU's configuration. We also conducted a comprehensive 
        sensitivity analysis to explore various viewing angles, adjusting 
        scene parameters, such as varying radii for different configurations 
        and introducing a range of maximum view angle changes. However, 
        the complexity of larger scenes necessitates a more effective 
        training strategy, prompting us to delve into innovative techniques 
        like cutscene augmentation. This method involves dividing extensive 
        scenes into manageable chunks, enhancing scene analysis and 
        understanding for improved model performance.
        </p>
        </div>
      </div>
    </div>

    
    <div class="columns is-centered">

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">Training on larger outdoor scenes</h2>
          <img id="db_image" src = "data/method_viz/View capture_180_renders.gif" width="500">
          <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
          
        </div>
      </div>

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4"></h2>
          <div class="content has-text-justified">
            <p>
              For our research, we employed a larger outdoor navigational environment 
              known as UrbanScene3D. To facilitate our experiments, we imported the .obj 
              file representing this environment into Blender. Subsequently, we set up a
              custom script within Blender to generate a dataset. This dataset was designed 
              to emulate camera poses and renders configured similarly to the DTU dataset,
              enabling us to conduct our experiments effectively and gather valuable 
              insights from this rich outdoor environment.
            </p>
            </div>
        </div>
      </div>

  
    </div>

    <div class="columns is-centered">

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">Setting Viewing ranges</h2>
          <img id="db_image" src = "data/method_viz/Selection_180_90_30.gif" width="500">
          <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
          
        </div>
      </div>

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4"></h2>
          <div class="content has-text-justified">
            <p>
              Utilizing the scene file within Blender, we conducted a comprehensive 
              sensitivity analysis to explore various viewing angles. This approach 
              allowed us to generate multiple datasets by manipulating the scene 
              parameters. Specifically, we created a double dome setup within the scene,
              varying the radii for two different configurations. Moreover, we 
              introduced a range of maximum view angle changes for each dataset, 
              aligning the maximum change with the specified view range. This rigorous 
              analysis encompassed view ranges of 30 degrees, 60 degrees, and 90 degrees, 
              providing us with valuable datasets that capture a wide array of 
              perspectives and visual information for our research.
            </p>
            </div>
        </div>
      </div>

  
    </div>

    <div class="columns is-centered">

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">Cutscene Augmentation</h2>
          <img id="db_image" src = "data/method_viz/how_to_cutscene.gif" width="500">
          <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
        </div>
      </div>

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4"></h2>
          <div class="content has-text-justified">
            <p>
              Taking inspiration from simple yet effective 2D augmentation methods
              such as cutout and cutmix, and stemming from the observations that
              large scale outdoor scenes are scarce and such scenes are often comprised
              of many smaller subscenes, we introduce cutscene augmentation. 
              The cutscene augmentation method increases the variation of the data
              by cutting out semantically divisible subscenes out of a larger scene.
              Those cutout scenes are scaled up and added as different data points
              pushing the network to generalize and avoid overfitting when data
              is scarce as it often is for large outdoor scenes. 
            </p>
            </div>
        </div>
      </div>

  
    </div>

    <p></p>
    </div>
</section>

<section class="section">
  <div class="container is-max-desktop">

    <div class="columns is-centered has-text-centered">
      <div class="column">
        <a id="interactive_demo"></a>
        <h2 class="title is-3">Experimental Setup</h2>
        
      </div>
    </div>

    
    <div class="columns is-centered">

      
      <div class="column has-text-centered">
        <div class="content">
          <div class="content has-text-justified">
            <p>               
            In our experimental setup, we employed a state-of-the-art approach 
            for training our computer vision model. We utilized a high-performance 
            computing environment with a 3090Ti GPU to accelerate our training 
            process. Our training procedure involved optimizing a range of 
            hyperparameters, including learning rates, and batch sizes, to achieve optimal performance. We conducted training 
            for a total of 200 epochs to ensure that our model converged to a 
            robust and accurate solution. This rigorous experimental setup allowed 
            us to achieve state-of-the-art results and validate the effectiveness 
            of our proposed approach.
            </p>
            </div>
        </div>
      </div>

      <script src = "data/trajectory_data/hawkins.js"></script>
      <script src = "demo/plot.js"></script>
  
    </div>

    <p></p>
    </div>
</section>


<section class="hero is-small is-light">
  <div class="hero-body">
  <div class="columns is-centered has-text-centered">
    <div class="container is-max-desktop">
      <h2 class="title is-3">Viewing Range Sensitivity Analysis Results</h2>
      <p>Below, we show qualitative results by fixing the reference views such that
        they are at most 30 degrees apart and fixing the targetted novel view. We then
        vary the model used to make the rgb and uncertainty estimation for this fixed input.
      </p>
    </div>
  </div>

  <div class="container is-max-desktop">

    <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">Reference Images </h2>
          <img id="db_image" src = "data/viewing_range_analysis/scan0_reference_images.jpg" width="500">

          <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
        </div>
      </div>

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">Ground Truth </h2>
          <img id="db_image" src = "data/viewing_range_analysis/scan0_ground_truth.jpg" width="167">

          <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
        </div>
      </div>

  </div>

  <div class="container is-max-desktop">
    <div class="columns is-centered">

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">90deg </h2>
          <img id="db_image" src = "data/viewing_range_analysis/scan0_best_90_rgb_53.jpg" width="500">
          RGB
          <img id="db_image" src = "data/viewing_range_analysis/scan0_best_90_uncertainty_53.jpg" width="500">
          Uncertainty
          <img id="db_image" src = "data/viewing_range_analysis/scan0_best_90_error_53.jpg" width="500">
          Error
          <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
        </div>
      </div>

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">60deg </h2>
          <img id="db_image" src = "data/viewing_range_analysis/scan0_best_60_rgb_53.jpg" width="500">
          RGB
          <img id="db_image" src = "data/viewing_range_analysis/scan0_best_60_uncertainty_53.jpg" width="500">
          Uncertainty
          <img id="db_image" src = "data/viewing_range_analysis/scan0_best_60_error_53.jpg" width="500">
          Error
          <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
        </div>
      </div>

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">30deg</h2>
          <img id="db_image" src = "data/viewing_range_analysis/scan0_best_30_rgb_53.jpg" width="500">
          RGB
          <img id="db_image" src = "data/viewing_range_analysis/scan0_best_30_uncertainty_53.jpg" width="500">
          Uncertainty
          <img id="db_image" src = "data/viewing_range_analysis/scan0_best_30_error_53.jpg" width="500">
          Error
          <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
        </div>
      </div>
    </div>
    <p>
      We further measure PSNR and SSIM as measures of reconstruction quality
      during training. The curves show the superiority of the quality as you reduce the viewing range.
    </p>
    <div class="columns is-centered">
      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">PSNR</h2>
          <img id="db_image" src = "data/combined_psnr.png" width="500">
        </div>
      </div>

      <div class="column has-text-centered">
        <div class="content">
          <h2 class="title is-4">SSIM</h2>
          <img id="db_image" src = "data/combined_ssim.png" width="500">
        </div>
      </div>
    </div>
    <p>
      The sensitivity analysis we conducted affirms that our framework is
      very sensitive to the viewing range. Having too big of a range will 
      affect the stability of training and the network's capability to learn, while
      having too small of a range would limit the network's novel view estimation.
      Note that even though the reconstruction quality reduces with a bigger viewing
      range, the uncertainty still correlates well with the error which is desirable.
      Thus, such models can still be utilized to estimate the next best view allbeit
      less reliably as the uncertainty predictions would be highly noisy.
      An interesting approach to study would be to incrementally increase the viewing
      range during training to stabilize training while simultaneously pushing 
      the network to expand its viewing range uncertainty estimation capabilities.
      We leave this idea for future work.
    </p>
  </div>

  <br><br><br><br>
  <div class="hero-body">
    <div class="columns is-centered has-text-centered">
      <div class="container is-max-desktop">
        <h2 class="title is-3">Cutscene Augmentation Results</h2>
        <p>
          For qualitative results, we conducted a more challenging analysis
          in which we varied the view 90 degrees in the x and y direction smoothly
          from -45,-45 to +45,+45 centered on the top bird's eye view. We then fixed
          the input views to be [-45,-45],[-35,-35],[+35,+35],[+45,+45].
        </p>
      </div>
    </div>
  
    <div class="container is-max-desktop">
  
      <div class="column has-text-centered">
          <div class="content">
            <h2 class="title is-4">Reference Images </h2>
            <img id="db_image" src = "data/cutscene_animation/town_reference_images.jpg" width="500">
  
            <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
          </div>
        </div>
  
        <div class="column has-text-centered">
          <div class="content">
            <h2 class="title is-4">Ground Truth </h2>
            <img id="db_image" src = "data/cutscene_animation/town_ground_truth.gif" width="167">
  
            <!-- <img src = "data/viz_gifs/row3_col3.gif"> -->
          </div>
        </div>
  
    </div>
  
    <div class="container is-max-desktop">
      <div class="columns is-centered">
  
        <div class="column has-text-centered">
          <div class="content">
            <h2 class="title is-4">60deg</h2>
            <img id="db_image" src = "data/cutscene_animation/town_60deg.gif" width="500">
            RGB
            <img id="db_image" src = "data/cutscene_animation/town_60deg_uncertainty.gif" width="500">
            Uncertainty
          </div>
        </div>
  
        <div class="column has-text-centered">
          <div class="content">
            <h2 class="title is-4">60deg + Cutscene</h2>
            <img id="db_image" src = "data/cutscene_animation/town_60deg_ca.gif" width="500">
            RGB
            <img id="db_image" src = "data/cutscene_animation/town_60deg_ca_uncertainty.gif" width="500">
            Uncertainty
          </div>
        </div>
  
      </div>
      <p>
        We again measure PSNR and SSIM as measures of reconstruction quality
        during training with cutscene augmentation.
      </p>
      <div class="columns is-centered">
  
        <div class="column has-text-centered">
          <div class="content">
            <h2 class="title is-4">PSNR</h2>
            <img id="db_image" src = "data/combined_cutscene_psnr.png" width="500">
          </div>
        </div>
  
        <div class="column has-text-centered">
          <div class="content">
            <h2 class="title is-4">SSIM</h2>
            <img id="db_image" src = "data/combined_cutscene_ssim.png" width="500">
          </div>
        </div>
      </div>
      <p>
        The performance improvements introduced by a very simple augmentation technique 
        such as cutscene are significant. Even though the model has never seen more than
        60 degrees of viewing range in a single datapoint. It was able to generalize
        to a broader viewing range (90 degrees) surpassing the baseline. The visualization 
        further serves to show how high uncertainity can hinder the NBV planning capabilities
        of the model. On the right side, we can trivially observe that the cutscene model's uncertainity
        is maximized at the birds eye view [0,0] whearas on the left side for the baseline,
        it is unclear visually which is the next best view as it suffers from high amounts of
        noise.
      </p>
    </div>
    <br><br><br><br>

    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column">
          <h2 class="title is-3">Next Best View Planning Results</h2>
          <p>
              To test out planning performance, we took the trained model and tested
              planning. We start with a random set of two views, sample from all available
              ground truth views and choose the next best view based on three different policies.
              We report our model as "ours", the DTU pretrained model as "DTU_baseline", and the max 
              view distance. We iterate until 6 views are collected and we measure
              the improvement of reconstruction with each new view collected using
              PSNR and SSIM as before.
          </p>
        </div>
      </div>
    
      <div class="columns is-centered">
        <div class="column has-text-centered">
            <div class="content">
              <img id="db_image" src = "data/planning/30deg_planning.png" width="1000">
            </div>
            <p>
              As shown above, measured in reconstruction quality Our method outperformes
              the geometric and DTU trained baselines successfully extending the NeU-NBV
              framework to large outdoor scenes.
          </p>
        </div>
    
      </div>
      </div>
      </div>

  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">

    <div class="columns is-centered has-text-centered">
      <div class="column">
        <h2 class="title is-3">Conclusion and Future Work</h2>
        <p>To conclude, in this work we have explored the application of <a href="https://arxiv.org/abs/2303.01284">NeU-NBV</a> planning to
          large outdoor scenes. We conducted an evaluation on the sensitivity of the approach
          to viewing angle range and showed that the method is highly sensitive. We further introduced
          and evaluated a novel augmentation technique "cutscene". We demonstrated how cutscene
          can significantly enhance the performance on outdoor large scenes. 

          In the future, we plan to further expand on the idea of cutscene possibly
          automating the approach of semanticly dividing the scene into multiple subcenes 
          by utilizing a 2D birds eye view building/POI detector or perhaps using
          geometric clustering clustering the floor vertices and exposing buildings. 
          Another interesting direction would be to add on to this augmentation technique by
          rearranging the cutout scenes into novel configurations pushing the 
          variety of the scarce scene data we have even further. Furthermore,
          our viewing range sensitivity analysis hints at the possiblity that the
          network could benefit from a scheduled viewing range in which we increase
          the range gradually as the network learns.</p>
      </div>
    </div>
    
    <div class="columns is-centered">
      <div class="column has-text-centered">
        <div class="content">
          <p>
          </p>
        </div>
      </div>
      
      <div class="column has-text-centered">
        <div class="content">
          
        </div>
      </div>

    </div>

    <p></p>
    </div>

<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title is-3">BibTeX</h2>
    <pre><code>
      @article{2023cutscene,
        title={Cutscene: Active vision for Next Best View Planning in outdoor scenes},
        author={Rauniyar, Aditya and Alama, Omar and Hou, Yuechuan and Ganwal, Mukul},
        url = {https://adityarauniyar.com/cutscene.github.io/}
        year={2023}
      }
    </code></pre>
  </div>
</section>

<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">
      <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">
        <img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" />
      </a>
    </div>
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            This website adapted from the Nerfies templates, which is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>
          <p>
            If you use the <a href="https://github.com/cutscene/cutscene.github.io">source code</a> of this website,
            please also link back to the <a href="https://github.com/nerfies/nerfies.github.io">Nerfies source code</a> in your footer.
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>