diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ff3b3681..af0223f9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,10 +10,9 @@ add_library(popsift
         popsift/sift_pyramid.cu popsift/sift_pyramid.h
         popsift/sift_octave.cu popsift/sift_octave.h
         popsift/s_pyramid_build.cu
-        popsift/s_pyramid_build_aa.cu popsift/s_pyramid_build_aa.h
-        popsift/s_pyramid_build_ai.cu popsift/s_pyramid_build_ai.h
-        popsift/s_pyramid_build_ra.cu popsift/s_pyramid_build_ra.h
-        popsift/s_pyramid_fixed.cu
+        popsift/s_pyramid_build_aa.cu
+        popsift/s_pyramid_build_ai.cu
+        popsift/s_pyramid_build_ra.cu
         popsift/sift_extremum.h
         popsift/sift_extremum.cu popsift/s_extrema.cu
         popsift/s_orientation.cu
diff --git a/src/application/main.cpp b/src/application/main.cpp
index bf1128ff..843b7e68 100755
--- a/src/application/main.cpp
+++ b/src/application/main.cpp
@@ -71,28 +71,20 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in
         ( "gauss-mode", value<std::string>()->notifier([&](const std::string& s) { config.setGaussMode(s); }),
           popsift::Config::getGaussModeUsage() )
         // "Choice of span (1-sided) for Gauss filters. Default is VLFeat-like computation depending on sigma. "
-        // "Options are: vlfeat, relative, relative-all, opencv, fixed9, fixed15"
+        // "Options are: vlfeat, relative, relative-all, opencv"
         ("desc-mode", value<std::string>()->notifier([&](const std::string& s) { config.setDescMode(s); }),
         "Choice of descriptor extraction modes:\n"
         "loop, iloop, grid, igrid, notile\n"
-	"Default is loop\n"
+        "Default is loop\n"
         "loop is OpenCV-like horizontal scanning, computing only valid points, grid extracts only useful points but rounds them, iloop uses linear texture and rotated gradiant fetching. igrid is grid with linear interpolation. notile is like igrid but avoids redundant gradiant fetching.")
-        ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }),
+        ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::RefineInOctave); }),
         "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, "
         "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.")
-        ("vlfeat-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::VLFeat); }),
+        ("vlfeat-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::RefineInLevel); }),
         "During the initial upscale, shift pixels by 1. That creates a sharper upscaled image. "
         "In extrema refinement, steps up to 0.6, levels remain unchanged, "
         "do not reject points when reaching max iterations, "
         "first contrast threshold is .8 * peak thresh.")
-        ("opencv-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::OpenCV); }),
-        "During the initial upscale, shift pixels by 0.5. "
-        "In extrema refinement, steps up to 0.5, "
-        "reject points when reaching max iterations, "
-        "first contrast threshold is floor(.5 * peak thresh). "
-        "Computed filter width are lower than VLFeat/PopSift")
-        ("direct-scaling", bool_switch()->notifier([&](bool b) { if(b) config.setScalingMode(popsift::Config::ScaleDirect); }),
-         "Direct each octave from upscaled orig instead of blurred level.")
         ("norm-multi", value<int>()->notifier([&](int i) {config.setNormalizationMultiplier(i); }), "Multiply the descriptor by pow(2,<int>).")
         ( "norm-mode", value<std::string>()->notifier([&](const std::string& s) { config.setNormMode(s); }),
           popsift::Config::getNormModeUsage() )
diff --git a/src/application/match.cpp b/src/application/match.cpp
index 3460975d..86909e81 100755
--- a/src/application/match.cpp
+++ b/src/application/match.cpp
@@ -70,28 +70,16 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& lF
     modes.add_options()
         ( "gauss-mode", value<std::string>()->notifier([&](const std::string& s) { config.setGaussMode(s); }),
           popsift::Config::getGaussModeUsage() )
-        ("desc-mode", value<std::string>()->notifier([&](const std::string& s) { config.setDescMode(s); }),
+        ( "desc-mode", value<std::string>()->notifier([&](const std::string& s) { config.setDescMode(s); }),
         "Choice of descriptor extraction modes:\n"
         "loop, iloop, grid, igrid, notile\n"
-	"Default is loop\n"
+        "Default is loop\n"
         "loop is OpenCV-like horizontal scanning, computing only valid points, grid extracts only useful points but rounds them, iloop uses linear texture and rotated gradiant fetching. igrid is grid with linear interpolation. notile is like igrid but avoids redundant gradiant fetching.")
-        ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }),
-        "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, "
-        "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.")
-        ("vlfeat-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::VLFeat); }),
-        "During the initial upscale, shift pixels by 1. That creates a sharper upscaled image. "
-        "In extrema refinement, steps up to 0.6, levels remain unchanged, "
-        "do not reject points when reaching max iterations, "
-        "first contrast threshold is .8 * peak thresh.")
-        ("opencv-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::OpenCV); }),
-        "During the initial upscale, shift pixels by 0.5. "
-        "In extrema refinement, steps up to 0.5, "
-        "reject points when reaching max iterations, "
-        "first contrast threshold is floor(.5 * peak thresh). "
-        "Computed filter width are lower than VLFeat/PopSift")
-        ("direct-scaling", bool_switch()->notifier([&](bool b) { if(b) config.setScalingMode(popsift::Config::ScaleDirect); }),
-         "Direct each octave from upscaled orig instead of blurred level.")
-        ("norm-multi", value<int>()->notifier([&](int i) {config.setNormalizationMultiplier(i); }), "Multiply the descriptor by pow(2,<int>).")
+        ( "popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::RefineInOctave); }),
+        "In extrema refinement, it is possible to move extrema within a level but also between the levels of an octave.")
+        ( "vlfeat-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::RefineInLevel); }),
+        "In extrema refinement, it is possible to move extrema within a level only.")
+        ( "norm-multi", value<int>()->notifier([&](int i) {config.setNormalizationMultiplier(i); }), "Multiply the descriptor by pow(2,<int>).")
         ( "norm-mode", value<std::string>()->notifier([&](const std::string& s) { config.setNormMode(s); }),
           popsift::Config::getNormModeUsage() )
         ( "root-sift", bool_switch()->notifier([&](bool b) { if(b) config.setNormMode(popsift::Config::RootSift); }),
diff --git a/src/popsift/gauss_filter.cu b/src/popsift/gauss_filter.cu
index 7c425f7f..362845e7 100755
--- a/src/popsift/gauss_filter.cu
+++ b/src/popsift/gauss_filter.cu
@@ -67,42 +67,7 @@ void print_gauss_filter_symbol( int columns )
 
     printf( "\n"
             "Gauss tables\n"
-            "      level span sigma : center value -> edge value\n"
-            "      absolute filters octave 0 (compute level 0, all other levels directly from level 0)\n");
-
-    for( int lvl=0; lvl<d_gauss.required_filter_stages; lvl++ ) {
-        int span = d_gauss.abs_o0.span[lvl] + d_gauss.abs_o0.span[lvl] - 1;
-
-        printf("      %d %d %2.6f: ", lvl, span, d_gauss.abs_o0.sigma[lvl] );
-        int m = min( d_gauss.abs_o0.span[lvl], columns );
-        for( int x=0; x<m; x++ ) {
-            printf("%0.8f ", d_gauss.abs_o0.filter[lvl*GAUSS_ALIGN+x] );
-        }
-        if( m < d_gauss.abs_o0.span[lvl] )
-            printf("...\n");
-        else
-            printf("\n");
-    }
-    printf( "\n"
-            "      absolute filters other octaves\n"
-            "      (level 0 via downscaling, all other levels directly from level 0)\n");
-
-    for( int lvl=0; lvl<d_gauss.required_filter_stages; lvl++ ) {
-        int span = d_gauss.abs_oN.span[lvl] + d_gauss.abs_oN.span[lvl] - 1;
-
-        printf("      %d %d %2.6f: ", lvl, span, d_gauss.abs_oN.sigma[lvl] );
-        int m = min( d_gauss.abs_oN.span[lvl], columns );
-        for( int x=0; x<m; x++ ) {
-            printf("%0.8f ", d_gauss.abs_oN.filter[lvl*GAUSS_ALIGN+x] );
-        }
-        if( m < d_gauss.abs_oN.span[lvl] )
-            printf("...\n");
-        else
-            printf("\n");
-    }
-    printf("\n");
-
-    printf("    level 0-filters for direct downscaling\n");
+            "    level 0-filters for direct downscaling\n");
 
     for( int lvl=0; lvl<MAX_OCTAVES; lvl++ ) {
         int span = d_gauss.dd.span[lvl] + d_gauss.dd.span[lvl] - 1;
@@ -187,33 +152,6 @@ void init_filter( const Config& conf,
 
     h_gauss.inc.computeBlurTable( &h_gauss );
 
-    /* abs_o0 :
-     * Gauss table to create octave 0 of the absolute filters directly from
-     * input images.
-     */
-    for( int lvl=0; lvl<h_gauss.required_filter_stages; lvl++ ) {
-        const float sigmaS = sigma0 * pow( 2.0f, (float)(lvl)/(float)levels );
-        h_gauss.abs_o0.sigma[lvl]  = sqrt( fabs( sigmaS * sigmaS - initial_blur * initial_blur ) );
-    }
-
-    h_gauss.abs_o0.computeBlurTable( &h_gauss );
-
-    /* abs_oN :
-     * Gauss tables to create levels 1 and above directly from level 0 of every
-     * octave. Could be used on octave 0, but abs_o0 is better.
-     * Level 0 must be created by other means (downscaling from previous octave,
-     * direct downscaling from input image, ...) before using abs_oN.
-     * 
-     */
-    h_gauss.abs_oN.sigma[0] = 0;
-    for( int lvl=1; lvl<h_gauss.required_filter_stages; lvl++ ) {
-        const float sigmaP = sigma0; // level 0 has already reached sigma0 blur
-        const float sigmaS = sigma0 * pow( 2.0f, (float)(lvl)/(float)levels );
-        h_gauss.abs_oN.sigma[lvl] = sqrt( sigmaS * sigmaS - sigmaP * sigmaP );
-    }
-
-    h_gauss.abs_oN.computeBlurTable( &h_gauss );
-
     /* dd :
      * The direct-downscaling kernels make use of the assumption that downscaling
      * from MAX_LEVEL-3 is identical to applying 2*sigma on the identical image
@@ -224,17 +162,13 @@ void init_filter( const Config& conf,
      * octaves, where it is also good for performance.
      * dd is only for creating level 0 of all octave directly from the input image.
      */
-    for( int oct=0; oct<MAX_OCTAVES; oct++ ) {
-        // sigma * 2^i
-        float oct_sigma = scalbnf( sigma0, oct );
 
-        // subtract initial blur
-        float b = sqrt( fabs( oct_sigma * oct_sigma - initial_blur * initial_blur ) );
+    // subtract initial blur
+    const float b = sqrt( fabs( sigma0 * sigma0 - initial_blur * initial_blur ) );
 
-        // sigma / 2^i
-        h_gauss.dd.sigma[oct] = scalbnf( b, -oct );
-        h_gauss.dd.computeBlurTable( &h_gauss );
-    }
+    // sigma / 2^i
+    h_gauss.dd.sigma[0] = b;
+    h_gauss.dd.computeBlurTable( &h_gauss );
 
     cudaError_t err;
     err = cudaMemcpyToSymbol( d_gauss,
@@ -260,8 +194,6 @@ __host__
 void GaussInfo::clearTables( )
 {
     inc            .clearTables();
-    abs_o0         .clearTables();
-    abs_oN         .clearTables();
     dd             .clearTables();
 }
 
@@ -276,20 +208,10 @@ int GaussInfo::getSpan( float sigma ) const
 {
     switch( _span_mode )
     {
-    case Config::VLFeat_Relative_All :
-        // return GaussInfo::vlFeatRelativeSpan( sigma );
-        return GaussInfo::vlFeatSpan( sigma );
-
     case Config::VLFeat_Compute :
         return GaussInfo::vlFeatSpan( sigma );
     case Config::VLFeat_Relative :
         return GaussInfo::vlFeatRelativeSpan( sigma );
-    case Config::OpenCV_Compute :
-        return GaussInfo::openCVSpan( sigma );
-    case Config::Fixed9 :
-        return 5;
-    case Config::Fixed15 :
-        return 8;
     default :
         stringstream ss;
         ss << "ERROR: The mode for computing Gauss filter scan is invalid";
@@ -317,15 +239,6 @@ int GaussInfo::vlFeatRelativeSpan( float sigma )
     return spn;
 }
 
-__host__
-int GaussInfo::openCVSpan( float sigma )
-{
-    int span = int( roundf( 2.0f * 4.0f * sigma + 1.0f ) ) | 1;
-    span >>= 1;
-    span  += 1;
-    return std::min<int>( span, GAUSS_ALIGN - 1 );
-}
-
 template<int LEVELS>
 __host__
 void GaussTable<LEVELS>::clearTables( )
diff --git a/src/popsift/gauss_filter.h b/src/popsift/gauss_filter.h
index db1a8c25..0a2c50cb 100755
--- a/src/popsift/gauss_filter.h
+++ b/src/popsift/gauss_filter.h
@@ -61,25 +61,11 @@ struct GaussInfo
      */
     GaussTable<GAUSS_LEVELS> inc;
 
-    /* Compute the 1D Gauss tables for all levels of octave 0.
-     * For octave 0, all of these tables derive from the input
-     * image.
+    /* This is the 1D Gauss table for filtering the input image.
+     * The input image is downscaled and blurred with sigma or by
+     * blurring the input image with 2*sigma and downscaling afterwards.
      */
-    GaussTable<GAUSS_LEVELS> abs_o0;
-
-    /* Compute the 1D Gauss tables for all levels of octaves 1 and up.
-     * Level 0 is empty, since it is created by other means.
-     * All other levels blur from level 0, not considering any
-     * initial blur.
-     */
-    GaussTable<GAUSS_LEVELS> abs_oN;
-
-    /* In theory, level 0 of octave 2 contains the same information
-     * whether it is constructed by downscaling and blurring the
-     * input image with sigma or by blurring the input image with 2*sigma
-     * and downscaling afterwards.
-     */
-    GaussTable<MAX_OCTAVES> dd;
+    GaussTable<1> dd;
 
     __host__
     void clearTables( );
@@ -99,9 +85,6 @@ struct GaussInfo
 
     __host__
     static int vlFeatRelativeSpan( float sigma );
-
-    __host__
-    static int openCVSpan( float sigma );
 };
 
 extern __device__ __constant__ GaussInfo d_gauss;
diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu
index 5c1acc44..2ef04523 100644
--- a/src/popsift/s_extrema.cu
+++ b/src/popsift/s_extrema.cu
@@ -123,101 +123,30 @@ template<int sift_mode>
 class ModeFunctions
 {
 public:
-    inline __device__
-    bool first_contrast_ok( float val ) const;
-
     /* refine
-     * returns -1 : break loop and fail
-     *          0 : continue looping
-     *          1 : break loop and succeed
+     * returns 0 : continue looping
+     *         1 : break loop and succeed
      */
     inline __device__
     int refine( float3& d, int3& n, int width, int height, int maxlevel, bool last_it );
-
-    /*
-     * returns true  : values after refine make sense
-     *         false : they do not
-     */
-    inline __device__
-    bool verify( float xn, float yn, float sn, int width, int height, int maxlevel ) const;
-};
-
-template<>
-class ModeFunctions<Config::OpenCV>
-{
-public:
-    inline __device__
-    bool first_contrast_ok( float val ) const
-    {
-        return ( fabsf( val ) >= floorf( d_consts.threshold ) );
-    }
-
-    inline __device__
-    int refine( float3& d, int3& n, int width, int height, int maxlevel, bool last_it ) const
-    {
-        // OpenCV mode is a special case because d remains unmodified.
-        // Either we return 1, and n has not been modified.
-        // Or we quit the loop by exceeding the limit, and reject the point anyway.
-
-        const float3 t = make_float3( fabsf(d.x), fabsf(d.y), fabsf(d.z) );
-
-        if( t.x < 0.5f && t.y < 0.5f && t.z < 0.5f ) {
-            // return false, quit the loop, success
-            return 1;
-        }
-
-        // This test in OpenCV is totally useless in CUDA because the thread
-        // would simply idle before failing 7 instructions below anyway.
-        // if( t.x > (float)(INT_MAX/3) || t.y > (float)(INT_MAX/3) || t.z > (float)(INT_MAX/3) ) {
-            // return false, quit the loop, fail
-            // return -1;
-        // }
-
-        n.x += roundf( d.x );  // choose rintf or roundf
-        n.y += roundf( d.y );  // rintf is quicker, roundf is more exact
-        n.z += roundf( d.z );
-
-        const int retval = ( n.x < 5 || n.x >= width-5 ||
-                             n.y < 5 || n.y >= height-5 ||
-                             n.z < 1 || n.z > maxlevel-2 ) ? -1 : 0;
-            // if outside of all DoG images (minus border),
-            // quit the loop, fail
-
-        return retval;
-    }
-
-    inline __device__
-    bool verify( float xn, float yn, float sn, int width, int height, int maxlevel ) const
-    {
-        return true;
-    }
 };
 
 template<>
-class ModeFunctions<Config::VLFeat>
+class ModeFunctions<Config::RefineInLevel>
 {
 public:
-    inline __device__
-    bool first_contrast_ok( const float val ) const
-    {
-        return ( fabsf( val ) >= 0.8f * 2.0f * d_consts.threshold );
-    }
-
     inline __device__
     int refine( float3& d, int3& n, int width, int height, int maxlevel, bool last_it ) const
     {
         if( last_it ) return 0;
 
-        float2 t;
+        int2 t;
 
-        t.x = ((d.x >= 0.6f && n.x < width-2) ?  1.0f : 0.0f )
-            + ((d.x <= -0.6f && n.x > 1)? -1.0f : 0.0f );
-
-        t.y = ((d.y >= 0.6f && n.y < height-2)  ?  1.0f : 0.0f )
-            + ((d.y <= -0.6f && n.y > 1) ? -1.0f : 0.0f );
+        t.x = ((d.x >=  0.6f && n.x < width-2) ?  1 : 0 )
+            + ((d.x <= -0.6f && n.x > 1)       ? -1 : 0 );
 
-        // t.z = ((d.z >= 0.6f && n.z < maxlevel-1)  ?  1 : 0 )
-            // + ((d.z <= -0.6f && n.z > 1) ? -1 : 0 );
+        t.y = ((d.y >=  0.6f && n.y < height-2)  ?  1 : 0 )
+            + ((d.y <= -0.6f && n.y > 1)         ? -1 : 0 );
 
         if( t.x == 0 && t.y == 0 ) {
             // no more changes
@@ -230,31 +159,12 @@ public:
 
         return 0;
     }
-
-    inline __device__
-    bool verify( float xn, float yn, float sn, int width, int height, int maxlevel ) const
-    {
-        // reject if outside of image bounds or far outside DoG bounds
-        return ( ( xn < 0.0f ||
-                   xn > width - 1.0f ||
-                   yn < 0.0f ||
-                   yn > height - 1.0f ||
-                   sn < 0.0f ||
-                   sn > maxlevel ) ? false
-                                   : true );
-    }
 };
 
 template<>
-class ModeFunctions<Config::PopSift>
+class ModeFunctions<Config::RefineInOctave>
 {
 public:
-    inline __device__
-    bool first_contrast_ok( const float val ) const
-    {
-        return ( fabsf( val ) >= 1.6f * d_consts.threshold );
-    }
-
     inline __device__
     int refine( float3& d, int3& n, int width, int height, int maxlevel, bool last_it ) const
     {
@@ -282,21 +192,32 @@ public:
 
         return 0;
     }
-
-    inline __device__
-    bool verify( float xn, float yn, float sn, int width, int height, int maxlevel ) const
-    {
-        // reject if outside of image bounds or far outside DoG bounds
-        return ( ( xn < 0.0f ||
-                   xn > width - 1.0f ||
-                   yn < 0.0f ||
-                   yn > height - 1.0f ||
-                   sn < -0.0f ||
-                   sn > maxlevel ) ? false
-                                   : true );
-    }
 };
 
+__device__ inline static
+bool first_contrast_ok( const float val )
+{
+    return ( fabsf( val ) >= 1.6f * d_consts.threshold );
+}
+
+/** verify() checks whether a refine position is outside the image boundaries or
+ *  outside the DoG boundaries.
+ *  returns true  : values after refine make sense
+ *          false : they do not
+ */
+__device__ inline static
+bool verify( float xn, float yn, float sn, int width, int height, int maxlevel )
+{
+    // reject if outside of image bounds or far outside DoG bounds
+    return ( ( xn < 0.0f ||
+               xn > width - 1.0f ||
+               yn < 0.0f ||
+               yn > height - 1.0f ||
+               sn < -0.0f ||
+               sn > maxlevel ) ? false
+                               : true );
+}
+
 template<int sift_mode>
 __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog,
                                                int debug_octave,
@@ -332,16 +253,10 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog,
     const int x       = block_x + threadIdx.x + 1;
     const int level   = block_z + 1;
 
-    if( sift_mode == Config::OpenCV ) {
-        if( x < 5 || y < 5 || x >= width-5 || y >= height-5 ) {
-            return false;
-        }
-    }
-
     const float val = readTex( dog, x, y, level );
 
     ModeFunctions<sift_mode> f;
-    if( ! f.first_contrast_ok( val ) ) return false;
+    if( ! first_contrast_ok( val ) ) return false;
 
     if( ! is_extremum( dog, x-1, y-1, level-1 ) ) {
         // if( debug_octave==0 && level==2 && x==14 && y==73 ) printf("But I fail\n");
@@ -436,33 +351,22 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog,
          */
         const int retval = f.refine( d, n, width, height, maxlevel, iter==MAX_ITERATIONS );
 
-        if( retval == -1 ) {
-            return false;
-        } else if( retval == 1 ) {
+        if( retval == 1 ) {
             break;
         }
     }
     while( iter < MAX_ITERATIONS ); /* go to next iter */
 
-    if (iter >= MAX_ITERATIONS) {
-        if( sift_mode == Config::OpenCV ) {
-            /* ensure convergence of interpolation */
-            return false;
-        }
-    }
-
-    if( sift_mode == Config::PopSift || sift_mode == Config::VLFeat ) {
-        if( d.x >= 1.5f || d.y >= 1.5f || d.z >= 1.5f ) {
-            // excessive pixel movement in at least dimension, reject
-            return false;
-        }
+    if( d.x >= 1.5f || d.y >= 1.5f || d.z >= 1.5f ) {
+        // excessive pixel movement in at least dimension, reject
+        return false;
     }
 
     const float xn      = n.x + d.x;
     const float yn      = n.y + d.y;
     const float sn      = n.z + d.z;
 
-    if( ! f.verify( xn, yn, sn, width, height, maxlevel ) ) {
+    if( ! verify( xn, yn, sn, width, height, maxlevel ) ) {
         return false;
     }
 
@@ -472,9 +376,6 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog,
     const float det     = DD.x * DD.y - DX.x * DX.x;
     const float edgeval = tr * tr / det;
 
-    // redundant check, verify() is stricter
-    // if( sift_mode == Config::PopSift && iter >= MAX_ITERATIONS && ( sn<0 || sn>maxlevel) ) { return false; }
-
     /* negative determinant => curvatures have different signs -> reject it */
     if (det <= 0.0f) {
         return false;
@@ -580,32 +481,12 @@ void Pyramid::find_extrema( const Config& conf )
 
         int*  num_blocks      = extrema_num_blocks;
 
-#ifdef USE_DOG_TEX_LINEAR
-#define getDogTexture getDogTextureLinear
-#else
-#define getDogTexture getDogTexturePoint
-#endif
         switch( conf.getSiftMode() )
         {
-        case Config::VLFeat :
-                find_extrema_in_dog<HEIGHT,Config::VLFeat>
-                    <<<grid,block,0,oct_str>>>
-                    ( oct_obj.getDogTexture( ),
-                      octave,
-                      cols,
-                      rows,
-                      _levels-1,
-                      num_blocks,
-                      grid.x * grid.y,
-                      oct_obj.getWGridDivider(),
-                      oct_obj.getHGridDivider(),
-                      conf.getFilterGridSize() );
-                POP_SYNC_CHK;
-                break;
-        case Config::OpenCV :
-                find_extrema_in_dog<HEIGHT,Config::OpenCV>
+        case Config::RefineInLevel :
+                find_extrema_in_dog<HEIGHT,Config::RefineInLevel>
                     <<<grid,block,0,oct_str>>>
-                    ( oct_obj.getDogTexture( ),
+                    ( oct_obj.getDogTexturePoint( ),
                       octave,
                       cols,
                       rows,
@@ -618,9 +499,9 @@ void Pyramid::find_extrema( const Config& conf )
                 POP_SYNC_CHK;
                 break;
         default :
-                find_extrema_in_dog<HEIGHT,Config::PopSift>
+                find_extrema_in_dog<HEIGHT,Config::RefineInOctave>
                     <<<grid,block,0,oct_str>>>
-                    ( oct_obj.getDogTexture( ),
+                    ( oct_obj.getDogTexturePoint( ),
                       octave,
                       cols,
                       rows,
@@ -633,7 +514,6 @@ void Pyramid::find_extrema( const Config& conf )
                 POP_SYNC_CHK;
                 break;
         }
-#undef getDogTexture
 
         cuda::event_record( oct_obj.getEventExtremaDone(), oct_str, __FILE__, __LINE__ );
     }
diff --git a/src/popsift/s_pyramid_build.cu b/src/popsift/s_pyramid_build.cu
index 8873ca5c..b2921eba 100755
--- a/src/popsift/s_pyramid_build.cu
+++ b/src/popsift/s_pyramid_build.cu
@@ -9,9 +9,6 @@
 #include "common/clamp.h"
 #include "common/debug_macros.h"
 #include "gauss_filter.h"
-#include "s_pyramid_build_aa.h"
-#include "s_pyramid_build_ai.h"
-#include "s_pyramid_build_ra.h"
 #include "sift_constants.h"
 #include "sift_pyramid.h"
 
@@ -29,24 +26,6 @@ namespace popsift {
 
 namespace gauss {
 
-__global__
-void get_by_2_interpolate( cudaTextureObject_t src_data,
-                           const int           src_level,
-                           cudaSurfaceObject_t dst_data,
-                           const int           dst_w,
-                           const int           dst_h )
-{
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    const int idy = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if( idx >= dst_w ) return;
-    if( idy >= dst_h ) return;
-
-    const float val = readTex( src_data, 2.0f * idx + 1.0f, 2.0f * idy + 1.0f, src_level );
-
-    surf2DLayeredwrite( val, dst_data, idx*4, idy, 0, cudaBoundaryModeZero ); // dst_data.ptr(idy)[idx] = val;
-}
-
 __global__
 void get_by_2_pick_every_second( cudaTextureObject_t src_data,
                                  const int           src_w,
@@ -94,115 +73,7 @@ void make_dog( cudaTextureObject_t src_data,
 } // namespace gauss
 
 __host__
-inline void Pyramid::horiz_from_input_image( const Config& conf, ImageBase* base, int octave, cudaStream_t stream )
-{
-    Octave&   oct_obj = _octaves[octave];
-
-    const int width   = oct_obj.getWidth();
-    const int height  = oct_obj.getHeight();
-
-    dim3 block( 128, 1 );
-    dim3 grid;
-    grid.x  = grid_divide( width,  128 );
-    grid.y  = height;
-
-    const Config::SiftMode& mode = conf.getSiftMode();
-    float shift  = 0.5f;
-
-    if( octave == 0 && ( mode == Config::PopSift || mode == Config::VLFeat ) ) {
-        shift  = 0.5f * powf( 2.0f, conf.getUpscaleFactor() - octave );
-    }
-
-    gauss::normalizedSource::horiz
-        <<<grid,block,0,stream>>>
-        ( base->getInputTexture(),
-          oct_obj.getIntermediateSurface(),
-          width,
-          height,
-          octave,
-          shift );
-
-    POP_SYNC_CHK;
-}
-
-__host__
-inline void Pyramid::horiz_level_from_input_image( const Config& conf, ImageBase* base, int octave, int level, cudaStream_t stream )
-{
-    if( octave != 0 )
-    {
-        POP_FATAL( "Unsupported parameter octave != 0" );
-    }
-
-    Octave&   oct_obj = _octaves[octave];
-
-    const int width   = oct_obj.getWidth();
-    const int height  = oct_obj.getHeight();
-
-    dim3 block( 128, 1 );
-    dim3 grid;
-    grid.x  = grid_divide( width,  128 );
-    grid.y  = height;
-
-    const Config::SiftMode& mode = conf.getSiftMode();
-    float shift  = 0.5f;
-
-    if( octave == 0 && ( mode == Config::PopSift || mode == Config::VLFeat ) ) {
-        shift  = 0.5f * powf( 2.0f, conf.getUpscaleFactor() - octave );
-    }
-
-    gauss::normalizedSource::horiz_level
-        <<<grid,block,0,stream>>>
-        ( base->getInputTexture(),
-          oct_obj.getIntermediateSurface(),
-          width,
-          height,
-          octave,
-          level,
-          shift );
-
-    POP_SYNC_CHK;
-}
-
-__host__
-inline void Pyramid::horiz_all_from_input_image( const Config& conf, ImageBase* base, int octave, int startlevel, int maxlevel, cudaStream_t stream )
-{
-    if( octave != 0 )
-    {
-        POP_FATAL( "Unsupported parameter octave != 0" );
-    }
-
-    Octave&      oct_obj = _octaves[octave];
-
-    const int width  = oct_obj.getWidth();
-    const int height = oct_obj.getHeight();
-
-    dim3 block( 128, 1 );
-    dim3 grid;
-    grid.x  = grid_divide( width,  128 );
-    grid.y  = height;
-
-    const Config::SiftMode& mode = conf.getSiftMode();
-    float shift  = 0.5f;
-
-    if( mode == Config::PopSift || mode == Config::VLFeat ) {
-        shift  = 0.5f * powf( 2.0f, conf.getUpscaleFactor() );
-    }
-
-    gauss::normalizedSource::horiz_all
-        <<<grid,block,0,stream>>>
-        ( base->getInputTexture(),
-          oct_obj.getIntermediateSurface( ),
-          width,
-          height,
-          shift,
-          maxlevel );
-
-    POP_SYNC_CHK;
-}
-
-
-__host__
-inline void Pyramid::downscale_from_prev_octave( int octave, cudaStream_t stream, Config::SiftMode mode )
+inline void Pyramid::downscale_from_prev_octave( int octave, cudaStream_t stream )
 {
     Octave&      oct_obj = _octaves[octave];
     Octave& prev_oct_obj = _octaves[octave-1];
@@ -215,86 +86,34 @@ inline void Pyramid::downscale_from_prev_octave( int octave, cudaStream_t stream
     h_grid.x = (unsigned int)grid_divide( width,  h_block.x );
     h_grid.y = (unsigned int)grid_divide( height, h_block.y );
 
-    switch( mode )
-    {
-    case Config::PopSift :
-    case Config::VLFeat :
-    case Config::OpenCV :
-        gauss::get_by_2_pick_every_second
-            <<<h_grid,h_block,0,stream>>>
-            ( prev_oct_obj.getDataTexPoint( ),
-              prev_oct_obj.getWidth(),
-              prev_oct_obj.getHeight(),
-              _levels-PREV_LEVEL,
-              oct_obj.getDataSurface( ),
-              oct_obj.getWidth(),
-              oct_obj.getHeight() );
-
-        POP_SYNC_CHK;
-        break;
-    default :
-        gauss::get_by_2_interpolate
-            <<<h_grid,h_block,0,stream>>>
-            ( prev_oct_obj.getDataTexLinear( ).tex,
-              _levels-PREV_LEVEL,
-              oct_obj.getDataSurface( ),
-              oct_obj.getWidth(),
-              oct_obj.getHeight() );
-
-        POP_SYNC_CHK;
-        break;
-    }
+    gauss::get_by_2_pick_every_second
+        <<<h_grid,h_block,0,stream>>>
+        ( prev_oct_obj.getDataTexPoint( ),
+          prev_oct_obj.getWidth(),
+          prev_oct_obj.getHeight(),
+          _levels-PREV_LEVEL,
+          oct_obj.getDataSurface( ),
+          oct_obj.getWidth(),
+          oct_obj.getHeight() );
+
+    POP_SYNC_CHK;
 }
 
 __host__
 inline void Pyramid::horiz_from_prev_level( int octave, int level, cudaStream_t stream, GaussTableChoice useInterpolatedGauss )
 {
-    Octave&      oct_obj = _octaves[octave];
-
-    const int width  = oct_obj.getWidth();
-    const int height = oct_obj.getHeight();
-
     switch( useInterpolatedGauss )
     {
     case Interpolated_FromPrevious :
-        {
-            dim3 block( 128, 1 );
-            dim3 grid;
-            grid.x  = grid_divide( width,  128 );
-            grid.y  = height;
-
-            gauss::absoluteSourceInterpolated::horiz
-                <<<grid,block,0,stream>>>
-                ( oct_obj.getDataTexLinear( ).tex,
-                  oct_obj.getIntermediateSurface( ),
-                  level );
-        }
+        horiz_from_prev_level_pairs( octave, level, stream );
         break;
     case NotInterpolated_FromPrevious :
-        {
-            dim3 block( 32,  8 ); // most stable good perf on GTX 980 TI
-            // similar speed: dim3 block( 32,  4 ); dim3 block( 32,  3 ); dim3 block( 32,  2 );
-
-            dim3 grid;
-            grid.x  = grid_divide( width,  32 );
-            grid.y  = grid_divide( height, block.y );
-
-            gauss::absoluteSource::horiz
-                <<<grid,block,0,stream>>>
-                ( oct_obj.getDataTexPoint( ),
-                  oct_obj.getIntermediateSurface( ),
-                  level );
-        }
-        break;
-    case Interpolated_FromFirst :
-    case NotInterpolated_FromFirst :
-        POP_FATAL( "Case horizontal Gauss filtering from first level makes not sense in case horizontal Gauss filter from previous level" );
+        horiz_from_prev_level_basic( octave, level, stream );
         break;
     default :
         POP_FATAL( "Missing case in horizontal Gauss filter from previous level" );
         break;
     }
-    POP_SYNC_CHK;
 }
 
 __host__
@@ -302,69 +121,16 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea
 {
     Octave& oct_obj = _octaves[octave];
 
-    /* waiting for any events is not necessary, it's in the same stream as horiz
-     */
-
     const int width  = oct_obj.getWidth();
     const int height = oct_obj.getHeight();
 
     switch( useInterpolatedGauss )
     {
     case Interpolated_FromPrevious :
-        {
-            dim3 block( 4, 32 );
-            dim3 grid;
-            grid.x = (unsigned int)grid_divide( width,  block.y );
-            grid.y = (unsigned int)grid_divide( height, block.x );
-
-            gauss::absoluteSourceInterpolated::vert
-                <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexLinear( ).tex,
-                  oct_obj.getDataSurface( ),
-                  level );
-        }
-        break;
-    case Interpolated_FromFirst :
-        {
-            dim3 block( 4, 32 );
-            dim3 grid;
-            grid.x = (unsigned int)grid_divide( width,  block.y );
-            grid.y = (unsigned int)grid_divide( height, block.x );
-
-            gauss::absoluteSourceInterpolated::vert_abs0
-                <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexLinear( ).tex,
-                  oct_obj.getDataSurface( ),
-                  level );
-        }
+        vert_from_interm_pairs( octave, level, stream );
         break;
     case NotInterpolated_FromPrevious :
-        {
-            dim3 block( 64, 2 );
-            dim3 grid;
-            grid.x = (unsigned int)grid_divide( width,  block.x );
-            grid.y = (unsigned int)grid_divide( height, block.y );
-
-            gauss::absoluteSource::vert
-                <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexPoint( ),
-                  oct_obj.getDataSurface( ),
-                  level );
-        }
-        break;
-    case NotInterpolated_FromFirst :
-        {
-            dim3 block( 64, 2 );
-            dim3 grid;
-            grid.x = (unsigned int)grid_divide( width,  block.x );
-            grid.y = (unsigned int)grid_divide( height, block.y );
-
-            gauss::absoluteSource::vert_abs0
-                <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexPoint( ),
-                  oct_obj.getDataSurface( ),
-                  level );
-        }
+        vert_from_interm_basic( octave, level, stream );
         break;
     default :
         {
@@ -375,60 +141,6 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea
     POP_SYNC_CHK;
 }
 
-__host__
-inline void Pyramid::vert_all_from_interm( int octave, int start_level, int max_level, cudaStream_t stream, GaussTableChoice useInterpolatedGauss )
-{
-    Octave& oct_obj = _octaves[octave];
-
-    /* waiting for any events is not necessary, it's in the same stream as horiz
-     */
-
-    const int width  = oct_obj.getWidth();
-    const int height = oct_obj.getHeight();
-
-    switch( useInterpolatedGauss )
-    {
-    case Interpolated_FromFirst :
-        {
-            dim3 block( 4, 32 );
-            dim3 grid;
-            grid.x = (unsigned int)grid_divide( width,  block.y );
-            grid.y = (unsigned int)grid_divide( height, block.x );
-
-            gauss::absoluteSourceInterpolated::vert_all_abs0
-                <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexLinear( ).tex,
-                  oct_obj.getDataSurface( ),
-                  start_level,
-                  max_level );
-        }
-        break;
-    case NotInterpolated_FromFirst :
-        {
-            dim3 block( 64, 2 );
-            dim3 grid;
-            grid.x = (unsigned int)grid_divide( width,  block.x );
-            grid.y = (unsigned int)grid_divide( height, block.y );
-
-            gauss::absoluteSource::vert_all_abs0
-                <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexPoint( ),
-                  oct_obj.getDataSurface( ),
-                  start_level,
-                  max_level );
-        }
-        break;
-    case Interpolated_FromPrevious :
-    case NotInterpolated_FromPrevious :
-        POP_FATAL( "Case horizontal Gauss filtering from intermediate level makes not sense in case vertial-all Gauss filter from previous level" );
-        break;
-    default :
-        POP_FATAL( "Missing case in vertical-all Gauss filter from intermediate buffer" );
-        break;
-    }
-    POP_SYNC_CHK;
-}
-
 __host__
 inline void Pyramid::dogs_from_blurred( int octave, int max_level, cudaStream_t stream )
 {
@@ -471,121 +183,54 @@ void Pyramid::build_pyramid( const Config& conf, ImageBase* base )
 
     cudaDeviceSynchronize();
 
-    for( uint32_t octave=0; octave<_num_octaves; octave++ ) {
+    GaussTableChoice gaussTableChoice;
+
+    if( conf.getGaussMode() == Config::VLFeat_Relative )
+        gaussTableChoice = Interpolated_FromPrevious;
+    else
+        gaussTableChoice = NotInterpolated_FromPrevious;
+
+    for( uint32_t octave=0; octave<_num_octaves; octave++ )
+    {
         Octave&      oct_obj = _octaves[octave];
         cudaStream_t stream  = oct_obj.getStream();
 
-        if( ( conf.getScalingMode() == Config::ScaleDirect ) &&
-            ( conf.getGaussMode() == Config::Fixed9 || conf.getGaussMode() == Config::Fixed15 ) ) {
-            if( octave == 0 ) {
-                make_octave( conf, base, oct_obj, stream, true );
-            } else {
-                horiz_from_input_image( conf, base, octave, stream );
-                vert_from_interm( octave, 0, stream, NotInterpolated_FromPrevious );
-                make_octave( conf, base, oct_obj, stream, false );
-            }
-        } else if( conf.getGaussMode() == Config::Fixed9 || conf.getGaussMode() == Config::Fixed15 ) {
-            if( octave == 0 ) {
-                make_octave( conf, base, oct_obj, stream, true );
-            } else {
-                Octave& prev_oct_obj = _octaves[octave-1];
-                cuda::event_wait( prev_oct_obj.getEventScaleDone(), stream, __FILE__, __LINE__ );
-
-                downscale_from_prev_octave( octave, stream, conf.getSiftMode() );
-                make_octave( conf, base, oct_obj, stream, false );
-            }
-
-            cuda::event_record( oct_obj.getEventScaleDone(), stream, __FILE__, __LINE__ );
-        } else if( conf.getScalingMode() == Config::ScaleDirect ) {
-            GaussTableChoice useGauss = ( conf.getGaussMode() == Config::VLFeat_Relative ) ? Interpolated_FromPrevious
-                                                                                           : NotInterpolated_FromPrevious;
-            for( int level=0; level<_levels; level++ )
-            {
-                if( level == 0 )
-                {
-                    horiz_from_input_image( conf, base, octave, stream );
-                    vert_from_interm( octave, level, stream, useGauss );
-                }
-                else
-                {
-                    horiz_from_prev_level( octave, level, stream, useGauss );
-                    vert_from_interm( octave, level, stream, useGauss );
-                }
-            }
-        } else if( conf.getGaussMode() == Config::VLFeat_Relative ) {
-            for( int level=0; level<_levels; level++ )
+        for( int level=0; level<_levels; level++ )
+        {
+            if( level == 0 )
             {
-                if( level == 0 )
+                if( octave == 0 )
                 {
-                    if( octave == 0 )
-                    {
-                        horiz_from_input_image( conf, base, 0, stream );
-                        vert_from_interm( octave, 0, stream, Interpolated_FromPrevious );
-                    }
-                    else
-                    {
-                        Octave& prev_oct_obj = _octaves[octave-1];
-                        cuda::event_wait( prev_oct_obj.getEventScaleDone(), stream, __FILE__, __LINE__ );
-
-                        downscale_from_prev_octave( octave, stream, conf.getSiftMode() );
-                    }
+                    horiz_from_input_image( conf, base, stream );
+                    vert_from_interm( octave, 0, stream, gaussTableChoice );
                 }
                 else
                 {
-                    horiz_from_prev_level( octave, level, stream, Interpolated_FromPrevious );
-                    vert_from_interm( octave, level, stream, Interpolated_FromPrevious );
-
-                    if( level == _levels - PREV_LEVEL ) {
-                        cuda::event_record( oct_obj.getEventScaleDone(), stream, __FILE__, __LINE__ );
-                    }
+                    Octave& prev_oct_obj = _octaves[octave-1];
+                    cuda::event_wait( prev_oct_obj.getEventScaleDone(), stream, __FILE__, __LINE__ );
+                    downscale_from_prev_octave( octave, stream );
                 }
             }
-        } else if( octave == 0 && conf.getGaussMode() == Config::VLFeat_Relative_All ) {
-            horiz_all_from_input_image( conf, base, octave, 0, _levels, stream );
-            vert_all_from_interm( octave, 0, _levels, stream, NotInterpolated_FromFirst );
-            cuda::event_record( oct_obj.getEventScaleDone(), stream, __FILE__, __LINE__ );
-        } else {
-            for( int level=0; level<_levels; level++ )
+            else
             {
-                if( level == 0 )
+                horiz_from_prev_level( octave, level, stream, gaussTableChoice );
+                vert_from_interm( octave, level, stream, gaussTableChoice );
+                if( level == _levels - PREV_LEVEL )
                 {
-                    if( octave == 0 )
-                    {
-                        horiz_from_input_image( conf, base, 0, stream );
-                        vert_from_interm( octave, 0, stream, NotInterpolated_FromPrevious );
-                    }
-                    else
-                    {
-                        Octave& prev_oct_obj = _octaves[octave-1];
-                        cuda::event_wait( prev_oct_obj.getEventScaleDone(), stream, __FILE__, __LINE__ );
-
-                        downscale_from_prev_octave( octave, stream, conf.getSiftMode() );
-                    }
-                }
-                else
-                {
-                    horiz_from_prev_level( octave, level, stream, NotInterpolated_FromPrevious );
-                    vert_from_interm( octave, level, stream, NotInterpolated_FromPrevious );
-
-                    if( level == _levels - PREV_LEVEL ) {
-                        cuda::event_record( oct_obj.getEventScaleDone(), stream, __FILE__, __LINE__ );
-                    }
+                    cuda::event_record( oct_obj.getEventScaleDone(), stream, __FILE__, __LINE__ );
                 }
             }
         }
     }
-    // for( int octave=_num_octaves-1; octave>=0; octave-- )
+
     for( int octave=0; octave<_num_octaves; octave++ )
     {
-        if( conf.getGaussMode() == Config::Fixed9 || conf.getGaussMode() == Config::Fixed15 ) {
-        } else {
-            Octave&      oct_obj = _octaves[octave];
-            cudaStream_t stream  = oct_obj.getStream();
-            dogs_from_blurred( octave, _levels, stream );
-        }
+        Octave&      oct_obj = _octaves[octave];
+        cudaStream_t stream  = oct_obj.getStream();
+        dogs_from_blurred( octave, _levels, stream );
     }
+
     for( int octave=0; octave<_num_octaves; octave++ )
-    // for( int octave=_num_octaves-1; octave>=0; octave-- )
     {
         Octave&      oct_obj = _octaves[octave];
         cudaStream_t stream  = oct_obj.getStream();
diff --git a/src/popsift/s_pyramid_build_aa.cu b/src/popsift/s_pyramid_build_aa.cu
index c026a8b7..a7c1ac28 100755
--- a/src/popsift/s_pyramid_build_aa.cu
+++ b/src/popsift/s_pyramid_build_aa.cu
@@ -1,5 +1,6 @@
 /*
  * Copyright 2016-2017, Simula Research Laboratory
+ *           2018-2024, University of Oslo
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -7,165 +8,123 @@
  */
 #include "common/assist.h"
 #include "gauss_filter.h"
-#include "s_pyramid_build_aa.h"
+#include "sift_pyramid.h"
 #include "sift_constants.h"
 
 namespace popsift {
-namespace gauss {
 namespace absoluteSource {
 
-__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ static void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    src_level = dst_level - 1;
     const int    span      =  d_gauss.inc.span[dst_level];
     const float* filter    = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN];
+    const int    block_x   = blockIdx.x * blockDim.x;
+    const int    block_y   = blockIdx.y * blockDim.y;
+    const int    xpos      = block_x + threadIdx.x;
+    const int    ypos      = block_y + threadIdx.y;
 
-    const int off_x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int off_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    float out = 0.0f;
-
-    float A = readTex( src_point_texture, off_x - span, off_y, src_level );
-    float B = readTex( src_point_texture, off_x + span, off_y, src_level );
-    float C = readTex( src_point_texture, off_x       , off_y, src_level );
-    float g  = filter[0];
-    out += C * g;
-    g    = filter[span];
-    out += ( A + B ) * g;
-
-    int shiftval = 0;
-    for( int offset=span-1; offset>0; offset-- ) {
-        shiftval += 1;
-        const float D1 = popsift::shuffle_down( A, shiftval );
-        const float D2 = popsift::shuffle_up  ( C, span - shiftval );
-        const float D  = threadIdx.x < (32 - shiftval) ? D1 : D2;
-        const float E1 = popsift::shuffle_up  ( B, shiftval );
-        const float E2 = popsift::shuffle_down( C, span - shiftval );
-        const float E  = threadIdx.x > shiftval        ? E1 : E2;
-        g = filter[offset];
-        out += ( D + E ) * g;
-    }
-
-    surf2DLayeredwrite( out, dst_data, off_x*4, off_y, dst_level, cudaBoundaryModeZero );
-}
-
-__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
-{
-    const int    span   =  d_gauss.inc.span[dst_level];
-    const float* filter = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN];
-    int block_x = blockIdx.x * blockDim.x;
-    int block_y = blockIdx.y * blockDim.y;
-    int idx     = threadIdx.x;
-    int idy;
-
+    int   idx;
     float g;
     float val;
-    float out = 0;
+    float out = 0.0f;
 
     for( int offset = span; offset>0; offset-- ) {
         g  = filter[offset];
 
-        idy = threadIdx.y - offset;
-        val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level );
+        idx = xpos - offset;
+        val = readTex( src_point_texture, idx, ypos, src_level );
         out += ( val * g );
 
-        idy = threadIdx.y + offset;
-        val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level );
+        idx = xpos + offset;
+        val = readTex( src_point_texture, idx, ypos, src_level );
         out += ( val * g );
     }
 
     g  = filter[0];
-    idy = threadIdx.y;
-    val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level );
+    val = readTex( src_point_texture, xpos, ypos, src_level );
     out += ( val * g );
 
-    idx = block_x+threadIdx.x;
-    idy = block_y+threadIdx.y;
-
-    surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero );
+    surf2DLayeredwrite( out, dst_data, xpos*4, ypos, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ static void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
 {
-    const int    span   =  d_gauss.abs_o0.span[dst_level];
-    const float* filter = &d_gauss.abs_o0.filter[dst_level*GAUSS_ALIGN];
-    int block_x = blockIdx.x * blockDim.x;
-    int block_y = blockIdx.y * blockDim.y;
-    int idx     = threadIdx.x;
-    int idy;
-
+    const int    span    =  d_gauss.inc.span[dst_level];
+    const float* filter  = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN];
+    const int    block_x = blockIdx.x * blockDim.x;
+    const int    block_y = blockIdx.y * blockDim.y;
+    const int    xpos    = block_x + threadIdx.x;
+    const int    ypos    = block_y + threadIdx.y;
+
+    int   idy;
     float g;
     float val;
-    float out = 0;
+    float out = 0.0f;
 
     for( int offset = span; offset>0; offset-- ) {
         g  = filter[offset];
 
-        idy = threadIdx.y - offset;
-        val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level );
+        idy = ypos - offset;
+        val = readTex( src_point_texture, xpos, idy, dst_level );
         out += ( val * g );
 
-        idy = threadIdx.y + offset;
-        val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level );
+        idy = ypos + offset;
+        val = readTex( src_point_texture, xpos, idy, dst_level );
         out += ( val * g );
     }
 
     g  = filter[0];
-    idy = threadIdx.y;
-    val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level );
+    val = readTex( src_point_texture, xpos, ypos, dst_level );
     out += ( val * g );
 
-    idx = block_x+threadIdx.x;
-    idy = block_y+threadIdx.y;
-
-    surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero );
+    surf2DLayeredwrite( out, dst_data, xpos*4, ypos, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert_all_abs0(cudaTextureObject_t src_point_texture,
-                              cudaSurfaceObject_t dst_data,
-                              int start_level,
-                              int max_level)
-{
-    const int block_x = blockIdx.x * blockDim.x;
-    const int block_y = blockIdx.y * blockDim.y;
-
-    for( int dst_level=start_level; dst_level<max_level; dst_level++ )
-    {
-        const int    span   =  d_gauss.abs_o0.span[dst_level];
-        const float* filter = &d_gauss.abs_o0.filter[dst_level*GAUSS_ALIGN];
-
-        int idx = threadIdx.x;
-        int idy;
-
-        float g;
-        float val;
-        float out = 0;
-
-        for( int offset = span; offset>0; offset-- ) {
-            g  = filter[offset];
-
-            idy = threadIdx.y - offset;
-            val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level );
-            out += ( val * g );
-
-            idy = threadIdx.y + offset;
-            val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level );
-            out += ( val * g );
-        }
-
-        g  = filter[0];
-        idy = threadIdx.y;
-        val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level );
-        out += ( val * g );
+} // namespace absoluteSource
 
-        idx = block_x+threadIdx.x;
-        idy = block_y+threadIdx.y;
+__host__
+void Pyramid::horiz_from_prev_level_basic( int octave, int level, cudaStream_t stream )
+{
+    Octave&      oct_obj = _octaves[octave];
+
+    const int width  = oct_obj.getWidth();
+    const int height = oct_obj.getHeight();
+
+    // similar speed: dim3 block( 32,  4 ); dim3 block( 32,  3 ); dim3 block( 32,  2 );
+    dim3 block( 32,  8 ); // most stable good perf on GTX 980 TI
+    dim3 grid;
+    grid.x  = grid_divide( width,  32 );
+    grid.y  = grid_divide( height, block.y );
+
+    absoluteSource::horiz
+        <<<grid,block,0,stream>>>
+        ( oct_obj.getDataTexPoint( ),
+          oct_obj.getIntermediateSurface( ),
+          level );
+    POP_SYNC_CHK;
+}
 
-        surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero );
-    }
+__host__
+void Pyramid::vert_from_interm_basic( int octave, int level, cudaStream_t stream )
+{
+    Octave& oct_obj = _octaves[octave];
+
+    const int width  = oct_obj.getWidth();
+    const int height = oct_obj.getHeight();
+
+    dim3 block( 64, 2 );
+    dim3 grid;
+    grid.x = (unsigned int)grid_divide( width,  block.x );
+    grid.y = (unsigned int)grid_divide( height, block.y );
+
+    absoluteSource::vert
+        <<<grid,block,0,stream>>>
+        ( oct_obj.getIntermDataTexPoint( ),
+          oct_obj.getDataSurface( ),
+          level );
+    POP_SYNC_CHK;
 }
 
-} // namespace absoluteSource
-} // namespace gauss
 } // namespace popsift
 
diff --git a/src/popsift/s_pyramid_build_aa.h b/src/popsift/s_pyramid_build_aa.h
deleted file mode 100755
index 4d3423cf..00000000
--- a/src/popsift/s_pyramid_build_aa.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2017, Simula Research Laboratory
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#include "common/plane_2d.h"
-
-namespace popsift {
-namespace gauss {
-namespace absoluteSource {
-
-__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
-
-__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
-
-__global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
-
-__global__ void vert_all_abs0(cudaTextureObject_t src_point_texture,
-                              cudaSurfaceObject_t dst_data,
-                              int start_level,
-                              int max_level);
-
-} // namespace absoluteSource
-} // namespace gauss
-} // namespace popsift
-
diff --git a/src/popsift/s_pyramid_build_ai.cu b/src/popsift/s_pyramid_build_ai.cu
index c16d636e..2349ad34 100755
--- a/src/popsift/s_pyramid_build_ai.cu
+++ b/src/popsift/s_pyramid_build_ai.cu
@@ -1,5 +1,6 @@
 /*
  * Copyright 2016-2017, Simula Research Laboratory
+ *           2018-2024, University of Oslo
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -7,131 +8,107 @@
  */
 #include "common/assist.h"
 #include "gauss_filter.h"
-#include "s_pyramid_build_aa.h"
+#include "sift_pyramid.h"
 #include "sift_constants.h"
 
 namespace popsift {
-namespace gauss {
 namespace absoluteSourceInterpolated {
 
-__global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ static void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    src_level = dst_level - 1;
     const int    span      =  d_gauss.inc.i_span[dst_level];
     const float* filter    = &d_gauss.inc.i_filter[dst_level*GAUSS_ALIGN];
-
-    const int off_x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int    idx       = blockIdx.x * blockDim.x + threadIdx.x;
+    const int    idy       = blockIdx.x * blockDim.x + threadIdx.x;
 
     float out = 0.0f;
 
     for( int offset = 1; offset<=span; offset += 2 ) {
         const float u    = filter[offset];
         const float off  = offset + ( 1.0f - u );
-        const float val = readTex( src_linear_tex, off_x - off, blockIdx.y, src_level )
-                        + readTex( src_linear_tex, off_x + off, blockIdx.y, src_level );
+        const float val = readTex( src_linear_tex, idx - off, idy, src_level )
+                        + readTex( src_linear_tex, idx + off, idy, src_level );
 
         const float v = filter[offset+1];
         out += val * v;
     }
     const float& g  = filter[0];
-    const float v3 = readTex( src_linear_tex, off_x, blockIdx.y, src_level );
+    const float v3 = readTex( src_linear_tex, idx, idy, src_level );
     out += ( v3 * g );
 
-    surf2DLayeredwrite( out, dst_data, off_x*4, blockIdx.y, dst_level, cudaBoundaryModeZero );
+    surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ static void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    span   =  d_gauss.inc.i_span[dst_level];
     const float* filter = &d_gauss.inc.i_filter[dst_level*GAUSS_ALIGN];
-    int block_x   = blockIdx.x * blockDim.y;
-    int block_y   = blockIdx.y * blockDim.x;
-    const int idx = threadIdx.y;
-    const int idy = threadIdx.x;
+    const int    idx    = blockIdx.y * blockDim.y + threadIdx.y;
+    const int    idy    = blockIdx.x * blockDim.x + threadIdx.x;
 
     float out = 0;
 
     for( int offset = 1; offset<=span; offset += 2 ) {
         const float u    = filter[offset];
         const float off  = offset + ( 1.0f - u );
-        const float val = readTex( src_linear_tex, block_x + idx, block_y + idy - off, dst_level )
-                        + readTex( src_linear_tex, block_x + idx, block_y + idy + off, dst_level );
+        const float val = readTex( src_linear_tex, idx, idy - off, dst_level )
+                        + readTex( src_linear_tex, idx, idy + off, dst_level );
 
         const float v = filter[offset+1];
         out += val * v;
     }
 
-    float g   = filter[0];
-    float val = readTex( src_linear_tex, block_x + idx, block_y + idy, dst_level );
-    out += ( val * g );
+    const float g   = filter[0];
+    const float v3 = readTex( src_linear_tex, idx, idy, dst_level );
+    out += ( v3 * g );
 
-    surf2DLayeredwrite( out, dst_data, (block_x+idx)*4, block_y+idy, dst_level, cudaBoundaryModeZero );
+    surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
-{
-    const int    span   =  d_gauss.abs_o0.i_span[dst_level];
-    const float* filter = &d_gauss.abs_o0.i_filter[dst_level*GAUSS_ALIGN];
-    const int block_x   = blockIdx.x * blockDim.y;
-    const int block_y   = blockIdx.y * blockDim.x;
-    const int idx       = threadIdx.y;
-    const int idy       = threadIdx.x;
-
-    float out = 0;
-
-    for( int offset = 1; offset<=span; offset += 2 ) {
-        const float u    = filter[offset];
-        const float off  = offset + ( 1.0f - u );
-        const float val = readTex( src_linear_tex, block_x + idx, block_y + idy - off, dst_level )
-                        + readTex( src_linear_tex, block_x + idx, block_y + idy + off, dst_level );
-
-        const float v = filter[offset+1];
-        out += val * v;
-    }
-
-    float g   = filter[0];
-    float val = readTex( src_linear_tex, block_x + idx, block_y + idy, dst_level );
-    out += ( val * g );
+} // namespace absoluteSourceInterpolated
 
-    surf2DLayeredwrite( out, dst_data, (block_x+idx)*4, block_y+idy, dst_level, cudaBoundaryModeZero );
+__host__
+void Pyramid::horiz_from_prev_level_pairs( int octave, int level, cudaStream_t stream )
+{
+    Octave&      oct_obj = _octaves[octave];
+
+    const int width  = oct_obj.getWidth();
+    const int height = oct_obj.getHeight();
+
+    dim3 block( 128, 1 );
+    dim3 grid;
+    grid.x  = grid_divide( width,  128 );
+    grid.y  = height;
+
+    absoluteSourceInterpolated::horiz
+        <<<grid,block,0,stream>>>
+        ( oct_obj.getDataTexLinear( ).tex,
+          oct_obj.getIntermediateSurface( ),
+          level );
+    POP_SYNC_CHK;
 }
 
-__global__ void vert_all_abs0(cudaTextureObject_t src_linear_tex,
-                              cudaSurfaceObject_t dst_data,
-                              int start_level,
-                              int max_level)
+__host__
+void Pyramid::vert_from_interm_pairs( int octave, int level, cudaStream_t stream )
 {
-    const int block_x = blockIdx.x * blockDim.y;
-    const int block_y = blockIdx.y * blockDim.x;
-    const int idx     = threadIdx.y;
-    const int idy     = threadIdx.x;
-
-    for( int dst_level=start_level; dst_level<max_level; dst_level++ )
-    {
-        const int    span   =  d_gauss.abs_o0.i_span[dst_level];
-        const float* filter = &d_gauss.abs_o0.i_filter[dst_level*GAUSS_ALIGN];
-
-        float out = 0;
-
-        for( int offset = 1; offset<=span; offset += 2 ) {
-            const float u    = filter[offset];
-            const float off  = offset + ( 1.0f - u );
-            const float val = readTex( src_linear_tex, block_x + idx, block_y + idy - off, dst_level )
-                            + readTex( src_linear_tex, block_x + idx, block_y + idy + off, dst_level );
-
-            const float v = filter[offset+1];
-            out += val * v;
-        }
-
-        const float& g   = filter[0];
-        float        val = readTex( src_linear_tex, block_x + idx, block_y + idy, dst_level );
-        out += ( val * g );
-
-        surf2DLayeredwrite( out, dst_data, (block_x+idx)*4, block_y+idy, dst_level, cudaBoundaryModeZero );
-    }
+    Octave& oct_obj = _octaves[octave];
+
+    const int width  = oct_obj.getWidth();
+    const int height = oct_obj.getHeight();
+
+    dim3 block( 4, 32 );
+    dim3 grid;
+    grid.y = (unsigned int)grid_divide( width,  block.y );
+    grid.x = (unsigned int)grid_divide( height, block.x );
+
+    absoluteSourceInterpolated::vert
+        <<<grid,block,0,stream>>>
+        ( oct_obj.getIntermDataTexLinear( ).tex,
+          oct_obj.getDataSurface( ),
+          level );
+    POP_SYNC_CHK;
 }
 
-} // namespace absoluteSourceInterpolated
-} // namespace gauss
 } // namespace popsift
 
diff --git a/src/popsift/s_pyramid_build_ai.h b/src/popsift/s_pyramid_build_ai.h
deleted file mode 100755
index d3431fe7..00000000
--- a/src/popsift/s_pyramid_build_ai.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2017, Simula Research Laboratory
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#include "common/plane_2d.h"
-
-namespace popsift {
-namespace gauss {
-namespace absoluteSourceInterpolated {
-
-__global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
-
-__global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
-
-__global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
-
-__global__ void vert_all_abs0(cudaTextureObject_t src_linear_tex,
-                              cudaSurfaceObject_t dst_data,
-                              int start_level,
-                              int max_level);
-
-} // namespace absoluteSourceInterpolated
-} // namespace gauss
-} // namespace popsift
-
diff --git a/src/popsift/s_pyramid_build_ra.cu b/src/popsift/s_pyramid_build_ra.cu
index 2b32e62c..7506b933 100755
--- a/src/popsift/s_pyramid_build_ra.cu
+++ b/src/popsift/s_pyramid_build_ra.cu
@@ -1,39 +1,34 @@
 /*
  * Copyright 2016-2017, Simula Research Laboratory
+ *           2018-2024, University of Oslo
  *
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #include "common/assist.h"
+#include "common/plane_2d.h"
 #include "gauss_filter.h"
-#include "s_pyramid_build_ra.h"
+#include "sift_pyramid.h"
 #include "sift_constants.h"
 
 namespace popsift {
-namespace gauss {
 namespace normalizedSource {
 
-__global__
-void horiz( cudaTextureObject_t src_linear_tex,
-            cudaSurfaceObject_t dst_data,
-            int                 dst_w,
-            int                 dst_h,
-            int                 octave,
-            float               shift )
+__global__ static void horiz( cudaTextureObject_t src_linear_tex,
+                              cudaSurfaceObject_t dst_data,
+                              int                 dst_w,
+                              int                 dst_h,
+                              float               shift )
 {
-    // Create level-0 for any octave from the input image.
-    // Since we are computing the direct-downscaling gauss filter tables
-    // and the first entry in that table is identical to the "normal"
-    // table, we do not need a special case.
-
+    // Create octave-0 - level-0 from the input image.
     const int    write_x = blockIdx.x * blockDim.x + threadIdx.x;
     const int    write_y = blockIdx.y;
 
     if( write_x >= dst_w ) return;
 
-    const int    span    =  d_gauss.dd.span[octave];
-    const float* filter  = &d_gauss.dd.filter[octave*GAUSS_ALIGN];
+    const int    span    =  d_gauss.dd.span[0];
+    const float* filter  = &d_gauss.dd.filter[0];
     const float  read_x  = ( blockIdx.x * blockDim.x + threadIdx.x + shift ) / dst_w;
     const float  read_y  = ( blockIdx.y + shift ) / dst_h;
 
@@ -54,81 +49,33 @@ void horiz( cudaTextureObject_t src_linear_tex,
     surf2DLayeredwrite( out * 255.0f, dst_data, write_x*4, write_y, 0, cudaBoundaryModeZero );
 }
 
-__global__
-void horiz_level( cudaTextureObject_t src_linear_tex,
-                  cudaSurfaceObject_t dst_data,
-                  int                 dst_w,
-                  int                 dst_h,
-                  int                 /* octave */,
-                  int                 level,
-                  float               shift )
-{
-    const int    write_x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int    write_y = blockIdx.y;
-
-    if( write_x >= dst_w ) return;
-
-    const float  read_x  = ( blockIdx.x * blockDim.x + threadIdx.x + shift ) / dst_w;
-    const float  read_y  = ( blockIdx.y + shift ) / dst_h;
-
-    const int    span      =  d_gauss.abs_o0.span[level];
-    const float* filter    = &d_gauss.abs_o0.filter[level*GAUSS_ALIGN];
-
-    float out = 0.0f;
+} // namespace normalizedSource
 
-    for( int offset = span; offset>0; offset-- ) {
-        const float& g  = filter[offset];
-        const float  offrel = float(offset) / dst_w;
-        const float  v1 = tex2D<float>( src_linear_tex, read_x - offrel, read_y );
-        const float  v2 = tex2D<float>( src_linear_tex, read_x + offrel, read_y );
-        out += ( ( v1 + v2 ) * g );
-    }
-    const float& g  = filter[0];
-    const float  v3 = tex2D<float>( src_linear_tex, read_x, read_y );
-    out += ( v3 * g );
+__host__
+void Pyramid::horiz_from_input_image( const Config& conf, ImageBase* base, cudaStream_t stream )
+{
+    Octave&   oct_obj = _octaves[0];
 
-    surf2DLayeredwrite( out * 255.0f, dst_data, write_x*4, write_y, level, cudaBoundaryModeZero );
-}
+    const int width   = oct_obj.getWidth();
+    const int height  = oct_obj.getHeight();
 
-__global__
-void horiz_all( cudaTextureObject_t src_linear_tex,
-                cudaSurfaceObject_t dst_data,
-                int                 dst_w,
-                int                 dst_h,
-                float               shift,
-                const int           max_level ) // dst_level )
-{
-    const int    write_x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int    write_y = blockIdx.y;
+    dim3 block( 128, 1 );
+    dim3 grid;
+    grid.x  = grid_divide( width,  128 );
+    grid.y  = height;
 
-    if( write_x >= dst_w ) return;
+    float shift  = 0.5f * powf( 2.0f, conf.getUpscaleFactor() );
 
-    const float  read_x  = ( blockIdx.x * blockDim.x + threadIdx.x + shift ) / dst_w;
-    const float  read_y  = ( blockIdx.y + shift ) / dst_h;
+    normalizedSource::horiz
+        <<<grid,block,0,stream>>>
+        ( base->getInputTexture(),
+          oct_obj.getIntermediateSurface(),
+          width,
+          height,
+          shift );
 
-    for( int dst_level=0; dst_level < max_level; dst_level++ )
-    {
-        const int    span      =  d_gauss.abs_o0.span[dst_level];
-        const float* filter    = &d_gauss.abs_o0.filter[dst_level*GAUSS_ALIGN];
-
-        float out = 0.0f;
-
-        for( int offset = span; offset>0; offset-- ) {
-            const float& g  = filter[offset];
-            const float  offrel = float(offset) / dst_w;
-            const float  v1 = tex2D<float>( src_linear_tex, read_x - offrel, read_y );
-            const float  v2 = tex2D<float>( src_linear_tex, read_x + offrel, read_y );
-            out += ( ( v1 + v2 ) * g );
-        }
-        const float& g  = filter[0];
-        const float  v3 = tex2D<float>( src_linear_tex, read_x, read_y );
-        out += ( v3 * g );
-
-        surf2DLayeredwrite( out * 255.0f, dst_data, write_x*4, write_y, dst_level, cudaBoundaryModeZero );
-    }
+    POP_SYNC_CHK;
 }
 
-} // namespace normalizedSource
-} // namespace gauss
 } // namespace popsift
 
diff --git a/src/popsift/s_pyramid_build_ra.h b/src/popsift/s_pyramid_build_ra.h
deleted file mode 100755
index 0b628bc4..00000000
--- a/src/popsift/s_pyramid_build_ra.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright 2016-2017, Simula Research Laboratory
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#include "common/plane_2d.h"
-
-namespace popsift {
-namespace gauss {
-namespace normalizedSource {
-
-__global__ void horiz(cudaTextureObject_t src_data,
-                      cudaSurfaceObject_t dst_data,
-                      int dst_w,
-                      int dst_h,
-                      int octave,
-                      float shift);
-
-__global__ void horiz_level(cudaTextureObject_t src_linear_tex,
-                            cudaSurfaceObject_t dst_data,
-                            int dst_w,
-                            int dst_h,
-                            int /* octave - must be 0 */,
-                            int level,
-                            float shift);
-
-__global__ void horiz_all(
-  cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_w, int dst_h, float shift, int max_level);
-
-} // namespace normalizedSource
-} // namespace gauss
-} // namespace popsift
-
diff --git a/src/popsift/s_pyramid_fixed.cu b/src/popsift/s_pyramid_fixed.cu
deleted file mode 100755
index 9e3d52aa..00000000
--- a/src/popsift/s_pyramid_fixed.cu
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright 2017, Simula Research Laboratory
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
-#include "sift_pyramid.h"
-#include "sift_constants.h"
-#include "gauss_filter.h"
-#include "common/debug_macros.h"
-#include "common/assist.h"
-#include "common/clamp.h"
-
-#include <iostream>
-#include <cstdio>
-
-namespace popsift {
-
-namespace gauss {
-
-namespace fixedSpan {
-
-template<int SHIFT>
-__device__
-inline float octave_fixed_horiz( float fval, const float* filter )
-{
-    /* Example:
-     * SHIFT is 4
-     * input  fval of thread N is extracted from image index N-4
-     * output fval of thread N should be filtered sum from N-4 to N+4
-     */
-    float out = fval * filter[0];
-    #pragma unroll
-    for( int i=1; i<=SHIFT; i++ ) {
-        float val  = popsift::shuffle_up( fval, i ) + popsift::shuffle_down( fval, i );
-        out += val * filter[i];
-    }
-
-    fval = popsift::shuffle_down( out, SHIFT );
-
-    return fval;
-}
-
-namespace absoluteTexAddress {
-/* read from point-addressable texture of image from previous octave */
-
-template<int SHIFT>
-__device__
-inline float octave_fixed_vert( cudaTextureObject_t src_data, int idx, int idy, int level, const float* filter )
-{
-    /* Input thread N takes as input the (idx,idy) position of the pixel that it
-     * will eventually write (The 2*SHIFT rightmost threads will not write anything).
-     * Thread N computes and returns the vertical filter at position N-SHIFT.
-     */
-    float       val    = readTex( src_data, idx-SHIFT, idy, level );
-
-    float       fval   = val * filter[0];
-    #pragma unroll
-    for( int i=1; i<=SHIFT; i++ ) {
-        val   = readTex( src_data, idx-SHIFT, idy-i, level )
-              + readTex( src_data, idx-SHIFT, idy+i, level );
-        fval += val * filter[i];
-    }
-
-    return fval;
-}
-
-template<int SHIFT, int WIDTH, int HEIGHT, int LEVELS>
-__global__
-void octave_fixed( cudaTextureObject_t src_data,
-                   cudaSurfaceObject_t dst_data,
-                   const int           w,
-                   const int           h,
-                   cudaSurfaceObject_t dog_data )
-{
-    const int IDx   = threadIdx.x;
-    const int IDy   = threadIdx.y;
-    const int IDz   = threadIdx.z;
-    const int level = IDz + 1;
-
-    const float* filter = &d_gauss.abs_oN.filter[level*GAUSS_ALIGN];
-
-    const int idx = blockIdx.x * WIDTH      + IDx;
-    const int idy = blockIdx.y * blockDim.y + IDy;
-
-    float fval;
-    
-    fval = octave_fixed_vert<SHIFT>( src_data, idx, idy, 0, filter );
-
-    fval = octave_fixed_horiz<SHIFT>( fval, filter );
-
-    __shared__ float lx_val[HEIGHT][WIDTH][LEVELS];
-
-    if( IDx < WIDTH ) {
-        lx_val[IDy][IDx][IDz] = fval;
-    }
-    __syncthreads();
-
-    if( IDx < WIDTH ) {
-        const float l0_val = readTex( src_data, idx, idy, 0 );
-        const float dogval = ( IDz == 0 )
-                           ? fval - l0_val
-                           : fval - lx_val[IDy][IDx][IDz-1];
-
-        const bool i_write = ( idx < w && idy < h );
-
-        if( i_write ) {
-            surf2DLayeredwrite( fval, dst_data,
-                                idx*4, idy,
-                                IDz + 1,
-                                cudaBoundaryModeZero );
-
-            surf2DLayeredwrite( dogval, dog_data,
-                                idx*4, idy,
-                                IDz,
-                                cudaBoundaryModeZero );
-        }
-    }
-}
-
-} // namespace absoluteTexAddress
-
-namespace relativeTexAddress {
-/* read from ratio-addressable texture of input image */
-
-/* reading from the texture laid over the input image */
-template<int SHIFT>
-__device__
-inline float octave_fixed_vert( cudaTextureObject_t src_data, int idx, int idy, const float mul_w, const float mul_h, float tshift, const float* filter )
-{
-    /* Like above, but reading uses relative input image positions */
-    const float xpos = ( idx - SHIFT + tshift ) * mul_w;
-    const float ypos = ( idy + tshift ) * mul_h;
-    float       val  = tex2D<float>( src_data, xpos, ypos );
-
-    float       fval = val * filter[0];
-    #pragma unroll
-    for( int i=1; i<=SHIFT; i++ ) {
-        val  = tex2D<float>( src_data, xpos, ypos - i * mul_h );
-        val += tex2D<float>( src_data, xpos, ypos + i * mul_h );
-        fval += val * filter[i];
-    }
-
-    return fval;
-}
-
-template<int SHIFT, int WIDTH, int HEIGHT, int LEVELS>
-__global__
-void octave_fixed( cudaTextureObject_t src_data,
-                   cudaSurfaceObject_t dst_data,
-                   cudaSurfaceObject_t dog_data,
-                   const int           w,
-                   const int           h,
-                   const float         tshift )
-{
-    const int IDx   = threadIdx.x;
-    const int IDy   = threadIdx.y;
-    const int level = threadIdx.z;
-
-    const float* filter = &d_gauss.abs_o0.filter[level*GAUSS_ALIGN];
-
-    const int idx = blockIdx.x * WIDTH      + IDx;
-    const int idy = blockIdx.y * blockDim.y + IDy;
-
-    const float mul_w  = __frcp_rn( float(w) );
-    const float mul_h  = __frcp_rn( float(h) );
-    float fval;
-
-    fval = octave_fixed_vert<SHIFT>( src_data, idx, idy, mul_w, mul_h, tshift, filter );
-
-    fval = octave_fixed_horiz<SHIFT>( fval, filter );
-
-    fval *= 255.0f; // don't forget to upscale
-
-    __shared__ float lx_val[HEIGHT][WIDTH][LEVELS];
-
-    if( IDx < WIDTH ) {
-        lx_val[IDy][IDx][level] = fval;
-    }
-    __syncthreads();
-
-    const bool i_write = ( idx < w && idy < h );
-
-    if( IDx < WIDTH && i_write ) {
-            // destination.ptr(idy)[idx] = fval;
-            surf2DLayeredwrite( fval, dst_data,
-                                idx*4, idy,
-                                level,
-                                cudaBoundaryModeZero );
-
-        if( level > 0 ) {
-            float dogval = fval - lx_val[IDy][IDx][level-1];
-            // left side great
-            // right side buggy
-            surf2DLayeredwrite( dogval, dog_data,
-                                idx*4, idy,
-                                level-1,
-                                cudaBoundaryModeZero );
-        }
-    }
-}
-
-} // namespace relativeTexAddress
-
-} // namespace fixedSpan
-
-} // namespace gauss
-
-template<int SHIFT, bool OCT_0, int LEVELS>
-__host__
-inline void make_octave_sub( const Config& conf, ImageBase* base, Octave& oct_obj, cudaStream_t stream )
-{
-    const int width  = oct_obj.getWidth();
-    const int height = oct_obj.getHeight();
-
-    if( OCT_0 ) {
-        const int x_size = 32;
-        const int l_conf = LEVELS;
-        const int w_conf = x_size - 2 * SHIFT;
-        const int h_conf = 1; // 1024 / ( x_size * l_conf );
-        dim3 block( x_size, h_conf, l_conf );
-        dim3 grid;
-        grid.x = grid_divide( width, w_conf );
-        grid.y = grid_divide( height, block.y );
-
-        assert( block.x * block.y * block.z < 1024 );
-        
-        // cerr << "calling relative with " << block.x * block.y * block.z << " threads per block" << endl
-             // << "                 and  " << grid.x * grid.y * grid.z << " blocks" << endl;
-
-        const float tshift = 0.5f * powf( 2.0f, conf.getUpscaleFactor() );
-
-        gauss::fixedSpan::relativeTexAddress::octave_fixed
-            <SHIFT,w_conf,h_conf,l_conf>
-            <<<grid,block,0,stream>>>
-            ( base->getInputTexture( ),
-              oct_obj.getDataSurface( ),
-              oct_obj.getDogSurface( ),
-              oct_obj.getWidth(),
-              oct_obj.getHeight(),
-              tshift );
-    } else {
-        const int x_size = 32;
-        const int l_conf = LEVELS-1;
-        const int w_conf = x_size - 2 * SHIFT;
-        const int h_conf = 1024 / ( x_size * l_conf );
-        dim3 block( x_size, h_conf, l_conf );
-        dim3 grid;
-        grid.x = grid_divide( width, w_conf );
-        grid.y = grid_divide( height, block.y );
-
-        assert( block.x * block.y * block.z < 1024 );
-
-        // cerr << "calling absolute with " << block.x * block.y * block.z << " threads per block" << endl
-             // << "                 and  " << grid.x * grid.y * grid.z << " blocks" << endl;
-
-        gauss::fixedSpan::absoluteTexAddress::octave_fixed
-            <SHIFT,w_conf,h_conf,l_conf>
-            <<<grid,block,0,stream>>>
-            ( oct_obj.getDataTexPoint( ),
-              oct_obj.getDataSurface( ),
-              oct_obj.getWidth(),
-              oct_obj.getHeight(),
-              oct_obj.getDogSurface( ) );
-    }
-}
-
-void Pyramid::make_octave( const Config& conf, ImageBase* base, Octave& oct_obj, cudaStream_t stream, bool isOctaveZero )
-{
-    if( _levels == 6 ) {
-        if( conf.getGaussMode() == Config::Fixed9 ) {
-            if( isOctaveZero )
-                make_octave_sub<4,true,6> ( conf, base, oct_obj, stream );
-            else
-                make_octave_sub<4,false,6>( conf, base, oct_obj, stream );
-        } else if( conf.getGaussMode() == Config::Fixed15 ) {
-            if( isOctaveZero )
-                make_octave_sub<7,true,6> ( conf, base, oct_obj, stream );
-            else
-                make_octave_sub<7,false,6>( conf, base, oct_obj, stream );
-        } else {
-            POP_FATAL("Unsupported Gauss filter mode for making all octaves at once");
-        }
-    } else {
-        POP_FATAL("Unsupported number of levels for making all octaves at once");
-    }
-}
-
-} // namespace popsift
-
diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu
index 251f58ff..1f6ff601 100644
--- a/src/popsift/sift_conf.cu
+++ b/src/popsift/sift_conf.cu
@@ -23,9 +23,8 @@ Config::Config( )
     , _edge_limit( 10.0f )
     , _threshold( 0.04 ) // ( 10.0f / 256.0f )
     , _gauss_mode( getGaussModeDefault() )
-    , _sift_mode( Config::PopSift )
+    , _sift_mode( Config::RefineInOctave )
     , _log_mode( Config::None )
-    , _scaling_mode( Config::ScaleDefault )
     , _desc_mode( Config::Loop )
     , _grid_filter_mode( Config::RandomScale )
     , verbose( false )
@@ -89,14 +88,6 @@ void Config::setGaussMode( const std::string& m )
         setGaussMode( Config::VLFeat_Relative );
     else if( m == "relative" )
         setGaussMode( Config::VLFeat_Relative );
-    else if( m == "vlfeat-direct" )
-        setGaussMode( Config::VLFeat_Relative_All );
-    else if( m == "opencv" )
-        setGaussMode( Config::OpenCV_Compute );
-    else if( m == "fixed9" )
-        setGaussMode( Config::Fixed9 );
-    else if( m == "fixed15" )
-        setGaussMode( Config::Fixed15 );
     else
         POP_FATAL( string("Bad Gauss mode.\n") + getGaussModeUsage() );
 }
@@ -113,10 +104,6 @@ const char* Config::getGaussModeUsage( )
         "Options are: "
         "vlfeat (default), "
         "vlfeat-hw-interpolated, "
-        "vlfeat-direct, "
-        "opencv, "
-        "fixed9, "
-        "fixed15, "
         "relative (synonym for vlfeat-hw-interpolated)";
 }
 
@@ -161,11 +148,6 @@ Config::LogMode Config::getLogMode( ) const
     return _log_mode;
 }
 
-void Config::setScalingMode( ScalingMode mode )
-{
-    _scaling_mode = mode;
-}
-
 /**
  * Normalization mode
  * Should the descriptor normalization use L2-like classic normalization
@@ -292,7 +274,6 @@ bool Config::equal( const Config& other ) const
         COMPARE( _edge_limit ) ||
         COMPARE( _threshold ) ||
         COMPARE( _upscale_factor ) ||
-        COMPARE( _scaling_mode ) ||
         COMPARE( _max_extrema ) ||
         COMPARE( _gauss_mode ) ||
         COMPARE( _sift_mode ) ||
diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h
index 583a958c..7a320945 100644
--- a/src/popsift/sift_conf.h
+++ b/src/popsift/sift_conf.h
@@ -12,8 +12,6 @@
 #define MAX_OCTAVES   20
 #define MAX_LEVELS    10
 
-#undef USE_DOG_TEX_LINEAR
-
 #ifdef _MSC_VER
 #define DEPRECATED(func) __declspec(deprecated) func
 #elif defined(__GNUC__) || defined(__clang__)
@@ -38,11 +36,7 @@ struct Config
     enum GaussMode
     {
         VLFeat_Compute,
-        VLFeat_Relative,
-        VLFeat_Relative_All,
-        OpenCV_Compute,
-        Fixed9,
-        Fixed15
+        VLFeat_Relative
     };
 
     /**
@@ -50,14 +44,14 @@ struct Config
      */
     enum SiftMode
     {
-        /// Popsift implementation
-        PopSift,
-        /// OpenCV implementation
-        OpenCV,
-        /// VLFeat implementation
-        VLFeat,
-        /// Default implementation is PopSift
-        Default = PopSift
+        /// refining an initial extremum stays in the same level of an octave
+        RefineInLevel,
+        /// refining an initial extremum can change level but stays in the same octave
+        RefineInOctave,
+
+        // PopSift = RefineInOctave, ///< Popsift implementation
+        // VLFeat  = RefineInLevel,  ///< VLFeat implementation
+        Default = RefineInOctave
     };
 
     /**
@@ -69,16 +63,6 @@ struct Config
         All
     };
 
-    /**
-     * @brief The scaling mode.
-     */
-    enum ScalingMode
-    {
-        ScaleDirect,
-        /// Indirect - only working method
-        ScaleDefault
-    };
-
     /**
      * @brief Modes for descriptor extraction.
      */
@@ -160,7 +144,6 @@ struct Config
      * @see LogMode
      */
     void setLogMode( LogMode mode = All );
-    void setScalingMode( ScalingMode mode = ScaleDefault );
 
     /**
      * @brief Enable/desable verbose mode.
@@ -319,13 +302,6 @@ struct Config
      */
     GridFilterMode getFilterSorting() const { return _grid_filter_mode; }
 
-    /**
-     * @brief Get the scaling mode.
-     * @return the descriptor extraction mode.
-     * @see ScalingMode
-     */
-    inline ScalingMode getScalingMode() const { return _scaling_mode; }
-
     /**
      * @brief Get the descriptor extraction mode
      * @return the descriptor extraction mode
@@ -352,9 +328,6 @@ struct Config
     /// default LogMode::None
     LogMode  _log_mode;
 
-    /// default: ScalingMode::DownscaledOctaves
-    ScalingMode _scaling_mode;
-
     /// default: DescMode::Loop
     DescMode    _desc_mode;
 
diff --git a/src/popsift/sift_octave.cu b/src/popsift/sift_octave.cu
index 430bc298..63b3be05 100755
--- a/src/popsift/sift_octave.cu
+++ b/src/popsift/sift_octave.cu
@@ -390,19 +390,12 @@ void Octave::alloc_dog_tex()
 
         err = cudaCreateTextureObject(&_dog_3d_tex_point, &dog_res_desc, &dog_tex_desc, 0);
         POP_CUDA_FATAL_TEST(err, "Could not create DoG texture: ");
-
-        dog_tex_desc.filterMode = cudaFilterModeLinear; // linear interpolation
-        err = cudaCreateTextureObject(&_dog_3d_tex_linear.tex, &dog_res_desc, &dog_tex_desc, 0);
-        POP_CUDA_FATAL_TEST(err, "Could not create DoG texture: ");
 }
 
 void Octave::free_dog_tex()
 {
     cudaError_t err;
 
-    err = cudaDestroyTextureObject(_dog_3d_tex_linear.tex);
-    POP_CUDA_FATAL_TEST(err, "Could not destroy DoG texture: ");
-
     err = cudaDestroyTextureObject(_dog_3d_tex_point);
     POP_CUDA_FATAL_TEST(err, "Could not destroy DoG texture: ");
 
diff --git a/src/popsift/sift_octave.h b/src/popsift/sift_octave.h
index fc2ad13b..96ea7635 100755
--- a/src/popsift/sift_octave.h
+++ b/src/popsift/sift_octave.h
@@ -53,7 +53,6 @@ class Octave
     cudaExtent            _dog_3d_ext{};
     cudaSurfaceObject_t   _dog_3d_surf{};
     cudaTextureObject_t   _dog_3d_tex_point{};
-    LinearTexture         _dog_3d_tex_linear{};
 
     // one CUDA stream per level
     // consider whether some of them can be removed
@@ -127,9 +126,6 @@ class Octave
     inline cudaTextureObject_t& getDogTexturePoint( ) {
         return _dog_3d_tex_point;
     }
-    inline LinearTexture& getDogTextureLinear( ) {
-        return _dog_3d_tex_linear;
-    }
 
      /**
       * @brief Allocates all GPU memories for one octave.
diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu
index c03b0d61..28289dae 100644
--- a/src/popsift/sift_pyramid.cu
+++ b/src/popsift/sift_pyramid.cu
@@ -79,7 +79,7 @@ __global__
 void Pyramid::download_and_save_array( const char* basename )
 {
     for( int o=0; o<_num_octaves; o++ )
-    _octaves[o].download_and_save_array( basename, o );
+        _octaves[o].download_and_save_array( basename, o );
 }
 
 /*
diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h
index 837fc3b1..9bdda8de 100755
--- a/src/popsift/sift_pyramid.h
+++ b/src/popsift/sift_pyramid.h
@@ -79,9 +79,7 @@ class Pyramid
 public:
     enum GaussTableChoice {
         Interpolated_FromPrevious,
-        Interpolated_FromFirst,
         NotInterpolated_FromPrevious,
-        NotInterpolated_FromFirst
     };
 
 public:
@@ -114,32 +112,19 @@ class Pyramid
     inline Octave& getOctave(const int o){ return _octaves[o]; }
 
 private:
-    inline void horiz_from_input_image( const Config&    conf,
-                                        ImageBase*       base,
-					                    int              octave,
-					                    cudaStream_t     stream );
-    inline void horiz_level_from_input_image( const Config&    conf,
-                                              ImageBase*       base,
-					                          int              octave,
-                                              int              level,
-					                          cudaStream_t     stream );
-    inline void horiz_all_from_input_image( const Config&    conf,
-                                            ImageBase*       base,
-                                            int              octave,
-                                            int              startlevel,
-                                            int              maxlevel,
-                                            cudaStream_t     stream );
-    inline void downscale_from_prev_octave( int octave, cudaStream_t stream, Config::SiftMode mode );
+    void horiz_from_input_image( const Config&    conf,
+                                 ImageBase*       base,
+					             cudaStream_t     stream );
+    inline void downscale_from_prev_octave( int octave, cudaStream_t stream );
+
+    void        horiz_from_prev_level_basic( int octave, int level, cudaStream_t stream );
+    void        horiz_from_prev_level_pairs( int octave, int level, cudaStream_t stream );
     inline void horiz_from_prev_level( int octave, int level, cudaStream_t stream, GaussTableChoice useInterpolatedGauss );
+    void        vert_from_interm_basic( int octave, int level, cudaStream_t stream );
+    void        vert_from_interm_pairs( int octave, int level, cudaStream_t stream );
     inline void vert_from_interm( int octave, int level, cudaStream_t stream, GaussTableChoice useInterpolatedGauss );
-    inline void vert_all_from_interm( int octave,
-                                      int start_level,
-                                      int max_level,
-                                      cudaStream_t stream,
-                                      GaussTableChoice useInterpolatedGauss );
-    inline void dogs_from_blurred( int octave, int max_level, cudaStream_t stream );
 
-    void make_octave( const Config& conf, ImageBase* base, Octave& oct_obj, cudaStream_t stream, bool isOctaveZero );
+    inline void dogs_from_blurred( int octave, int max_level, cudaStream_t stream );
 
     void reset_extrema_mgmt( );
     void build_pyramid( const Config& conf, ImageBase* base );