From c3252c362c62a71e9fa93a7abfc707bf2af46757 Mon Sep 17 00:00:00 2001
From: Rene Winchenbach <rene.winchenbach@gmail.com>
Date: Tue, 12 Nov 2024 10:55:41 +0100
Subject: [PATCH] Moved utils from sub branches into a single branch so all
 problems are solvable from the primary branch. (#18)

---
 CMakeLists.txt                  |   9 +
 src/util/CollisionDetection.cpp | 456 +++++++++++++++++
 src/util/CollisionDetection.h   |  62 +++
 src/util/CollisionInfo.h        |  13 +
 src/util/pcgsolver.h            | 871 ++++++++++++++++++++++++++++++++
 5 files changed, 1411 insertions(+)
 create mode 100644 src/util/CollisionDetection.cpp
 create mode 100644 src/util/CollisionDetection.h
 create mode 100644 src/util/CollisionInfo.h
 create mode 100644 src/util/pcgsolver.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b4ddcc..29ae811 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,15 @@ file(GLOB_RECURSE PIPELINE_HEADERS
 )
 target_sources(Template PRIVATE ${PIPELINE_SOURCES} ${PIPELINE_HEADERS})
 
+file(GLOB_RECURSE UTIL_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/util/*.cpp
+)
+
+file(GLOB_RECURSE UTIL_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/util/*.h
+)
+target_sources(Template PRIVATE ${UTIL_SOURCES} ${UTIL_HEADERS})
+
 if(DEV_MODE)
 	# In dev mode, we load resources from the source tree, so that when we
 	# dynamically edit resources (like shaders), these are correctly
diff --git a/src/util/CollisionDetection.cpp b/src/util/CollisionDetection.cpp
new file mode 100644
index 0000000..fb027a2
--- /dev/null
+++ b/src/util/CollisionDetection.cpp
@@ -0,0 +1,456 @@
+#include <vector>
+#include <glm/glm.hpp>
+#include <glm/gtx/norm.hpp>
+#include <iostream>
+#include <util/CollisionInfo.h>
+#include <util/CollisionDetection.h>
+#include <glm/gtx/string_cast.hpp>
+
+// tool data structures/functions called by the collision detection method, you can ignore the details here
+namespace collisionTools
+{
+    using vec3 = glm::vec3;
+    using mat4 = glm::mat4;
+    using vec4 = glm::vec4;
+
+    vec3 getVectorConnnectingCenters(const mat4 &worldFromObj_A, const mat4 &worldFromObj_B)
+    {
+
+        const vec3 worldCenter_A = worldFromObj_A * vec4(0, 0, 0, 1);
+        const vec3 worldCenter_B = worldFromObj_B * vec4(0, 0, 0, 1);
+        return worldCenter_B - worldCenter_A;
+    }
+
+    // Get Corners
+    std::vector<vec3> getCorners(const mat4 &worldFromObj)
+    {
+        const vec3 worldCenter = worldFromObj * vec4(0, 0, 0, 1);
+        vec3 worldEdges[3];
+        for (size_t i = 0; i < 3; ++i)
+        {
+            vec3 objEdge = vec3(0.0);
+            objEdge[i] = 0.5f;
+            worldEdges[i] = worldFromObj * vec4(objEdge, 0);
+        }
+        std::vector<vec3> results;
+        results.push_back(worldCenter - worldEdges[0] - worldEdges[1] - worldEdges[2]);
+        results.push_back(worldCenter + worldEdges[0] - worldEdges[1] - worldEdges[2]);
+        results.push_back(worldCenter - worldEdges[0] + worldEdges[1] - worldEdges[2]);
+        results.push_back(worldCenter + worldEdges[0] + worldEdges[1] - worldEdges[2]); // this +,+,-
+        results.push_back(worldCenter - worldEdges[0] - worldEdges[1] + worldEdges[2]);
+        results.push_back(worldCenter + worldEdges[0] - worldEdges[1] + worldEdges[2]); // this +,-,+
+        results.push_back(worldCenter - worldEdges[0] + worldEdges[1] + worldEdges[2]); // this -,+,+
+        results.push_back(worldCenter + worldEdges[0] + worldEdges[1] + worldEdges[2]); // this +,+,+
+        return results;
+    }
+
+    // Get Rigid Box Size
+    vec3 getBoxSize(const mat4 &worldFromObj)
+    {
+        vec3 size = vec3(0.0);
+        vec3 edges[3];
+        for (size_t i = 0; i < 3; ++i)
+        {
+            vec3 objEdge = vec3(0.0);
+            objEdge[i] = 0.5f;
+            edges[i] = worldFromObj * vec4(objEdge, 0);
+            size[i] = 2.0f * glm::length(edges[i]);
+        }
+        return size;
+    }
+
+    // Get the Normal to the faces
+    std::vector<vec3> getAxisNormalToFaces(const mat4 &worldFromObj)
+    {
+        std::vector<vec3> edges;
+        vec4 objX = vec4(1, 0, 0, 0);
+        vec4 objY = vec4(0, 1, 0, 0);
+        vec4 objZ = vec4(0, 0, 1, 0);
+        vec3 edge1 = glm::normalize(worldFromObj * objX);
+        vec3 edge2 = glm::normalize(worldFromObj * objY);
+        vec3 edge3 = glm::normalize(worldFromObj * objZ);
+        std::vector<vec3> results;
+        edges.push_back(edge1);
+        edges.push_back(edge2);
+        edges.push_back(edge3);
+        return edges;
+    }
+
+    // Get the pair of edges
+    std::vector<vec3> getPairOfEdges(const mat4 &worldFromObj_A, const mat4 &worldFromObj_B)
+    {
+        std::vector<vec3> worldEdges1 = getAxisNormalToFaces(worldFromObj_A);
+        std::vector<vec3> worldEdges2 = getAxisNormalToFaces(worldFromObj_B);
+
+        std::vector<vec3> results;
+        for (int i = 0; i < worldEdges1.size(); i++)
+        {
+            for (int j = 0; j < worldEdges2.size(); j++)
+            {
+                vec3 vector = glm::cross(worldEdges1[i], worldEdges2[j]);
+                if (glm::length(vector) > 0)
+                    results.push_back(glm::normalize(vector));
+            }
+        }
+        return results;
+    }
+
+    // project a shape on an axis
+    Projection project(const mat4 &worldFromObj, vec3 axis)
+    {
+        // Get corners
+        std::vector<vec3> worldCorners = getCorners(worldFromObj);
+        float min = glm::dot(worldCorners[0], axis);
+        float max = min;
+        for (int i = 1; i < worldCorners.size(); i++)
+        {
+            float p = glm::dot(worldCorners[i], axis);
+            if (p < min)
+            {
+                min = p;
+            }
+            else if (p > max)
+            {
+                max = p;
+            }
+        }
+        Projection p;
+        p.max = max;
+        p.min = min;
+        return p;
+    }
+
+    bool overlap(Projection p1, Projection p2)
+    {
+        return !((p1.max > p2.max && p1.min > p2.max) || (p2.max > p1.max && p2.min > p1.max));
+    }
+
+    float getOverlap(Projection p1, Projection p2)
+    {
+        return glm::min(p1.max, p2.max) - glm::max(p1.min, p2.min);
+    }
+
+    static vec3 contactPoint(
+        const vec3 &pOne,
+        const vec3 &dOne,
+        float oneSize,
+        const vec3 &pTwo,
+        const vec3 &dTwo,
+        float twoSize,
+
+        // If this is true, and the contact point is outside
+        // the edge (in the case of an edge-face contact) then
+        // we use one's midpoint, otherwise we use two's.
+        bool useOne)
+    {
+        vec3 cOne, cTwo;
+
+        float smOne = glm::length2(dOne);
+        float smTwo = glm::length2(dTwo);
+        float dpOneTwo = glm::dot(dTwo, dOne);
+
+        vec3 toSt = pOne - pTwo;
+        float dpStaOne = glm::dot(dOne, toSt);
+        float dpStaTwo = glm::dot(dTwo, toSt);
+
+        float denom = smOne * smTwo - dpOneTwo * dpOneTwo;
+
+        // Zero denominator indicates parrallel lines
+        if (abs(denom) < 0.0001f)
+        {
+            return useOne ? pOne : pTwo;
+        }
+
+        float mua = (dpOneTwo * dpStaTwo - smTwo * dpStaOne) / denom;
+        float mub = (smOne * dpStaTwo - dpOneTwo * dpStaOne) / denom;
+
+        // If either of the edges has the nearest point out
+        // of bounds, then the edges aren't crossed, we have
+        // an edge-face contact. Our point is on the edge, which
+        // we know from the useOne parameter.
+        if (mua > oneSize ||
+            mua < -oneSize ||
+            mub > twoSize ||
+            mub < -twoSize)
+        {
+            return useOne ? pOne : pTwo;
+        }
+        else
+        {
+            cOne = pOne + dOne * mua;
+            cTwo = pTwo + dTwo * mub;
+
+            return cOne * 0.5f + cTwo * 0.5f;
+        }
+    }
+
+    vec3 handleVertexToface(const mat4 &worldFromObj, const vec3 &toCenter)
+    {
+        std::vector<vec3> corners = getCorners(worldFromObj);
+        float min = 1000;
+        vec3 vertex;
+        for (int i = 0; i < corners.size(); i++)
+        {
+            float value = glm::dot(corners[i], toCenter);
+            if (value < min)
+            {
+                vertex = corners[i];
+                min = value;
+            }
+        }
+
+        return vertex;
+    }
+
+    CollisionInfo checkCollisionSATHelper(const mat4 &worldFromObj_A, const mat4 &worldFromObj_B, vec3 size_A, vec3 size_B)
+    {
+        CollisionInfo info;
+        info.isColliding = false;
+        vec3 collisionPoint = vec3(0.0);
+        float smallOverlap = 10000.0f;
+        vec3 axis;
+        int index;
+        int fromWhere = -1;
+        bool bestSingleAxis = false;
+        vec3 toCenter = getVectorConnnectingCenters(worldFromObj_A, worldFromObj_B);
+        std::vector<vec3> axes1 = getAxisNormalToFaces(worldFromObj_A);
+        std::vector<vec3> axes2 = getAxisNormalToFaces(worldFromObj_B);
+        std::vector<vec3> axes3 = getPairOfEdges(worldFromObj_A, worldFromObj_B);
+        // loop over the axes1
+        for (int i = 0; i < axes1.size(); i++)
+        {
+            // project both shapes onto the axis
+            Projection p1 = project(worldFromObj_A, axes1[i]);
+            Projection p2 = project(worldFromObj_B, axes1[i]);
+            // do the projections overlap?
+            if (!overlap(p1, p2))
+            {
+                // then we can guarantee that the shapes do not overlap
+                return info;
+            }
+            else
+            {
+                // get the overlap
+                float o = getOverlap(p1, p2);
+                // check for minimum
+                if (o < smallOverlap)
+                {
+                    // then set this one as the smallest
+                    smallOverlap = o;
+                    axis = axes1[i];
+                    index = i;
+                    fromWhere = 0;
+                }
+            }
+        }
+        // loop over the axes2
+        for (int i = 0; i < axes2.size(); i++)
+        {
+            // project both shapes onto the axis
+            Projection p1 = project(worldFromObj_A, axes2[i]);
+            Projection p2 = project(worldFromObj_B, axes2[i]);
+            // do the projections overlap?
+            if (!overlap(p1, p2))
+            {
+                // then we can guarantee that the shapes do not overlap
+                return info;
+            }
+            else
+            {
+                // get the overlap
+                float o = getOverlap(p1, p2);
+                // check for minimum
+                if (o < smallOverlap)
+                {
+                    // then set this one as the smallest
+                    smallOverlap = o;
+                    axis = axes2[i];
+                    index = i;
+                    fromWhere = 1;
+                    bestSingleAxis = true;
+                }
+            }
+        }
+        int whichEdges = 0;
+        // loop over the axes3
+        for (int i = 0; i < axes3.size(); i++)
+        {
+            // project both shapes onto the axis
+            Projection p1 = project(worldFromObj_A, axes3[i]);
+            Projection p2 = project(worldFromObj_B, axes3[i]);
+            // do the projections overlap?
+            if (!overlap(p1, p2))
+            {
+                // then we can guarantee that the shapes do not overlap
+                return info;
+            }
+            else
+            {
+                // get the overlap
+                float o = getOverlap(p1, p2);
+                // check for minimum
+                if (o < smallOverlap)
+                {
+                    // then set this one as the smallest
+                    smallOverlap = o;
+                    axis = axes3[i];
+                    index = i;
+                    whichEdges = i;
+                    fromWhere = 2;
+                }
+            }
+        }
+        // if we get here then we know that every axis had overlap on it
+        // so we can guarantee an intersection
+        vec3 normal;
+        switch (fromWhere)
+        {
+        case 0:
+        {
+            normal = axis;
+            if (glm::dot(axis, toCenter) <= 0)
+            {
+                normal = -normal;
+            }
+            collisionPoint = handleVertexToface(worldFromObj_B, toCenter);
+        }
+        break;
+        case 1:
+        {
+            normal = axis;
+            if (glm::dot(axis, toCenter) <= 0)
+            {
+                normal = -normal;
+            }
+            collisionPoint = handleVertexToface(worldFromObj_A, toCenter * -1.0f);
+        }
+        break;
+        case 2:
+        {
+            vec3 axis = glm::normalize(glm::cross(axes1[whichEdges / 3], axes2[whichEdges % 3]));
+            normal = axis;
+            if (glm::dot(axis, toCenter) <= 0)
+            {
+                normal = -normal;
+            }
+            vec4 ptOnOneEdge = vec4(0.5, 0.5, 0.5, 1);
+            vec4 ptOnTwoEdge = vec4(0.5, 0.5, 0.5, 1);
+
+            for (int i = 0; i < 3; i++)
+            {
+                if (i == whichEdges / 3)
+                    ptOnOneEdge[i] = 0;
+                else if (glm::dot(axes1[i], normal) < 0)
+                    ptOnOneEdge[i] = -ptOnOneEdge[i];
+
+                if (i == whichEdges % 3)
+                    ptOnTwoEdge[i] = 0;
+                else if (glm::dot(axes2[i], normal) > 0)
+                    ptOnTwoEdge[i] = -ptOnTwoEdge[i];
+            }
+            ptOnOneEdge = worldFromObj_A * ptOnOneEdge;
+            ptOnTwoEdge = worldFromObj_B * ptOnTwoEdge;
+            collisionPoint = contactPoint(ptOnOneEdge,
+                                          axes1[whichEdges / 3],
+                                          size_A[whichEdges / 3],
+                                          ptOnTwoEdge,
+                                          axes2[whichEdges % 3],
+                                          size_B[whichEdges % 3],
+                                          bestSingleAxis);
+        }
+        break;
+        }
+
+        info.isColliding = true;
+        info.collisionPointWorld = collisionPoint;
+        info.depth = smallOverlap;
+        info.normalWorld = -normal;
+        return info;
+    }
+
+    CollisionInfo checkCollisionSAT(glm::mat4 &worldFromObj_A, glm::mat4 &worldFromObj_B)
+    {
+        using namespace collisionTools;
+        vec3 calSizeA = getBoxSize(worldFromObj_A);
+        vec3 calSizeB = getBoxSize(worldFromObj_B);
+
+        return checkCollisionSATHelper(worldFromObj_A, worldFromObj_B, calSizeA, calSizeB);
+    }
+
+    // example of using the checkCollisionSAT function
+    void testCheckCollision(int caseid)
+    {
+
+        if (caseid == 1)
+        {                                                                            // simple examples, suppose that boxes A and B are cubes and have no rotation
+            glm::mat4 AM = glm::translate(glm::mat4(1.0), glm::vec3(1.0, 1.0, 1.0)); // box A at (1.0,1.0,1.0)
+            glm::mat4 BM = glm::translate(glm::mat4(1.0), glm::vec3(2.0, 2.0, 2.0)); // box B at (2.0,2.0,2.0)
+
+            // check for collision
+            CollisionInfo simpletest = checkCollisionSAT(AM, BM); // should find out a collision here
+            if (!simpletest.isColliding)
+                std::cout << "No Collision" << std::endl;
+            else
+            {
+                std::cout << "collision detected at normal: " << simpletest.normalWorld.x << ", " << simpletest.normalWorld.y << ", " << simpletest.normalWorld.z << std::endl;
+                std::cout << "collision point : " << simpletest.collisionPointWorld.x << ", " << simpletest.collisionPointWorld.y << ", " << simpletest.collisionPointWorld.z << std::endl;
+            }
+            // case 1 result:
+            // collision detected at normal: -1.000000, -0.000000, -0.000000
+            // collision point : 1.500000, 1.500000, 1.500000
+            // Box A should be pushed to the left
+        }
+        else if (caseid == 2)
+        { // case 2, collide at a corner of Box B:
+            mat4 scale_A = glm::scale(mat4(1.0), vec3(9.0, 2.0, 3.0));
+            mat4 translate_A = glm::translate(mat4(1.0), vec3(0.2, 5.0, 1.0));
+            mat4 AM = translate_A * scale_A;
+
+            float s = 5.656855f;
+            mat4 scale_B = glm::scale(mat4(1.0), vec3(s, s, 2.0));
+            mat4 rotate_B = glm::rotate(mat4(1.0), glm::radians(45.0f), vec3(0, 0, 1));
+            mat4 BM = rotate_B * scale_B;
+
+            // check for collision
+            CollisionInfo simpletest = checkCollisionSAT(AM, BM); // should find out a collision here
+
+            if (!simpletest.isColliding)
+                std::cout << "No Collision" << std::endl;
+            else
+            {
+                std::cout << "collision detected at normal: " << simpletest.normalWorld.x << ", " << simpletest.normalWorld.y << ", " << simpletest.normalWorld.z << std::endl;
+                std::cout << "collision point : " << simpletest.collisionPointWorld.x << ", " << simpletest.collisionPointWorld.y << ", " << simpletest.collisionPointWorld.z << std::endl;
+            }
+            // case 2 result:
+            // collision detected at normal : 0.000000, 1.000000, 0.000000
+            // collision point : 0.000000, 4.000000, 1.000000
+        }
+        else if (caseid == 3)
+        { // case 3, collide at a corner of Box A:
+
+            mat4 scale_A = glm::scale(mat4(1.0), vec3(2.829f, 2.829f, 2.0f));
+            mat4 rotate_A = glm::rotate(mat4(1.0), glm::radians(45.0f), vec3(0, 0, 1));
+            mat4 translate_A = glm::translate(mat4(1.0), vec3(-2.0, 0.0, 1.0));
+            mat4 AM = translate_A * rotate_A * scale_A;
+
+            mat4 scale_B = glm::scale(mat4(1.0), vec3(9.0f, 2.0f, 4.0f));
+            mat4 rotate_B = glm::rotate(mat4(1.0), glm::radians(90.0f), vec3(0, 0, 1));
+            mat4 translate_B = glm::translate(mat4(1.0), vec3(1.0, 0.5, 0.0));
+            mat4 BM = translate_B * rotate_B * scale_B;
+
+            // check for collision
+            CollisionInfo simpletest = checkCollisionSAT(AM, BM); // should find out a collision here
+
+            if (!simpletest.isColliding)
+                std::cout << "No Collision" << std::endl;
+            else
+            {
+                std::cout << "collision detected at normal: " << simpletest.normalWorld.x << ", " << simpletest.normalWorld.y << ", " << simpletest.normalWorld.z << std::endl;
+                std::cout << "collision point : " << simpletest.collisionPointWorld.x << ", " << simpletest.collisionPointWorld.y << ", " << simpletest.collisionPointWorld.z << std::endl;
+            }
+            // case 3 result:
+            // collision detected at normal: -1.000000, 0.000000, -0.000000
+            // collision point : 0.000405, 0.000000, 0.000000
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/util/CollisionDetection.h b/src/util/CollisionDetection.h
new file mode 100644
index 0000000..4ce120f
--- /dev/null
+++ b/src/util/CollisionDetection.h
@@ -0,0 +1,62 @@
+#pragma once
+#include <vector>
+#include <glm/glm.hpp>
+#include <glm/gtx/norm.hpp>
+#include <iostream>
+#include <util/CollisionInfo.h>
+
+// tool data structures/functions called by the collision detection method, you can ignore the details here
+namespace collisionTools
+{
+
+    struct Projection
+    {
+        float min, max;
+    };
+
+    glm::vec3 getVectorConnnectingCenters(const glm::mat4 &worldFromObj_A, const glm::mat4 &worldFromObj_B);
+    // Get Corners
+    std::vector<glm::vec3> getCorners(const glm::mat4 &worldFromObj);
+
+    // Get Rigid Box Size
+    glm::vec3 getBoxSize(const glm::mat4 &worldFromObj);
+
+    // Get the Normal to the faces
+    std::vector<glm::vec3> getAxisNormalToFaces(const glm::mat4 &worldFromObj);
+
+    // Get the pair of edges
+    std::vector<glm::vec3> getPairOfEdges(const glm::mat4 &worldFromObj_A, const glm::mat4 &worldFromObj_B);
+
+    // project a shape on an axis
+    Projection project(const glm::mat4 &worldFromObj, glm::vec3 axis);
+
+    bool overlap(Projection p1, Projection p2);
+
+    float getOverlap(Projection p1, Projection p2);
+
+    static glm::vec3 contactPoint(
+        const glm::vec3 &pOne,
+        const glm::vec3 &dOne,
+        float oneSize,
+        const glm::vec3 &pTwo,
+        const glm::vec3 &dTwo,
+        float twoSize,
+
+        // If this is true, and the contact point is outside
+        // the edge (in the case of an edge-face contact) then
+        // we use one's midpoint, otherwise we use two's.
+        bool useOne);
+
+    glm::vec3 handleVertexToface(const glm::mat4 &worldFromObj, const glm::vec3 &toCenter);
+
+    CollisionInfo checkCollisionSATHelper(const glm::mat4 &worldFromObj_A, const glm::mat4 &worldFromObj_B, glm::vec3 size_A, glm::vec3 size_B);
+
+    /* params:
+    obj2World_A, the transfer matrix from object space of A to the world space
+    obj2World_B, the transfer matrix from object space of B to the world space
+    */
+    CollisionInfo checkCollisionSAT(glm::mat4 &worldFromObj_A, glm::mat4 &worldFromObj_B);
+
+    // example of using the checkCollisionSAT function
+    void testCheckCollision(int caseid);
+}
\ No newline at end of file
diff --git a/src/util/CollisionInfo.h b/src/util/CollisionInfo.h
new file mode 100644
index 0000000..4fd97cb
--- /dev/null
+++ b/src/util/CollisionInfo.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <glm/glm.hpp>
+
+// the return structure, with these values, you should be able to calculate the impulse
+// the depth shouldn't be used in your impulse calculation, it is a redundant value
+// if the normalWorld == XMVectorZero(), no collision
+struct CollisionInfo
+{
+    bool isColliding;              // whether there is a collision point, true for yes
+    glm::vec3 collisionPointWorld; // the position of the collision point in world space
+    glm::vec3 normalWorld;         // the direction of the impulse to A, negative of the collision face of A
+    float depth;                   // the distance of the collision point to the surface, not necessary.
+};
\ No newline at end of file
diff --git a/src/util/pcgsolver.h b/src/util/pcgsolver.h
new file mode 100644
index 0000000..0d3454d
--- /dev/null
+++ b/src/util/pcgsolver.h
@@ -0,0 +1,871 @@
+//
+//  Preconditioned conjugate gradient solver
+//
+//  Created by Robert Bridson, Ryoichi Ando and Nils Thuerey
+//
+
+#ifndef RCMATRIX3_H
+#define RCMATRIX3_H
+
+#include <iterator>
+#include <cassert>
+#include <vector>
+#include <fstream>
+#include <cmath>
+#include <functional>
+
+// index type
+#define int_index long long
+
+// parallelization disabled
+
+#define parallel_for(size)                                                                   \
+   {                                                                                         \
+      int thread_number = 0;                                                                 \
+      int_index parallel_index = 0;                                                          \
+      for (int_index parallel_index = 0; parallel_index < (int_index)size; parallel_index++) \
+      {
+#define parallel_end                   \
+   }                                   \
+   thread_number = parallel_index = 0; \
+   }
+
+#define parallel_block
+#define do_parallel
+#define do_end
+#define block_end
+
+// note - "Int" instead of "N" here, the latter is size!
+template <class Int, class T>
+struct InstantBLAS
+{
+   static inline Int offset(Int N, Int incX) { return ((incX) > 0 ? 0 : ((N)-1) * (-(incX))); }
+   static T cblas_ddot(const Int N, const T *X, const Int incX, const T *Y, const Int incY)
+   {
+      double r = 0.0; // always use double precision internally here...
+      Int i;
+      Int ix = offset(N, incX);
+      Int iy = offset(N, incY);
+      for (i = 0; i < N; i++)
+      {
+         r += X[ix] * Y[iy];
+         ix += incX;
+         iy += incY;
+      }
+      return (T)r;
+   }
+   static void cblas_daxpy(const Int N, const T alpha, const T *X, const Int incX, T *Y, const Int incY)
+   {
+      Int i;
+      if (N <= 0)
+         return;
+      if (alpha == 0.0)
+         return;
+      if (incX == 1 && incY == 1)
+      {
+         const Int m = N % 4;
+         for (i = 0; i < m; i++)
+            Y[i] += alpha * X[i];
+         for (i = m; i + 3 < N; i += 4)
+         {
+            Y[i] += alpha * X[i];
+            Y[i + 1] += alpha * X[i + 1];
+            Y[i + 2] += alpha * X[i + 2];
+            Y[i + 3] += alpha * X[i + 3];
+         }
+      }
+      else
+      {
+         Int ix = offset(N, incX);
+         Int iy = offset(N, incY);
+         for (i = 0; i < N; i++)
+         {
+            Y[iy] += alpha * X[ix];
+            ix += incX;
+            iy += incY;
+         }
+      }
+   }
+   // dot products ==============================================================
+   static inline T dot(const std::vector<T> &x, const std::vector<T> &y)
+   {
+      return cblas_ddot((int)x.size(), &x[0], 1, &y[0], 1);
+   }
+
+   // inf-norm (maximum absolute value: index of max returned) ==================
+   static inline Int index_abs_max(const std::vector<T> &x)
+   {
+      int maxind = 0;
+      T maxvalue = 0;
+      for (Int i = 0; i < (Int)x.size(); ++i)
+      {
+         if (std::abs(x[i]) > maxvalue)
+         {
+            maxvalue = fabs(x[i]);
+            maxind = i;
+         }
+      }
+      return maxind;
+   }
+
+   // inf-norm (maximum absolute value) =========================================
+   // technically not part of BLAS, but useful
+   static inline T abs_max(const std::vector<T> &x)
+   {
+      return std::abs(x[index_abs_max(x)]);
+   }
+
+   // saxpy (y=alpha*x+y) =======================================================
+   static inline void add_scaled(T alpha, const std::vector<T> &x, std::vector<T> &y)
+   {
+      cblas_daxpy((Int)x.size(), alpha, &x[0], 1, &y[0], 1);
+   }
+};
+
+template <class T>
+void zero(std::vector<T> &v)
+{
+   for (int i = (int)v.size() - 1; i >= 0; --i)
+      v[i] = 0;
+}
+
+template <class T>
+void insert(std::vector<T> &a, unsigned int index, T e)
+{
+   a.push_back(a.back());
+   for (unsigned int i = (unsigned int)a.size() - 1; i > index; --i)
+      a[i] = a[i - 1];
+   a[index] = e;
+}
+
+template <class T>
+void erase(std::vector<T> &a, unsigned int index)
+{
+   for (unsigned int i = index; i < a.size() - 1; ++i)
+      a[i] = a[i + 1];
+   a.pop_back();
+}
+
+//============================================================================
+// Dynamic compressed sparse row matrix.
+
+template <class T>
+struct SparseMatrix
+{
+   int n;                               // dimension
+   std::vector<std::vector<int>> index; // for each row, a list of all column indices (sorted)
+   std::vector<std::vector<T>> value;   // values corresponding to index
+
+   explicit SparseMatrix(int n_ = 0, int expected_nonzeros_per_row = 7)
+       : n(n_), index(n_), value(n_)
+   {
+      for (int i = 0; i < n; ++i)
+      {
+         index[i].reserve(expected_nonzeros_per_row);
+         value[i].reserve(expected_nonzeros_per_row);
+      }
+   }
+
+   void clear(void)
+   {
+      n = 0;
+      index.clear();
+      value.clear();
+   }
+
+   void zero(void)
+   {
+      for (int i = 0; i < n; ++i)
+      {
+         index[i].resize(0);
+         value[i].resize(0);
+      }
+   }
+
+   void resize(int n_)
+   {
+      n = n_;
+      index.resize(n);
+      value.resize(n);
+   }
+
+   T operator()(int i, int j) const
+   {
+      for (int k = 0; k < (int)index[i].size(); ++k)
+      {
+         if (index[i][k] == j)
+            return value[i][k];
+         else if (index[i][k] > j)
+            return 0;
+      }
+      return 0;
+   }
+
+   void set_element(int i, int j, T new_value)
+   {
+      int k = 0;
+      for (; k < (int)index[i].size(); ++k)
+      {
+         if (index[i][k] == j)
+         {
+            value[i][k] = new_value;
+            return;
+         }
+         else if (index[i][k] > j)
+         {
+            insert(index[i], k, j);
+            insert(value[i], k, new_value);
+            return;
+         }
+      }
+      index[i].push_back(j);
+      value[i].push_back(new_value);
+   }
+
+   void add_to_element(int i, int j, T increment_value)
+   {
+      int k = 0;
+      for (; k < (int)index[i].size(); ++k)
+      {
+         if (index[i][k] == j)
+         {
+            value[i][k] += increment_value;
+            return;
+         }
+         else if (index[i][k] > j)
+         {
+            insert(index[i], k, j);
+            insert(value[i], k, increment_value);
+            return;
+         }
+      }
+      index[i].push_back(j);
+      value[i].push_back(increment_value);
+   }
+
+   // assumes indices is already sorted
+   void add_sparse_row(int i, const std::vector<int> &indices, const std::vector<T> &values)
+   {
+      int j = 0, k = 0;
+      while (j < indices.size() && k < (int)index[i].size())
+      {
+         if (index[i][k] < indices[j])
+         {
+            ++k;
+         }
+         else if (index[i][k] > indices[j])
+         {
+            insert(index[i], k, indices[j]);
+            insert(value[i], k, values[j]);
+            ++j;
+         }
+         else
+         {
+            value[i][k] += values[j];
+            ++j;
+            ++k;
+         }
+      }
+      for (; j < indices.size(); ++j)
+      {
+         index[i].push_back(indices[j]);
+         value[i].push_back(values[j]);
+      }
+   }
+
+   // assumes matrix has symmetric structure - so the indices in row i tell us which columns to delete i from
+   void symmetric_remove_row_and_column(int i)
+   {
+      for (int a = 0; a < index[i].size(); ++a)
+      {
+         int j = index[i][a]; //
+         for (int b = 0; b < index[j].size(); ++b)
+         {
+            if (index[j][b] == i)
+            {
+               erase(index[j], b);
+               erase(value[j], b);
+               break;
+            }
+         }
+      }
+      index[i].resize(0);
+      value[i].resize(0);
+   }
+
+   void write_matlab(std::ostream &output, const char *variable_name)
+   {
+      output << variable_name << "=sparse([";
+      for (int i = 0; i < n; ++i)
+      {
+         for (int j = 0; j < index[i].size(); ++j)
+         {
+            output << i + 1 << " ";
+         }
+      }
+      output << "],...\n  [";
+      for (int i = 0; i < n; ++i)
+      {
+         for (int j = 0; j < index[i].size(); ++j)
+         {
+            output << index[i][j] + 1 << " ";
+         }
+      }
+      output << "],...\n  [";
+      for (int i = 0; i < n; ++i)
+      {
+         for (int j = 0; j < value[i].size(); ++j)
+         {
+            output << value[i][j] << " ";
+         }
+      }
+      output << "], " << n << ", " << n << ");" << std::endl;
+   }
+};
+
+typedef SparseMatrix<float> SparseMatrixf;
+typedef SparseMatrix<double> SparseMatrixd;
+
+// perform result=matrix*x
+template <class T>
+void multiply(const SparseMatrix<T> &matrix, const std::vector<T> &x, std::vector<T> &result)
+{
+   assert(matrix.n == x.size());
+   result.resize(matrix.n);
+   // for(int i=0; i<matrix.n; ++i)
+   parallel_for(matrix.n)
+   {
+      unsigned i(parallel_index);
+      T value = 0;
+      for (int j = 0; j < (int)matrix.index[i].size(); ++j)
+      {
+         value += matrix.value[i][j] * x[matrix.index[i][j]];
+      }
+      result[i] = value;
+   }
+   parallel_end
+}
+
+// perform result=result-matrix*x
+template <class T>
+void multiply_and_subtract(const SparseMatrix<T> &matrix, const std::vector<T> &x, std::vector<T> &result)
+{
+   assert(matrix.n == x.size());
+   result.resize(matrix.n);
+   for (int i = 0; i < (int)matrix.n; ++i)
+   {
+      for (int j = 0; j < (int)matrix.index[i].size(); ++j)
+      {
+         result[i] -= matrix.value[i][j] * x[matrix.index[i][j]];
+      }
+   }
+}
+
+//============================================================================
+// Fixed version of SparseMatrix. This is not a good structure for dynamically
+// modifying the matrix, but can be significantly faster for matrix-vector
+// multiplies due to better data locality.
+
+template <class T>
+struct FixedSparseMatrix
+{
+   int n;                     // dimension
+   std::vector<T> value;      // nonzero values row by row
+   std::vector<int> colindex; // corresponding column indices
+   std::vector<int> rowstart; // where each row starts in value and colindex (and last entry is one past the end, the number of nonzeros)
+
+   explicit FixedSparseMatrix(int n_ = 0)
+       : n(n_), value(0), colindex(0), rowstart(n_ + 1)
+   {
+   }
+
+   void clear(void)
+   {
+      n = 0;
+      value.clear();
+      colindex.clear();
+      rowstart.clear();
+   }
+
+   void resize(int n_)
+   {
+      n = n_;
+      rowstart.resize(n + 1);
+   }
+
+   void construct_from_matrix(const SparseMatrix<T> &matrix)
+   {
+      resize(matrix.n);
+      rowstart[0] = 0;
+      for (int i = 0; i < n; ++i)
+      {
+         rowstart[i + 1] = rowstart[i] + matrix.index[i].size();
+      }
+      value.resize(rowstart[n]);
+      colindex.resize(rowstart[n]);
+      int j = 0;
+      for (int i = 0; i < n; ++i)
+      {
+         for (int k = 0; k < (int)matrix.index[i].size(); ++k)
+         {
+            value[j] = matrix.value[i][k];
+            colindex[j] = matrix.index[i][k];
+            ++j;
+         }
+      }
+   }
+
+   void write_matlab(std::ostream &output, const char *variable_name)
+   {
+      output << variable_name << "=sparse([";
+      for (int i = 0; i < n; ++i)
+      {
+         for (int j = rowstart[i]; j < rowstart[i + 1]; ++j)
+         {
+            output << i + 1 << " ";
+         }
+      }
+      output << "],...\n  [";
+      for (int i = 0; i < n; ++i)
+      {
+         for (int j = rowstart[i]; j < rowstart[i + 1]; ++j)
+         {
+            output << colindex[j] + 1 << " ";
+         }
+      }
+      output << "],...\n  [";
+      for (int i = 0; i < n; ++i)
+      {
+         for (int j = rowstart[i]; j < rowstart[i + 1]; ++j)
+         {
+            output << value[j] << " ";
+         }
+      }
+      output << "], " << n << ", " << n << ");" << std::endl;
+   }
+};
+
+// perform result=matrix*x
+template <class T>
+void multiply(const FixedSparseMatrix<T> &matrix, const std::vector<T> &x, std::vector<T> &result)
+{
+   assert(matrix.n == x.size());
+   result.resize(matrix.n);
+   // for(int i=0; i<matrix.n; ++i)
+   parallel_for(matrix.n)
+   {
+      unsigned i(parallel_index);
+      T value = 0;
+      for (int j = matrix.rowstart[i]; j < matrix.rowstart[i + 1]; ++j)
+      {
+         value += matrix.value[j] * x[matrix.colindex[j]];
+      }
+      result[i] = value;
+   }
+   parallel_end
+}
+
+// perform result=result-matrix*x
+template <class T>
+void multiply_and_subtract(const FixedSparseMatrix<T> &matrix, const std::vector<T> &x, std::vector<T> &result)
+{
+   assert(matrix.n == x.size());
+   result.resize(matrix.n);
+   for (int i = 0; i < matrix.n; ++i)
+   {
+      for (int j = matrix.rowstart[i]; j < matrix.rowstart[i + 1]; ++j)
+      {
+         result[i] -= matrix.value[j] * x[matrix.colindex[j]];
+      }
+   }
+}
+
+//============================================================================
+// A simple compressed sparse column data structure (with separate diagonal)
+// for lower triangular matrices
+
+template <class T>
+struct SparseColumnLowerFactor
+{
+   int n;
+   std::vector<T> invdiag;    // reciprocals of diagonal elements
+   std::vector<T> value;      // values below the diagonal, listed column by column
+   std::vector<int> rowindex; // a list of all row indices, for each column in turn
+   std::vector<int> colstart; // where each column begins in rowindex (plus an extra entry at the end, of #nonzeros)
+   std::vector<T> adiag;      // just used in factorization: minimum "safe" diagonal entry allowed
+
+   explicit SparseColumnLowerFactor(int n_ = 0)
+       : n(n_), invdiag(n_), colstart(n_ + 1), adiag(n_)
+   {
+   }
+
+   void clear(void)
+   {
+      n = 0;
+      invdiag.clear();
+      value.clear();
+      rowindex.clear();
+      colstart.clear();
+      adiag.clear();
+   }
+
+   void resize(int n_)
+   {
+      n = n_;
+      invdiag.resize(n);
+      colstart.resize(n + 1);
+      adiag.resize(n);
+   }
+
+   void write_matlab(std::ostream &output, const char *variable_name)
+   {
+      output << variable_name << "=sparse([";
+      for (int i = 0; i < n; ++i)
+      {
+         output << " " << i + 1;
+         for (int j = colstart[i]; j < colstart[i + 1]; ++j)
+         {
+            output << " " << rowindex[j] + 1;
+         }
+      }
+      output << "],...\n  [";
+      for (int i = 0; i < n; ++i)
+      {
+         output << " " << i + 1;
+         for (int j = colstart[i]; j < colstart[i + 1]; ++j)
+         {
+            output << " " << i + 1;
+         }
+      }
+      output << "],...\n  [";
+      for (int i = 0; i < n; ++i)
+      {
+         output << " " << (invdiag[i] != 0 ? 1 / invdiag[i] : 0);
+         for (int j = colstart[i]; j < colstart[i + 1]; ++j)
+         {
+            output << " " << value[j];
+         }
+      }
+      output << "], " << n << ", " << n << ");" << std::endl;
+   }
+};
+
+//============================================================================
+// Incomplete Cholesky factorization, level zero, with option for modified version.
+// Set modification_parameter between zero (regular incomplete Cholesky) and
+// one (fully modified version), with values close to one usually giving the best
+// results. The min_diagonal_ratio parameter is used to detect and correct
+// problems in factorization: if a pivot is this much less than the diagonal
+// entry from the original matrix, the original matrix entry is used instead.
+
+template <class T>
+void factor_modified_incomplete_cholesky0(const SparseMatrix<T> &matrix, SparseColumnLowerFactor<T> &factor,
+                                          T modification_parameter = 0.97, T min_diagonal_ratio = 0.25)
+{
+   // first copy lower triangle of matrix into factor (Note: assuming A is symmetric of course!)
+   factor.resize(matrix.n);
+   zero(factor.invdiag); // important: eliminate old values from previous solves!
+   factor.value.resize(0);
+   factor.rowindex.resize(0);
+   zero(factor.adiag);
+   for (int i = 0; i < matrix.n; ++i)
+   {
+      factor.colstart[i] = (int)factor.rowindex.size();
+      for (int j = 0; j < (int)matrix.index[i].size(); ++j)
+      {
+         if (matrix.index[i][j] > i)
+         {
+            factor.rowindex.push_back(matrix.index[i][j]);
+            factor.value.push_back(matrix.value[i][j]);
+         }
+         else if (matrix.index[i][j] == i)
+         {
+            factor.invdiag[i] = factor.adiag[i] = matrix.value[i][j];
+         }
+      }
+   }
+   factor.colstart[matrix.n] = (int)factor.rowindex.size();
+   // now do the incomplete factorization (figure out numerical values)
+
+   // MATLAB code:
+   // L=tril(A);
+   // for k=1:size(L,2)
+   //   L(k,k)=sqrt(L(k,k));
+   //   L(k+1:end,k)=L(k+1:end,k)/L(k,k);
+   //   for j=find(L(:,k))'
+   //     if j>k
+   //       fullupdate=L(:,k)*L(j,k);
+   //       incompleteupdate=fullupdate.*(A(:,j)~=0);
+   //       missing=sum(fullupdate-incompleteupdate);
+   //       L(j:end,j)=L(j:end,j)-incompleteupdate(j:end);
+   //       L(j,j)=L(j,j)-omega*missing;
+   //     end
+   //   end
+   // end
+
+   for (int k = 0; k < matrix.n; ++k)
+   {
+      if (factor.adiag[k] == 0)
+         continue; // null row/column
+      // figure out the final L(k,k) entry
+      if (factor.invdiag[k] < min_diagonal_ratio * factor.adiag[k])
+         factor.invdiag[k] = 1 / sqrt(factor.adiag[k]); // drop to Gauss-Seidel here if the pivot looks dangerously small
+      else
+         factor.invdiag[k] = 1 / sqrt(factor.invdiag[k]);
+      // finalize the k'th column L(:,k)
+      for (int p = factor.colstart[k]; p < factor.colstart[k + 1]; ++p)
+      {
+         factor.value[p] *= factor.invdiag[k];
+      }
+      // incompletely eliminate L(:,k) from future columns, modifying diagonals
+      for (int p = factor.colstart[k]; p < factor.colstart[k + 1]; ++p)
+      {
+         int j = factor.rowindex[p]; // work on column j
+         T multiplier = factor.value[p];
+         T missing = 0;
+         int a = factor.colstart[k];
+         // first look for contributions to missing from dropped entries above the diagonal in column j
+         int b = 0;
+         while (a < factor.colstart[k + 1] && factor.rowindex[a] < j)
+         {
+            // look for factor.rowindex[a] in matrix.index[j] starting at b
+            while (b < (int)matrix.index[j].size())
+            {
+               if (matrix.index[j][b] < factor.rowindex[a])
+                  ++b;
+               else if (matrix.index[j][b] == factor.rowindex[a])
+                  break;
+               else
+               {
+                  missing += factor.value[a];
+                  break;
+               }
+            }
+            ++a;
+         }
+         // adjust the diagonal j,j entry
+         if (a < factor.colstart[k + 1] && factor.rowindex[a] == j)
+         {
+            factor.invdiag[j] -= multiplier * factor.value[a];
+         }
+         ++a;
+         // and now eliminate from the nonzero entries below the diagonal in column j (or add to missing if we can't)
+         b = factor.colstart[j];
+         while (a < factor.colstart[k + 1] && b < factor.colstart[j + 1])
+         {
+            if (factor.rowindex[b] < factor.rowindex[a])
+               ++b;
+            else if (factor.rowindex[b] == factor.rowindex[a])
+            {
+               factor.value[b] -= multiplier * factor.value[a];
+               ++a;
+               ++b;
+            }
+            else
+            {
+               missing += factor.value[a];
+               ++a;
+            }
+         }
+         // and if there's anything left to do, add it to missing
+         while (a < factor.colstart[k + 1])
+         {
+            missing += factor.value[a];
+            ++a;
+         }
+         // and do the final diagonal adjustment from the missing entries
+         factor.invdiag[j] -= modification_parameter * multiplier * missing;
+      }
+   }
+}
+
+//============================================================================
+// Solution routines with lower triangular matrix.
+
+// solve L*result=rhs
+template <class T>
+void solve_lower(const SparseColumnLowerFactor<T> &factor, const std::vector<T> &rhs, std::vector<T> &result)
+{
+   assert(factor.n == rhs.size());
+   assert(factor.n == result.size());
+   result = rhs;
+   for (int i = 0; i < factor.n; ++i)
+   {
+      result[i] *= factor.invdiag[i];
+      for (int j = factor.colstart[i]; j < factor.colstart[i + 1]; ++j)
+      {
+         result[factor.rowindex[j]] -= factor.value[j] * result[i];
+      }
+   }
+}
+
+// solve L^T*result=rhs
+template <class T>
+void solve_lower_transpose_in_place(const SparseColumnLowerFactor<T> &factor, std::vector<T> &x)
+{
+   assert(factor.n == (int)x.size());
+   assert(factor.n > 0);
+   int i = factor.n;
+   do
+   {
+      --i;
+      for (int j = factor.colstart[i]; j < factor.colstart[i + 1]; ++j)
+      {
+         x[i] -= factor.value[j] * x[factor.rowindex[j]];
+      }
+      x[i] *= factor.invdiag[i];
+   } while (i != 0);
+}
+
+//============================================================================
+// Encapsulates the Conjugate Gradient algorithm with incomplete Cholesky
+// factorization preconditioner.
+
+template <class T>
+struct SparsePCGSolver
+{
+   SparsePCGSolver(void)
+   {
+      set_solver_parameters(1e-5, 100, 0.97, 0.25);
+   }
+
+   void set_solver_parameters(T tolerance_factor_, int max_iterations_, T modified_incomplete_cholesky_parameter_ = 0.97, T min_diagonal_ratio_ = 0.25)
+   {
+      tolerance_factor = tolerance_factor_;
+      if (tolerance_factor < 1e-30)
+         tolerance_factor = 1e-30;
+      max_iterations = max_iterations_;
+      modified_incomplete_cholesky_parameter = modified_incomplete_cholesky_parameter_;
+      min_diagonal_ratio = min_diagonal_ratio_;
+   }
+
+   bool solve(const SparseMatrix<T> &matrix, const std::vector<T> &rhs, std::vector<T> &result, T &relative_residual_out, int &iterations_out, int precondition = 2)
+   {
+      int n = matrix.n;
+      if ((int)m.size() != n)
+      {
+         m.resize(n);
+         s.resize(n);
+         z.resize(n);
+         r.resize(n);
+      }
+      zero(result);
+      r = rhs;
+      double residual_out = InstantBLAS<int, T>::abs_max(r);
+      if (residual_out == 0)
+      {
+         iterations_out = 0;
+         return true;
+      }
+      // double tol=tolerance_factor*residual_out; // relative residual
+      double tol = tolerance_factor;
+      double residual_0 = residual_out;
+
+      form_preconditioner(matrix, precondition);
+      apply_preconditioner(r, z, precondition);
+      double rho = InstantBLAS<int, T>::dot(z, r);
+      if (rho == 0 || rho != rho)
+      {
+         iterations_out = 0;
+         return false;
+      }
+
+      s = z;
+      fixed_matrix.construct_from_matrix(matrix);
+      int iteration;
+      for (iteration = 0; iteration < max_iterations; ++iteration)
+      {
+         multiply(fixed_matrix, s, z);
+         double alpha = rho / InstantBLAS<int, T>::dot(s, z);
+         InstantBLAS<int, T>::add_scaled(alpha, s, result);
+         InstantBLAS<int, T>::add_scaled(-alpha, z, r);
+         residual_out = InstantBLAS<int, T>::abs_max(r);
+         relative_residual_out = residual_out / residual_0;
+         if (residual_out <= tol)
+         {
+            iterations_out = iteration + 1;
+            return true;
+         }
+         apply_preconditioner(r, z, precondition);
+         double rho_new = InstantBLAS<int, T>::dot(z, r);
+         double beta = rho_new / rho;
+         InstantBLAS<int, T>::add_scaled(beta, s, z);
+         s.swap(z); // s=beta*s+z
+         rho = rho_new;
+      }
+      iterations_out = iteration;
+      relative_residual_out = residual_out / residual_0;
+      return false;
+   }
+
+protected:
+   // internal structures
+   SparseColumnLowerFactor<T> ic_factor; // modified incomplete cholesky factor
+   std::vector<T> m, z, s, r;            // temporary vectors for PCG
+   FixedSparseMatrix<T> fixed_matrix;    // used within loop
+
+   // parameters
+   T tolerance_factor;
+   int max_iterations;
+   T modified_incomplete_cholesky_parameter;
+   T min_diagonal_ratio;
+
+   void form_preconditioner(const SparseMatrix<T> &matrix, int precondition = 2)
+   {
+      if (precondition == 2)
+      {
+         // incomplete cholesky
+         factor_modified_incomplete_cholesky0(matrix, ic_factor, modified_incomplete_cholesky_parameter, min_diagonal_ratio);
+      }
+      else if (precondition == 1)
+      {
+         // diagonal
+         ic_factor.resize(matrix.n);
+         zero(ic_factor.invdiag);
+         for (int i = 0; i < matrix.n; ++i)
+         {
+            for (int j = 0; j < (int)matrix.index[i].size(); ++j)
+            {
+               if (matrix.index[i][j] == i)
+               {
+                  ic_factor.invdiag[i] = 1. / matrix.value[i][j];
+               }
+            }
+         }
+      }
+   }
+
+   void apply_preconditioner(const std::vector<T> &x, std::vector<T> &result, int precondition = 2)
+   {
+      if (precondition == 2)
+      {
+         // incomplete cholesky
+         solve_lower(ic_factor, x, result);
+         solve_lower_transpose_in_place(ic_factor, result);
+      }
+      else if (precondition == 1)
+      {
+         // diagonal
+         for (int_index i = 0; i < (int_index)result.size(); ++i)
+         {
+            result[i] = x[i] * ic_factor.invdiag[i];
+         }
+      }
+      else
+      {
+         // off
+         result = x;
+      }
+   }
+};
+
+#undef parallel_for
+#undef parallel_end
+#undef int_index
+
+#undef parallel_block
+#undef do_parallel
+#undef do_end
+#undef block_end
+
+#endif
\ No newline at end of file