From fb3e38b161a7e8d9ea4c6baf83f3d586a40ce773 Mon Sep 17 00:00:00 2001
From: Frank Dellaert <dellaert@gmail.com>
Date: Sat, 16 Jan 2010 04:57:58 +0000
Subject: [PATCH] 25% performance increase by improving weighted_eliminate

---
 cpp/Matrix.cpp                  | 56 +++++++++++++++++++--------------
 cpp/Vector.cpp                  | 46 ++++++++++++++++++++-------
 cpp/Vector.h                    |  7 +++++
 cpp/timeGaussianFactor.cpp      |  3 +-
 cpp/timeGaussianFactorGraph.cpp |  5 +--
 5 files changed, 79 insertions(+), 38 deletions(-)

diff --git a/cpp/Matrix.cpp b/cpp/Matrix.cpp
index edbec380a..494a466f0 100644
--- a/cpp/Matrix.cpp
+++ b/cpp/Matrix.cpp
@@ -296,43 +296,65 @@ void householder_update(Matrix &A, int j, double beta, const Vector& vjm) {
   }
 }
 
+/* ************************************************************************* */
+// update A, b
+// A' \define A_{S}-ar and b'\define b-ad
+__attribute__ ((noinline))	// uncomment to prevent inlining when profiling
+void updateAb(Matrix& A, Vector& b, int j, const Vector& a, const Vector& r, double d) {
+	const size_t m = A.size1(), n = A.size2();
+	for (int i = 0; i < m; ++i) { // update all rows
+		double ai = a(i);
+		b(i) -= ai * d;
+		double *Aptr = A.data().begin() + i * n + j + 1;
+		const double *rptr = r.data().begin() + j + 1;
+		for (int j2 = j + 1; j2 < n; ++j2) { // limit to only columns in separator
+			//A(i,j2) -= ai*r(j2);
+			*Aptr -= ai * *rptr;
+			Aptr++;
+			rptr++;
+		}
+	}
+}
+
 /* ************************************************************************* */
 list<boost::tuple<Vector, double, double> >
 weighted_eliminate(Matrix& A, Vector& b, const Vector& sigmas) {
-	bool verbose = false;
 	size_t m = A.size1(), n = A.size2(); // get size(A)
 	size_t maxRank = min(m,n);
 
 	// create list
 	list<boost::tuple<Vector, double, double> > results;
 
+	Vector pseudo(m); // allocate storage for pseudo-inverse
+
+	// TODO: calculate weights once
+	// Vector weights =
+
 	// We loop over all columns, because the columns that can be eliminated
 	// are not necessarily contiguous. For each one, estimate the corresponding
 	// scalar variable x as d-rS, with S the separator (remaining columns).
 	// Then update A and b by substituting x with d-rS, zero-ing out x's column.
 	for (int j=0; j<n; ++j) {
 		// extract the first column of A
+		// TODO: this is an allocate and a copy
+		// can we somehow make a "reference" vector, boost magic
 		Vector a(column(A, j));
-		if (verbose) print(a,"a = ");
 
 		// Calculate weighted pseudo-inverse and corresponding precision
-		Vector pseudo; double precision;
-		boost::tie(pseudo, precision) = weightedPseudoinverse(a, sigmas);
-		if (verbose) cout << "precision = " << precision << endl;
+		// TODO: pass in weights which are calculated once
+		// TODO return variance
+		double precision = weightedPseudoinverse(a, sigmas, pseudo);
 
 		// if precision is zero, no information on this column
 		if (precision < 1e-8) continue;
-		if (verbose) print(pseudo, "pseudo = ");
 
 		// create solution and copy into r
 		Vector r(basis(n, j));
 		for (int j2=j+1; j2<n; ++j2)
 			r(j2) = inner_prod(pseudo, column(A, j2));
-		if (verbose) print(r, "r = ");
 
 		// create the rhs
 		double d = inner_prod(pseudo, b);
-		if (verbose) cout << "d = " << d << endl;
 
 		// construct solution (r, d, sigma)
 		results.push_back(boost::make_tuple(r, d, 1./sqrt(precision)));
@@ -340,21 +362,9 @@ weighted_eliminate(Matrix& A, Vector& b, const Vector& sigmas) {
 		// exit after rank exhausted
 		if (results.size()>=maxRank) break;
 
-		// update A, b
-		for (int i=0;i<m;++i) { // update all rows
-			double ai = a(i);
-			b(i) -= ai*d;
-			double *Aptr = A.data().begin()+i*n+j+1;
-			double *rptr = r.data().begin()+j+1;
-			for (int j2=j+1;j2<n;++j2){ // limit to only columns in separator
-				//A(i,j2) -= ai*r(j2);
-				*Aptr -= ai* *rptr;
-				Aptr++;
-				rptr++;
-			}
-		}
-		if (verbose) print(sub(A,0,m,j+1,n), "updated A");
-		if (verbose) print(b, "updated b ");
+		// update A, b, expensive, suing outer product
+		// A' \define A_{S}-a*r and b'\define b-d*a
+		updateAb(A, b, j, a, r, d);
 	}
 
 	return results;
diff --git a/cpp/Vector.cpp b/cpp/Vector.cpp
index 09518a6cd..940b5a052 100644
--- a/cpp/Vector.cpp
+++ b/cpp/Vector.cpp
@@ -222,32 +222,54 @@ namespace gtsam {
   }
   
   /* ************************************************************************* */
-  pair<Vector, double> weightedPseudoinverse(const Vector& a, const Vector& sigmas) {
+  // Fast version *no error checking* !
+  // Pass in initialized vector of size m or will crash !
+  double weightedPseudoinverse(const Vector& a, const Vector& sigmas, Vector& pseudo) {
 	  size_t m = sigmas.size();
-	  if (a.size() != m)
-		  throw invalid_argument("V and precisions have different sizes!");
 
 	  // If there is a valid (a!=0) constraint (sigma==0) return the first one
 	  for(int i=0; i<m; ++i)
-		  if (sigmas[i] < 1e-9 && fabs(a[i]) > 1e-9)
-			  return make_pair(delta(m,i,1/a[i]), std::numeric_limits<double>::infinity());
+		  if (sigmas[i] < 1e-9 && fabs(a[i]) > 1e-9) {
+		  	pseudo=delta(m,i,1/a[i]);
+			  return std::numeric_limits<double>::infinity();
+		  }
 
 	  // Form psuedo-inverse inv(a'inv(Sigma)a)a'inv(Sigma)
 	  // For diagonal Sigma, inv(Sigma) = diag(precisions)
 	  double precision = 0;
-	  Vector precisions(m);
+	  // pseudo will be used to store both precisions (an intermediate) and result
+	  Vector& precisions = pseudo;
 	  for(int i = 0; i<m; i++) {
-		  if (fabs(a[i]) < 1e-9) // also catches remaining sigma==0 rows
+	  	double ai=a[i];
+		  if (fabs(ai) < 1e-9) // also catches remaining sigma==0 rows
 			  precisions[i] = 0.;
 		  else {
-			  precisions[i] = 1./(sigmas[i]*sigmas[i]);
-			  precision += a[i]*a[i]*precisions[i];
+		  	double si=sigmas[i],pi = 1./(si*si);
+			  precision += ai*ai*pi;
+			  precisions[i] = pi;
 		  }
 	  }
 	  // precision = a'inv(Sigma)a
-	  if (precision<1e-9) return make_pair(zero(m), precision);
-	  Vector pseudo(emul(precisions,a));
-	  return make_pair(pseudo/precision, precision);
+	  if (precision<1e-9)
+	  	for(int i = 0; i<m; i++) pseudo[i]=0;
+	  else {
+	  	// emul(precisions,a)/precision
+	  	double f = 1.0/precision;
+	  	for(int i = 0; i<m; i++)
+	  		pseudo[i]=f*precisions[i]*a[i];
+	  }
+	  return precision;
+  }
+
+  /* ************************************************************************* */
+  // Slow version with error checking
+  pair<Vector, double> weightedPseudoinverse(const Vector& a, const Vector& sigmas) {
+	  size_t m = sigmas.size();
+	  if (a.size() != m)
+		  throw invalid_argument("V and precisions have different sizes!");
+	  Vector pseudo(m);
+  	double precision = weightedPseudoinverse(a, sigmas, pseudo);
+	  return make_pair(pseudo, precision);
   }
 
   /* ************************************************************************* */
diff --git a/cpp/Vector.h b/cpp/Vector.h
index c073393cf..01076a565 100644
--- a/cpp/Vector.h
+++ b/cpp/Vector.h
@@ -192,6 +192,13 @@ std::pair<double,Vector> house(Vector &x);
  */
 std::pair<Vector, double> weightedPseudoinverse(const Vector& v, const Vector& sigmas);
 
+/*
+ * Fast version *no error checking* !
+ * Pass in initialized vector pseudo of size(sigma) or will crash !
+ * @return the precision, pseudoinverse in third argument
+ */
+double weightedPseudoinverse(const Vector& a, const Vector& sigmas, Vector& pseudo);
+
 /**
  * concatenate Vectors
  */
diff --git a/cpp/timeGaussianFactor.cpp b/cpp/timeGaussianFactor.cpp
index ac16901b1..9a55ecd12 100644
--- a/cpp/timeGaussianFactor.cpp
+++ b/cpp/timeGaussianFactor.cpp
@@ -13,6 +13,7 @@ using namespace std;
 #include <boost/tuple/tuple.hpp>
 #include "Matrix.h"
 #include "GaussianFactor.h"
+#include "GaussianConditional.h"
 
 using namespace gtsam;
 
@@ -66,7 +67,7 @@ int main()
   b2(6) = 2;
   b2(7) = -1;
   
-  GaussianFactor combined("x2", Ax2,  "l1", Al1, "x1", Ax1, b2);
+  GaussianFactor combined("x2", Ax2,  "l1", Al1, "x1", Ax1, b2,1);
   long timeLog = clock();
   int n = 1000000;
   GaussianConditional::shared_ptr conditional;
diff --git a/cpp/timeGaussianFactorGraph.cpp b/cpp/timeGaussianFactorGraph.cpp
index bcc9fadef..a783492ee 100644
--- a/cpp/timeGaussianFactorGraph.cpp
+++ b/cpp/timeGaussianFactorGraph.cpp
@@ -51,10 +51,11 @@ TEST(timeGaussianFactorGraph, linearTime)
 /* ************************************************************************* */
 TEST(timeGaussianFactorGraph, planar)
 {
-	// 1740: 8.12, 8.12, 8.12, 8.16, 8.14
+	// 1741: 8.12, 8.12, 8.12, 8.16, 8.14
+	// 1742: 5.99, 5.97, 5.97, 6.02, 5.97
 	int N = 30;
 	double time = timePlanarSmoother(N); cout << time << endl;
-	DOUBLES_EQUAL(8.12,time,0.1);
+	DOUBLES_EQUAL(5.97,time,0.1);
 }
 
 /* ************************************************************************* */