Improved performance of updateAb in NoiseModel with GSL/ATLAS. Various other small optimizations were made.

release/4.3a0
Alex Cunningham 2010-01-21 18:51:59 +00:00
parent 2183e09c67
commit ac746ccead
8 changed files with 200 additions and 91 deletions

View File

@ -171,16 +171,19 @@ Vector column_(const Matrix& A, size_t j) {
}
/* ************************************************************************* */
Vector row(const Matrix& A, size_t i) {
Vector row_(const Matrix& A, size_t i) {
if (i>=A.size1())
throw invalid_argument("Row index out of bounds!");
const double * Aptr = A.data().begin() + A.size2() * i;
return Vector_(A.size2(), Aptr);
// TODO: improve this
size_t n = A.size2();
Vector a(n);
for (size_t j=0; j<n; ++j)
a(j) = A(i,j);
return a;
// size_t n = A.size2();
// Vector a(n);
// for (size_t j=0; j<n; ++j)
// a(j) = A(i,j);
// return a;
}
/* ************************************************************************* */
@ -322,8 +325,30 @@ void householder_update(Matrix &A, int j, double beta, const Vector& vjm) {
// __attribute__ ((noinline)) // uncomment to prevent inlining when profiling
static void updateAb(Matrix& A, Vector& b, int j, const Vector& a,
const Vector& r, double d) {
// TODO: do some optimization here if possible
const size_t m = A.size1(), n = A.size2();
#ifdef GSL
// update A
// A(0:m,j+1:end) = A(0:m,j+1:end) - a(0:m)*r(j+1:end)'
// get a view for A
// gsl_matrix_view Ag = gsl_matrix_view_array(A.data().begin(), m, n);
// gsl_matrix_view Ag_view = gsl_matrix_submatrix (&(Ag.matrix), 0, j+1, m, n-j-1);
// // get a view for r
// gsl_vector_const_view rg = gsl_vector_const_view_array(r.data().begin()+j+1, n-j-1);
// // get a view for a
// gsl_vector_const_view ag = gsl_vector_const_view_array(a.data().begin(), m);
//
// // rank one update
// gsl_blas_dger (-1.0, &(ag.vector), &(rg.vector), &(Ag_view.matrix));
//
// // update b
// double * bptr = b.data().begin();
// const double * aptr = a.data().begin();
// for (size_t i = 0; i < m; i++) {
// *(bptr+i) -= d* *(aptr+i);
// }
// original
for (size_t i = 0; i < m; i++) { // update all rows
double ai = a(i);
b(i) -= ai * d;
@ -333,6 +358,18 @@ static void updateAb(Matrix& A, Vector& b, int j, const Vector& a,
for (size_t j2 = j + 1; j2 < n; j2++, Aij++, rptr++)
*Aij -= ai * (*rptr);
}
#else
for (size_t i = 0; i < m; i++) { // update all rows
double ai = a(i);
b(i) -= ai * d;
double *Aij = A.data().begin() + i * n + j + 1;
const double *rptr = r.data().begin() + j + 1;
// A(i,j+1:end) -= ai*r(j+1:end)
for (size_t j2 = j + 1; j2 < n; j2++, Aij++, rptr++)
*Aij -= ai * (*rptr);
}
#endif
}
/* ************************************************************************* */
@ -396,7 +433,54 @@ void householder_(Matrix &A, size_t k)
{
const size_t m = A.size1(), n = A.size2(), kprime = min(k,min(m,n));
// Original version
#ifdef GSL
// loop over the kprime first columns
for(size_t j=0; j < kprime; j++){
// below, the indices r,c always refer to original A
// copy column from matrix to xjm, i.e. x(j:m) = A(j:m,j)
Vector xjm(m-j);
for(size_t r = j ; r < m; r++)
xjm(r-j) = A(r,j);
// calculate the Householder vector
// COPIED IN: boost::tie(beta,vjm) = house(xjm);
const double x0 = xjm(0);
const double x02 = x0*x0;
const double sigma = inner_prod(trans(xjm),xjm) - x02;
double beta = 0.0; Vector vjm(xjm); vjm(0) = 1.0;
if( sigma == 0.0 )
beta = 0.0;
else {
double mu = sqrt(x02 + sigma);
if( x0 <= 0.0 )
vjm(0) = x0 - mu;
else
vjm(0) = -sigma / (x0 + mu);
const double v02 = vjm(0)*vjm(0);
beta = 2.0 * v02 / (sigma + v02);
vjm = vjm / vjm(0);
}
// do outer product update A = (I-beta vv')*A = A - v*(beta*A'*v)' = A - v*w'
//householder_update(A, j, beta, vjm);
// inlined use GSL version
gsl_vector_const_view v = gsl_vector_const_view_array(vjm.data().begin(), m-j);
gsl_matrix_view Ag = gsl_matrix_view_array(A.data().begin(), m, n);
gsl_matrix_view Ag_view = gsl_matrix_submatrix (&(Ag.matrix), j, 0, m-j, n);
gsl_linalg_householder_hm (beta, &(v.vector), &(Ag_view.matrix));
// the Householder vector is copied in the zeroed out part
for( size_t r = j+1 ; r < m ; r++ )
A(r,j) = vjm(r-j);
} // column j
#else
// loop over the kprime first columns
for(size_t j=0; j < kprime; j++){
// below, the indices r,c always refer to original A
@ -418,6 +502,7 @@ void householder_(Matrix &A, size_t k)
A(r,j) = vjm(r-j);
} // column j
#endif
}
/* ************************************************************************* */
@ -530,27 +615,6 @@ Matrix collect(const std::vector<const Matrix *>& matrices, size_t m, size_t n)
dimA2 += M->size2();
}
// original version
// Matrix A(dimA1, dimA2);
// size_t hindex = 0;
// BOOST_FOREACH(const Matrix* M, matrices) {
// for(size_t d1 = 0; d1 < M->size1(); d1++)
// for(size_t d2 = 0; d2 < M->size2(); d2++)
// A(d1, d2+hindex) = (*M)(d1, d2);
// hindex += M->size2();
// }
// matrix_range version
// Result: slower
// Matrix A(dimA1, dimA2);
// size_t hindex = 0;
// BOOST_FOREACH(const Matrix* M, matrices) {
// ublas::matrix_range<Matrix> mr(A, ublas::range(0, dimA1),
// ublas::range(hindex, hindex+M->size2()));
// noalias(mr) = *M;
// hindex += M->size2();
// }
// memcpy version
Matrix A(dimA1, dimA2);
double * Aptr = A.data().begin();

View File

@ -157,7 +157,7 @@ Vector column_(const Matrix& A, size_t j);
* @param index of the row
* @return the row in vector form
*/
Vector row(const Matrix& A, size_t j);
Vector row_(const Matrix& A, size_t j);
// left multiply with scalar
//inline Matrix operator*(double s, const Matrix& A) { return A*s;}

View File

@ -11,6 +11,11 @@
#include <typeinfo>
#include <stdexcept>
#ifdef GSL
#include <gsl/gsl_blas.h> // needed for gsl blas
#include <gsl/gsl_linalg.h>
#endif
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/foreach.hpp>
@ -37,6 +42,23 @@ namespace gtsam {
// __attribute__ ((noinline)) // uncomment to prevent inlining when profiling
static void updateAb(Matrix& Ab, int j, const Vector& a, const Vector& rd) {
size_t m = Ab.size1(), n = Ab.size2()-1;
#ifdef GSL
// Ab(0:m,j+1:n) = Ab(0:m,j+1:n) - a(0:m)*rd(j+1:end)'
// get a view for Ab
gsl_matrix_view Abg = gsl_matrix_view_array(Ab.data().begin(), m, n+1);
gsl_matrix_view Abg_view = gsl_matrix_submatrix (&(Abg.matrix), 0, j+1, m, n-j);
// get a view for a
gsl_vector_const_view ag = gsl_vector_const_view_array(a.data().begin(), m);
// get a view for r
gsl_vector_const_view rdg = gsl_vector_const_view_array(rd.data().begin()+j+1, n-j);
// rank one update
gsl_blas_dger (-1.0, &(ag.vector), &(rdg.vector), &(Abg_view.matrix));
#else
for (int i = 0; i < m; i++) { // update all rows
double ai = a(i);
double *Aij = Ab.data().begin() + i * (n+1) + j + 1;
@ -45,6 +67,7 @@ namespace gtsam {
for (int j2 = j + 1; j2 < n+1; j2++, Aij++, rptr++)
*Aij -= ai * (*rptr);
}
#endif
}
/* ************************************************************************* */

View File

@ -18,6 +18,11 @@
#include <Windows.h>
#endif
#ifdef GSL
#include <gsl/gsl_blas.h> // needed for gsl blas
#include <gsl/gsl_linalg.h>
#endif
#include <boost/random/normal_distribution.hpp>
#include <boost/random/variate_generator.hpp>
@ -232,9 +237,11 @@ namespace gtsam {
}
/* ************************************************************************* */
pair<double, Vector > house(Vector &x)
{
const double x02 = x(0)*x(0);
pair<double, Vector > house(const Vector &x) {
const double x0 = x(0);
const double x02 = x0*x0;
// old code - GSL verison was actually a bit slower
const double sigma = inner_prod(trans(x),x) - x02;
double beta = 0.0;
@ -244,16 +251,15 @@ namespace gtsam {
beta = 0.0;
else {
double mu = sqrt(x02 + sigma);
if( x(0) <= 0.0 )
v(0) = x(0) - mu;
if( x0 <= 0.0 )
v(0) = x0 - mu;
else
v(0) = -sigma / (x(0) + mu);
v(0) = -sigma / (x0 + mu);
const double v02 = v(0)*v(0);
beta = 2.0 * v02 / (sigma + v02);
v = v / v(0);
}
return make_pair(beta, v);
}

View File

@ -211,7 +211,7 @@ Vector operator/(double s, const Vector& v);
* from x, such that the corresponding Householder reflection zeroes out
* all but x.(j), j is base 0. Golub & Van Loan p 210.
*/
std::pair<double,Vector> house(Vector &x);
std::pair<double,Vector> house(const Vector &x);
/**
* Weighted Householder solution vector,

View File

@ -174,15 +174,15 @@ TEST( matrix, row )
0., -1., 0., 1., 0., 0., 0.3,
1., 0., 0., 0., -1., 0., 0.2,
0., 1., 0., 0., 0., -1., -0.1);
Vector a1 = row(A, 0);
Vector a1 = row_(A, 0);
Vector exp1 = Vector_(7, -1., 0., 1., 0., 0., 0., -0.2);
CHECK(assert_equal(a1, exp1));
Vector a2 = row(A, 2);
Vector a2 = row_(A, 2);
Vector exp2 = Vector_(7, 1., 0., 0., 0., -1., 0., 0.2);
CHECK(assert_equal(a2, exp2));
Vector a3 = row(A, 3);
Vector a3 = row_(A, 3);
Vector exp3 = Vector_(7, 0., 1., 0., 0., 0., -1., -0.1);
CHECK(assert_equal(a3, exp3));
}

View File

@ -55,7 +55,7 @@ TEST(timeGaussianFactorGraph, linearTime)
int T = 100000;
double time1 = timeKalmanSmoother( T); cout << "timeKalmanSmoother( T): " << time1;
double time2 = timeKalmanSmoother(2*T); cout << " (2*T): " << time2 << endl;
DOUBLES_EQUAL(2*time1,time2,0.1);
DOUBLES_EQUAL(2*time1,time2,0.2);
}
/* ************************************************************************* */
@ -74,6 +74,7 @@ TEST(timeGaussianFactorGraph, planar)
// (N = 100) : 16.36
// Improved (int->size_t)
// (N = 100) : 15.39
// Using GSL/BLAS for updateAb : 12.87
// Switch to 100*100 grid = 10K poses
// 1879: 15.6498 15.3851 15.5279

View File

@ -125,9 +125,12 @@ double timeVScaleRow(size_t m, size_t n, size_t reps) {
/**
* Results:
* Alex's Machine
* Alex's Machine (reps = 200000)
* - ublas matrix_column : 4.63 sec
* - naive implementation : 4.70 sec
*
* reps = 2000000
* -
*/
double timeColumn(size_t reps) {
// create a matrix
@ -144,8 +147,8 @@ double timeColumn(size_t reps) {
boost::timer t;
for (size_t i=0; i<reps; ++i)
for (size_t j = 0; j<n; ++j)
result = ublas::matrix_column<Matrix>(M, j);
//result = column(M, j);
//result = ublas::matrix_column<Matrix>(M, j);
result = column_(M, j);
elapsed = t.elapsed();
}
return elapsed;
@ -154,10 +157,22 @@ double timeColumn(size_t reps) {
/*
* Results
* Alex's machine
*
* Runs at reps = 500000
* Baseline (no householder, just matrix copy) : 0.05 sec
* Initial : 8.20 sec
* All in one function : 7.89 sec
* Replace householder update with GSL, ATLAS : 0.92 sec
*
* Runs at reps = 2000000
* Baseline (GSL/ATLAS householder update) : 3.61 sec
*
* Runs at reps = 5000000
* Baseline : 8.76 sec
* GSL/Atlas version of updateAb : 9.03 sec // Why does this have an effect?
* Inlining house() : 6.33 sec
* Inlining householder_update [GSL] : 6.15 sec
*
*/
double timeHouseholder(size_t reps) {
// create a matrix
@ -182,35 +197,35 @@ double timeHouseholder(size_t reps) {
int main(int argc, char ** argv) {
// // Time collect()
// cout << "Starting Matrix::collect() Timing" << endl;
// //size_t p = 100000; size_t m = 10; size_t n = 12; size_t reps = 50;
// size_t p = 10; size_t m = 10; size_t n = 12; size_t reps = 10000000;
// double collect_time1 = timeCollect(p, m, n, false, reps);
// double collect_time2 = timeCollect(p, m, n, true, reps);
// cout << "Average Elapsed time for collect (no pass) [" << p << " (" << m << ", " << n << ") matrices] : " << collect_time1 << endl;
// cout << "Average Elapsed time for collect (pass) [" << p << " (" << m << ", " << n << ") matrices] : " << collect_time2 << endl;
//
// // Time vector_scale_column
// cout << "Starting Matrix::vector_scale(column) Timing" << endl;
// size_t m1 = 400; size_t n1 = 480; size_t reps1 = 1000;
// double vsColumn_time = timeVScaleColumn(m1, n1, reps1);
// cout << "Elapsed time for vector_scale(column) [(" << m1 << ", " << n1 << ") matrix] : " << vsColumn_time << endl;
//
// // Time vector_scale_row
// cout << "Starting Matrix::vector_scale(row) Timing" << endl;
// double vsRow_time = timeVScaleRow(m1, n1, reps1);
// cout << "Elapsed time for vector_scale(row) [(" << m1 << ", " << n1 << ") matrix] : " << vsRow_time << endl;
//
// // Time column_() NOTE: using the ublas version
// cout << "Starting column_() Timing" << endl;
// size_t reps2 = 200000;
// double column_time = timeColumn(reps2);
// cout << "Time: " << column_time << " sec" << endl;
// Time collect()
cout << "Starting Matrix::collect() Timing" << endl;
//size_t p = 100000; size_t m = 10; size_t n = 12; size_t reps = 50;
size_t p = 10; size_t m = 10; size_t n = 12; size_t reps = 10000000;
double collect_time1 = timeCollect(p, m, n, false, reps);
double collect_time2 = timeCollect(p, m, n, true, reps);
cout << "Average Elapsed time for collect (no pass) [" << p << " (" << m << ", " << n << ") matrices] : " << collect_time1 << endl;
cout << "Average Elapsed time for collect (pass) [" << p << " (" << m << ", " << n << ") matrices] : " << collect_time2 << endl;
// Time vector_scale_column
cout << "Starting Matrix::vector_scale(column) Timing" << endl;
size_t m1 = 400; size_t n1 = 480; size_t reps1 = 1000;
double vsColumn_time = timeVScaleColumn(m1, n1, reps1);
cout << "Elapsed time for vector_scale(column) [(" << m1 << ", " << n1 << ") matrix] : " << vsColumn_time << endl;
// Time vector_scale_row
cout << "Starting Matrix::vector_scale(row) Timing" << endl;
double vsRow_time = timeVScaleRow(m1, n1, reps1);
cout << "Elapsed time for vector_scale(row) [(" << m1 << ", " << n1 << ") matrix] : " << vsRow_time << endl;
// Time column_() NOTE: using the ublas version
cout << "Starting column_() Timing" << endl;
size_t reps2 = 2000000;
double column_time = timeColumn(reps2);
cout << "Time: " << column_time << " sec" << endl;
// Time householder_ function
cout << "Starting householder_() Timing" << endl;
size_t reps_house = 500000;
size_t reps_house = 5000000;
double house_time = timeHouseholder(reps_house);
cout << "Elapsed time for householder_() : " << house_time << " sec" << endl;