xerus
a general purpose tensor library
blasLapackWrapper.cpp
Go to the documentation of this file.
1 // Xerus - A General Purpose Tensor Library
2 // Copyright (C) 2014-2017 Benjamin Huber and Sebastian Wolf.
3 //
4 // Xerus is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU Affero General Public License as published
6 // by the Free Software Foundation, either version 3 of the License,
7 // or (at your option) any later version.
8 //
9 // Xerus is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU Affero General Public License for more details.
13 //
14 // You should have received a copy of the GNU Affero General Public License
15 // along with Xerus. If not, see <http://www.gnu.org/licenses/>.
16 //
17 // For further information on Xerus visit https://libXerus.org
18 // or contact us at contact@libXerus.org.
19 
26 #include <complex.h>
27 // fix for non standard-conform complex implementation
28 #undef I
29 
30 // workaround for broken Lapack
31 #define lapack_complex_float float _Complex
32 #define lapack_complex_double double _Complex
33 extern "C"
34 {
35  #include <cblas.h>
36 }
37 
38 #ifdef __has_include
39  #if __has_include(<lapacke.h>)
40  #include <lapacke.h>
41  #elif __has_include(<lapacke/lapacke.h>)
42  #include <lapacke/lapacke.h>
43  #else
44  #pragma error no lapacke found
45  #endif
46 #else
47  #include <lapacke.h>
48 #endif
49 
50 
51 #include <memory>
52 #include <xerus/misc/standard.h>
54 #include <xerus/misc/check.h>
55 
57 #include <xerus/basic.h>
58 
61 #include <xerus/misc/math.h>
62 #include <xerus/misc/internal.h>
63 
64 
65 
66 namespace xerus {
67  namespace blasWrapper {
68  // the following routines use a work array as temporary storage
69  // to avoid the overhead of repeated reallocation for many small calls, every thread pre-allocates a small scratch-space
70 // const size_t DEFAULT_WORKSPACE_SIZE = 1024*1024;
71 // thread_local value_t defaultWorkspace[DEFAULT_WORKSPACE_SIZE]; // NOTE recheck compatibility with eigen (dolfin) when reinserting this!
72 
73 
74  //----------------------------------------------- LEVEL I BLAS ----------------------------------------------------------
75 
76  double one_norm(const double* const _x, const size_t _n) {
77  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
78 
80 
81  const double result = cblas_dasum(static_cast<int>(_n), _x, 1);
82 
83  XERUS_PA_END("Dense BLAS", "One Norm", misc::to_string(_n));
84 
85  return result;
86  }
87 
88  double two_norm(const double* const _x, const size_t _n) {
89  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
90 
92 
93  const double result = cblas_dnrm2(static_cast<int>(_n), _x, 1);
94 
95  XERUS_PA_END("Dense BLAS", "Two Norm", misc::to_string(_n));
96 
97  return result;
98  }
99 
100  double dot_product(const double* const _x, const size_t _n, const double* const _y) {
101  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
102 
104 
105  const double result = cblas_ddot(static_cast<int>(_n), _x, 1, _y, 1);
106 
107  XERUS_PA_END("Dense BLAS", "Dot Product", misc::to_string(_n)+"*"+misc::to_string(_n));
108 
109  return result;
110  }
111 
112 
113  //----------------------------------------------- LEVEL II BLAS ---------------------------------------------------------
114 
115  void matrix_vector_product(double* const _x, const size_t _m, const double _alpha, const double* const _A, const size_t _n, const bool _transposed, const double* const _y) {
116  // Delegate call if appropriate
117  if(_m == 1) { *_x = _alpha*dot_product(_A, _n, _y); return;}
118 
119  REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
120  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
121 
123  if(!_transposed) {
124  cblas_dgemv(CblasRowMajor, CblasNoTrans, static_cast<int>(_m), static_cast<int>(_n), _alpha, _A, static_cast<int>(_n), _y, 1, 0.0, _x, 1);
125  } else {
126  cblas_dgemv(CblasRowMajor, CblasTrans, static_cast<int>(_n), static_cast<int>(_m), _alpha, _A, static_cast<int>(_m) , _y, 1, 0.0, _x, 1);
127  }
128 
129  XERUS_PA_END("Dense BLAS", "Matrix Vector Product", misc::to_string(_m)+"x"+misc::to_string(_n)+" * "+misc::to_string(_n));
130  }
131 
132  void dyadic_vector_product(double* _A, const size_t _m, const size_t _n, const double _alpha, const double*const _x, const double* const _y) {
133  REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
134  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
135 
137 
138  //Blas wants to add the product to A, but we don't.
139  misc::set_zero(_A, _m*_n);
140 
141  cblas_dger(CblasRowMajor, static_cast<int>(_m), static_cast<int>(_n), _alpha, _x, 1, _y, 1, _A, static_cast<int>(_n));
142 
143  XERUS_PA_END("Dense BLAS", "Dyadic Vector Product", misc::to_string(_m)+" o "+misc::to_string(_n));
144  }
145 
146 
147  //----------------------------------------------- LEVEL III BLAS --------------------------------------------------------
149  void matrix_matrix_product( double* const _C,
150  const size_t _leftDim,
151  const size_t _rightDim,
152  const double _alpha,
153  const double* const _A,
154  const size_t _lda,
155  const bool _transposeA,
156  const size_t _middleDim,
157  const double* const _B,
158  const size_t _ldb,
159  const bool _transposeB) {
160  //Delegate call if appropriate
161  if(_leftDim == 1) {
162  matrix_vector_product(_C, _rightDim, _alpha, _B, _middleDim, !_transposeB, _A);
163  } else if(_rightDim == 1) {
164  matrix_vector_product(_C, _leftDim, _alpha, _A, _middleDim, _transposeA, _B);
165  } else if(_middleDim == 1) {
166  dyadic_vector_product(_C, _leftDim, _rightDim, _alpha, _A, _B);
167  } else {
168 
169  REQUIRE(_leftDim <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
170  REQUIRE(_middleDim <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
171  REQUIRE(_rightDim <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
172  REQUIRE(_lda <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
173  REQUIRE(_ldb <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
174 
176 
177  cblas_dgemm( CblasRowMajor, // Array storage format
178  _transposeA ? CblasTrans : CblasNoTrans, // LHS transposed?
179  _transposeB ? CblasTrans : CblasNoTrans, // RHS transposed?
180  static_cast<int>(_leftDim), // Left dimension
181  static_cast<int>(_rightDim), // Right dimension
182  static_cast<int>(_middleDim), // Middle dimension
183  _alpha, // Factor to the Product
184  _A, // Pointer to LHS
185  static_cast<int>(_lda), // LDA
186  _B, // Pointer to RHS
187  static_cast<int>(_ldb), // LDB
188  0.0, // Factor of C (Zero if only the product is required)
189  _C, // Pointer to result
190  static_cast<int>(_rightDim) // LDC
191  );
192 
193  XERUS_PA_END("Dense BLAS", "Matrix-Matrix-Multiplication", misc::to_string(_leftDim)+"x"+misc::to_string(_middleDim)+" * "+misc::to_string(_middleDim)+"x"+misc::to_string(_rightDim));
194  }
195  }
196 
197 
198 
199  //----------------------------------------------- LAPACK ----------------------------------------------------------------
200 
201  void svd( double* const _U, double* const _S, double* const _Vt, const double* const _A, const size_t _m, const size_t _n) {
202  //Create copy of A
203  const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
204  misc::copy(tmpA.get(), _A, _m*_n);
205 
206  svd_destructive(_U, _S, _Vt, tmpA.get(), _m, _n);
207  }
208 
209 
210  void svd_destructive( double* const _U, double* const _S, double* const _Vt, double* const _A, const size_t _m, const size_t _n) {
211  REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
212  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
213 
215  std::unique_ptr<double[]> tmpA(new double[_m*_n]);
216  misc::copy(tmpA.get(), _A, _m*_n);
217 
218  int lapackAnswer = LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), _S, _U, static_cast<int>(std::min(_m, _n)), _Vt, static_cast<int>(_n));
219  CHECK(lapackAnswer == 0, warning, "Lapack failed to compute SVD. Answer is: " << lapackAnswer);
220  CHECK(lapackAnswer == 0, warning, "Call was: LAPACKE_dgesdd(LAPACK_ROW_MAJOR, 'S', " << static_cast<int>(_m) << ", " << static_cast<int>(_n) << ", " << _A << ", " << static_cast<int>(_n) <<", "
221  << _S <<", " << _U << ", " << static_cast<int>(std::min(_m, _n)) << ", " << _Vt << ", " << static_cast<int>(_n) << ");");
222  if(lapackAnswer != 0) {
223  LOG(warning, "SVD failed ");
224 // for(size_t i=0; i < _m; ++i) {
225 // for(size_t j=0; j < _n; ++j) {
226 // LOG(warning, tmpA[i*_n+j]);
227 // }
228 // }
229  }
230 
231  XERUS_PA_END("Dense LAPACK", "Singular Value Decomposition", misc::to_string(_m)+"x"+misc::to_string(_n));
232  }
233 
234 
235  std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> qc(const double* const _A, const size_t _m, const size_t _n) {
236  const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
237  misc::copy(tmpA.get(), _A, _m*_n);
238 
239  return qc_destructive(tmpA.get(), _m, _n);
240  }
241 
242 
243  std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> qc_destructive(double* const _A, const size_t _m, const size_t _n) {
244  REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
245  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
246 
247  REQUIRE(_n > 0, "Dimension n must be larger than zero");
248  REQUIRE(_m > 0, "Dimension m must be larger than zero");
249 
251 
252  // Maximal rank is used by Lapacke
253  const size_t maxRank = std::min(_m, _n);
254 
255  // Tmp Array for Lapacke
256  const std::unique_ptr<double[]> tau(new double[maxRank]);
257 
258  const std::unique_ptr<int[]> permutation(new int[_n]());
259  misc::set_zero(permutation.get(), _n); // Lapack requires the entries to be zero.
260 
261  // Calculate QR factorisations with column pivoting
262  IF_CHECK(int lapackAnswer = ) LAPACKE_dgeqp3(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), permutation.get(), tau.get());
263  REQUIRE(lapackAnswer == 0, "Unable to perform QC factorisaton (dgeqp3). Lapacke says: " << lapackAnswer );
264 
265 
266  // Determine the actual rank
267  size_t rank;
268  for (rank = 1; rank <= maxRank; ++rank) {
269  if (rank == maxRank || std::abs(_A[rank+rank*_n]) < 16*std::numeric_limits<double>::epsilon()*_A[0]) {
270  break;
271  }
272  }
273 
274 
275  // Create the matrix C
276  std::unique_ptr<double[]> C(new double[rank*_n]);
277  misc::set_zero(C.get(), rank*_n);
278 
279  // Copy the upper triangular Matrix C (rank x _n) into position
280  for (size_t col = 0; col < _n; ++col) {
281  const size_t targetCol = static_cast<size_t>(permutation[col]-1); // For Lapack numbers start at 1 (instead of 0).
282  for(size_t row = 0; row < rank && row < col+1; ++row) {
283  C[row*_n + targetCol] = _A[row*_n + col];
284  }
285  }
286 
287 
288  // Create orthogonal matrix Q
289  IF_CHECK(lapackAnswer = ) LAPACKE_dorgqr(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(maxRank), static_cast<int>(maxRank), _A, static_cast<int>(_n), tau.get());
290  CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the QC factorisation. Lapacke says: " << lapackAnswer);
291 
292  // Copy the newly created Q into position
293  std::unique_ptr<double[]> Q(new double[_m*rank]);
294  if(rank == _n) {
295  misc::copy(Q.get(), _A, _m*rank);
296  } else {
297  for(size_t row = 0; row < _m; ++row) {
298  misc::copy(Q.get()+row*rank, _A+row*_n, rank);
299  }
300  }
301 
302  XERUS_PA_END("Dense LAPACK", "QRP Factorisation", misc::to_string(_m)+"x"+misc::to_string(rank)+" * "+misc::to_string(rank)+"x"+misc::to_string(_n));
303 
304  return std::make_tuple(std::move(Q), std::move(C), rank);
305  }
306 
307 
308  std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> cq(const double* const _A, const size_t _m, const size_t _n) {
309  const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
310  misc::copy(tmpA.get(), _A, _m*_n);
311 
312  return cq_destructive(tmpA.get(), _m, _n);
313  }
314 
315 
316  // We use that in col-major we get At = Qt * Ct => A = C * Q, i.e. doing the calculation in col-major and switching Q and C give the desired result.
317  std::tuple<std::unique_ptr<double[]>, std::unique_ptr<double[]>, size_t> cq_destructive(double* const _A, const size_t _m, const size_t _n) {
318  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
319  REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
320 
321  REQUIRE(_m > 0, "Dimension m must be larger than zero");
322  REQUIRE(_n > 0, "Dimension n must be larger than zero");
323 
325 
326  // Maximal rank is used by Lapacke
327  const size_t maxRank = std::min(_n, _m);
328 
329  // Tmp Array for Lapacke
330  const std::unique_ptr<double[]> tau(new double[maxRank]);
331 
332  const std::unique_ptr<int[]> permutation(new int[_m]());
333  misc::set_zero(permutation.get(), _m); // Lapack requires the entries to be zero.
334 
335  // Calculate QR factorisations with column pivoting
336  IF_CHECK(int lapackAnswer = ) LAPACKE_dgeqp3(LAPACK_COL_MAJOR, static_cast<int>(_n), static_cast<int>(_m), _A, static_cast<int>(_n), permutation.get(), tau.get());
337  REQUIRE(lapackAnswer == 0, "Unable to perform QC factorisaton (dgeqp3). Lapacke says: " << lapackAnswer );
338 
339 
340  // Determine the actual rank
341  size_t rank;
342  for (rank = 1; rank <= maxRank; ++rank) {
343  if (rank == maxRank || std::abs(_A[rank+rank*_n]) < 16*std::numeric_limits<double>::epsilon()*_A[0]) {
344  break;
345  }
346  }
347 
348 
349  // Create the matrix C
350  std::unique_ptr<double[]> C(new double[rank*_m]);
351  misc::set_zero(C.get(), rank*_m);
352 
353  // Copy the upper triangular Matrix C (rank x _m) into position
354  for (size_t col = 0; col < _m; ++col) {
355  const size_t targetCol = static_cast<size_t>(permutation[col]-1); // For Lapack numbers start at 1 (instead of 0).
356  misc::copy(C.get()+targetCol*rank, _A+col*_n, std::min(rank, col+1));
357  }
358 
359 
360  // Create orthogonal matrix Q
361  IF_CHECK(lapackAnswer = ) LAPACKE_dorgqr(LAPACK_COL_MAJOR, static_cast<int>(_n), static_cast<int>(maxRank), static_cast<int>(maxRank), _A, static_cast<int>(_n), tau.get());
362  CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the QC factorisation. Lapacke says: " << lapackAnswer);
363 
364  // Copy the newly created Q into position
365  std::unique_ptr<double[]> Q(new double[_n*rank]);
366  misc::copy(Q.get(), _A, _n*rank);
367 
368  XERUS_PA_END("Dense LAPACK", "QRP Factorisation", misc::to_string(_n)+"x"+misc::to_string(rank)+" * "+misc::to_string(rank)+"x"+misc::to_string(_m));
369 
370  return std::make_tuple(std::move(C), std::move(Q), rank);
371  }
372 
373 
374  void qr(double* const _Q, double* const _R, const double* const _A, const size_t _m, const size_t _n) {
375  // Create tmp copy of A since Lapack wants to destroy it
376  const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
377  misc::copy(tmpA.get(), _A, _m*_n);
378 
379  qr_destructive(_Q, _R, tmpA.get(), _m, _n);
380  }
381 
382 
383  void inplace_qr(double* const _AtoQ, double* const _R, const size_t _m, const size_t _n) {
384  qr_destructive(_AtoQ, _R, _AtoQ, _m, _n);
385  }
386 
387 
388  void qr_destructive( double* const _Q, double* const _R, double* const _A, const size_t _m, const size_t _n) {
389  REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
390  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
391 
392  REQUIRE(_n > 0, "Dimension n must be larger than zero");
393  REQUIRE(_m > 0, "Dimension m must be larger than zero");
394 
395  REQUIRE(_Q && _R && _A, "QR decomposition must not be called with null pointers: Q:" << _Q << " R: " << _R << " A: " << _A);
396  REQUIRE(_A != _R, "_A and _R must be different, otherwise qr call will fail.");
397 
399 
400  // Maximal rank is used by Lapacke
401  const size_t rank = std::min(_m, _n);
402 
403  // Tmp Array for Lapacke
404  const std::unique_ptr<double[]> tau(new double[rank]);
405 
406  // Calculate QR factorisations
407 // LOG(Lapacke, "Call to dorgqr with parameters: " << LAPACK_ROW_MAJOR << ", " << static_cast<int>(_m) << ", " << static_cast<int>(_n) << ", " << _A << ", " << static_cast<int>(_n) << ", " << tau.get());
408  IF_CHECK( int lapackAnswer = ) LAPACKE_dgeqrf(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), tau.get());
409  CHECK(lapackAnswer == 0, error, "Unable to perform QR factorisaton. Lapacke says: " << lapackAnswer );
410 
411  // Copy the upper triangular Matrix R (rank x _n) into position
412  for(size_t row =0; row < rank; ++row) {
413  misc::set_zero(_R+row*_n, row); // Set starting zeros
414  misc::copy(_R+row*_n+row, _A+row*_n+row, _n-row); // Copy upper triangular part from lapack result.
415  }
416 
417  // Create orthogonal matrix Q (in tmpA)
418  // LOG(Lapacke, "Call to dorgqr with parameters: " << LAPACK_ROW_MAJOR << ", " << static_cast<int>(_m) << ", " << static_cast<int>(rank) << ", " << static_cast<int>(rank) << ", " << _A << ", " << static_cast<int>(_n) << ", " << tau.get());
419  IF_CHECK( lapackAnswer = ) LAPACKE_dorgqr(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(rank), static_cast<int>(rank), _A, static_cast<int>(_n), tau.get());
420  CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the QR factorisation. Lapacke says: " << lapackAnswer);
421 
422  // Copy Q (_m x rank) into position
423  if(_A != _Q) {
424  if(_n == rank) {
425  misc::copy(_Q, _A, _m*_n);
426  } else {
427  for(size_t row = 0; row < _m; ++row) {
428  misc::copy(_Q+row*rank, _A+row*_n, rank);
429  }
430  }
431  } else if(_n != rank) { // Note extra treatmeant to avoid memcpy overlap
432  for(size_t row = 1; row < _m; ++row) {
433  misc::copy_inplace(_Q+row*rank, _A+row*_n, rank);
434  }
435  }
436 
437  XERUS_PA_END("Dense LAPACK", "QR Factorisation", misc::to_string(_m)+"x"+misc::to_string(_n));
438  }
439 
440 
441  void rq( double* const _R, double* const _Q, const double* const _A, const size_t _m, const size_t _n) {
442  // Create tmp copy of A since Lapack wants to destroy it
443  const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
444  misc::copy(tmpA.get(), _A, _m*_n);
445 
446  rq_destructive(_R, _Q, tmpA.get(), _m, _n);
447  }
448 
449 
450  void inplace_rq( double* const _R, double* const _AtoQ, const size_t _m, const size_t _n) {
451  rq_destructive(_R, _AtoQ, _AtoQ, _m, _n);
452  }
453 
454 
455  void rq_destructive( double* const _R, double* const _Q, double* const _A, const size_t _m, const size_t _n) {
456  REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
457  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
458 
459  REQUIRE(_n > 0, "Dimension n must be larger than zero");
460  REQUIRE(_m > 0, "Dimension m must be larger than zero");
461 
462  REQUIRE(_Q && _R && _A, "QR decomposition must not be called with null pointers: R " << _R << " Q: " << _Q << " A: " << _A);
463  REQUIRE(_A != _R, "_A and _R must be different, otherwise qr call will fail.");
464 
466 
467  // Maximal rank is used by Lapacke
468  const size_t rank = std::min(_m, _n);
469 
470  // Tmp Array for Lapacke
471  const std::unique_ptr<double[]> tau(new double[rank]);
472 
473  IF_CHECK( int lapackAnswer = ) LAPACKE_dgerqf(LAPACK_ROW_MAJOR, static_cast<int>(_m), static_cast<int>(_n), _A, static_cast<int>(_n), tau.get());
474  CHECK(lapackAnswer == 0, error, "Unable to perform QR factorisaton. Lapacke says: " << lapackAnswer << ". Call was: LAPACKE_dgerqf(LAPACK_ROW_MAJOR, "<<static_cast<int>(_m)<<", "<<static_cast<int>(_n)<<", "<<_A<<", "<<static_cast<int>(_n)<<", "<<tau.get()<<");" );
475 
476 
477  // Copy the upper triangular Matrix R (_m x rank) into position.
478  size_t row = 0;
479  for( ; row < _m - rank; ++row) {
480  misc::copy(_R+row*rank, _A+row*_n+_n-rank, rank);
481  }
482  for(size_t skip = 0; row < _m; ++row, ++skip) {
483  misc::set_zero(_R+row*rank, skip); // Set starting zeros
484  misc::copy(_R+row*rank+skip, _A+row*_n+_n-rank+skip, rank-skip); // Copy upper triangular part from lapack result.
485  }
486 
487  // Create orthogonal matrix Q (in _A). Lapacke expects to get the last rank rows of A...
488  IF_CHECK( lapackAnswer = ) LAPACKE_dorgrq(LAPACK_ROW_MAJOR, static_cast<int>(rank), static_cast<int>(_n), static_cast<int>(rank), _A+(_m-rank)*_n, static_cast<int>(_n), tau.get());
489  CHECK(lapackAnswer == 0, error, "Unable to reconstruct Q from the RQ factorisation. Lapacke says: " << lapackAnswer << ". Call was: LAPACKE_dorgrq(LAPACK_ROW_MAJOR, "<<static_cast<int>(rank)<<", "<<static_cast<int>(_n)<<", "<<static_cast<int>(rank)<<", "<<_A+(_m-rank)*_n<<", "<<static_cast<int>(_n)<<", "<<tau.get()<<");");
490 
491 
492  //Copy Q (rank x _n) into position
493  if(_A != _Q) {
494  misc::copy(_Q, _A+(_m-rank)*_n, rank*_n);
495  }
496 
497  XERUS_PA_END("Dense LAPACK", "RQ Factorisation", misc::to_string(_m)+"x"+misc::to_string(_n));
498  }
499 
500 
501  static bool is_symmetric(const double* const _A, const size_t _n) {
502  double max = 0;
503  for (size_t i=0; i<_n*_n; ++i) {
504  max = std::max(max, _A[i]);
505  }
506 
507  for (size_t i=0; i<_n; ++i) {
508  for (size_t j=i+1; j<_n; ++j) {
509  if (std::abs(_A[i*_n + j] - _A[i + j*_n]) >= 4 * max * std::numeric_limits<double>::epsilon()) {
510 // LOG(aslkdjj, std::abs(_A[i*_n + j] - _A[i + j*_n]) << " / " << _A[i*_n + j] << " " << max);
511  return false;
512  }
513  }
514  }
515  return true;
516  }
517 
519  static bool pos_neg_definite_diagonal(const double* const _A, const size_t _n) {
520  bool positive = (_A[0] > 0);
521  const size_t steps=_n+1;
522  if (positive) {
523  for (size_t i=1; i<_n; ++i) {
524  if (_A[i*steps] < std::numeric_limits<double>::epsilon()) {
525  return false;
526  }
527  }
528  return true;
529  } else {
530  for (size_t i=1; i<_n; ++i) {
531  if (_A[i*steps] > -std::numeric_limits<double>::epsilon()) {
532  return false;
533  }
534  }
535  return true;
536  }
537 
538  }
539 
542  void solve(double* const _x, const double* const _A, const size_t _m, const size_t _n, const double* const _b, const size_t _nrhs) {
543  REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
544  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
545  REQUIRE(_nrhs <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
546 
547  const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
548  misc::copy(tmpA.get(), _A, _m*_n);
549 
550  LOG(debug, "solving with...");
551 
552  // not rectangular -> fallback to least-squares (QR or SVD decomposition to solve)
553  if (_m != _n) {
554  LOG(debug, "SVD");
555  const std::unique_ptr<double[]> tmpB(new double[_m*_nrhs]);
556  misc::copy(tmpB.get(), _b, _m*_nrhs);
557  solve_least_squares_destructive(_x, tmpA.get(), _m, _n, tmpB.get(), _nrhs);
558  return;
559  }
560 
561  // not symmetric -> LU solver
562  if (!is_symmetric(_A, _n)) {
563  LOG(debug, "LU");
565 
566  std::unique_ptr<int[]> pivot(new int[_n]);
567 
568  misc::copy(_x, _b, _n);
569 
570  IF_CHECK( int lapackAnswer = ) LAPACKE_dgesv(
571  LAPACK_ROW_MAJOR,
572  static_cast<int>(_n), // Dimensions of A (nxn)
573  static_cast<int>(_nrhs), // Number of rhs b
574  tmpA.get(), // input: A, output: L and U
575  static_cast<int>(_n), // LDA
576  pivot.get(), // output: permutation P
577  _x, // input: rhs b, output: solution x
578  static_cast<int>(_nrhs) // ldb
579  );
580  CHECK(lapackAnswer == 0, error, "Unable to solve Ax = b (PLU solver). Lapacke says: " << lapackAnswer);
581 
582  XERUS_PA_END("Dense LAPACK", "Solve (PLU)", misc::to_string(_n)+"x"+misc::to_string(_n)+"x"+misc::to_string(_nrhs));
583  return;
584  }
585 
586  // positive or negative diagonal -> try cholesky
587  if (pos_neg_definite_diagonal(_A, _n)) {
588  LOG(debug, "trying cholesky");
589  int lapackAnswer = 0;
590  {
592 
593  lapackAnswer = LAPACKE_dpotrf2(
594  LAPACK_ROW_MAJOR,
595  'U', // Upper triangle of A is read
596  static_cast<int>(_n), // dimensions of A
597  tmpA.get(), // input: A, output: cholesky factorisation
598  static_cast<int>(_n) // LDA
599  );
600 
601  XERUS_PA_END("Dense LAPACK", "Cholesky decomposition", misc::to_string(_n)+"x"+misc::to_string(_n));
602  }
603 
604  if (lapackAnswer == 0) {
605  LOG(debug, "cholesky");
607 
608  misc::copy(_x, _b, _n);
609 
610  lapackAnswer = LAPACKE_dpotrs(
611  LAPACK_ROW_MAJOR,
612  'U', // upper triangle of cholesky decomp is stored in tmpA
613  static_cast<int>(_n), // dimensions of A
614  static_cast<int>(_nrhs),// number of rhs
615  tmpA.get(), // input: cholesky decomp
616  static_cast<int>(_n), // lda
617  _x, // input: rhs b, output: solution x
618  static_cast<int>(_nrhs) // ldb
619  );
620  CHECK(lapackAnswer == 0, error, "Unable to solve Ax = b (cholesky solver). Lapacke says: " << lapackAnswer);
621 
622  XERUS_PA_END("Dense LAPACK", "Solve (Cholesky)", misc::to_string(_n)+"x"+misc::to_string(_n)+"x"+misc::to_string(_nrhs));
623 
624  return;
625  } else {
626  // restore tmpA
627  misc::copy(tmpA.get(), _A, _m*_n);
628  }
629  }
630 
631  LOG(debug, "LDL");
632  // non-definite diagonal or choleksy failed -> fallback to LDL^T decomposition
634 
635  misc::copy(_x, _b, _n);
636  std::unique_ptr<int[]> pivot(new int[_n]);
637 
638  LAPACKE_dsysv(
639  LAPACK_ROW_MAJOR,
640  'U', // upper triangular part of _A is accessed
641  static_cast<int>(_n), // dimension of A
642  static_cast<int>(_nrhs),// number of rhs
643  tmpA.get(), // input: A, output: part of the LDL decomposition
644  static_cast<int>(_n), // lda
645  pivot.get(), // output: details of blockstructure of D
646  _x, // input: rhs b, output: solution x
647  static_cast<int>(_nrhs) // ldb
648  );
649 
650  XERUS_PA_END("Dense LAPACK", "Solve (LDL)", misc::to_string(_n)+"x"+misc::to_string(_n)+"x"+misc::to_string(_nrhs));
651  }
652 
653 
654  void solve_least_squares( double* const _x, const double* const _A, const size_t _m, const size_t _n, const double* const _b, const size_t _p){
655  const std::unique_ptr<double[]> tmpA(new double[_m*_n]);
656  misc::copy(tmpA.get(), _A, _m*_n);
657 
658  const std::unique_ptr<double[]> tmpB(new double[_m*_p]);
659  misc::copy(tmpB.get(), _b, _m*_p);
660 
661  solve_least_squares_destructive(_x, tmpA.get(), _m, _n, tmpB.get(), _p);
662  }
663 
664 
665  void solve_least_squares_destructive( double* const _x, double* const _A, const size_t _m, const size_t _n, double* const _b, const size_t _p){
666  REQUIRE(_m <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
667  REQUIRE(_n <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
668  REQUIRE(_p <= static_cast<size_t>(std::numeric_limits<int>::max()), "Dimension to large for BLAS/Lapack");
669 
671 
672  std::unique_ptr<int[]> pivot(new int[_n]);
673  misc::set_zero(pivot.get(), _n);
674 
675  std::unique_ptr<double[]> signulars(new double[std::min(_n, _m)]);
676 
677  int rank;
678 
679  double* bOrX;
680  if(_m >= _n) {
681  bOrX = _b;
682  } else {
683  bOrX = _x;
684  misc::copy(bOrX, _b, _m*_p);
685  misc::set_zero(bOrX+_m*_p, (_n-_m)*_p); // Lapacke is unhappy if the array contains NANs...
686  }
687 
688 // IF_CHECK( int lapackAnswer = ) LAPACKE_dgelsy(
689 // LAPACK_ROW_MAJOR,
690 // static_cast<int>(_m), // Left dimension of A
691 // static_cast<int>(_n), // Right dimension of A
692 // static_cast<int>(_p), // Number of rhss
693 // _A, // Matrix A
694 // static_cast<int>(_n), // LDA
695 // bOrX, // On input b, on output x
696 // static_cast<int>(_p), // LDB
697 // pivot.get(), // Pivot, entries must be zero to allow pivoting
698 // xerus::EPSILON, // Used to determine the accuracy of the Lapacke call. Basically all singular values smaller than RCOND*s[0] are ignored. (s[0] is the largest signular value)
699 // &rank); // Outputs the rank of A
700 
701  IF_CHECK( int lapackAnswer = ) LAPACKE_dgelsd(
702  LAPACK_ROW_MAJOR,
703  static_cast<int>(_m), // Left dimension of A
704  static_cast<int>(_n), // Right dimension of A
705  static_cast<int>(_p), // Number of rhss
706  _A, // Matrix A
707  static_cast<int>(_n), // LDA
708  bOrX, // On input b, on output x
709  static_cast<int>(_p), // LDB
710  signulars.get(), // Pivot, entries must be zero to allow pivoting
711  xerus::EPSILON, // Used to determine the accuracy of the Lapacke call. Basically all singular values smaller than RCOND*s[0] are ignored. (s[0] is the largest signular value)
712  &rank); // Outputs the rank of A
713 
714  CHECK(lapackAnswer == 0, error, "Unable to solves min ||Ax - b||_2 for x. Lapacke says: " << lapackAnswer << " sizes are " << _m << " x " << _n << " * " << _p);
715 
716  if(_m >= _n) { // I.e. bOrX is _b
717  misc::copy(_x, bOrX, _n*_p);
718  }
719 
720  XERUS_PA_END("Dense LAPACK", "Solve Least Squares", misc::to_string(_m)+"x"+misc::to_string(_n)+" * "+misc::to_string(_p));
721  }
722 
723  } // namespace blasWrapper
724 
725 } // namespace xerus
726 
Header file for some additional math functions.
std::tuple< std::unique_ptr< double[]>, std::unique_ptr< double[]>, size_t > cq(const double *const _A, const size_t _m, const size_t _n)
: splits A = C*Q, with _C an rxm matrix (where r is the rank of _A) and _Q orthogonal.
Header file for CHECK and REQUIRE macros.
Header file for the performance analysis global objects and analysis function.
std::tuple< std::unique_ptr< double[]>, std::unique_ptr< double[]>, size_t > qc(const double *const _A, const size_t _m, const size_t _n)
: splits A = Q*C, with _C an rxn matrix (where r is the rank of _A) and _Q orthogonal.
#define CHECK
Definition: internal.h:34
Header file for the blas and lapack wrapper functions.
void qr(double *const _Q, double *const _R, const double *const _A, const size_t _m, const size_t _n)
: Performs (Q,R) = QR(A)
static bool pos_neg_definite_diagonal(const double *const _A, const size_t _n)
checks whether the diagonal of _A is all positive or all negative. returns false otherwise ...
#define REQUIRE
Definition: internal.h:33
void copy_inplace(T *const _x, const T *const _y, const size_t _n) noexcept
Copys _n entries from _y to _x, allowing the accessed memry regions of _y and _x to overlap...
#define LOG
Definition: internal.h:38
void matrix_matrix_product(double *const _C, const size_t _leftDim, const size_t _rightDim, const double _alpha, const double *const _A, const size_t _lda, const bool _transposeA, const size_t _middleDim, const double *const _B, const size_t _ldb, const bool _transposeB)
: Performs the Matrix-Matrix product C = alpha*OP(A) * OP(B)
#define IF_CHECK
Definition: internal.h:35
void inplace_rq(double *const _R, double *const _AtoQ, const size_t _m, const size_t _n)
: Performs (R,AtoQ) = RQ(AtoQ)
The main namespace of xerus.
Definition: basic.h:37
void inplace_qr(double *const _AtoQ, double *const _R, const size_t _m, const size_t _n)
: Performs (AtoQ,R) = QR(AtoQ)
void svd_destructive(double *const _U, double *const _S, double *const _Vt, double *const _A, const size_t _m, const size_t _n)
: Performs (U,S,V) = SVD(A). Destroys A.
void rq(double *const _R, double *const _Q, const double *const _A, const size_t _m, const size_t _n)
: Performs (R,Q) = RQ(A)
double dot_product(const double *const _x, const size_t _n, const double *const _y)
: Computes the dot product = x^T*y
double one_norm(const double *const _x, const size_t _n)
: Computes the one norm =||x||_1
void dyadic_vector_product(double *_A, const size_t _m, const size_t _n, const double _alpha, const double *const _x, const double *const _y)
: Performs A = alpha*x*y^T
std::tuple< std::unique_ptr< double[]>, std::unique_ptr< double[]>, size_t > qc_destructive(double *const _A, const size_t _m, const size_t _n)
: splits A = Q*C, with _C an rxn matrix (where r is the rank of _A) and _Q orthogonal. Destroys A.
void solve(double *_x, const double *_A, size_t _m, size_t _n, const double *_b, size_t _p)
: Solves Ax = b for x
static bool is_symmetric(const double *const _A, const size_t _n)
Header file for the low level array support function.
Header file for comfort functions and macros that should not be exported in the library.
constexpr const value_t EPSILON
The default epsilon value in xerus.
Definition: basic.h:50
Header file for some elementary string manipulation routines.
double two_norm(const double *const _x, const size_t _n)
: Computes the two norm =||x||_2
void qr_destructive(double *const _Q, double *const _R, double *const _A, const size_t _m, const size_t _n)
: Performs (Q,R) = QR(A), destroys A in the process
Header file for shorthand notations that are xerus specific but used throughout the library...
#define XERUS_PA_END(group, name, parameter)
void svd(double *const _U, double *const _S, double *const _Vt, const double *const _A, const size_t _m, const size_t _n)
: Performs (U,S,V) = SVD(A)
void copy(T *const __restrict _dest, const T *const __restrict _src, const size_t _n) noexcept
Copys _n entries from _y to _x, where _y and _x must be disjunkt memory regions.
#define XERUS_PA_START
static XERUS_force_inline std::string to_string(const bool obj)
Definition: stringFromTo.h:41
void set_zero(T *const __restrict _x, const size_t _n) noexcept
Sets all entries equal to zero.
void matrix_vector_product(double *const _x, const size_t _m, const double _alpha, const double *const _A, const size_t _n, const bool _transposed, const double *const _y)
: Perfroms x = alpha*OP(A)*y
void rq_destructive(double *const _R, double *const _Q, double *const _A, const size_t _m, const size_t _n)
: Performs (R,Q) = RQ(A), destroys A in the process
void solve_least_squares_destructive(double *const _x, double *const _A, const size_t _m, const size_t _n, double *const _b, const size_t _p)
: Solves min ||Ax - b||_2 for x
std::tuple< std::unique_ptr< double[]>, std::unique_ptr< double[]>, size_t > cq_destructive(double *const _A, const size_t _m, const size_t _n)
: splits A = C*Q, with _C an rxm matrix (where r is the rank of _A) and _Q orthogonal. Destroys A.
Header file for global shorthand notations of elementary integer types and attribute lists...
void solve_least_squares(double *const _x, const double *const _A, const size_t _m, const size_t _n, const double *const _b, const size_t _p)
: Solves min ||Ax - b||_2 for x