ug4
gpusparsematrix_impl.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2013-2015: G-CSC, Goethe University Frankfurt
3  * Author: Martin Rupp
4  *
5  * This file is part of UG4.
6  *
7  * UG4 is free software: you can redistribute it and/or modify it under the
8  * terms of the GNU Lesser General Public License version 3 (as published by the
9  * Free Software Foundation) with the following additional attribution
10  * requirements (according to LGPL/GPL v3 §7):
11  *
12  * (1) The following notice must be displayed in the Appropriate Legal Notices
13  * of covered and combined works: "Based on UG4 (www.ug4.org/license)".
14  *
15  * (2) The following notice must be displayed at a prominent place in the
16  * terminal output of covered works: "Based on UG4 (www.ug4.org/license)".
17  *
18  * (3) The following bibliography is recommended for citation and must be
19  * preserved in all covered files:
20  * "Reiter, S., Vogel, A., Heppner, I., Rupp, M., and Wittum, G. A massively
21  * parallel geometric multigrid solver on hierarchically distributed grids.
22  * Computing and visualization in science 16, 4 (2013), 151-164"
23  * "Vogel, A., Reiter, S., Rupp, M., Nägel, A., and Wittum, G. UG4 -- a novel
24  * flexible software system for simulating pde based models on high performance
25  * computers. Computing and visualization in science 16, 4 (2013), 165-179"
26  *
27  * This program is distributed in the hope that it will be useful,
28  * but WITHOUT ANY WARRANTY; without even the implied warranty of
29  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30  * GNU Lesser General Public License for more details.
31  */
32 
33 #ifndef __H__UG__CPU_ALGEBRA__GPUSparseMatrix_IMPL__
34 #define __H__UG__CPU_ALGEBRA__GPUSparseMatrix_IMPL__
35 
36 // creation etc
37 
38 #include <fstream>
39 #include <cstring>
40 
43 #include "gpusparsematrix.h"
44 #include <vector>
45 #include <algorithm>
46 #include "cuda/cuda_manager.h"
47 
48 
49 namespace ug{
50 
51 template<typename T>
53 {
54  initGPU();
55  PROFILE_GPUMATRIX(GPUSparseMatrix_constructor);
56  bNeedsValues = true;
57  iIterators=0;
58  nnz = 0;
59  m_numCols = 0;
60  maxValues = 0;
61  cols.resize(32);
62  if(bNeedsValues) values.resize(32);
63 }
64 
65 template<typename T>
66 bool GPUSparseMatrix<T>::resize_and_clear(size_t newRows, size_t newCols)
67 {
68  PROFILE_GPUMATRIX(GPUSparseMatrix_resize_and_clear);
69  UG_LOG(this << "GPUSparseMatrix::resize_and_clear(" << newRows << ", " << newCols << ")");
70  rowStart.clear(); rowStart.resize(newRows+1, -1);
71  rowMax.clear(); rowMax.resize(newRows);
72  rowEnd.clear(); rowEnd.resize(newRows, -1);
73  m_numCols = newCols;
74  nnz = 0;
75 
76  cols.clear(); cols.resize(newRows);
77  values.clear();
78  if(bNeedsValues) values.resize(newRows);
79  maxValues = 0;
80  return true;
81 }
82 
83 template<typename T>
84 bool GPUSparseMatrix<T>::resize_and_keep_values(size_t newRows, size_t newCols)
85 {
86  PROFILE_GPUMATRIX(GPUSparseMatrix_resize_and_keep_values);
87  UG_LOG(this << "GPUSparseMatrix_resize_and_keep_values(" << newRows << ", " << newCols << ")");
88  //UG_LOG("GPUSparseMatrix resize " << newRows << "x" << newCols << "\n");
89  if(newRows == 0 && newCols == 0)
90  return resize_and_clear(0,0);
91 
92  if(newRows != num_rows())
93  {
94  size_t oldrows = num_rows();
95  rowStart.resize(newRows+1, -1);
96  rowMax.resize(newRows);
97  rowEnd.resize(newRows, -1);
98  for(size_t i=oldrows; i<newRows; i++)
99  {
100  rowStart[i] = -1;
101  rowEnd[i] = -1;
102  }
103  }
104  if((int)newCols < m_numCols)
105  copyToNewSize(get_nnz_max_cols(newCols), newCols);
106 
107  m_numCols = newCols;
108  return true;
109 }
110 
111 
112 template<typename T>
114 {
115  PROFILE_GPUMATRIX(GPUSparseMatrix_set_as_transpose_of);
116  resize_and_clear(B.num_cols(), B.num_rows());
117  /*rowStart.resize(B.num_cols(), 0);
118  rowMax.resize(B.num_cols(), 0);
119  rowEnd.resize(B.num_cols(), 0);
120  nnz = B.nnz;
121  bNeedsValues = B.bNeedsValues;
122  if(bNeedsValues) values.resize(nnz);
123  cols.resize(nnz);
124  maxValues = B.maxValues;
125  fragmented = 0;
126  m_numCols = B.num_rows();
127  size_t r, c;
128  for(r=0; r<num_rows(); r++)
129  for(const_row_iterator it = B.begin_row(r); it != B.end_row(r); ++it)
130  rowMax[it.index()]++;
131  rowEnd[0] = rowMax[0];
132  rowEnd[0] = 0;
133  for(c=1; c<B.num_cols(); c++)
134  {
135  rowStart[c] = rowMax[c-1];
136  rowMax[c] = rowStart[c]+rowMax[c];
137  rowEnd[c] = rowStart[c];
138  }*/
139 
140  for(size_t r=0; r<B.num_rows(); r++)
141  for(const_row_iterator it = B.begin_row(r); it != B.end_row(r); ++it)
142  operator()(it.index(), r) = scale*it.value();
143  // todo: sort rows
144  return true;
145 }
146 
147 template<typename T>
148 template<typename vector_t>
149 inline void GPUSparseMatrix<T>::mat_mult_add_row(size_t row, typename vector_t::value_type &dest, double alpha, const vector_t &v) const
150 {
151  for(const_row_iterator conn = begin_row(row); conn != end_row(row); ++conn)
152  MatMultAdd(dest, 1.0, dest, alpha, conn.value(), v[conn.index()]);
153 }
154 
156 template<typename T>
157 template<typename vector_t>
158 void GPUSparseMatrix<T>::axpy(double alpha, vector_t &x, double beta, const vector_t &y) const
159 {
160  check_device();
161  cusparseDcsrmv(CUDAManager::get_cusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE, num_rows(), num_cols(), get_nnz(),
162  &beta, get_matrix_descr(), get_device_value_ptr(), get_device_rowStart(), get_device_cols(), y.get_dev_ptr(),
163  &alpha, x.get_dev_ptr());
164 }
165 
166 
167 // calculate dest = alpha1*v1 + beta1*A*w1 (A = this matrix)
168 template<typename T>
169 template<typename vector_t>
170 bool GPUSparseMatrix<T>::axpy(vector_t &dest,
171  const number &alpha1, const vector_t &v1,
172  const number &beta1, const vector_t &w1) const
173 {
174 // UG_LOG("GPUSparseMatrix axpy\n");
175  PROFILE_GPUMATRIX(GPUSparseMatrix_axpy);
176 
177  if(alpha1 == 0.0 || &dest == &v1)
178  axpy(alpha1, dest, beta1, w1);
179  else
180  {
181  axpy(0.0, dest, beta1, w1);
182  dest.add(alpha1, v1);
183  }
184  return true;
185 }
186 
187 // calculate dest = alpha1*v1 + beta1*A^T*w1 (A = this matrix)
188 template<typename T>
189 template<typename vector_t>
191  const number &alpha1, const vector_t &v1,
192  const number &beta1, const vector_t &w1) const
193 {
194  PROFILE_GPUMATRIX(GPUSparseMatrix_axpy_transposed);
195  UG_ASSERT(0, "not implemented");
196  return true;
197 }
198 
199 template<typename T>
201 {
202  PROFILE_GPUMATRIX(GPUSparseMatrix_set);
203  check_fragmentation();
204  for(size_t row=0; row<num_rows(); row++)
205  for(row_iterator it = begin_row(row); it != end_row(row); ++it)
206  {
207  if(it.index() == row)
208  it.value() = a;
209  else
210  it.value() = 0.0;
211  }
212 
213  return true;
214 }
215 
216 
217 template<typename T>
218 inline bool GPUSparseMatrix<T>::is_isolated(size_t i) const
219 {
220  UG_ASSERT(i < num_rows() && i >= 0, *this << ": " << i << " out of bounds.");
221 
222  for(const_row_iterator it = begin_row(i); it != end_row(i); ++it)
223  if(it.index() != i && it.value() != 0.0)
224  return false;
225  return true;
226 }
227 
228 
229 template<typename T>
230 void GPUSparseMatrix<T>::set_matrix_row(size_t row, connection *c, size_t nr)
231 {
232  /*PROFILE_GPUMATRIX(GPUSparseMatrix_set_matrix_row);
233  bool bSorted=false;
234  for(size_t i=0; bSorted && i+1 < nr; i++)
235  bSorted = c[i].iIndex < c[i+1].iIndex;
236  if(bSorted)
237  {
238  int start;
239  if(rowStart[row] == -1 || rowMax[row] - rowStart[row] < (int)nr)
240  {
241  assureValuesSize(maxValues+nr);
242  start = maxValues;
243  rowMax[row] = start+nr;
244  rowStart[row] = start;
245  maxValues+=nr;
246  }
247  else
248  start = rowStart[row];
249  rowEnd[row] = start+nr;
250  for(size_t i=0; i<nr; i++)
251  {
252  cols[start+i] = c[i].iIndex;
253  values[start+i] = c[i].dValue;
254  }
255  }
256  else*/
257  {
258  for(size_t i=0; i<nr; i++)
259  operator()(row, c[i].iIndex) = c[i].dValue;
260  }
261 }
262 
263 template<typename T>
264 void GPUSparseMatrix<T>::add_matrix_row(size_t row, connection *c, size_t nr)
265 {
266  //PROFILE_GPUMATRIX(GPUSparseMatrix_add_matrix_row);
267  for(size_t i=0; i<nr; i++)
268  operator()(row, c[i].iIndex) += c[i].dValue;
269 }
270 
271 
272 template<typename T>
274 {
275  //PROFILE_GPUMATRIX(GPUSparseMatrix_set_as_copy_of);
276  resize_and_clear(B.num_rows(), B.num_cols());
277  for(size_t i=0; i < B.num_rows(); i++)
278  for(const_row_iterator it = B.begin_row(i); it != B.end_row(i); ++it)
279  operator()(i, it.index()) = scale*it.value();
280  return true;
281 }
282 
283 
284 
285 template<typename T>
287 {
288  //PROFILE_GPUMATRIX(GPUSparseMatrix_scale);
289  for(size_t i=0; i < num_rows(); i++)
290  for(row_iterator it = begin_row(i); it != end_row(i); ++it)
291  it.value() *= d;
292  return true;
293 }
294 
295 
296 
297 
298 
299 //======================================================================================================
300 // submatrix set/get
301 
302 template<typename T>
303 template<typename M>
304 void GPUSparseMatrix<T>::add(const M &mat)
305 {
306  for(size_t i=0; i < mat.num_rows(); i++)
307  {
308  int r = mat.row_index(i);
309  for(size_t j=0; j < mat.num_cols(); j++)
310  {
311  int c = mat.col_index(j);
312  (*this)(r,c) += mat(i,j);
313  }
314  }
315 }
316 
317 
318 template<typename T>
319 template<typename M>
320 void GPUSparseMatrix<T>::set(const M &mat)
321 {
322  for(size_t i=0; i < mat.num_rows(); i++)
323  {
324  int r = mat.row_index(i);
325  for(size_t j=0; j < mat.num_cols(); j++)
326  {
327  int c = mat.col_index(j);
328  (*this)(r,c) = mat(i,j);
329  }
330  }
331 }
332 
333 template<typename T>
334 template<typename M>
335 void GPUSparseMatrix<T>::get(M &mat) const
336 {
337  for(size_t i=0; i < mat.num_rows(); i++)
338  {
339  int r = mat.row_index(i);
340  for(size_t j=0; j < mat.num_cols(); j++)
341  {
342  int c = mat.col_index(j);
343  mat(i,j) = (*this)(r,c);
344  }
345  }
346 }
347 
348 
349 template<typename T>
350 int GPUSparseMatrix<T>::get_index_internal(size_t row, int col) const
351 {
352 // PROFILE_GPUMATRIX(SP_get_index_internal);
353  assert(rowStart[row] != -1);
354  int l = rowStart[row];
355  int r = rowEnd[row];
356  int mid=0;
357  while(l < r)
358  {
359  mid = (l+r)/2;
360  if(cols[mid] < col)
361  l = mid+1;
362  else if(cols[mid] > col)
363  r = mid-1;
364  else
365  return mid;
366  }
367  mid = (l+r)/2;
368  int ret;
369  if(mid < rowStart[row])
370  ret = rowStart[row];
371  else if(mid == rowEnd[row] || col <= cols[mid])
372  ret = mid;
373  else ret = mid+1;
374  UG_ASSERT(ret <= rowEnd[row] && ret >= rowStart[row], "row iterator row " << row << " pos " << ret << " out of bounds [" << rowStart[row] << ", " << rowEnd[row] << "]");
375  return ret;
376 }
377 
378 
379 
380 template<typename T>
381 int GPUSparseMatrix<T>::get_index_const(int r, int c) const
382 {
383  if(rowStart[r] == -1 || rowStart[r] == rowEnd[r]) return -1;
384  int index=get_index_internal(r, c);
385  if(index >= rowStart[r] && index < rowEnd[r] && cols[index] == c)
386  return index;
387  else
388  return -1;
389 }
390 
391 
392 template<typename T>
394 {
395 // UG_LOG("get_index " << r << ", " << c << "\n");
396 // UG_LOG(rowStart[r] << " - " << rowMax[r] << " - " << rowEnd[r] << " - " << cols.size() << " - " << maxValues << "\n");
397  if(rowStart[r] == -1 || rowStart[r] == rowEnd[r])
398  {
399 // UG_LOG("new row\n");
400  // row did not start, start new row at the end of cols array
401  assureValuesSize(maxValues+1);
402  rowStart[r] = maxValues;
403  rowEnd[r] = maxValues+1;
404  rowMax[r] = maxValues+1;
405  if(bNeedsValues) values[maxValues] = 0.0;
406  cols[maxValues] = c;
407  maxValues++;
408  nnz++;
409  return maxValues-1;
410  }
411 
412  /* for(int i=rowStart[r]; i<rowEnd[r]; i++)
413  if(cols[i] == c)
414  return i;*/
415 
416  // get the index where (r,c) _should_ be
417  int index=get_index_internal(r, c);
418 
419  if(index < rowEnd[r]
420  && cols[index] == c)
421  {
422 // UG_LOG("found\n");
423  return index; // we found it
424  }
425 
426  // we did not find it, so we have to add it
427 
428 #ifndef NDEBUG
429  assert(index == rowEnd[r] || cols[index] > c);
430  assert(index == rowStart[r] || cols[index-1] < c);
431  for(int i=rowStart[r]+1; i<rowEnd[r]; i++)
432  assert(cols[i] > cols[i-1]);
433 #endif
434  if(rowEnd[r] == rowMax[r] && rowEnd[r] == maxValues
435  && maxValues < (int)cols.size())
436  {
437 // UG_LOG("at end\n");
438  // this row is stored at the end, so we can easily make more room
439  rowMax[r]++;
440  maxValues++;
441  }
442 
443  if(rowEnd[r] == rowMax[r])
444  {
445 // UG_LOG("renew\n");
446  // row is full, we need to copy it to another place
447  int newSize = (rowEnd[r]-rowStart[r])*2;
448  if(maxValues+newSize > (int)cols.size())
449  {
450 // UG_LOG("ass\n");
451  assureValuesSize(maxValues+newSize);
452  index=get_index_internal(r, c);
453  }
454  // copy it to the end and insert index
455  fragmented += rowEnd[r]-rowStart[r];
456  index = index-rowStart[r]+maxValues;
457  int j=rowEnd[r]-rowStart[r]+maxValues;
458  if(rowEnd[r] != 0)
459  for(int i=rowEnd[r]-1; i>=rowStart[r]; i--, j--)
460  {
461  if(j==index) j--;
462  if(bNeedsValues) values[j] = values[i];
463  cols[j] = cols[i];
464  if(i==rowStart[r]) break;
465  }
466  rowEnd[r] = maxValues+rowEnd[r]-rowStart[r]+1;
467  rowStart[r] = maxValues;
468  rowMax[r] = maxValues+newSize;
469  maxValues += newSize;
470  }
471  else
472  {
473 // UG_LOG("enlength\n");
474  // move part > index so we can insert the index
475  if(rowEnd[r] != 0)
476  for(int i=rowEnd[r]-1; i>=index; i--)
477  {
478  if(bNeedsValues) values[i+1] = values[i];
479  cols[i+1] = cols[i];
480  if(i==index) break;
481  }
482  rowEnd[r]++;
483  }
484  if(bNeedsValues) values[index] = 0.0;
485  cols[index] = c;
486 
487  nnz++;
488 #ifndef NDEBUG
489  assert(index >= rowStart[r] && index < rowEnd[r]);
490  for(int i=rowStart[r]+1; i<rowEnd[r]; i++)
491  assert(cols[i] > cols[i-1]);
492 #endif
493  return index;
494 
495 }
496 
497 template<typename T>
498 void GPUSparseMatrix<T>::copyToNewSize(size_t newSize, size_t maxCol)
499 {
500  PROFILE_GPUMATRIX(GPUSparseMatrix_copyToNewSize);
501  /*UG_LOG("copyToNewSize: from " << values.size() << " to " << newSize << "\n");
502  UG_LOG("sizes are " << cols.size() << " and " << values.size() << ", ");
503  UG_LOG(reset_floats << "capacities are " << cols.capacity() << " and " << values.capacity() << ", NNZ = " << nnz << ", fragmentation = " <<
504  (1-((double)nnz)/cols.size())*100.0 << "%\n");
505  if(newSize == nnz) { UG_LOG("Defragmenting to NNZ."); }*/
506  if( (iIterators > 0)
507  || (newSize > values.size() && (100.0*nnz)/newSize < 20 && newSize <= cols.capacity()) )
508  {
509  UG_ASSERT(newSize > values.size(), "no nnz-defragmenting while using iterators.");
510  //UG_LOG("copyToNew not defragmenting because of iterators or low fragmentation.\n");}
511  cols.resize(newSize);
512  cols.resize(cols.capacity());
513  if(bNeedsValues) { values.resize(newSize); values.resize(cols.size()); }
514  return;
515  }
516 
517  std::vector<value_type> v(newSize);
518  std::vector<int> c(newSize);
519  size_t j=0;
520  for(size_t r=0; r<num_rows(); r++)
521  {
522  if(rowStart[r] == -1)
523  rowStart[r] = rowEnd[r] = rowMax[r] = j;
524  else
525  {
526  size_t start=j;
527  for(int k=rowStart[r]; k<rowEnd[r]; k++)
528  {
529  if(cols[k] < (int)maxCol)
530  {
531  if(bNeedsValues) v[j] = values[k];
532  c[j] = cols[k];
533  j++;
534  }
535  }
536  rowStart[r] = start;
537  rowEnd[r] = rowMax[r] = j;
538  }
539  }
540  rowStart[num_rows()] = rowEnd[num_rows()-1];
541  fragmented = 0;
542  maxValues = j;
543  if(bNeedsValues) std::swap(values, v);
544  std::swap(cols, c);
545 }
546 
547 template<typename T>
549 {
550  if((double)nnz/(double)maxValues < 0.9)
551  (const_cast<this_type*>(this))->defragment();
552 }
553 
554 template<typename T>
556 {
557  if(s <= cols.size()) return;
558  size_t newSize = nnz*2;
559  if(newSize < s) newSize = s;
560  copyToNewSize(newSize);
561 
562 }
563 
564 template<typename T>
566 {
567  int j=0;
568  for(size_t r=0; r<num_rows(); r++)
569  {
570  if(rowStart[r] == -1) continue;
571  for(int k=rowStart[r]; k<rowEnd[r]; k++)
572  if(cols[k] < (int)maxCols)
573  j++;
574  }
575  return j;
576 }
577 
578 } // namespace ug
579 
580 #endif
581 
parameterString s
Definition: connection.h:40
size_t iIndex
Definition: connection.h:42
static cusparseHandle_t get_cusparseHandle()
Definition: cuda_manager.h:102
Definition: gpusparsematrix.h:400
Definition: gpusparsematrix.h:383
sparse matrix for big, variable sparse matrices.
Definition: gpusparsematrix.h:102
bool is_isolated(size_t i) const
check for isolated condition of an index
Definition: gpusparsematrix_impl.h:218
bool resize_and_keep_values(size_t newRows, size_t newCols)
Definition: gpusparsematrix_impl.h:84
row_iterator end_row(size_t r)
Definition: gpusparsematrix.h:421
void set(const M &mat)
set local matrix
Definition: gpusparsematrix_impl.h:320
bool set_as_transpose_of(const GPUSparseMatrix< value_type > &B, double scale=1.0)
write in a empty GPUSparseMatrix (this) the transpose GPUSparseMatrix of B.
Definition: gpusparsematrix_impl.h:113
void add_matrix_row(size_t row, connection *c, size_t nr)
Definition: gpusparsematrix_impl.h:264
row_iterator begin_row(size_t r)
Definition: gpusparsematrix.h:420
size_t num_rows() const
returns number of rows
Definition: gpusparsematrix.h:337
void assureValuesSize(size_t s)
Definition: gpusparsematrix_impl.h:555
int get_nnz_max_cols(size_t maxCols)
Definition: gpusparsematrix_impl.h:565
int get_index_internal(size_t row, int col) const
Definition: gpusparsematrix_impl.h:350
bool set_as_copy_of(const GPUSparseMatrix< value_type > &B, double scale=1.0)
create/recreate this as a copy of GPUSparseMatrix B
Definition: gpusparsematrix_impl.h:273
void set_matrix_row(size_t row, connection *c, size_t nr)
Definition: gpusparsematrix_impl.h:230
bool axpy(vector_t &dest, const number &alpha1, const vector_t &v1, const number &beta1, const vector_t &w1) const
calculate dest = alpha1*v1 + beta1*A*w1 (A = this matrix)
Definition: gpusparsematrix_impl.h:170
bool scale(double d)
Definition: gpusparsematrix_impl.h:286
int get_index_const(int r, int c) const
Definition: gpusparsematrix_impl.h:381
void check_fragmentation() const
Definition: gpusparsematrix_impl.h:548
void add(const M &mat)
Definition: gpusparsematrix_impl.h:304
void mat_mult_add_row(size_t row, typename vector_t::value_type &dest, double alpha, const vector_t &v) const
calculates dest += alpha * A[row, .] v;
Definition: gpusparsematrix_impl.h:149
void copyToNewSize(size_t newSize)
Definition: gpusparsematrix.h:564
void get(M &mat) const
get local matrix
Definition: gpusparsematrix_impl.h:335
bool axpy_transposed(vector_t &dest, const number &alpha1, const vector_t &v1, const number &beta1, const vector_t &w1) const
calculate dest = alpha1*v1 + beta1*A^T*w1 (A = this matrix)
Definition: gpusparsematrix_impl.h:190
GPUSparseMatrix()
constructor for empty GPUSparseMatrix
Definition: gpusparsematrix_impl.h:52
bool resize_and_clear(size_t newRows, size_t newCols)
resizes the GPUSparseMatrix
Definition: gpusparsematrix_impl.h:66
size_t num_cols() const
returns the number of cols
Definition: gpusparsematrix.h:340
int get_index(int r, int c)
Definition: gpusparsematrix_impl.h:393
#define PROFILE_GPUMATRIX(name)
Definition: gpusparsematrix.h:52
number alpha
#define UG_ASSERT(expr, msg)
Definition: assert.h:70
#define UG_LOG(msg)
Definition: log.h:367
double number
Definition: types.h:124
the ug namespace
bool MatMultAdd(vector_t &dest, const number &alpha1, const vector_t &v1, const number &beta1, const matrix_t &A1, const vector_t &w1)
calculates dest = alpha1*v1 + beta1 * A1 *w1;
Definition: operations_mat.h:68
T value_type
Definition: sparsematrix_interface.h:2
size_t num_rows() const
Definition: sparsematrix_interface.h:38
const_row_iterator end_row(size_t row) const
value_type & operator()(size_t r, size_t c)
const_row_iterator begin_row(size_t row) const
size_t num_cols() const
Definition: sparsematrix_interface.h:39