Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
kernel.h
Go to the documentation of this file.
1 #ifndef __KERNEL_H__
2 #define __KERNEL_H__
3 
4 #include "../sparse_formats/csr.h"
5 namespace CTF{
6  #ifdef __CUDACC__
7  #define NBLK 15
8  #define NTRD 512
9  template<typename dtype_A, typename dtype_B, typename dtype_C, dtype_C(*f)(dtype_A, dtype_B), void(*g)(dtype_C, dtype_C&)>
10  __global__ void cuda_gemmf(char tA,
11  char tB,
12  int m,
13  int n,
14  int k,
15  dtype_A const * A,
16  dtype_B const * B,
17  dtype_C * C){
18  int bidx = blockIdx.x;
19  int tidx = threadIdx.x;
20  int lda_A_m = tA == 'N' ? 1 : k;
21  int lda_A_k = tA == 'N' ? m : 1;
22  int lda_B_k = tB == 'N' ? 1 : n;
23  int lda_B_n = tB == 'N' ? k : 1;
24  for (int mi=bidx; mi<m; mi+=NBLK){
25  for (int ni=tidx; ni<n; ni+=NTRD){
26  for (int ki=0; ki<k; ki++){
27  g(f(A[mi*lda_A_m+ki*lda_A_k],
28  B[ki*lda_B_k+ni*lda_B_n]),
29  C[mi +ni*m]);
30  }
31  }
32  }
33  }
34 
35  template<typename dtype_A, typename dtype_B, typename dtype_C, dtype_C(*f)(dtype_A, dtype_B), void(*g)(dtype_C, dtype_C&)>
36  __device__
37  void cuda_csrmmf(int m,
38  int n,
39  int k,
40  dtype_A const * A,
41  int const * JA,
42  int const * IA,
43  dtype_B const * B,
44  dtype_C * C){
45  int bidx = blockIdx.x;
46  int tidx = threadIdx.x;
47  for (int col_B=bidx; col_B<n; col_B+=NBLK){
48  for (int row_A=tidx; row_A<m; row_A+=NTRD){
49  for (int i_A=IA[row_A]-1; i_A<IA[row_A+1]-1; i_A++){
50  int col_A = JA[i_A]-1;
51  g(f(A[i_A],B[col_B*k+col_A]),C[col_B*m+row_A]);
52  }
53  }
54  }
55  }
56 
57 
58  template<typename dtype_A, typename dtype_B, typename dtype_C, dtype_C(*f)(dtype_A, dtype_B), void(*g)(dtype_C, dtype_C&)>
59  __device__
60  void cuda_csrmmf(int m,
61  int n,
62  int k,
63  dtype_A const * A,
64  int const * JA,
65  int const * IA,
66  dtype_B const * B,
67  dtype_C * C){
68  int bidx = blockIdx.x;
69  int tidx = threadIdx.x;
70  for (int col_B=bidx; col_B<n; col_B+=NBLK){
71  for (int row_A=tidx; row_A<m; row_A+=NTRD){
72  for (int i_A=IA[row_A]-1; i_A<IA[row_A+1]-1; i_A++){
73  int col_A = JA[i_A]-1;
74  g(f(A[i_A],B[col_B*k+col_A]),C[col_B*m+row_A]);
75  }
76  }
77  }
78  }
79 
80  //FIXME there is code replication here with ../sparse_foramts/csr.cxx
81  #define ALIGN 256
82 
83  template<typename dtype_A, typename dtype_B, typename dtype_C, dtype_C(*f)(dtype_A, dtype_B), void(*g)(dtype_C, dtype_C&)>
84  __global__
85  void offload_csrmm(int m,
86  int n,
87  int k,
88  char const * all_data,
89  dtype_B const * B,
90  dtype_C * C){
91  int64_t nnz_A = ((int64_t*)all_data)[0];
92  int offset = 3*sizeof(int64_t);
93  if (offset % ALIGN != 0) offset += ALIGN-(offset%ALIGN);
94  dtype_A const * A = (dtype_A const *)(all_data + offset);
95  offset += nnz_A*sizeof(dtype_A);
96  if (offset % ALIGN != 0) offset += ALIGN-(offset%ALIGN);
97  int const * IA = (int*)(all_data + offset);
98  offset += (m+1)*sizeof(int);
99  if (offset % ALIGN != 0) offset += ALIGN-(offset%ALIGN);
100  int const * JA = (int*)(all_data + offset);
101  cuda_csrmmf<dtype_A,dtype_B,dtype_C,f,g>(m,n,k,A,JA,IA,B,C);
102  }
103 
104  #undef ALIGN
105 
106  #endif
107 
108  template<typename dtype>
109  #ifdef __CUDACC__
110  __device__ __host__
111  #endif
112  void default_monoid(dtype a, dtype & b){ b = a+b; }
113 
114  template<typename dtype=double, void(*g)(dtype, dtype&)=default_monoid<dtype> >
116  public:
117  static MPI_Op get_MPI_Op(){
118  MPI_Op moo;
119 
120  //FIXME: assumes monoid is commutative
121  MPI_Op_create(
122  [](void * a, void * b, int * n, MPI_Datatype*){
123  for (int i=0; i<*n; i++){
124  g(((dtype*)a)[i], ((dtype*)b)[i]);
125  }
126  },
127  1, &moo);
128 
129  return moo;
130  }
131 
133  this->el_size = sizeof(dtype);
134  }
135 
136  void accum(char const * a,
137  char * b) const {
138  g(((dtype const *)a)[0], ((dtype *)b)[0]);
139  }
140 
141  static void xpy(int n,
142  dtype const * X,
143  int incX,
144  dtype * Y,
145  int incY){
146 
147  for (int i=0; i<n; i++){
148  g(X[incX*i],Y[incY*i]);
149  }
150  }
155  virtual void init_shell(int64_t n, char * arr) const {
156  dtype dummy = dtype();
157  for (int i=0; i<n; i++){
158  memcpy(arr+i*el_size,(char*)&dummy,el_size);
159  }
160  }
161  };
162 
163 
164 
165  template<typename dtype_A, typename dtype_B, typename dtype_C, dtype_C(*f)(dtype_A, dtype_B), void(*g)(dtype_C, dtype_C&)=default_monoid<dtype_C> >
166  class Bivar_Kernel : public Monoid_Kernel<dtype_C, g>, public Bivar_Function<dtype_A, dtype_B, dtype_C> {
167  public:
168  Bivar_Kernel() : Bivar_Function<dtype_A, dtype_B, dtype_C>(f) {
169  this->has_kernel = true;
170 #ifdef __CUDACC__
171  this->has_off_gemm = true;
172 #endif
173  this->el_size = sizeof(dtype_C);
174  }
175 
176  Bivar_Kernel(bool is_comm) : Bivar_Function<dtype_A, dtype_B, dtype_C>(f, is_comm) {
177  this->has_kernel = true;
178 #ifdef __CUDACC__
179  this->has_off_gemm = true;
180 #endif
181  }
182 
183 
184 
185  static void gemm(char tA,
186  char tB,
187  int m,
188  int n,
189  int k,
190  dtype_A const * A,
191  dtype_B const * B,
192  dtype_C * C){
193  int lda_A_m = tA == 'N' ? 1 : k;
194  int lda_A_k = tA == 'N' ? m : 1;
195  int lda_B_k = tB == 'N' ? 1 : n;
196  int lda_B_n = tB == 'N' ? k : 1;
197 #ifdef _OPENMP
198  #pragma omp parallel for
199 #endif
200  for (int mi=0; mi<m; mi++){
201 #ifdef _OPENMP
202  #pragma omp parallel for
203 #endif
204  for (int ni=0; ni<n; ni++){
205  for (int ki=0; ki<k; ki++){
206  g(f(A[mi*lda_A_m+ki*lda_A_k],
207  B[ki*lda_B_k+ni*lda_B_n]),
208  C[mi +ni*m]);
209  }
210  }
211  }
212  }
213 
214 
215  static void coomm(int m,
216  int n,
217  int k,
218  dtype_A const * A,
219  int const * rows_A,
220  int const * cols_A,
221  int nnz_A,
222  dtype_B const * B,
223  dtype_C * C){
224  //TAU_FSTART(default_fcoomm);
225  for (int i=0; i<nnz_A; i++){
226  int row_A = rows_A[i]-1;
227  int col_A = cols_A[i]-1;
228  for (int col_C=0; col_C<n; col_C++){
229  g(f(A[i],B[col_C*k+col_A]),C[col_C*m+row_A]);
230  }
231  }
232  //TAU_FSTOP(default_fcoomm);
233  }
234 
235  void ccoomm(int m,
236  int n,
237  int k,
238  char const * A,
239  int const * rows_A,
240  int const * cols_A,
241  int64_t nnz_A,
242  char const * B,
243  char * C) const {
244  coomm(m, n, k, (dtype_A const *)A, rows_A, cols_A, nnz_A,
245  (dtype_B const *)B, (dtype_C *)C);
246  }
247 
248  static void csrmm(int m,
249  int n,
250  int k,
251  dtype_A const * A,
252  int const * JA,
253  int const * IA,
254  int64_t nnz_A,
255  dtype_B const * B,
256  dtype_C * C){
257  //TAU_FSTART(3type_csrmm);
258 #ifdef _OPENMP
259  #pragma omp parallel for
260 #endif
261  for (int row_A=0; row_A<m; row_A++){
262 #ifdef _OPENMP
263  #pragma omp parallel for
264 #endif
265  for (int col_B=0; col_B<n; col_B++){
266  for (int i_A=IA[row_A]-1; i_A<IA[row_A+1]-1; i_A++){
267  int col_A = JA[i_A]-1;
268  g(f(A[i_A],B[col_B*k+col_A]),C[col_B*m+row_A]);
269  }
270  }
271  }
272  //TAU_FSTOP(3type_csrmm);
273  }
274  void cgemm(char tA,
275  char tB,
276  int m,
277  int n,
278  int k,
279  char const * A,
280  char const * B,
281  char * C) const {
282  gemm(tA, tB, m, n, k,
283  (dtype_A const *)A, (dtype_B const *)B, (dtype_C *)C);
284  }
285 
286  // FIXME: below kernels replicate code from src/interface/semiring.h
287 
288  static void csrmultd
289  (int m,
290  int n,
291  int k,
292  dtype_A const * A,
293  int const * JA,
294  int const * IA,
295  int nnz_A,
296  dtype_B const * B,
297  int const * JB,
298  int const * IB,
299  int nnz_B,
300  dtype_C * C){
301 #ifdef _OPENMP
302  #pragma omp parallel for
303 #endif
304  for (int row_A=0; row_A<m; row_A++){
305  for (int i_A=IA[row_A]-1; i_A<IA[row_A+1]-1; i_A++){
306  int row_B = JA[i_A]-1; //=col_A
307  for (int i_B=IB[row_B]-1; i_B<IB[row_B+1]-1; i_B++){
308  int col_B = JB[i_B]-1;
309  g(f(A[i_A],B[i_B]),C[col_B*m+row_A]);
310  }
311  }
312  }
313  }
314 
315 
316  void csrmultcsr_old
317  (int m,
318  int n,
319  int k,
320  dtype_A const * A,
321  int const * JA,
322  int const * IA,
323  int nnz_A,
324  dtype_B const * B,
325  int const * JB,
326  int const * IB,
327  int nnz_B,
328  char *& C_CSR) const {
329  int * IC = (int*)CTF_int::alloc(sizeof(int)*(m+1));
330  int * has_col = (int*)CTF_int::alloc(sizeof(int)*n);
331  IC[0] = 1;
332  for (int i=0; i<m; i++){
333  memset(has_col, 0, sizeof(int)*n);
334  IC[i+1] = IC[i];
335  CTF_int::CSR_Matrix::compute_has_col(JA, IA, JB, IB, i, has_col);
336  for (int j=0; j<n; j++){
337  IC[i+1] += has_col[j];
338  }
339  }
340  CTF_int::CSR_Matrix C(IC[m]-1, m, n, this);
341  dtype_C * vC = (dtype_C*)C.vals();
342  int * JC = C.JA();
343  memcpy(C.IA(), IC, sizeof(int)*(m+1));
344  CTF_int::cdealloc(IC);
345  IC = C.IA();
346  int64_t * rev_col = (int64_t*)CTF_int::alloc(sizeof(int64_t)*n);
347  for (int i=0; i<m; i++){
348  memset(has_col, 0, sizeof(int)*n);
349  CTF_int::CSR_Matrix::compute_has_col(JA, IA, JB, IB, i, has_col);
350  int vs = 0;
351  for (int j=0; j<n; j++){
352  if (has_col[j]){
353  JC[IC[i]+vs-1] = j+1;
354  rev_col[j] = IC[i]+vs-1;
355  vs++;
356  }
357  }
358  memset(has_col, 0, sizeof(int)*n);
359  for (int j=0; j<IA[i+1]-IA[i]; j++){
360  int row_B = JA[IA[i]+j-1]-1;
361  int idx_A = IA[i]+j-1;
362  for (int l=0; l<IB[row_B+1]-IB[row_B]; l++){
363  int idx_B = IB[row_B]+l-1;
364  if (has_col[JB[idx_B]-1])
365  g(f(A[idx_A],B[idx_B]), vC[rev_col[JB[idx_B]-1]]);
366  else
367  vC[rev_col[JB[idx_B]-1]] = f(A[idx_A],B[idx_B]);
368  has_col[JB[idx_B]-1] = 1;
369  }
370  }
371  }
372  CTF_int::CSR_Matrix C_in(C_CSR);
373  if (C_CSR == NULL || C_in.nnz() == 0){
374  C_CSR = C.all_data;
375  } else {
376  char * ans = CTF_int::CSR_Matrix::csr_add(C_CSR, C.all_data, this);
378  C_CSR = ans;
379  }
380  CTF_int::cdealloc(has_col);
381  CTF_int::cdealloc(rev_col);
382  }
383 
384  void csrmultcsr
385  (int m,
386  int n,
387  int k,
388  dtype_A const * A, // A m by k
389  int const * JA,
390  int const * IA,
391  int nnz_A,
392  dtype_B const * B, // B k by n
393  int const * JB,
394  int const * IB,
395  int nnz_B,
396  char *& C_CSR) const {
397  //int *ic = (int*)Malloc(sizeof(int)*(m+1));
398  int * IC = (int*)CTF_int::alloc(sizeof(int)*(m+1));
399  memset(IC, 0, sizeof(int)*(m+1));
400 #ifdef _OPENMP
401  #pragma omp parallel
402  {
403 #endif
404  int * has_col = (int*)CTF_int::alloc(sizeof(int)*(n+1)); //n is the num of col of B
405  int nnz = 0;
406 #ifdef _OPENMP
407  #pragma omp for schedule(dynamic) // TO DO test other strategies
408 #endif
409  for (int i=0; i<m; i++){
410  memset(has_col, 0, sizeof(int)*(n+1));
411  nnz = 0;
412  for (int j=0; j<IA[i+1]-IA[i]; j++){
413  int row_B = JA[IA[i]+j-1]-1;
414  for (int kk=0; kk<IB[row_B+1]-IB[row_B]; kk++){
415  int idx_B = IB[row_B]+kk-1;
416  if (has_col[JB[idx_B]] == 0){
417  nnz++;
418  has_col[JB[idx_B]] = 1;
419  }
420  }
421  IC[i+1]=nnz;
422  }
423  }
424  CTF_int::cdealloc(has_col);
425 #ifdef _OPENMP
426  } // END PARALLEL
427 #endif
428  int ic_prev = 1;
429  for(int i=0;i < m+1; i++){
430  ic_prev += IC[i];
431  IC[i] = ic_prev;
432  }
433  CTF_int::CSR_Matrix C(IC[m]-1, m, n, this);
434  dtype_C * vC = (dtype_C*)C.vals();
435  int * JC = C.JA();
436  memcpy(C.IA(), IC, sizeof(int)*(m+1));
437  CTF_int::cdealloc(IC);
438  IC = C.IA();
439 #ifdef _OPENMP
440  #pragma omp parallel
441  {
442 #endif
443  int ins = 0;
444  int *dcol = (int *) CTF_int::alloc(n*sizeof(int));
445  dtype_C *acc_data = new dtype_C[n];
446 #ifdef _OPENMP
447  #pragma omp for
448 #endif
449  for (int i=0; i<m; i++){
450  memset(dcol, 0, sizeof(int)*(n));
451  ins = 0;
452  for (int j=0; j<IA[i+1]-IA[i]; j++){
453  int row_b = JA[IA[i]+j-1]-1; // 1-based
454  int idx_a = IA[i]+j-1;
455  for (int ii = 0; ii < IB[row_b+1]-IB[row_b]; ii++){
456  int col_b = IB[row_b]+ii-1;
457  int col_c = JB[col_b]-1; // 1-based
458 // dtype_C val = fmul(A[idx_a], B[col_b]);
459  if (dcol[col_c] == 0){
460  dcol[col_c] = JB[col_b];
461  acc_data[col_c] =f(A[idx_a],B[col_b]);
462  } else {
463  g(f(A[idx_a],B[col_b]), acc_data[col_c]);
464  }
465  }
466  }
467  for(int jj = 0; jj < n; jj++){
468  if (dcol[jj] != 0){
469  JC[IC[i]+ins-1] = dcol[jj];
470  vC[IC[i]+ins-1] = acc_data[jj];
471  ++ins;
472  }
473  }
474  }
475  CTF_int::cdealloc(dcol);
476  delete [] acc_data;
477 #ifdef _OPENMP
478  } //PRAGMA END
479 #endif
480  CTF_int::CSR_Matrix C_in(C_CSR);
481  if (C_CSR == NULL || C_in.nnz() == 0){
482  C_CSR = C.all_data;
483  } else {
484  char * ans = CTF_int::CSR_Matrix::csr_add(C_CSR, C.all_data, this);
486  C_CSR = ans;
487  }
488 
489  }
490 
491 
492 
493  void ccsrmultd
494  (int m,
495  int n,
496  int k,
497  char const * A,
498  int const * JA,
499  int const * IA,
500  int nnz_A,
501  char const * B,
502  int const * JB,
503  int const * IB,
504  int nnz_B,
505  char * C,
506  CTF_int::algstrct const * sr_C) const {
507  csrmultd(m,n,k,(dtype_A const *)A,JA,IA,nnz_A,(dtype_B const *)B,JB,IB,nnz_B,(dtype_C *)C);
508  }
509 
510  void ccsrmultcsr
511  (int m,
512  int n,
513  int k,
514  char const * A,
515  int const * JA,
516  int const * IA,
517  int nnz_A,
518  char const * B,
519  int const * JB,
520  int const * IB,
521  int nnz_B,
522  char *& C_CSR,
523  CTF_int::algstrct const * sr_C) const {
524  csrmultcsr(m,n,k,(dtype_A const *)A,JA,IA,nnz_A,(dtype_B const *)B, JB, IB, nnz_B, C_CSR);
525  }
526 
527  void ccsrmm(int m,
528  int n,
529  int k,
530  char const * A,
531  int const * JA,
532  int const * IA,
533  int64_t nnz_A,
534  char const * B,
535  char * C,
536  CTF_int::algstrct const * sr_C) const {
537  csrmm(m,n,k,(dtype_A const *)A,JA,IA,nnz_A,(dtype_B const *)B, (dtype_C *)C);
538  }
539 
540 
541 
542  static void offload_gemm(char tA,
543  char tB,
544  int m,
545  int n,
546  int k,
547  dtype_A const * A,
548  dtype_B const * B,
549  dtype_C * C){
550 #ifdef __CUDACC__
551 #ifdef PROFILE_CUGEMM
552  //TAU_FSTART(3type_cugemm);
553 #endif
554  cuda_gemmf<dtype_A,dtype_B,dtype_C,f,g><<<NBLK,NTRD>>>(tA, tB, m, n, k, A, B, C);
555 #ifdef PROFILE_CUGEMM
556  cudaDeviceSynchronize();
557  //TAU_FSTOP(3type_cugemm);
558 #endif
559 #else
560  assert(0);
561 #endif
562  }
563 
564  void coffload_gemm(char tA,
565  char tB,
566  int m,
567  int n,
568  int k,
569  char const * A,
570  char const * B,
571  char * C) const {
572  offload_gemm(tA, tB, m, n, k, (dtype_A const *)A, (dtype_B const *)B, (dtype_C*)C);
573  }
574 
575 /*
576  void coffload_csrmm(int m,
577  int n,
578  int k,
579  char const * A,
580  int const * JA,
581  int const * IA,
582  int64_t nnz_A,
583  char const * B,
584  char * C) const {
585  offload_csrmm(m, n, k, (dtype_A const *)A, JA, IA, nnz_A, (dtype_B const *)B, (dtype_C*)C);
586  }*/
587 
588  void coffload_csrmm(int m,
589  int n,
590  int k,
591  char const * all_data,
592  char const * B,
593  char * C) const {
594 #ifdef __CUDACC__
595 #ifdef PROFILE_CUGEMM
596  //TAU_FSTART(3type_cucsrmm);
597 #endif
598  offload_csrmm<dtype_A,dtype_B,dtype_C,f,g><<<NBLK,NTRD>>>(m, n, k, all_data, (dtype_B const *)B, (dtype_C *)C);
599 #ifdef PROFILE_CUGEMM
600  cudaDeviceSynchronize();
601  //TAU_FSTOP(3type_cucsrmm);
602 #endif
603 #else
604  assert(0);
605 #endif
606 // offload_csrmm(m, n, k, (dtype_A const *)A, JA, IA, nnz_A, (dtype_B const *)B, (dtype_C*)C);
607  }
608 
609 
610 
611 
612 /* static void axpy(int n,
613  dtype_C alpha,
614  dtype_C const * X,
615  int incX,
616  dtype_C * Y
617  int incY){
618 
619  for (int i=0; i<n; i++){
620  g(f(alpha,X[incX*i]),Y[incY*i]);
621  }
622  }*/
623  };
624 
625 }
626 #endif
void ccsrmm(int m, int n, int k, char const *A, int const *JA, int const *IA, int64_t nnz_A, char const *B, char *C, CTF_int::algstrct const *sr_C) const
Definition: kernel.h:527
int * IA() const
retrieves prefix sum of number of nonzeros for each row (of size nrow()+1) out of all_data ...
Definition: csr.cxx:107
void cgemm(char tA, char tB, int m, int n, int k, char const *A, char const *B, char *C) const
Definition: kernel.h:274
void accum(char const *a, char *b) const
b+=a
Definition: kernel.h:136
static char * csr_add(char *cA, char *cB, accumulatable const *adder)
Definition: csr.cxx:332
void ccoomm(int m, int n, int k, char const *A, int const *rows_A, int const *cols_A, int64_t nnz_A, char const *B, char *C) const
Definition: kernel.h:235
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
static void gemm(char tA, char tB, int m, int n, int k, dtype_A const *A, dtype_B const *B, dtype_C *C)
Definition: kernel.h:185
custom bivariate function on two tensors: e.g. C["ij"] = f(A["ik"],B["kj"])
Definition: functions.h:137
virtual void init_shell(int64_t n, char *arr) const
initialize n objects to zero
Definition: kernel.h:155
void gemm(char tA, char tB, int m, int n, int k, dtype alpha, dtype const *A, dtype const *B, dtype beta, dtype *C)
Definition: semiring.cxx:82
static void compute_has_col(int const *JA, int const *IA, int const *JB, int const *IB, int i, int *has_col)
Definition: csr.cxx:316
int * JA() const
retrieves column indices of each value in vals stored in sorted form by row
Definition: csr.cxx:119
void coffload_gemm(char tA, char tB, int m, int n, int k, char const *A, char const *B, char *C) const
Definition: kernel.h:564
int64_t nnz() const
retrieves number of nonzeros out of all_data
Definition: csr.cxx:80
#define ALIGN
Definition: csr.cxx:5
abstract class that knows how to add
Definition: algstrct.h:13
static void offload_gemm(char tA, char tB, int m, int n, int k, dtype_A const *A, dtype_B const *B, dtype_C *C)
Definition: kernel.h:542
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
Definition: csr.h:22
static MPI_Op get_MPI_Op()
Definition: kernel.h:117
static void coomm(int m, int n, int k, dtype_A const *A, int const *rows_A, int const *cols_A, int nnz_A, dtype_B const *B, dtype_C *C)
Definition: kernel.h:215
Bivar_Kernel(bool is_comm)
Definition: kernel.h:176
char * all_data
serialized buffer containing all info, index, and values related to matrix
Definition: csr.h:25
char * vals() const
retrieves array of values out of all_data
Definition: csr.cxx:101
static void csrmm(int m, int n, int k, dtype_A const *A, int const *JA, int const *IA, int64_t nnz_A, dtype_B const *B, dtype_C *C)
Definition: kernel.h:248
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
Definition: apsp.cxx:17
void offload_gemm(char tA, char tB, int m, int n, int k, dtype alpha, offload_tsr &A, int lda_A, offload_tsr &B, int lda_B, dtype beta, offload_tsr &C, int lda_C)
void default_monoid(dtype a, dtype &b)
Definition: kernel.h:112
static void xpy(int n, dtype const *X, int incX, dtype *Y, int incY)
Definition: kernel.h:141
void coffload_csrmm(int m, int n, int k, char const *all_data, char const *B, char *C) const
Definition: kernel.h:588