Guest User

Untitled

a guest
Feb 19th, 2018
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.75 KB | None | 0 0
  1. - (Old) Loopy Kernel:
  2.  
  3. ```c++
  4. __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) tsfc_kernel(__global double *__restrict__ A_0, __global double const *__restrict__ coords, __global double const *__restrict__ w_0){
  5. double acc_i12;
  6. double cse;
  7. double cse_0;
  8.  
  9. cse_0 = -1.0 * coords[1];
  10. cse = -1.0 * coords[0];
  11. for (int i1 = 0; i1 <= 2; ++i1){
  12. acc_i12 = 0.0;
  13. for (int i12 = 0; i12 <= 2; ++i12)
  14. acc_i12 = acc_i12 + cnst[3 * i12 + i1] * (cnst[3 * i12 + 2] * w_0[2] + cnst[3 * i12] * w_0[0] + cnst[3 * i12 + 1] * w_0[1]) * cnst_0[i12] * fabs((cse + coords[2]) * (cse_0 + coords[5]) + -1.0 * (cse + coords[4]) * (cse_0 + coords[3]));
  15. A_0[i1] = acc_i12;
  16. }
  17. }
  18. ```
  19. - MatFree kernel
  20.  
  21. ```c++
  22. static inline void form00_cell_integral_otherwise (double A[3][3] , const double *const restrict *restrict coords ){
  23. static const double t0[3][4] = {{0.666666666666667, 0.166666666666667, 0.166666666666667},
  24. {0.166666666666667, 0.166666666666667, 0.666666666666667},
  25. {0.166666666666667, 0.666666666666667, 0.166666666666667}};
  26. double t1 = (-1 * coords[0][0]);
  27. double t2 = (-1 * coords[0][1]);
  28. double t3 = fabs(((t1 + coords[1][0]) * (t2 + coords[2][1])) + (-1 * ((t1 + coords[2][0]) * (t2 + coords[1][1]))));
  29. static const double t4[4] = {0.166666666666667, 0.166666666666667, 0.166666666666667};
  30.  
  31. for (int ip = 0; ip < 3; ip += 1){
  32. double t5 = (t4[ip] * t3);
  33. for (int j = 0; j < 3; j += 1){
  34. double t6 = (t0[ip][j] * t5);
  35. for (int k = 0; k < 3; k += 1){
  36. #pragma coffee expression
  37. A[j][k] += t0[ip][k] * t6;
  38. }
  39. }
  40. }
  41. }
  42. ```
  43. - New Loopy Kernel after taking into account the extra CSE's which we were dropping:
  44.  
  45. ```c++
  46. __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) tsfc_kernel(__global double *__restrict__ A_0, __global double const *__restrict__ coords, __global double const *__restrict__ w_0)
  47. {
  48. double acc_i12;
  49. double cse;
  50. double cse_0;
  51. double cse_1;
  52. double cse_2[3];
  53. double cse_3[3];
  54. double cse_4[3];
  55. double cse_5[3];
  56. double cse_6[3];
  57. double cse_7[3];
  58. double cse_8[3];
  59. double cse_9[3 * 3];
  60.  
  61. cse_0 = -1.0 * coords[1];
  62. cse = -1.0 * coords[0];
  63. cse_1 = fabs((cse + coords[2]) * (cse_0 + coords[5]) + -1.0 * (cse + coords[4]) * (cse_0 + coords[3]));
  64. for (int i1 = 0; i1 <= 2; ++i1)
  65. {
  66. acc_i12 = 0.0;
  67. for (int i12 = 0; i12 <= 2; ++i12)
  68. {
  69. cse_6[i12] = cnst[3 * i12 + 2] * w_0[2];
  70. cse_4[i12] = cnst[3 * i12 + 1] * w_0[1];
  71. cse_3[i12] = cnst[3 * i12] * w_0[0];
  72. cse_5[i12] = cse_3 + cse_4;
  73. cse_7[i12] = cse_5 + cse_6;
  74. cse_2[i12] = cnst_0[i12] * cse_1;
  75. cse_8[i12] = cse_2 * cse_7;
  76. cse_9[3 * i12 + i1] = cnst[3 * i12 + i1] * cse_8;
  77. acc_i12 = acc_i12 + cse_9;
  78. }
  79. A_0[i1] = acc_i12;
  80. }
  81. }
  82. ```
Add Comment
Please, Sign In to add comment