Advertisement
Guest User

Untitled

a guest
Feb 29th, 2016
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 61.14 KB | None | 0 0
  1. User@User-PC ~/PyFR/euler_vortex_2d-1.3Test
  2. $ export PYFR_DEBUG_OMP_KEEP_LIBS=1
  3. (env4)
  4. User@User-PC ~/PyFR/euler_vortex_2d-1.3Test
  5. $ pyfr run -b openmp -p euler_vortex_2d.pyfrm euler_vortex_2d.ini
  6.  
  7. <pyfr.util.memoize object at 0x6fffe85c278>
  8. --self
  9. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
  10. --args
  11.  
  12. --kwargs
  13.  
  14. res cache key bad
  15. [[[-9.93056816 -8.93056816 -7.93056816 ..., 7.06943184 8.06943184
  16. 9.06943184]
  17. [-9.93056816 -9.93056816 -9.93056816 ..., 9.06943184 9.06943184
  18. 9.06943184]]
  19.  
  20. [[-9.66999052 -8.66999052 -7.66999052 ..., 7.33000948 8.33000948
  21. 9.33000948]
  22. [-9.93056816 -9.93056816 -9.93056816 ..., 9.06943184 9.06943184
  23. 9.06943184]]
  24.  
  25. [[-9.33000948 -8.33000948 -7.33000948 ..., 7.66999052 8.66999052
  26. 9.66999052]
  27. [-9.93056816 -9.93056816 -9.93056816 ..., 9.06943184 9.06943184
  28. 9.06943184]]
  29.  
  30. ...,
  31. [[-9.66999052 -8.66999052 -7.66999052 ..., 7.33000948 8.33000948
  32. 9.33000948]
  33. [-9.06943184 -9.06943184 -9.06943184 ..., 9.93056816 9.93056816
  34. 9.93056816]]
  35.  
  36. [[-9.33000948 -8.33000948 -7.33000948 ..., 7.66999052 8.66999052
  37. 9.66999052]
  38. [-9.06943184 -9.06943184 -9.06943184 ..., 9.93056816 9.93056816
  39. 9.93056816]]
  40.  
  41. [[-9.06943184 -8.06943184 -7.06943184 ..., 7.93056816 8.93056816
  42. 9.93056816]
  43. [-9.06943184 -9.06943184 -9.06943184 ..., 9.93056816 9.93056816
  44. 9.93056816]]]
  45. new res cache key
  46.  
  47. <pyfr.util.memoize object at 0x6fffe85c278>
  48. --self
  49. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
  50. --args
  51.  
  52. --kwargs
  53.  
  54. [[[-9.93056816 -8.93056816 -7.93056816 ..., 7.06943184 8.06943184
  55. 9.06943184]
  56. [-9.93056816 -9.93056816 -9.93056816 ..., 9.06943184 9.06943184
  57. 9.06943184]]
  58.  
  59. [[-9.66999052 -8.66999052 -7.66999052 ..., 7.33000948 8.33000948
  60. 9.33000948]
  61. [-9.93056816 -9.93056816 -9.93056816 ..., 9.06943184 9.06943184
  62. 9.06943184]]
  63.  
  64. [[-9.33000948 -8.33000948 -7.33000948 ..., 7.66999052 8.66999052
  65. 9.66999052]
  66. [-9.93056816 -9.93056816 -9.93056816 ..., 9.06943184 9.06943184
  67. 9.06943184]]
  68.  
  69. ...,
  70. [[-9.66999052 -8.66999052 -7.66999052 ..., 7.33000948 8.33000948
  71. 9.33000948]
  72. [-9.06943184 -9.06943184 -9.06943184 ..., 9.93056816 9.93056816
  73. 9.93056816]]
  74.  
  75. [[-9.33000948 -8.33000948 -7.33000948 ..., 7.66999052 8.66999052
  76. 9.66999052]
  77. [-9.06943184 -9.06943184 -9.06943184 ..., 9.93056816 9.93056816
  78. 9.93056816]]
  79.  
  80. [[-9.06943184 -8.06943184 -7.06943184 ..., 7.93056816 8.93056816
  81. 9.93056816]
  82. [-9.06943184 -9.06943184 -9.06943184 ..., 9.93056816 9.93056816
  83. 9.93056816]]]
  84. res cache key good
  85.  
  86. <pyfr.util.memoize object at 0x6fffe85c278>
  87. --self
  88. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> mpts
  89. --args
  90.  
  91. --kwargs
  92.  
  93. res cache key bad
  94. [[[-10. -9. -8. ..., 7. 8. 9.]
  95. [-10. -10. -10. ..., 9. 9. 9.]]
  96.  
  97. [[ -9. -8. -7. ..., 8. 9. 10.]
  98. [-10. -10. -10. ..., 9. 9. 9.]]
  99.  
  100. [[-10. -9. -8. ..., 7. 8. 9.]
  101. [ -9. -9. -9. ..., 10. 10. 10.]]
  102.  
  103. [[ -9. -8. -7. ..., 8. 9. 10.]
  104. [ -9. -9. -9. ..., 10. 10. 10.]]]
  105. new res cache key
  106.  
  107. <pyfr.util.memoize object at 0x6fffe85c160>
  108. --self
  109. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> M3
  110. --args
  111.  
  112. --kwargs
  113.  
  114. res cache key bad
  115. <pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6ffffd5ed68>
  116. new res cache key
  117.  
  118. <pyfr.util.memoize object at 0x6fffe7267b8>
  119. --self
  120. <pyfr.backends.openmp.cblas.OpenMPCBLASKernels object at 0x6fffe55f9e8> par_gemm
  121.  
  122. #include <omp.h>
  123. #include <stdlib.h>
  124. #include <tgmath.h>
  125.  
  126. #define PYFR_ALIGN_BYTES 32
  127. #define PYFR_NOINLINE __attribute__ ((noinline))
  128.  
  129. #define min(a, b) ((a) < (b) ? (a) : (b))
  130. #define max(a, b) ((a) > (b) ? (a) : (b))
  131.  
  132. // Typedefs
  133. typedef double fpdtype_t;
  134.  
  135. // OpenMP static loop scheduling functions
  136.  
  137. static inline int
  138. gcd(int a, int b)
  139. {
  140. return (a == 0) ? b : gcd(b % a, a);
  141. }
  142.  
  143. static inline void
  144. loop_sched_1d(int n, int align, int *b, int *e)
  145. {
  146. int tid = omp_get_thread_num();
  147. int nth = omp_get_num_threads();
  148.  
  149. // Round up n to be a multiple of nth
  150. int rn = n + nth - 1 - (n - 1) % nth;
  151.  
  152. // Nominal tile size
  153. int sz = rn / nth;
  154.  
  155. // Handle alignment
  156. sz += align - 1 - (sz - 1) % align;
  157.  
  158. // Assign the starting and ending index
  159. *b = sz * tid;
  160. *e = min(*b + sz, n);
  161.  
  162. // Clamp
  163. if (*b >= n)
  164. *b = *e = 0;
  165. }
  166.  
  167. static inline void
  168. loop_sched_2d(int nrow, int ncol, int colalign,
  169. int *rowb, int *rowe, int *colb, int *cole)
  170. {
  171. int tid = omp_get_thread_num();
  172. int nth = omp_get_num_threads();
  173.  
  174. // Distribute threads
  175. int nrowth = gcd(nrow, nth);
  176. int ncolth = nth / nrowth;
  177.  
  178. // Row and column indices for our thread
  179. int rowix = tid / ncolth;
  180. int colix = tid % ncolth;
  181.  
  182. // Round up ncol to be a multiple of ncolth
  183. int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;
  184.  
  185. // Nominal tile size
  186. int ntilerow = nrow / nrowth;
  187. int ntilecol = rncol / ncolth;
  188.  
  189. // Handle column alignment
  190. ntilecol += colalign - 1 - (ntilecol - 1) % colalign;
  191.  
  192. // Assign the starting and ending row to each thread
  193. *rowb = ntilerow * rowix;
  194. *rowe = *rowb + ntilerow;
  195.  
  196. // Assign the starting and ending column to each thread
  197. *colb = ntilecol * colix;
  198. *cole = min(*colb + ntilecol, ncol);
  199.  
  200. // Clamp
  201. if (*colb >= ncol)
  202. *colb = *cole = 0;
  203. }
  204.  
  205.  
  206.  
  207.  
  208. // CBLAS GEMM constants
  209. #define ROW_MAJOR 101
  210. #define NO_TRANS 111
  211.  
  212. // CBLAS GEMM prototype
  213. typedef void (*cblas_gemm_t)(int, int, int,
  214. int, int, int,
  215. fpdtype_t, const fpdtype_t *, int,
  216. const fpdtype_t *, int,
  217. fpdtype_t, fpdtype_t *, int);
  218.  
  219. void
  220. par_gemm(cblas_gemm_t gemm, int M, int N, int K,
  221. fpdtype_t alpha, const fpdtype_t *A, int lda,
  222. const fpdtype_t *B, int ldb,
  223. fpdtype_t beta, fpdtype_t *C, int ldc)
  224. {
  225. #pragma omp parallel
  226. {
  227. int begin, end;
  228. loop_sched_1d(N, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);
  229.  
  230. gemm(ROW_MAJOR, NO_TRANS, NO_TRANS, M, end - begin, K,
  231. alpha, A, lda, B + begin, ldb, beta, C + begin, ldc);
  232. }
  233. }
  234.  
  235. [<class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
  236. --args
  237.  
  238. --kwargs
  239.  
  240. res cache key bad
  241. <_FuncPtr object at 0x6fffe5694f8>
  242. new res cache key
  243.  
  244. <pyfr.util.memoize object at 0x6fffe85c240>
  245. --self
  246. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
  247. --args
  248.  
  249. --kwargs
  250.  
  251.  
  252. <pyfr.util.memoize object at 0x6fffe85c208>
  253. --self
  254. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
  255. --args
  256.  
  257. --kwargs
  258.  
  259. res cache key bad
  260. [[ 4. 4. 4. ..., 4. 4. 4.]
  261. [ 4. 4. 4. ..., 4. 4. 4.]
  262. [ 4. 4. 4. ..., 4. 4. 4.]
  263. ...,
  264. [ 4. 4. 4. ..., 4. 4. 4.]
  265. [ 4. 4. 4. ..., 4. 4. 4.]
  266. [ 4. 4. 4. ..., 4. 4. 4.]]
  267. new res cache key
  268. res cache key bad
  269. <pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6ffffca9f98>
  270. new res cache key
  271.  
  272. <pyfr.util.memoize object at 0x6fffebbf470>
  273. --self
  274. <pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> negdivconf pyfr.solvers.baseadvec.kernels.negdivconf {'ndims': 2, 'srcex': ['(0.)', '(0.)', '(0.)', '(0.)'], 'nvars': 4}
  275. --args
  276.  
  277. --kwargs
  278.  
  279. res cache key bad
  280. ('\n\n#include <omp.h>\n#include <stdlib.h>\n#include <tgmath.h>\n\n#define PYFR_ALIGN_BYTES 32\n#define PYFR_NOINLINE __attribute__ ((noinline))\n\n#define min(a, b) ((a) < (b) ? (a) : (b))\n#define max(a, b) ((a) > (b) ? (a) : (b))\n\n// Typedefs\ntypedef double fpdtype_t;\n\n// OpenMP static loop scheduling functions\n\nstatic inline int\ngcd(int a, int b)\n{\n return (a == 0) ? b : gcd(b % a, a);\n}\n\nstatic inline void\nloop_sched_1d(int n, int align, int *b, int *e)\n{\n int tid = omp_get_thread_num();\n int nth = omp_get_num_threads();\n\n // Round up n to be a multiple of nth\n int rn = n + nth - 1 - (n - 1) % nth;\n\n // Nominal tile size\n int sz = rn / nth;\n\n // Handle alignment\n sz += align - 1 - (sz - 1) % align;\n\n // Assign the starting and ending index\n *b = sz * tid;\n *e = min(*b + sz, n);\n\n // Clamp\n if (*b >= n)\n *b = *e = 0;\n}\n\nstatic inline void\nloop_sched_2d(int nrow, int ncol, int colalign,\n int *rowb, int *rowe, int *colb, int *cole)\n{\n int tid = omp_get_thread_num();\n int nth = omp_get_num_threads();\n\n // Distribute threads\n int nrowth = gcd(nrow, nth);\n int ncolth = nth / nrowth;\n\n // Row and column indices for our thread\n int rowix = tid / ncolth;\n int colix = tid % ncolth;\n\n // Round up ncol to be a multiple of ncolth\n int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;\n\n // Nominal tile size\n int ntilerow = nrow / nrowth;\n int ntilecol = rncol / ncolth;\n\n // Handle column alignment\n ntilecol += colalign - 1 - (ntilecol - 1) % colalign;\n\n // Assign the starting and ending row to each thread\n *rowb = ntilerow * rowix;\n *rowe = *rowb + ntilerow;\n\n // Assign the starting and ending column to each thread\n *colb = ntilecol * colix;\n *cole = min(*colb + ntilecol, ncol);\n\n // Clamp\n if (*colb >= ncol)\n *colb = *cole = 0;\n}\n\n\n\n\n\nstatic PYFR_NOINLINE void negdivconf_inner(int _nx, const fpdtype_t *__restrict__ rcpdjac_v, fpdtype_t *__restrict__ tdivtconf_v0, fpdtype_t *__restrict__ tdivtconf_v1, fpdtype_t *__restrict__ tdivtconf_v2, fpdtype_t *__restrict__ tdivtconf_v3)\n {\n for (int _x = 0; _x < _nx; _x++)\n {\n \n tdivtconf_v0[_x] = -rcpdjac_v[_x]*tdivtconf_v0[_x] + (0.);\n tdivtconf_v1[_x] = -rcpdjac_v[_x]*tdivtconf_v1[_x] + (0.);\n tdivtconf_v2[_x] = -rcpdjac_v[_x]*tdivtconf_v2[_x] + (0.);\n tdivtconf_v3[_x] = -rcpdjac_v[_x]*tdivtconf_v3[_x] + (0.);\n\n }\n }\n void negdivconf(int _ny, int _nx, const fpdtype_t* __restrict__ rcpdjac_v, int lsdrcpdjac, fpdtype_t* __restrict__ tdivtconf_v, int lsdtdivtconf)\n {\n #pragma omp parallel\n {\n int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);\n int rb, re, cb, ce;\n loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);\n for (int _y = rb; _y < re; _y++)\n {\n negdivconf_inner(ce - cb, rcpdjac_v + _y*lsdrcpdjac + cb, tdivtconf_v + (_y*4 + 0)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 1)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 2)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 3)*lsdtdivtconf + cb);\n }\n }\n }\n\n', 2, ['_ny', '_nx', 'rcpdjac', 'tdivtconf'], [[<class 'numpy.int32'>], [<class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>]])
  281. new res cache key
  282.  
  283. <pyfr.util.memoize object at 0x6fffe7267b8>
  284. --self
  285. <pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> negdivconf
  286.  
  287. #include <omp.h>
  288. #include <stdlib.h>
  289. #include <tgmath.h>
  290.  
  291. #define PYFR_ALIGN_BYTES 32
  292. #define PYFR_NOINLINE __attribute__ ((noinline))
  293.  
  294. #define min(a, b) ((a) < (b) ? (a) : (b))
  295. #define max(a, b) ((a) > (b) ? (a) : (b))
  296.  
  297. // Typedefs
  298. typedef double fpdtype_t;
  299.  
  300. // OpenMP static loop scheduling functions
  301.  
  302. static inline int
  303. gcd(int a, int b)
  304. {
  305. return (a == 0) ? b : gcd(b % a, a);
  306. }
  307.  
  308. static inline void
  309. loop_sched_1d(int n, int align, int *b, int *e)
  310. {
  311. int tid = omp_get_thread_num();
  312. int nth = omp_get_num_threads();
  313.  
  314. // Round up n to be a multiple of nth
  315. int rn = n + nth - 1 - (n - 1) % nth;
  316.  
  317. // Nominal tile size
  318. int sz = rn / nth;
  319.  
  320. // Handle alignment
  321. sz += align - 1 - (sz - 1) % align;
  322.  
  323. // Assign the starting and ending index
  324. *b = sz * tid;
  325. *e = min(*b + sz, n);
  326.  
  327. // Clamp
  328. if (*b >= n)
  329. *b = *e = 0;
  330. }
  331.  
  332. static inline void
  333. loop_sched_2d(int nrow, int ncol, int colalign,
  334. int *rowb, int *rowe, int *colb, int *cole)
  335. {
  336. int tid = omp_get_thread_num();
  337. int nth = omp_get_num_threads();
  338.  
  339. // Distribute threads
  340. int nrowth = gcd(nrow, nth);
  341. int ncolth = nth / nrowth;
  342.  
  343. // Row and column indices for our thread
  344. int rowix = tid / ncolth;
  345. int colix = tid % ncolth;
  346.  
  347. // Round up ncol to be a multiple of ncolth
  348. int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;
  349.  
  350. // Nominal tile size
  351. int ntilerow = nrow / nrowth;
  352. int ntilecol = rncol / ncolth;
  353.  
  354. // Handle column alignment
  355. ntilecol += colalign - 1 - (ntilecol - 1) % colalign;
  356.  
  357. // Assign the starting and ending row to each thread
  358. *rowb = ntilerow * rowix;
  359. *rowe = *rowb + ntilerow;
  360.  
  361. // Assign the starting and ending column to each thread
  362. *colb = ntilecol * colix;
  363. *cole = min(*colb + ntilecol, ncol);
  364.  
  365. // Clamp
  366. if (*colb >= ncol)
  367. *colb = *cole = 0;
  368. }
  369.  
  370.  
  371.  
  372.  
  373.  
  374. static PYFR_NOINLINE void negdivconf_inner(int _nx, const fpdtype_t *__restrict__ rcpdjac_v, fpdtype_t *__restrict__ tdivtconf_v0, fpdtype_t *__restrict__ tdivtconf_v1, fpdtype_t *__restrict__ tdivtconf_v2, fpdtype_t *__restrict__ tdivtconf_v3)
  375. {
  376. for (int _x = 0; _x < _nx; _x++)
  377. {
  378.  
  379. tdivtconf_v0[_x] = -rcpdjac_v[_x]*tdivtconf_v0[_x] + (0.);
  380. tdivtconf_v1[_x] = -rcpdjac_v[_x]*tdivtconf_v1[_x] + (0.);
  381. tdivtconf_v2[_x] = -rcpdjac_v[_x]*tdivtconf_v2[_x] + (0.);
  382. tdivtconf_v3[_x] = -rcpdjac_v[_x]*tdivtconf_v3[_x] + (0.);
  383.  
  384. }
  385. }
  386. void negdivconf(int _ny, int _nx, const fpdtype_t* __restrict__ rcpdjac_v, int lsdrcpdjac, fpdtype_t* __restrict__ tdivtconf_v, int lsdtdivtconf)
  387. {
  388. #pragma omp parallel
  389. {
  390. int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
  391. int rb, re, cb, ce;
  392. loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
  393. for (int _y = rb; _y < re; _y++)
  394. {
  395. negdivconf_inner(ce - cb, rcpdjac_v + _y*lsdrcpdjac + cb, tdivtconf_v + (_y*4 + 0)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 1)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 2)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 3)*lsdtdivtconf + cb);
  396. }
  397. }
  398. }
  399.  
  400. [<class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
  401. --args
  402.  
  403. --kwargs
  404.  
  405. res cache key bad
  406. <_FuncPtr object at 0x6fffe5695c0>
  407. new res cache key
  408.  
  409. <pyfr.util.memoize object at 0x6fffe85c160>
  410. --self
  411. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> M1 - M3*M2
  412. --args
  413.  
  414. --kwargs
  415.  
  416. res cache key bad
  417. <pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6fffe108a20>
  418. new res cache key
  419.  
  420. <pyfr.util.memoize object at 0x6fffe7267b8>
  421. --self
  422. <pyfr.backends.openmp.cblas.OpenMPCBLASKernels object at 0x6fffe55f9e8> par_gemm
  423.  
  424. #include <omp.h>
  425. #include <stdlib.h>
  426. #include <tgmath.h>
  427.  
  428. #define PYFR_ALIGN_BYTES 32
  429. #define PYFR_NOINLINE __attribute__ ((noinline))
  430.  
  431. #define min(a, b) ((a) < (b) ? (a) : (b))
  432. #define max(a, b) ((a) > (b) ? (a) : (b))
  433.  
  434. // Typedefs
  435. typedef double fpdtype_t;
  436.  
  437. // OpenMP static loop scheduling functions
  438.  
  439. static inline int
  440. gcd(int a, int b)
  441. {
  442. return (a == 0) ? b : gcd(b % a, a);
  443. }
  444.  
  445. static inline void
  446. loop_sched_1d(int n, int align, int *b, int *e)
  447. {
  448. int tid = omp_get_thread_num();
  449. int nth = omp_get_num_threads();
  450.  
  451. // Round up n to be a multiple of nth
  452. int rn = n + nth - 1 - (n - 1) % nth;
  453.  
  454. // Nominal tile size
  455. int sz = rn / nth;
  456.  
  457. // Handle alignment
  458. sz += align - 1 - (sz - 1) % align;
  459.  
  460. // Assign the starting and ending index
  461. *b = sz * tid;
  462. *e = min(*b + sz, n);
  463.  
  464. // Clamp
  465. if (*b >= n)
  466. *b = *e = 0;
  467. }
  468.  
  469. static inline void
  470. loop_sched_2d(int nrow, int ncol, int colalign,
  471. int *rowb, int *rowe, int *colb, int *cole)
  472. {
  473. int tid = omp_get_thread_num();
  474. int nth = omp_get_num_threads();
  475.  
  476. // Distribute threads
  477. int nrowth = gcd(nrow, nth);
  478. int ncolth = nth / nrowth;
  479.  
  480. // Row and column indices for our thread
  481. int rowix = tid / ncolth;
  482. int colix = tid % ncolth;
  483.  
  484. // Round up ncol to be a multiple of ncolth
  485. int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;
  486.  
  487. // Nominal tile size
  488. int ntilerow = nrow / nrowth;
  489. int ntilecol = rncol / ncolth;
  490.  
  491. // Handle column alignment
  492. ntilecol += colalign - 1 - (ntilecol - 1) % colalign;
  493.  
  494. // Assign the starting and ending row to each thread
  495. *rowb = ntilerow * rowix;
  496. *rowe = *rowb + ntilerow;
  497.  
  498. // Assign the starting and ending column to each thread
  499. *colb = ntilecol * colix;
  500. *cole = min(*colb + ntilecol, ncol);
  501.  
  502. // Clamp
  503. if (*colb >= ncol)
  504. *colb = *cole = 0;
  505. }
  506.  
  507.  
  508.  
  509.  
  510. // CBLAS GEMM constants
  511. #define ROW_MAJOR 101
  512. #define NO_TRANS 111
  513.  
  514. // CBLAS GEMM prototype
  515. typedef void (*cblas_gemm_t)(int, int, int,
  516. int, int, int,
  517. fpdtype_t, const fpdtype_t *, int,
  518. const fpdtype_t *, int,
  519. fpdtype_t, fpdtype_t *, int);
  520.  
  521. void
  522. par_gemm(cblas_gemm_t gemm, int M, int N, int K,
  523. fpdtype_t alpha, const fpdtype_t *A, int lda,
  524. const fpdtype_t *B, int ldb,
  525. fpdtype_t beta, fpdtype_t *C, int ldc)
  526. {
  527. #pragma omp parallel
  528. {
  529. int begin, end;
  530. loop_sched_1d(N, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);
  531.  
  532. gemm(ROW_MAJOR, NO_TRANS, NO_TRANS, M, end - begin, K,
  533. alpha, A, lda, B + begin, ldb, beta, C + begin, ldc);
  534. }
  535. }
  536.  
  537. [<class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
  538. --args
  539.  
  540. --kwargs
  541.  
  542. <_FuncPtr object at 0x6fffe5694f8>
  543. res cache key good
  544.  
  545. <pyfr.util.memoize object at 0x6fffe85c160>
  546. --self
  547. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> M0
  548. --args
  549.  
  550. --kwargs
  551.  
  552. res cache key bad
  553. <pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6fffe108978>
  554. new res cache key
  555.  
  556. <pyfr.util.memoize object at 0x6fffe7267b8>
  557. --self
  558. <pyfr.backends.openmp.cblas.OpenMPCBLASKernels object at 0x6fffe55f9e8> par_gemm
  559.  
  560. #include <omp.h>
  561. #include <stdlib.h>
  562. #include <tgmath.h>
  563.  
  564. #define PYFR_ALIGN_BYTES 32
  565. #define PYFR_NOINLINE __attribute__ ((noinline))
  566.  
  567. #define min(a, b) ((a) < (b) ? (a) : (b))
  568. #define max(a, b) ((a) > (b) ? (a) : (b))
  569.  
  570. // Typedefs
  571. typedef double fpdtype_t;
  572.  
  573. // OpenMP static loop scheduling functions
  574.  
  575. static inline int
  576. gcd(int a, int b)
  577. {
  578. return (a == 0) ? b : gcd(b % a, a);
  579. }
  580.  
  581. static inline void
  582. loop_sched_1d(int n, int align, int *b, int *e)
  583. {
  584. int tid = omp_get_thread_num();
  585. int nth = omp_get_num_threads();
  586.  
  587. // Round up n to be a multiple of nth
  588. int rn = n + nth - 1 - (n - 1) % nth;
  589.  
  590. // Nominal tile size
  591. int sz = rn / nth;
  592.  
  593. // Handle alignment
  594. sz += align - 1 - (sz - 1) % align;
  595.  
  596. // Assign the starting and ending index
  597. *b = sz * tid;
  598. *e = min(*b + sz, n);
  599.  
  600. // Clamp
  601. if (*b >= n)
  602. *b = *e = 0;
  603. }
  604.  
  605. static inline void
  606. loop_sched_2d(int nrow, int ncol, int colalign,
  607. int *rowb, int *rowe, int *colb, int *cole)
  608. {
  609. int tid = omp_get_thread_num();
  610. int nth = omp_get_num_threads();
  611.  
  612. // Distribute threads
  613. int nrowth = gcd(nrow, nth);
  614. int ncolth = nth / nrowth;
  615.  
  616. // Row and column indices for our thread
  617. int rowix = tid / ncolth;
  618. int colix = tid % ncolth;
  619.  
  620. // Round up ncol to be a multiple of ncolth
  621. int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;
  622.  
  623. // Nominal tile size
  624. int ntilerow = nrow / nrowth;
  625. int ntilecol = rncol / ncolth;
  626.  
  627. // Handle column alignment
  628. ntilecol += colalign - 1 - (ntilecol - 1) % colalign;
  629.  
  630. // Assign the starting and ending row to each thread
  631. *rowb = ntilerow * rowix;
  632. *rowe = *rowb + ntilerow;
  633.  
  634. // Assign the starting and ending column to each thread
  635. *colb = ntilecol * colix;
  636. *cole = min(*colb + ntilecol, ncol);
  637.  
  638. // Clamp
  639. if (*colb >= ncol)
  640. *colb = *cole = 0;
  641. }
  642.  
  643.  
  644.  
  645.  
  646. // CBLAS GEMM constants
  647. #define ROW_MAJOR 101
  648. #define NO_TRANS 111
  649.  
  650. // CBLAS GEMM prototype
  651. typedef void (*cblas_gemm_t)(int, int, int,
  652. int, int, int,
  653. fpdtype_t, const fpdtype_t *, int,
  654. const fpdtype_t *, int,
  655. fpdtype_t, fpdtype_t *, int);
  656.  
  657. void
  658. par_gemm(cblas_gemm_t gemm, int M, int N, int K,
  659. fpdtype_t alpha, const fpdtype_t *A, int lda,
  660. const fpdtype_t *B, int ldb,
  661. fpdtype_t beta, fpdtype_t *C, int ldc)
  662. {
  663. #pragma omp parallel
  664. {
  665. int begin, end;
  666. loop_sched_1d(N, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);
  667.  
  668. gemm(ROW_MAJOR, NO_TRANS, NO_TRANS, M, end - begin, K,
  669. alpha, A, lda, B + begin, ldb, beta, C + begin, ldc);
  670. }
  671. }
  672.  
  673. [<class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
  674. --args
  675.  
  676. --kwargs
  677.  
  678. <_FuncPtr object at 0x6fffe5694f8>
  679. res cache key good
  680.  
  681. <pyfr.util.memoize object at 0x6fffe85c1d0>
  682. --self
  683. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
  684. --args
  685.  
  686. --kwargs
  687.  
  688.  
  689. <pyfr.util.memoize object at 0x6fffe85c198>
  690. --self
  691. <pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
  692. --args
  693.  
  694. --kwargs
  695.  
  696. res cache key bad
  697. [[[[ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  698. 5.00000000e-01 5.00000000e-01 5.00000000e-01]
  699. [ -6.16678657e-17 -8.26510554e-16 3.91590947e-15 ...,
  700. 5.16569096e-14 0.00000000e+00 0.00000000e+00]]
  701.  
  702. [[ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  703. 5.00000000e-01 5.00000000e-01 5.00000000e-01]
  704. [ -2.93107297e-16 -5.95071123e-16 1.86123133e-14 ...,
  705. 3.71919452e-14 0.00000000e+00 0.00000000e+00]]
  706.  
  707. [[ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  708. 5.00000000e-01 5.00000000e-01 5.00000000e-01]
  709. [ -5.95071123e-16 -2.93107297e-16 3.77870163e-14 ...,
  710. 1.83192061e-14 0.00000000e+00 0.00000000e+00]]
  711.  
  712. ...,
  713. [[ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  714. 5.00000000e-01 5.00000000e-01 5.00000000e-01]
  715. [ -2.93107297e-16 -5.95071123e-16 1.86123133e-14 ...,
  716. 3.71919452e-14 0.00000000e+00 0.00000000e+00]]
  717.  
  718. [[ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  719. 5.00000000e-01 5.00000000e-01 5.00000000e-01]
  720. [ -5.95071123e-16 -2.93107297e-16 3.77870163e-14 ...,
  721. 1.83192061e-14 0.00000000e+00 0.00000000e+00]]
  722.  
  723. [[ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  724. 5.00000000e-01 5.00000000e-01 5.00000000e-01]
  725. [ -8.26510554e-16 -6.16678657e-17 5.24834202e-14 ...,
  726. 3.85424160e-15 0.00000000e+00 0.00000000e+00]]]
  727.  
  728.  
  729. [[[ -6.16678657e-17 6.16678657e-17 -1.23335731e-16 ...,
  730. 0.00000000e+00 -8.26510554e-16 8.26510554e-16]
  731. [ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  732. 5.00000000e-01 5.00000000e-01 5.00000000e-01]]
  733.  
  734. [[ -6.16678657e-17 6.16678657e-17 -1.23335731e-16 ...,
  735. 0.00000000e+00 -8.26510554e-16 8.26510554e-16]
  736. [ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  737. 5.00000000e-01 5.00000000e-01 5.00000000e-01]]
  738.  
  739. [[ -6.16678657e-17 6.16678657e-17 -1.23335731e-16 ...,
  740. 0.00000000e+00 -8.26510554e-16 8.26510554e-16]
  741. [ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  742. 5.00000000e-01 5.00000000e-01 5.00000000e-01]]
  743.  
  744. ...,
  745. [[ -8.26510554e-16 8.26510554e-16 -1.65302111e-15 ...,
  746. 0.00000000e+00 -6.16678657e-17 6.16678657e-17]
  747. [ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  748. 5.00000000e-01 5.00000000e-01 5.00000000e-01]]
  749.  
  750. [[ -8.26510554e-16 8.26510554e-16 -1.65302111e-15 ...,
  751. 0.00000000e+00 -6.16678657e-17 6.16678657e-17]
  752. [ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  753. 5.00000000e-01 5.00000000e-01 5.00000000e-01]]
  754.  
  755. [[ -8.26510554e-16 8.26510554e-16 -1.65302111e-15 ...,
  756. 0.00000000e+00 -6.16678657e-17 6.16678657e-17]
  757. [ 5.00000000e-01 5.00000000e-01 5.00000000e-01 ...,
  758. 5.00000000e-01 5.00000000e-01 5.00000000e-01]]]]
  759. new res cache key
  760. res cache key bad
  761. <pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6fffe159b70>
  762. new res cache key
  763.  
  764. <pyfr.util.memoize object at 0x6fffebbf470>
  765. --self
  766. <pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> tflux pyfr.solvers.euler.kernels.tflux {'ndims': 2, 'c': OrderedDict([('gamma', 1.4), ('S', 13.5), ('M', 0.4), ('R', 1.5)]), 'nvars': 4}
  767. --args
  768.  
  769. --kwargs
  770.  
  771. res cache key bad
  772. ('\n\n#include <omp.h>\n#include <stdlib.h>\n#include <tgmath.h>\n\n#define PYFR_ALIGN_BYTES 32\n#define PYFR_NOINLINE __attribute__ ((noinline))\n\n#define min(a, b) ((a) < (b) ? (a) : (b))\n#define max(a, b) ((a) > (b) ? (a) : (b))\n\n// Typedefs\ntypedef double fpdtype_t;\n\n// OpenMP static loop scheduling functions\n\nstatic inline int\ngcd(int a, int b)\n{\n return (a == 0) ? b : gcd(b % a, a);\n}\n\nstatic inline void\nloop_sched_1d(int n, int align, int *b, int *e)\n{\n int tid = omp_get_thread_num();\n int nth = omp_get_num_threads();\n\n // Round up n to be a multiple of nth\n int rn = n + nth - 1 - (n - 1) % nth;\n\n // Nominal tile size\n int sz = rn / nth;\n\n // Handle alignment\n sz += align - 1 - (sz - 1) % align;\n\n // Assign the starting and ending index\n *b = sz * tid;\n *e = min(*b + sz, n);\n\n // Clamp\n if (*b >= n)\n *b = *e = 0;\n}\n\nstatic inline void\nloop_sched_2d(int nrow, int ncol, int colalign,\n int *rowb, int *rowe, int *colb, int *cole)\n{\n int tid = omp_get_thread_num();\n int nth = omp_get_num_threads();\n\n // Distribute threads\n int nrowth = gcd(nrow, nth);\n int ncolth = nth / nrowth;\n\n // Row and column indices for our thread\n int rowix = tid / ncolth;\n int colix = tid % ncolth;\n\n // Round up ncol to be a multiple of ncolth\n int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;\n\n // Nominal tile size\n int ntilerow = nrow / nrowth;\n int ntilecol = rncol / ncolth;\n\n // Handle column alignment\n ntilecol += colalign - 1 - (ntilecol - 1) % colalign;\n\n // Assign the starting and ending row to each thread\n *rowb = ntilerow * rowix;\n *rowe = *rowb + ntilerow;\n\n // Assign the starting and ending column to each thread\n *colb = ntilecol * colix;\n *cole = min(*colb + ntilecol, ncol);\n\n // Clamp\n if (*colb >= ncol)\n *colb = *cole = 0;\n}\n\n\n\n\n\n\n\n\n\nstatic PYFR_NOINLINE void tflux_inner(int _nx, fpdtype_t *__restrict__ f_v0v0, fpdtype_t *__restrict__ f_v0v1, fpdtype_t *__restrict__ f_v0v2, fpdtype_t *__restrict__ f_v0v3, fpdtype_t *__restrict__ f_v1v0, fpdtype_t *__restrict__ f_v1v1, fpdtype_t *__restrict__ f_v1v2, fpdtype_t *__restrict__ f_v1v3, const fpdtype_t *__restrict__ smats_v0v0, const fpdtype_t *__restrict__ smats_v0v1, const fpdtype_t *__restrict__ smats_v1v0, const fpdtype_t *__restrict__ smats_v1v1, const fpdtype_t *__restrict__ u_v0, const fpdtype_t *__restrict__ u_v1, const fpdtype_t *__restrict__ u_v2, const fpdtype_t *__restrict__ u_v3)\n {\n for (int _x = 0; _x < _nx; _x++)\n {\n \n // Compute the flux\n fpdtype_t ftemp[2][4];\n fpdtype_t p, v[2];\n {\n\n fpdtype_t invrho_ = 1.0/u_v0[_x], E_ = u_v3[_x];\n\n // Compute the velocities\n fpdtype_t rhov_[2];\n rhov_[0] = u_v1[_x];\n v[0] = invrho_*rhov_[0];\n rhov_[1] = u_v2[_x];\n v[1] = invrho_*rhov_[1];\n\n // Compute the pressure\n p = 0.3999999999999999*(E_ - 0.5*invrho_*((rhov_[0])*(rhov_[0]) + (rhov_[1])*(rhov_[1])));\n\n // Density and energy fluxes\n ftemp[0][0] = rhov_[0];\n ftemp[0][3] = (E_ + p)*v[0];\n ftemp[1][0] = rhov_[1];\n ftemp[1][3] = (E_ + p)*v[1];\n\n // Momentum fluxes\n ftemp[0][1] = rhov_[0]*v[0] + p;\n ftemp[0][2] = rhov_[0]*v[1];\n ftemp[1][1] = rhov_[1]*v[0];\n ftemp[1][2] = rhov_[1]*v[1] + p;\n\n};\n\n // Transform the fluxes\n f_v0v0[_x] = smats_v0v0[_x]*ftemp[0][0] + smats_v0v1[_x]*ftemp[1][0];\n f_v0v1[_x] = smats_v0v0[_x]*ftemp[0][1] + smats_v0v1[_x]*ftemp[1][1];\n f_v0v2[_x] = smats_v0v0[_x]*ftemp[0][2] + smats_v0v1[_x]*ftemp[1][2];\n f_v0v3[_x] = smats_v0v0[_x]*ftemp[0][3] + smats_v0v1[_x]*ftemp[1][3];\n f_v1v0[_x] = smats_v1v0[_x]*ftemp[0][0] + smats_v1v1[_x]*ftemp[1][0];\n f_v1v1[_x] = smats_v1v0[_x]*ftemp[0][1] + smats_v1v1[_x]*ftemp[1][1];\n f_v1v2[_x] = smats_v1v0[_x]*ftemp[0][2] + smats_v1v1[_x]*ftemp[1][2];\n f_v1v3[_x] = smats_v1v0[_x]*ftemp[0][3] + smats_v1v1[_x]*ftemp[1][3];\n\n }\n }\n void tflux(int _ny, int _nx, fpdtype_t* __restrict__ f_v, int lsdf, const fpdtype_t* __restrict__ smats_v, int lsdsmats, const fpdtype_t* __restrict__ u_v, int lsdu)\n {\n #pragma omp parallel\n {\n int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);\n int rb, re, cb, ce;\n loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);\n for (int _y = rb; _y < re; _y++)\n {\n tflux_inner(ce - cb, f_v + ((0*_ny + _y)*4 + 0)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 1)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 2)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 3)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 0)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 1)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 2)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 3)*lsdf + cb, smats_v + ((0*_ny + _y)*2 + 0)*lsdsmats + cb, smats_v + ((0*_ny + _y)*2 + 1)*lsdsmats + cb, smats_v + ((1*_ny + _y)*2 + 0)*lsdsmats + cb, smats_v + ((1*_ny + _y)*2 + 1)*lsdsmats + cb, u_v + (_y*4 + 0)*lsdu + cb, u_v + (_y*4 + 1)*lsdu + cb, u_v + (_y*4 + 2)*lsdu + cb, u_v + (_y*4 + 3)*lsdu + cb);\n }\n }\n }\n\n', 2, ['_ny', '_nx', 'f', 'smats', 'u'], [[<class 'numpy.int32'>], [<class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>]])
  773. new res cache key
  774.  
  775. <pyfr.util.memoize object at 0x6fffe7267b8>
  776. --self
  777. <pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> tflux
  778.  
  779. #include <omp.h>
  780. #include <stdlib.h>
  781. #include <tgmath.h>
  782.  
  783. #define PYFR_ALIGN_BYTES 32
  784. #define PYFR_NOINLINE __attribute__ ((noinline))
  785.  
  786. #define min(a, b) ((a) < (b) ? (a) : (b))
  787. #define max(a, b) ((a) > (b) ? (a) : (b))
  788.  
  789. // Typedefs
  790. typedef double fpdtype_t;
  791.  
  792. // OpenMP static loop scheduling functions
  793.  
  794. static inline int
  795. gcd(int a, int b)
  796. {
  797. return (a == 0) ? b : gcd(b % a, a);
  798. }
  799.  
  800. static inline void
  801. loop_sched_1d(int n, int align, int *b, int *e)
  802. {
  803. int tid = omp_get_thread_num();
  804. int nth = omp_get_num_threads();
  805.  
  806. // Round up n to be a multiple of nth
  807. int rn = n + nth - 1 - (n - 1) % nth;
  808.  
  809. // Nominal tile size
  810. int sz = rn / nth;
  811.  
  812. // Handle alignment
  813. sz += align - 1 - (sz - 1) % align;
  814.  
  815. // Assign the starting and ending index
  816. *b = sz * tid;
  817. *e = min(*b + sz, n);
  818.  
  819. // Clamp
  820. if (*b >= n)
  821. *b = *e = 0;
  822. }
  823.  
  824. static inline void
  825. loop_sched_2d(int nrow, int ncol, int colalign,
  826. int *rowb, int *rowe, int *colb, int *cole)
  827. {
  828. int tid = omp_get_thread_num();
  829. int nth = omp_get_num_threads();
  830.  
  831. // Distribute threads
  832. int nrowth = gcd(nrow, nth);
  833. int ncolth = nth / nrowth;
  834.  
  835. // Row and column indices for our thread
  836. int rowix = tid / ncolth;
  837. int colix = tid % ncolth;
  838.  
  839. // Round up ncol to be a multiple of ncolth
  840. int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;
  841.  
  842. // Nominal tile size
  843. int ntilerow = nrow / nrowth;
  844. int ntilecol = rncol / ncolth;
  845.  
  846. // Handle column alignment
  847. ntilecol += colalign - 1 - (ntilecol - 1) % colalign;
  848.  
  849. // Assign the starting and ending row to each thread
  850. *rowb = ntilerow * rowix;
  851. *rowe = *rowb + ntilerow;
  852.  
  853. // Assign the starting and ending column to each thread
  854. *colb = ntilecol * colix;
  855. *cole = min(*colb + ntilecol, ncol);
  856.  
  857. // Clamp
  858. if (*colb >= ncol)
  859. *colb = *cole = 0;
  860. }
  861.  
  862.  
  863.  
  864.  
  865.  
  866.  
  867.  
  868.  
  869.  
  870. static PYFR_NOINLINE void tflux_inner(int _nx, fpdtype_t *__restrict__ f_v0v0, fpdtype_t *__restrict__ f_v0v1, fpdtype_t *__restrict__ f_v0v2, fpdtype_t *__restrict__ f_v0v3, fpdtype_t *__restrict__ f_v1v0, fpdtype_t *__restrict__ f_v1v1, fpdtype_t *__restrict__ f_v1v2, fpdtype_t *__restrict__ f_v1v3, const fpdtype_t *__restrict__ smats_v0v0, const fpdtype_t *__restrict__ smats_v0v1, const fpdtype_t *__restrict__ smats_v1v0, const fpdtype_t *__restrict__ smats_v1v1, const fpdtype_t *__restrict__ u_v0, const fpdtype_t *__restrict__ u_v1, const fpdtype_t *__restrict__ u_v2, const fpdtype_t *__restrict__ u_v3)
  871. {
  872. for (int _x = 0; _x < _nx; _x++)
  873. {
  874.  
  875. // Compute the flux
  876. fpdtype_t ftemp[2][4];
  877. fpdtype_t p, v[2];
  878. {
  879.  
  880. fpdtype_t invrho_ = 1.0/u_v0[_x], E_ = u_v3[_x];
  881.  
  882. // Compute the velocities
  883. fpdtype_t rhov_[2];
  884. rhov_[0] = u_v1[_x];
  885. v[0] = invrho_*rhov_[0];
  886. rhov_[1] = u_v2[_x];
  887. v[1] = invrho_*rhov_[1];
  888.  
  889. // Compute the pressure
  890. p = 0.3999999999999999*(E_ - 0.5*invrho_*((rhov_[0])*(rhov_[0]) + (rhov_[1])*(rhov_[1])));
  891.  
  892. // Density and energy fluxes
  893. ftemp[0][0] = rhov_[0];
  894. ftemp[0][3] = (E_ + p)*v[0];
  895. ftemp[1][0] = rhov_[1];
  896. ftemp[1][3] = (E_ + p)*v[1];
  897.  
  898. // Momentum fluxes
  899. ftemp[0][1] = rhov_[0]*v[0] + p;
  900. ftemp[0][2] = rhov_[0]*v[1];
  901. ftemp[1][1] = rhov_[1]*v[0];
  902. ftemp[1][2] = rhov_[1]*v[1] + p;
  903.  
  904. };
  905.  
  906. // Transform the fluxes
  907. f_v0v0[_x] = smats_v0v0[_x]*ftemp[0][0] + smats_v0v1[_x]*ftemp[1][0];
  908. f_v0v1[_x] = smats_v0v0[_x]*ftemp[0][1] + smats_v0v1[_x]*ftemp[1][1];
  909. f_v0v2[_x] = smats_v0v0[_x]*ftemp[0][2] + smats_v0v1[_x]*ftemp[1][2];
  910. f_v0v3[_x] = smats_v0v0[_x]*ftemp[0][3] + smats_v0v1[_x]*ftemp[1][3];
  911. f_v1v0[_x] = smats_v1v0[_x]*ftemp[0][0] + smats_v1v1[_x]*ftemp[1][0];
  912. f_v1v1[_x] = smats_v1v0[_x]*ftemp[0][1] + smats_v1v1[_x]*ftemp[1][1];
  913. f_v1v2[_x] = smats_v1v0[_x]*ftemp[0][2] + smats_v1v1[_x]*ftemp[1][2];
  914. f_v1v3[_x] = smats_v1v0[_x]*ftemp[0][3] + smats_v1v1[_x]*ftemp[1][3];
  915.  
  916. }
  917. }
  918. void tflux(int _ny, int _nx, fpdtype_t* __restrict__ f_v, int lsdf, const fpdtype_t* __restrict__ smats_v, int lsdsmats, const fpdtype_t* __restrict__ u_v, int lsdu)
  919. {
  920. #pragma omp parallel
  921. {
  922. int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
  923. int rb, re, cb, ce;
  924. loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
  925. for (int _y = rb; _y < re; _y++)
  926. {
  927. tflux_inner(ce - cb, f_v + ((0*_ny + _y)*4 + 0)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 1)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 2)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 3)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 0)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 1)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 2)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 3)*lsdf + cb, smats_v + ((0*_ny + _y)*2 + 0)*lsdsmats + cb, smats_v + ((0*_ny + _y)*2 + 1)*lsdsmats + cb, smats_v + ((1*_ny + _y)*2 + 0)*lsdsmats + cb, smats_v + ((1*_ny + _y)*2 + 1)*lsdsmats + cb, u_v + (_y*4 + 0)*lsdu + cb, u_v + (_y*4 + 1)*lsdu + cb, u_v + (_y*4 + 2)*lsdu + cb, u_v + (_y*4 + 3)*lsdu + cb);
  928. }
  929. }
  930. }
  931.  
  932. [<class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
  933. --args
  934.  
  935. --kwargs
  936.  
  937. res cache key bad
  938. <_FuncPtr object at 0x6fffe569688>
  939. new res cache key
  940.  
  941. <pyfr.util.memoize object at 0x6fffebbf470>
  942. --self
  943. <pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> intcflux pyfr.solvers.euler.kernels.intcflux {'ndims': 2, 'rsolver': 'rusanov', 'c': OrderedDict([('gamma', 1.4), ('S', 13.5), ('M', 0.4), ('R', 1.5)]), 'nvars': 4}
  944. --args
  945.  
  946. --kwargs
  947.  
  948. res cache key bad
  949. ('\n\n#include <omp.h>\n#include <stdlib.h>\n#include <tgmath.h>\n\n#define PYFR_ALIGN_BYTES 32\n#define PYFR_NOINLINE __attribute__ ((noinline))\n\n#define min(a, b) ((a) < (b) ? (a) : (b))\n#define max(a, b) ((a) > (b) ? (a) : (b))\n\n// Typedefs\ntypedef double fpdtype_t;\n\n// OpenMP static loop scheduling functions\n\nstatic inline int\ngcd(int a, int b)\n{\n return (a == 0) ? b : gcd(b % a, a);\n}\n\nstatic inline void\nloop_sched_1d(int n, int align, int *b, int *e)\n{\n int tid = omp_get_thread_num();\n int nth = omp_get_num_threads();\n\n // Round up n to be a multiple of nth\n int rn = n + nth - 1 - (n - 1) % nth;\n\n // Nominal tile size\n int sz = rn / nth;\n\n // Handle alignment\n sz += align - 1 - (sz - 1) % align;\n\n // Assign the starting and ending index\n *b = sz * tid;\n *e = min(*b + sz, n);\n\n // Clamp\n if (*b >= n)\n *b = *e = 0;\n}\n\nstatic inline void\nloop_sched_2d(int nrow, int ncol, int colalign,\n int *rowb, int *rowe, int *colb, int *cole)\n{\n int tid = omp_get_thread_num();\n int nth = omp_get_num_threads();\n\n // Distribute threads\n int nrowth = gcd(nrow, nth);\n int ncolth = nth / nrowth;\n\n // Row and column indices for our thread\n int rowix = tid / ncolth;\n int colix = tid % ncolth;\n\n // Round up ncol to be a multiple of ncolth\n int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;\n\n // Nominal tile size\n int ntilerow = nrow / nrowth;\n int ntilecol = rncol / ncolth;\n\n // Handle column alignment\n ntilecol += colalign - 1 - (ntilecol - 1) % colalign;\n\n // Assign the starting and ending row to each thread\n *rowb = ntilerow * rowix;\n *rowe = *rowb + ntilerow;\n\n // Assign the starting and ending column to each thread\n *colb = ntilecol * colix;\n *cole = min(*colb + ntilecol, ncol);\n\n // Clamp\n if (*colb >= ncol)\n *colb = *cole = 0;\n}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n void intcflux(int _nx, const fpdtype_t* __restrict__ magnl_v, const fpdtype_t* __restrict__ nl_v, int lsdnl, fpdtype_t* __restrict__ ul_v, const int* __restrict__ ul_vix, const int* __restrict__ ul_vcstri, fpdtype_t* __restrict__ ur_v, const int* __restrict__ ur_vix, const int* __restrict__ ur_vcstri)\n {\n #pragma omp parallel\n {\n int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);\n int cb, ce;\n loop_sched_1d(_nx, align, &cb, &ce);\n for (int _x = cb; _x < ce; _x++)\n {\n \n // Perform the Riemann solve\n fpdtype_t fn[4];\n {\n\n // Compute the left and right fluxes + velocities and pressures\n fpdtype_t fl_[2][4], fr_[2][4];\n fpdtype_t vl_[2], vr_[2];\n fpdtype_t pl_, pr_;\n\n {\n\n fpdtype_t invrho__ = 1.0/ul_v[ul_vix[_x] + ul_vcstri[_x]*0], E__ = ul_v[ul_vix[_x] + ul_vcstri[_x]*3];\n\n // Compute the velocities\n fpdtype_t rhov__[2];\n rhov__[0] = ul_v[ul_vix[_x] + ul_vcstri[_x]*1];\n vl_[0] = invrho__*rhov__[0];\n rhov__[1] = ul_v[ul_vix[_x] + ul_vcstri[_x]*2];\n vl_[1] = invrho__*rhov__[1];\n\n // Compute the pressure\n pl_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));\n\n // Density and energy fluxes\n fl_[0][0] = rhov__[0];\n fl_[0][3] = (E__ + pl_)*vl_[0];\n fl_[1][0] = rhov__[1];\n fl_[1][3] = (E__ + pl_)*vl_[1];\n\n // Momentum fluxes\n fl_[0][1] = rhov__[0]*vl_[0] + pl_;\n fl_[0][2] = rhov__[0]*vl_[1];\n fl_[1][1] = rhov__[1]*vl_[0];\n fl_[1][2] = rhov__[1]*vl_[1] + pl_;\n\n};\n {\n\n fpdtype_t invrho__ = 1.0/ur_v[ur_vix[_x] + ur_vcstri[_x]*0], E__ = ur_v[ur_vix[_x] + ur_vcstri[_x]*3];\n\n // Compute the velocities\n fpdtype_t rhov__[2];\n rhov__[0] = ur_v[ur_vix[_x] + ur_vcstri[_x]*1];\n vr_[0] = invrho__*rhov__[0];\n rhov__[1] = ur_v[ur_vix[_x] + ur_vcstri[_x]*2];\n vr_[1] = invrho__*rhov__[1];\n\n // Compute the pressure\n pr_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));\n\n // Density and energy fluxes\n fr_[0][0] = rhov__[0];\n fr_[0][3] = (E__ + pr_)*vr_[0];\n fr_[1][0] = rhov__[1];\n fr_[1][3] = (E__ + pr_)*vr_[1];\n\n // Momentum fluxes\n fr_[0][1] = rhov__[0]*vr_[0] + pr_;\n fr_[0][2] = rhov__[0]*vr_[1];\n fr_[1][1] = rhov__[1]*vr_[0];\n fr_[1][2] = rhov__[1]*vr_[1] + pr_;\n\n};\n\n // Sum the left and right velocities and take the normal\n fpdtype_t nv_ = ((nl_v[lsdnl*0 + _x])*(vl_[0] + vr_[0]) + (nl_v[lsdnl*1 + _x])*(vl_[1] + vr_[1]));\n\n // Estimate the maximum wave speed / 2\n fpdtype_t a_ = sqrt(0.35*(pl_ + pr_)/(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] + ur_v[ur_vix[_x] + ur_vcstri[_x]*0]))\n + 0.25*fabs(nv_);\n\n // Output\n fn[0] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][0] + fr_[0][0]) + nl_v[lsdnl*1 + _x]*(fl_[1][0] + fr_[1][0]))\n + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] - ur_v[ur_vix[_x] + ur_vcstri[_x]*0]);\n fn[1] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][1] + fr_[0][1]) + nl_v[lsdnl*1 + _x]*(fl_[1][1] + fr_[1][1]))\n + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*1] - ur_v[ur_vix[_x] + ur_vcstri[_x]*1]);\n fn[2] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][2] + fr_[0][2]) + nl_v[lsdnl*1 + _x]*(fl_[1][2] + fr_[1][2]))\n + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*2] - ur_v[ur_vix[_x] + ur_vcstri[_x]*2]);\n fn[3] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][3] + fr_[0][3]) + nl_v[lsdnl*1 + _x]*(fl_[1][3] + fr_[1][3]))\n + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*3] - ur_v[ur_vix[_x] + ur_vcstri[_x]*3]);\n\n};\n\n // Scale and write out the common normal fluxes\n ul_v[ul_vix[_x] + ul_vcstri[_x]*0] = magnl_v[_x]*fn[0];\n ur_v[ur_vix[_x] + ur_vcstri[_x]*0] = -magnl_v[_x]*fn[0];\n ul_v[ul_vix[_x] + ul_vcstri[_x]*1] = magnl_v[_x]*fn[1];\n ur_v[ur_vix[_x] + ur_vcstri[_x]*1] = -magnl_v[_x]*fn[1];\n ul_v[ul_vix[_x] + ul_vcstri[_x]*2] = magnl_v[_x]*fn[2];\n ur_v[ur_vix[_x] + ur_vcstri[_x]*2] = -magnl_v[_x]*fn[2];\n ul_v[ul_vix[_x] + ul_vcstri[_x]*3] = magnl_v[_x]*fn[3];\n ur_v[ur_vix[_x] + ur_vcstri[_x]*3] = -magnl_v[_x]*fn[3];\n\n }\n }\n }\n\n', 1, ['_nx', 'magnl', 'nl', 'ul', 'ur'], [[<class 'numpy.int32'>], [<class 'numpy.int64'>], [<class 'numpy.int64'>, <class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>], [<class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>]])
  950. new res cache key
  951.  
  952. <pyfr.util.memoize object at 0x6fffe7267b8>
  953. --self
  954. <pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> intcflux
  955.  
  956. #include <omp.h>
  957. #include <stdlib.h>
  958. #include <tgmath.h>
  959.  
  960. #define PYFR_ALIGN_BYTES 32
  961. #define PYFR_NOINLINE __attribute__ ((noinline))
  962.  
  963. #define min(a, b) ((a) < (b) ? (a) : (b))
  964. #define max(a, b) ((a) > (b) ? (a) : (b))
  965.  
  966. // Typedefs
  967. typedef double fpdtype_t;
  968.  
  969. // OpenMP static loop scheduling functions
  970.  
  971. static inline int
  972. gcd(int a, int b)
  973. {
  974. return (a == 0) ? b : gcd(b % a, a);
  975. }
  976.  
  977. static inline void
  978. loop_sched_1d(int n, int align, int *b, int *e)
  979. {
  980. int tid = omp_get_thread_num();
  981. int nth = omp_get_num_threads();
  982.  
  983. // Round up n to be a multiple of nth
  984. int rn = n + nth - 1 - (n - 1) % nth;
  985.  
  986. // Nominal tile size
  987. int sz = rn / nth;
  988.  
  989. // Handle alignment
  990. sz += align - 1 - (sz - 1) % align;
  991.  
  992. // Assign the starting and ending index
  993. *b = sz * tid;
  994. *e = min(*b + sz, n);
  995.  
  996. // Clamp
  997. if (*b >= n)
  998. *b = *e = 0;
  999. }
  1000.  
  1001. static inline void
  1002. loop_sched_2d(int nrow, int ncol, int colalign,
  1003. int *rowb, int *rowe, int *colb, int *cole)
  1004. {
  1005. int tid = omp_get_thread_num();
  1006. int nth = omp_get_num_threads();
  1007.  
  1008. // Distribute threads
  1009. int nrowth = gcd(nrow, nth);
  1010. int ncolth = nth / nrowth;
  1011.  
  1012. // Row and column indices for our thread
  1013. int rowix = tid / ncolth;
  1014. int colix = tid % ncolth;
  1015.  
  1016. // Round up ncol to be a multiple of ncolth
  1017. int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;
  1018.  
  1019. // Nominal tile size
  1020. int ntilerow = nrow / nrowth;
  1021. int ntilecol = rncol / ncolth;
  1022.  
  1023. // Handle column alignment
  1024. ntilecol += colalign - 1 - (ntilecol - 1) % colalign;
  1025.  
  1026. // Assign the starting and ending row to each thread
  1027. *rowb = ntilerow * rowix;
  1028. *rowe = *rowb + ntilerow;
  1029.  
  1030. // Assign the starting and ending column to each thread
  1031. *colb = ntilecol * colix;
  1032. *cole = min(*colb + ntilecol, ncol);
  1033.  
  1034. // Clamp
  1035. if (*colb >= ncol)
  1036. *colb = *cole = 0;
  1037. }
  1038.  
  1039.  
  1040.  
  1041.  
  1042.  
  1043.  
  1044.  
  1045.  
  1046.  
  1047.  
  1048.  
  1049.  
  1050.  
  1051.  
  1052.  
  1053. void intcflux(int _nx, const fpdtype_t* __restrict__ magnl_v, const fpdtype_t* __restrict__ nl_v, int lsdnl, fpdtype_t* __restrict__ ul_v, const int* __restrict__ ul_vix, const int* __restrict__ ul_vcstri, fpdtype_t* __restrict__ ur_v, const int* __restrict__ ur_vix, const int* __restrict__ ur_vcstri)
  1054. {
  1055. #pragma omp parallel
  1056. {
  1057. int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
  1058. int cb, ce;
  1059. loop_sched_1d(_nx, align, &cb, &ce);
  1060. for (int _x = cb; _x < ce; _x++)
  1061. {
  1062.  
  1063. // Perform the Riemann solve
  1064. fpdtype_t fn[4];
  1065. {
  1066.  
  1067. // Compute the left and right fluxes + velocities and pressures
  1068. fpdtype_t fl_[2][4], fr_[2][4];
  1069. fpdtype_t vl_[2], vr_[2];
  1070. fpdtype_t pl_, pr_;
  1071.  
  1072. {
  1073.  
  1074. fpdtype_t invrho__ = 1.0/ul_v[ul_vix[_x] + ul_vcstri[_x]*0], E__ = ul_v[ul_vix[_x] + ul_vcstri[_x]*3];
  1075.  
  1076. // Compute the velocities
  1077. fpdtype_t rhov__[2];
  1078. rhov__[0] = ul_v[ul_vix[_x] + ul_vcstri[_x]*1];
  1079. vl_[0] = invrho__*rhov__[0];
  1080. rhov__[1] = ul_v[ul_vix[_x] + ul_vcstri[_x]*2];
  1081. vl_[1] = invrho__*rhov__[1];
  1082.  
  1083. // Compute the pressure
  1084. pl_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));
  1085.  
  1086. // Density and energy fluxes
  1087. fl_[0][0] = rhov__[0];
  1088. fl_[0][3] = (E__ + pl_)*vl_[0];
  1089. fl_[1][0] = rhov__[1];
  1090. fl_[1][3] = (E__ + pl_)*vl_[1];
  1091.  
  1092. // Momentum fluxes
  1093. fl_[0][1] = rhov__[0]*vl_[0] + pl_;
  1094. fl_[0][2] = rhov__[0]*vl_[1];
  1095. fl_[1][1] = rhov__[1]*vl_[0];
  1096. fl_[1][2] = rhov__[1]*vl_[1] + pl_;
  1097.  
  1098. };
  1099. {
  1100.  
  1101. fpdtype_t invrho__ = 1.0/ur_v[ur_vix[_x] + ur_vcstri[_x]*0], E__ = ur_v[ur_vix[_x] + ur_vcstri[_x]*3];
  1102.  
  1103. // Compute the velocities
  1104. fpdtype_t rhov__[2];
  1105. rhov__[0] = ur_v[ur_vix[_x] + ur_vcstri[_x]*1];
  1106. vr_[0] = invrho__*rhov__[0];
  1107. rhov__[1] = ur_v[ur_vix[_x] + ur_vcstri[_x]*2];
  1108. vr_[1] = invrho__*rhov__[1];
  1109.  
  1110. // Compute the pressure
  1111. pr_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));
  1112.  
  1113. // Density and energy fluxes
  1114. fr_[0][0] = rhov__[0];
  1115. fr_[0][3] = (E__ + pr_)*vr_[0];
  1116. fr_[1][0] = rhov__[1];
  1117. fr_[1][3] = (E__ + pr_)*vr_[1];
  1118.  
  1119. // Momentum fluxes
  1120. fr_[0][1] = rhov__[0]*vr_[0] + pr_;
  1121. fr_[0][2] = rhov__[0]*vr_[1];
  1122. fr_[1][1] = rhov__[1]*vr_[0];
  1123. fr_[1][2] = rhov__[1]*vr_[1] + pr_;
  1124.  
  1125. };
  1126.  
  1127. // Sum the left and right velocities and take the normal
  1128. fpdtype_t nv_ = ((nl_v[lsdnl*0 + _x])*(vl_[0] + vr_[0]) + (nl_v[lsdnl*1 + _x])*(vl_[1] + vr_[1]));
  1129.  
  1130. // Estimate the maximum wave speed / 2
  1131. fpdtype_t a_ = sqrt(0.35*(pl_ + pr_)/(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] + ur_v[ur_vix[_x] + ur_vcstri[_x]*0]))
  1132. + 0.25*fabs(nv_);
  1133.  
  1134. // Output
  1135. fn[0] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][0] + fr_[0][0]) + nl_v[lsdnl*1 + _x]*(fl_[1][0] + fr_[1][0]))
  1136. + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] - ur_v[ur_vix[_x] + ur_vcstri[_x]*0]);
  1137. fn[1] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][1] + fr_[0][1]) + nl_v[lsdnl*1 + _x]*(fl_[1][1] + fr_[1][1]))
  1138. + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*1] - ur_v[ur_vix[_x] + ur_vcstri[_x]*1]);
  1139. fn[2] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][2] + fr_[0][2]) + nl_v[lsdnl*1 + _x]*(fl_[1][2] + fr_[1][2]))
  1140. + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*2] - ur_v[ur_vix[_x] + ur_vcstri[_x]*2]);
  1141. fn[3] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][3] + fr_[0][3]) + nl_v[lsdnl*1 + _x]*(fl_[1][3] + fr_[1][3]))
  1142. + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*3] - ur_v[ur_vix[_x] + ur_vcstri[_x]*3]);
  1143.  
  1144. };
  1145.  
  1146. // Scale and write out the common normal fluxes
  1147. ul_v[ul_vix[_x] + ul_vcstri[_x]*0] = magnl_v[_x]*fn[0];
  1148. ur_v[ur_vix[_x] + ur_vcstri[_x]*0] = -magnl_v[_x]*fn[0];
  1149. ul_v[ul_vix[_x] + ul_vcstri[_x]*1] = magnl_v[_x]*fn[1];
  1150. ur_v[ur_vix[_x] + ur_vcstri[_x]*1] = -magnl_v[_x]*fn[1];
  1151. ul_v[ul_vix[_x] + ul_vcstri[_x]*2] = magnl_v[_x]*fn[2];
  1152. ur_v[ur_vix[_x] + ur_vcstri[_x]*2] = -magnl_v[_x]*fn[2];
  1153. ul_v[ul_vix[_x] + ul_vcstri[_x]*3] = magnl_v[_x]*fn[3];
  1154. ur_v[ur_vix[_x] + ur_vcstri[_x]*3] = -magnl_v[_x]*fn[3];
  1155.  
  1156. }
  1157. }
  1158. }
  1159.  
  1160. [<class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>]
  1161. --args
  1162.  
  1163. --kwargs
  1164.  
  1165. 0 [main] python3 4796 child_info_fork::abort: unable to remap libtmp.so to same address as parent (0x1F0000) - try running rebaseall
  1166. Traceback (most recent call last):
  1167. File "/home/User/PyFR/env4/src/pyfr/pyfr/util.py", line 39, in __call__
  1168. res = cache[key]
  1169. KeyError: (<function OpenMPKernelProvider._build_kernel at 0x6fffe54a6a8>, b'\x80\x03X\x08\x00\x00\x00intcfluxq\x00XZ\x18\x00\x00\n\n#include <omp.h>\n#include <stdlib.h>\n#include <tgmath.h>\n\n#define PYFR_ALIGN_BYTES 32\n#define PYFR_NOINLINE __attribute__ ((noinline))\n\n#define min(a, b) ((a) < (b) ? (a) : (b))\n#define max(a, b) ((a) > (b) ? (a) : (b))\n\n// Typedefs\ntypedef double fpdtype_t;\n\n// OpenMP static loop scheduling functions\n\nstatic inline int\ngcd(int a, int b)\n{\n return (a == 0) ? b : gcd(b % a, a);\n}\n\nstatic inline void\nloop_sched_1d(int n, int align, int *b, int *e)\n{\n int tid = omp_get_thread_num();\n int nth = omp_get_num_threads();\n\n // Round up n to be a multiple of nth\n int rn = n + nth - 1 - (n - 1) % nth;\n\n // Nominal tile size\n int sz = rn / nth;\n\n // Handle alignment\n sz += align - 1 - (sz - 1) % align;\n\n // Assign the starting and ending index\n *b = sz * tid;\n *e = min(*b + sz, n);\n\n // Clamp\n if (*b >= n)\n *b = *e = 0;\n}\n\nstatic inline void\nloop_sched_2d(int nrow, int ncol, int colalign,\n int *rowb, int *rowe, int *colb, int *cole)\n{\n int tid = omp_get_thread_num();\n int nth = omp_get_num_threads();\n\n // Distribute threads\n int nrowth = gcd(nrow, nth);\n int ncolth = nth / nrowth;\n\n // Row and column indices for our thread\n int rowix = tid / ncolth;\n int colix = tid % ncolth;\n\n // Round up ncol to be a multiple of ncolth\n int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;\n\n // Nominal tile size\n int ntilerow = nrow / nrowth;\n int ntilecol = rncol / ncolth;\n\n // Handle column alignment\n ntilecol += colalign - 1 - (ntilecol - 1) % colalign;\n\n // Assign the starting and ending row to each thread\n *rowb = ntilerow * rowix;\n *rowe = *rowb + ntilerow;\n\n // Assign the starting and ending column to each thread\n *colb = ntilecol * colix;\n *cole = min(*colb + ntilecol, ncol);\n\n // Clamp\n if (*colb >= ncol)\n *colb = *cole = 0;\n}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n void intcflux(int _nx, const fpdtype_t* __restrict__ magnl_v, const fpdtype_t* __restrict__ nl_v, int lsdnl, fpdtype_t* __restrict__ ul_v, const int* __restrict__ ul_vix, const int* __restrict__ ul_vcstri, fpdtype_t* __restrict__ ur_v, const int* __restrict__ ur_vix, const int* __restrict__ ur_vcstri)\n {\n #pragma omp parallel\n {\n int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);\n int cb, ce;\n loop_sched_1d(_nx, align, &cb, &ce);\n for (int _x = cb; _x < ce; _x++)\n {\n \n // Perform the Riemann solve\n fpdtype_t fn[4];\n {\n\n // Compute the left and right fluxes + velocities and pressures\n fpdtype_t fl_[2][4], fr_[2][4];\n fpdtype_t vl_[2], vr_[2];\n fpdtype_t pl_, pr_;\n\n {\n\n fpdtype_t invrho__ = 1.0/ul_v[ul_vix[_x] + ul_vcstri[_x]*0], E__ = ul_v[ul_vix[_x] + ul_vcstri[_x]*3];\n\n // Compute the velocities\n fpdtype_t rhov__[2];\n rhov__[0] = ul_v[ul_vix[_x] + ul_vcstri[_x]*1];\n vl_[0] = invrho__*rhov__[0];\n rhov__[1] = ul_v[ul_vix[_x] + ul_vcstri[_x]*2];\n vl_[1] = invrho__*rhov__[1];\n\n // Compute the pressure\n pl_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));\n\n // Density and energy fluxes\n fl_[0][0] = rhov__[0];\n fl_[0][3] = (E__ + pl_)*vl_[0];\n fl_[1][0] = rhov__[1];\n fl_[1][3] = (E__ + pl_)*vl_[1];\n\n // Momentum fluxes\n fl_[0][1] = rhov__[0]*vl_[0] + pl_;\n fl_[0][2] = rhov__[0]*vl_[1];\n fl_[1][1] = rhov__[1]*vl_[0];\n fl_[1][2] = rhov__[1]*vl_[1] + pl_;\n\n};\n {\n\n fpdtype_t invrho__ = 1.0/ur_v[ur_vix[_x] + ur_vcstri[_x]*0], E__ = ur_v[ur_vix[_x] + ur_vcstri[_x]*3];\n\n // Compute the velocities\n fpdtype_t rhov__[2];\n rhov__[0] = ur_v[ur_vix[_x] + ur_vcstri[_x]*1];\n vr_[0] = invrho__*rhov__[0];\n rhov__[1] = ur_v[ur_vix[_x] + ur_vcstri[_x]*2];\n vr_[1] = invrho__*rhov__[1];\n\n // Compute the pressure\n pr_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));\n\n // Density and energy fluxes\n fr_[0][0] = rhov__[0];\n fr_[0][3] = (E__ + pr_)*vr_[0];\n fr_[1][0] = rhov__[1];\n fr_[1][3] = (E__ + pr_)*vr_[1];\n\n // Momentum fluxes\n fr_[0][1] = rhov__[0]*vr_[0] + pr_;\n fr_[0][2] = rhov__[0]*vr_[1];\n fr_[1][1] = rhov__[1]*vr_[0];\n fr_[1][2] = rhov__[1]*vr_[1] + pr_;\n\n};\n\n // Sum the left and right velocities and take the normal\n fpdtype_t nv_ = ((nl_v[lsdnl*0 + _x])*(vl_[0] + vr_[0]) + (nl_v[lsdnl*1 + _x])*(vl_[1] + vr_[1]));\n\n // Estimate the maximum wave speed / 2\n fpdtype_t a_ = sqrt(0.35*(pl_ + pr_)/(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] + ur_v[ur_vix[_x] + ur_vcstri[_x]*0]))\n + 0.25*fabs(nv_);\n\n // Output\n fn[0] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][0] + fr_[0][0]) + nl_v[lsdnl*1 + _x]*(fl_[1][0] + fr_[1][0]))\n + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] - ur_v[ur_vix[_x] + ur_vcstri[_x]*0]);\n fn[1] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][1] + fr_[0][1]) + nl_v[lsdnl*1 + _x]*(fl_[1][1] + fr_[1][1]))\n + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*1] - ur_v[ur_vix[_x] + ur_vcstri[_x]*1]);\n fn[2] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][2] + fr_[0][2]) + nl_v[lsdnl*1 + _x]*(fl_[1][2] + fr_[1][2]))\n + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*2] - ur_v[ur_vix[_x] + ur_vcstri[_x]*2]);\n fn[3] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][3] + fr_[0][3]) + nl_v[lsdnl*1 + _x]*(fl_[1][3] + fr_[1][3]))\n + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*3] - ur_v[ur_vix[_x] + ur_vcstri[_x]*3]);\n\n};\n\n // Scale and write out the common normal fluxes\n ul_v[ul_vix[_x] + ul_vcstri[_x]*0] = magnl_v[_x]*fn[0];\n ur_v[ur_vix[_x] + ur_vcstri[_x]*0] = -magnl_v[_x]*fn[0];\n ul_v[ul_vix[_x] + ul_vcstri[_x]*1] = magnl_v[_x]*fn[1];\n ur_v[ur_vix[_x] + ur_vcstri[_x]*1] = -magnl_v[_x]*fn[1];\n ul_v[ul_vix[_x] + ul_vcstri[_x]*2] = magnl_v[_x]*fn[2];\n ur_v[ur_vix[_x] + ur_vcstri[_x]*2] = -magnl_v[_x]*fn[2];\n ul_v[ul_vix[_x] + ul_vcstri[_x]*3] = magnl_v[_x]*fn[3];\n ur_v[ur_vix[_x] + ur_vcstri[_x]*3] = -magnl_v[_x]*fn[3];\n\n }\n }\n }\n\nq\x01]q\x02(cnumpy\nint32\nq\x03cnumpy\nint64\nq\x04h\x04h\x03h\x04h\x04h\x04h\x04h\x04h\x04e\x87q\x05.', b'\x80\x03}q\x00.')
  1170.  
  1171. During handling of the above exception, another exception occurred:
  1172.  
  1173. Traceback (most recent call last):
  1174. File "/home/User/PyFR/env4/lib/python3.4/site-packages/pytools/prefork.py", line 46, in call_capture_output
  1175. popen = Popen(cmdline, cwd=cwd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
  1176. File "/usr/lib/python3.4/subprocess.py", line 859, in __init__
  1177. restore_signals, start_new_session)
  1178. File "/usr/lib/python3.4/subprocess.py", line 1395, in _execute_child
  1179. restore_signals, start_new_session, preexec_fn)
  1180. BlockingIOError: [Errno 11] Resource temporarily unavailable
  1181.  
  1182. During handling of the above exception, another exception occurred:
  1183.  
  1184. Traceback (most recent call last):
  1185. File "/home/User/PyFR/env4/bin/pyfr", line 9, in <module>
  1186. load_entry_point('pyfr==1.3.0', 'console_scripts', 'pyfr')()
  1187. File "/home/User/PyFR/env4/src/pyfr/pyfr/scripts/main.py", line 109, in main
  1188. args.process(args)
  1189. File "/home/User/PyFR/env4/src/pyfr/pyfr/scripts/main.py", line 230, in process_run
  1190. args, NativeReader(args.mesh), None, Inifile.load(args.cfg)
  1191. File "/home/User/PyFR/env4/src/pyfr/pyfr/scripts/main.py", line 214, in _process_common
  1192. solver = get_solver(backend, rallocs, mesh, soln, cfg)
  1193. File "/home/User/PyFR/env4/src/pyfr/pyfr/solvers/__init__.py", line 14, in get_solver
  1194. return get_integrator(backend, systemcls, rallocs, mesh, initsoln, cfg)
  1195. File "/home/User/PyFR/env4/src/pyfr/pyfr/integrators/__init__.py", line 26, in get_integrator
  1196. return integrator(backend, systemcls, rallocs, mesh, initsoln, cfg)
  1197. File "/home/User/PyFR/env4/src/pyfr/pyfr/integrators/controllers.py", line 14, in __init__
  1198. super().__init__(*args, **kwargs)
  1199. File "/home/User/PyFR/env4/src/pyfr/pyfr/integrators/steppers.py", line 9, in __init__
  1200. super().__init__(*args, **kwargs)
  1201. File "/home/User/PyFR/env4/src/pyfr/pyfr/integrators/base.py", line 41, in __init__
  1202. self.system = systemcls(backend, rallocs, mesh, initsoln, nreg, cfg)
  1203. File "/home/User/PyFR/env4/src/pyfr/pyfr/solvers/base/system.py", line 59, in __init__
  1204. self._gen_kernels(eles, int_inters, mpi_inters, bc_inters)
  1205. File "/home/User/PyFR/env4/src/pyfr/pyfr/solvers/base/system.py", line 167, in _gen_kernels
  1206. kernels[pn, kn].append(kgetter())
  1207. File "/home/User/PyFR/env4/src/pyfr/pyfr/solvers/euler/inters.py", line 21, in <lambda>
  1208. magnl=self._mag_pnorm_lhs, nl=self._norm_pnorm_lhs
  1209. File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/base/backend.py", line 154, in kernel
  1210. return kern(*args, **kwargs)
  1211. File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/base/kernels.py", line 162, in kernel_meth
  1212. fun = self._build_kernel(name, src, list(it.chain(*argt)))
  1213. File "/home/User/PyFR/env4/src/pyfr/pyfr/util.py", line 43, in __call__
  1214. res = cache[key] = self.func(*args, **kwargs)
  1215. File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/openmp/provider.py", line 13, in _build_kernel
  1216. mod = GccSourceModule(src, self.backend.cfg)
  1217. File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/openmp/compiler.py", line 61, in __init__
  1218. super().__init__(src, cfg)
  1219. File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/openmp/compiler.py", line 30, in __init__
  1220. lname = self._build(tmpdir)
  1221. File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/openmp/compiler.py", line 80, in _build
  1222. call_capture_output(cmd + self._cflags, cwd=tmpdir)
  1223. File "/home/User/PyFR/env4/lib/python3.4/site-packages/pytools/prefork.py", line 197, in call_capture_output
  1224. return forker[0].call_capture_output(cmdline, cwd, error_on_nonzero)
  1225. File "/home/User/PyFR/env4/lib/python3.4/site-packages/pytools/prefork.py", line 54, in call_capture_output
  1226. % ( " ".join(cmdline), e))
  1227. pytools.prefork.ExecError: error invoking 'gcc -shared -std=c99 -Ofast -march=native -fopenmp -fPIC -o libtmp.so tmp.c': [Errno 11] Resource temporarily unavailable
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement