Untitled

User@User-PC ~/PyFR/euler_vortex_2d-1.3Test
$ export PYFR_DEBUG_OMP_KEEP_LIBS=1
(env4)
User@User-PC ~/PyFR/euler_vortex_2d-1.3Test
$ pyfr run -b openmp -p euler_vortex_2d.pyfrm euler_vortex_2d.ini

<pyfr.util.memoize object at 0x6fffe85c278>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
--args

--kwargs

res cache key bad
[[[-9.93056816 -8.93056816 -7.93056816 ...,  7.06943184  8.06943184
    9.06943184]
  [-9.93056816 -9.93056816 -9.93056816 ...,  9.06943184  9.06943184
    9.06943184]]

 [[-9.66999052 -8.66999052 -7.66999052 ...,  7.33000948  8.33000948
    9.33000948]
  [-9.93056816 -9.93056816 -9.93056816 ...,  9.06943184  9.06943184
    9.06943184]]

 [[-9.33000948 -8.33000948 -7.33000948 ...,  7.66999052  8.66999052
    9.66999052]
  [-9.93056816 -9.93056816 -9.93056816 ...,  9.06943184  9.06943184
    9.06943184]]

 ...,
 [[-9.66999052 -8.66999052 -7.66999052 ...,  7.33000948  8.33000948
    9.33000948]
  [-9.06943184 -9.06943184 -9.06943184 ...,  9.93056816  9.93056816
    9.93056816]]

 [[-9.33000948 -8.33000948 -7.33000948 ...,  7.66999052  8.66999052
    9.66999052]
  [-9.06943184 -9.06943184 -9.06943184 ...,  9.93056816  9.93056816
    9.93056816]]

 [[-9.06943184 -8.06943184 -7.06943184 ...,  7.93056816  8.93056816
    9.93056816]
  [-9.06943184 -9.06943184 -9.06943184 ...,  9.93056816  9.93056816
    9.93056816]]]
new res cache key

<pyfr.util.memoize object at 0x6fffe85c278>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
--args

--kwargs

[[[-9.93056816 -8.93056816 -7.93056816 ...,  7.06943184  8.06943184
    9.06943184]
  [-9.93056816 -9.93056816 -9.93056816 ...,  9.06943184  9.06943184
    9.06943184]]

 [[-9.66999052 -8.66999052 -7.66999052 ...,  7.33000948  8.33000948
    9.33000948]
  [-9.93056816 -9.93056816 -9.93056816 ...,  9.06943184  9.06943184
    9.06943184]]

 [[-9.33000948 -8.33000948 -7.33000948 ...,  7.66999052  8.66999052
    9.66999052]
  [-9.93056816 -9.93056816 -9.93056816 ...,  9.06943184  9.06943184
    9.06943184]]

 ...,
 [[-9.66999052 -8.66999052 -7.66999052 ...,  7.33000948  8.33000948
    9.33000948]
  [-9.06943184 -9.06943184 -9.06943184 ...,  9.93056816  9.93056816
    9.93056816]]

 [[-9.33000948 -8.33000948 -7.33000948 ...,  7.66999052  8.66999052
    9.66999052]
  [-9.06943184 -9.06943184 -9.06943184 ...,  9.93056816  9.93056816
    9.93056816]]

 [[-9.06943184 -8.06943184 -7.06943184 ...,  7.93056816  8.93056816
    9.93056816]
  [-9.06943184 -9.06943184 -9.06943184 ...,  9.93056816  9.93056816
    9.93056816]]]
res cache key good

<pyfr.util.memoize object at 0x6fffe85c278>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> mpts
--args

--kwargs

res cache key bad
[[[-10.  -9.  -8. ...,   7.   8.   9.]
  [-10. -10. -10. ...,   9.   9.   9.]]

 [[ -9.  -8.  -7. ...,   8.   9.  10.]
  [-10. -10. -10. ...,   9.   9.   9.]]

 [[-10.  -9.  -8. ...,   7.   8.   9.]
  [ -9.  -9.  -9. ...,  10.  10.  10.]]

 [[ -9.  -8.  -7. ...,   8.   9.  10.]
  [ -9.  -9.  -9. ...,  10.  10.  10.]]]
new res cache key

<pyfr.util.memoize object at 0x6fffe85c160>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> M3
--args

--kwargs

res cache key bad
<pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6ffffd5ed68>
new res cache key

<pyfr.util.memoize object at 0x6fffe7267b8>
--self
<pyfr.backends.openmp.cblas.OpenMPCBLASKernels object at 0x6fffe55f9e8> par_gemm

#include <omp.h>
#include <stdlib.h>
#include <tgmath.h>

#define PYFR_ALIGN_BYTES 32
#define PYFR_NOINLINE __attribute__ ((noinline))

#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))

// Typedefs
typedef double fpdtype_t;

// OpenMP static loop scheduling functions

static inline int
gcd(int a, int b)
{
    return (a == 0) ? b : gcd(b % a, a);
}

static inline void
loop_sched_1d(int n, int align, int *b, int *e)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Round up n to be a multiple of nth
    int rn = n + nth - 1 - (n - 1) % nth;

    // Nominal tile size
    int sz = rn / nth;

    // Handle alignment
    sz += align - 1 - (sz - 1) % align;

    // Assign the starting and ending index
    *b = sz * tid;
    *e = min(*b + sz, n);

    // Clamp
    if (*b >= n)
        *b = *e = 0;
}

static inline void
loop_sched_2d(int nrow, int ncol, int colalign,
              int *rowb, int *rowe, int *colb, int *cole)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Distribute threads
    int nrowth = gcd(nrow, nth);
    int ncolth = nth / nrowth;

    // Row and column indices for our thread
    int rowix = tid / ncolth;
    int colix = tid % ncolth;

    // Round up ncol to be a multiple of ncolth
    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;

    // Nominal tile size
    int ntilerow = nrow / nrowth;
    int ntilecol = rncol / ncolth;

    // Handle column alignment
    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;

    // Assign the starting and ending row to each thread
    *rowb = ntilerow * rowix;
    *rowe = *rowb + ntilerow;

    // Assign the starting and ending column to each thread
    *colb = ntilecol * colix;
    *cole = min(*colb + ntilecol, ncol);

    // Clamp
    if (*colb >= ncol)
        *colb = *cole = 0;
}


// CBLAS GEMM constants
#define ROW_MAJOR 101
#define NO_TRANS  111

// CBLAS GEMM prototype
typedef void (*cblas_gemm_t)(int, int, int,
                             int, int, int,
                             fpdtype_t, const fpdtype_t *, int,
                             const fpdtype_t *, int,
                             fpdtype_t, fpdtype_t *, int);

void
par_gemm(cblas_gemm_t gemm, int M, int N, int K,
         fpdtype_t alpha, const fpdtype_t *A, int lda,
         const fpdtype_t *B, int ldb,
         fpdtype_t beta, fpdtype_t *C, int ldc)
{
    #pragma omp parallel
    {
        int begin, end;
        loop_sched_1d(N, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);

        gemm(ROW_MAJOR, NO_TRANS, NO_TRANS, M, end - begin, K,
             alpha, A, lda, B + begin, ldb, beta, C + begin, ldc);
    }
}

 [<class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
--args

--kwargs

res cache key bad
<_FuncPtr object at 0x6fffe5694f8>
new res cache key

<pyfr.util.memoize object at 0x6fffe85c240>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
--args

--kwargs


<pyfr.util.memoize object at 0x6fffe85c208>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
--args

--kwargs

res cache key bad
[[ 4.  4.  4. ...,  4.  4.  4.]
 [ 4.  4.  4. ...,  4.  4.  4.]
 [ 4.  4.  4. ...,  4.  4.  4.]
 ...,
 [ 4.  4.  4. ...,  4.  4.  4.]
 [ 4.  4.  4. ...,  4.  4.  4.]
 [ 4.  4.  4. ...,  4.  4.  4.]]
new res cache key
res cache key bad
<pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6ffffca9f98>
new res cache key

<pyfr.util.memoize object at 0x6fffebbf470>
--self
<pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> negdivconf pyfr.solvers.baseadvec.kernels.negdivconf {'ndims': 2, 'srcex': ['(0.)', '(0.)', '(0.)', '(0.)'], 'nvars': 4}
--args

--kwargs

res cache key bad
('\n\n#include <omp.h>\n#include <stdlib.h>\n#include <tgmath.h>\n\n#define PYFR_ALIGN_BYTES 32\n#define PYFR_NOINLINE __attribute__ ((noinline))\n\n#define min(a, b) ((a) < (b) ? (a) : (b))\n#define max(a, b) ((a) > (b) ? (a) : (b))\n\n// Typedefs\ntypedef double fpdtype_t;\n\n// OpenMP static loop scheduling functions\n\nstatic inline int\ngcd(int a, int b)\n{\n    return (a == 0) ? b : gcd(b % a, a);\n}\n\nstatic inline void\nloop_sched_1d(int n, int align, int *b, int *e)\n{\n    int tid = omp_get_thread_num();\n    int nth = omp_get_num_threads();\n\n    // Round up n to be a multiple of nth\n    int rn = n + nth - 1 - (n - 1) % nth;\n\n    // Nominal tile size\n    int sz = rn / nth;\n\n    // Handle alignment\n    sz += align - 1 - (sz - 1) % align;\n\n    // Assign the starting and ending index\n    *b = sz * tid;\n    *e = min(*b + sz, n);\n\n    // Clamp\n    if (*b >= n)\n        *b = *e = 0;\n}\n\nstatic inline void\nloop_sched_2d(int nrow, int ncol, int colalign,\n              int *rowb, int *rowe, int *colb, int *cole)\n{\n    int tid = omp_get_thread_num();\n    int nth = omp_get_num_threads();\n\n    // Distribute threads\n    int nrowth = gcd(nrow, nth);\n    int ncolth = nth / nrowth;\n\n    // Row and column indices for our thread\n    int rowix = tid / ncolth;\n    int colix = tid % ncolth;\n\n    // Round up ncol to be a multiple of ncolth\n    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;\n\n    // Nominal tile size\n    int ntilerow = nrow / nrowth;\n    int ntilecol = rncol / ncolth;\n\n    // Handle column alignment\n    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;\n\n    // Assign the starting and ending row to each thread\n    *rowb = ntilerow * rowix;\n    *rowe = *rowb + ntilerow;\n\n    // Assign the starting and ending column to each thread\n    *colb = ntilecol * colix;\n    *cole = min(*colb + ntilecol, ncol);\n\n    // Clamp\n    if (*colb >= ncol)\n        *colb = *cole = 0;\n}\n\n\n\n\n\nstatic PYFR_NOINLINE void negdivconf_inner(int _nx, const fpdtype_t *__restrict__ rcpdjac_v, fpdtype_t *__restrict__ tdivtconf_v0, fpdtype_t *__restrict__ tdivtconf_v1, fpdtype_t *__restrict__ tdivtconf_v2, fpdtype_t *__restrict__ tdivtconf_v3)\n               {\n                   for (int _x = 0; _x < _nx; _x++)\n                   {\n                       \n    tdivtconf_v0[_x] = -rcpdjac_v[_x]*tdivtconf_v0[_x] + (0.);\n    tdivtconf_v1[_x] = -rcpdjac_v[_x]*tdivtconf_v1[_x] + (0.);\n    tdivtconf_v2[_x] = -rcpdjac_v[_x]*tdivtconf_v2[_x] + (0.);\n    tdivtconf_v3[_x] = -rcpdjac_v[_x]*tdivtconf_v3[_x] + (0.);\n\n                   }\n               }\n                   void negdivconf(int _ny, int _nx, const fpdtype_t* __restrict__ rcpdjac_v, int lsdrcpdjac, fpdtype_t* __restrict__ tdivtconf_v, int lsdtdivtconf)\n                   {\n                       #pragma omp parallel\n                       {\n                           int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);\n                           int rb, re, cb, ce;\n                           loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);\n                           for (int _y = rb; _y < re; _y++)\n                           {\n                               negdivconf_inner(ce - cb, rcpdjac_v + _y*lsdrcpdjac + cb, tdivtconf_v + (_y*4 + 0)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 1)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 2)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 3)*lsdtdivtconf + cb);\n                           }\n                       }\n                   }\n\n', 2, ['_ny', '_nx', 'rcpdjac', 'tdivtconf'], [[<class 'numpy.int32'>], [<class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>]])
new res cache key

<pyfr.util.memoize object at 0x6fffe7267b8>
--self
<pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> negdivconf

#include <omp.h>
#include <stdlib.h>
#include <tgmath.h>

#define PYFR_ALIGN_BYTES 32
#define PYFR_NOINLINE __attribute__ ((noinline))

#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))

// Typedefs
typedef double fpdtype_t;

// OpenMP static loop scheduling functions

static inline int
gcd(int a, int b)
{
    return (a == 0) ? b : gcd(b % a, a);
}

static inline void
loop_sched_1d(int n, int align, int *b, int *e)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Round up n to be a multiple of nth
    int rn = n + nth - 1 - (n - 1) % nth;

    // Nominal tile size
    int sz = rn / nth;

    // Handle alignment
    sz += align - 1 - (sz - 1) % align;

    // Assign the starting and ending index
    *b = sz * tid;
    *e = min(*b + sz, n);

    // Clamp
    if (*b >= n)
        *b = *e = 0;
}

static inline void
loop_sched_2d(int nrow, int ncol, int colalign,
              int *rowb, int *rowe, int *colb, int *cole)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Distribute threads
    int nrowth = gcd(nrow, nth);
    int ncolth = nth / nrowth;

    // Row and column indices for our thread
    int rowix = tid / ncolth;
    int colix = tid % ncolth;

    // Round up ncol to be a multiple of ncolth
    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;

    // Nominal tile size
    int ntilerow = nrow / nrowth;
    int ntilecol = rncol / ncolth;

    // Handle column alignment
    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;

    // Assign the starting and ending row to each thread
    *rowb = ntilerow * rowix;
    *rowe = *rowb + ntilerow;

    // Assign the starting and ending column to each thread
    *colb = ntilecol * colix;
    *cole = min(*colb + ntilecol, ncol);

    // Clamp
    if (*colb >= ncol)
        *colb = *cole = 0;
}


static PYFR_NOINLINE void negdivconf_inner(int _nx, const fpdtype_t *__restrict__ rcpdjac_v, fpdtype_t *__restrict__ tdivtconf_v0, fpdtype_t *__restrict__ tdivtconf_v1, fpdtype_t *__restrict__ tdivtconf_v2, fpdtype_t *__restrict__ tdivtconf_v3)
               {
                   for (int _x = 0; _x < _nx; _x++)
                   {

    tdivtconf_v0[_x] = -rcpdjac_v[_x]*tdivtconf_v0[_x] + (0.);
    tdivtconf_v1[_x] = -rcpdjac_v[_x]*tdivtconf_v1[_x] + (0.);
    tdivtconf_v2[_x] = -rcpdjac_v[_x]*tdivtconf_v2[_x] + (0.);
    tdivtconf_v3[_x] = -rcpdjac_v[_x]*tdivtconf_v3[_x] + (0.);

                   }
               }
                   void negdivconf(int _ny, int _nx, const fpdtype_t* __restrict__ rcpdjac_v, int lsdrcpdjac, fpdtype_t* __restrict__ tdivtconf_v, int lsdtdivtconf)
                   {
                       #pragma omp parallel
                       {
                           int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
                           int rb, re, cb, ce;
                           loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
                           for (int _y = rb; _y < re; _y++)
                           {
                               negdivconf_inner(ce - cb, rcpdjac_v + _y*lsdrcpdjac + cb, tdivtconf_v + (_y*4 + 0)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 1)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 2)*lsdtdivtconf + cb, tdivtconf_v + (_y*4 + 3)*lsdtdivtconf + cb);
                           }
                       }
                   }

 [<class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
--args

--kwargs

res cache key bad
<_FuncPtr object at 0x6fffe5695c0>
new res cache key

<pyfr.util.memoize object at 0x6fffe85c160>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> M1 - M3*M2
--args

--kwargs

res cache key bad
<pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6fffe108a20>
new res cache key

<pyfr.util.memoize object at 0x6fffe7267b8>
--self
<pyfr.backends.openmp.cblas.OpenMPCBLASKernels object at 0x6fffe55f9e8> par_gemm

#include <omp.h>
#include <stdlib.h>
#include <tgmath.h>

#define PYFR_ALIGN_BYTES 32
#define PYFR_NOINLINE __attribute__ ((noinline))

#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))

// Typedefs
typedef double fpdtype_t;

// OpenMP static loop scheduling functions

static inline int
gcd(int a, int b)
{
    return (a == 0) ? b : gcd(b % a, a);
}

static inline void
loop_sched_1d(int n, int align, int *b, int *e)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Round up n to be a multiple of nth
    int rn = n + nth - 1 - (n - 1) % nth;

    // Nominal tile size
    int sz = rn / nth;

    // Handle alignment
    sz += align - 1 - (sz - 1) % align;

    // Assign the starting and ending index
    *b = sz * tid;
    *e = min(*b + sz, n);

    // Clamp
    if (*b >= n)
        *b = *e = 0;
}

static inline void
loop_sched_2d(int nrow, int ncol, int colalign,
              int *rowb, int *rowe, int *colb, int *cole)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Distribute threads
    int nrowth = gcd(nrow, nth);
    int ncolth = nth / nrowth;

    // Row and column indices for our thread
    int rowix = tid / ncolth;
    int colix = tid % ncolth;

    // Round up ncol to be a multiple of ncolth
    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;

    // Nominal tile size
    int ntilerow = nrow / nrowth;
    int ntilecol = rncol / ncolth;

    // Handle column alignment
    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;

    // Assign the starting and ending row to each thread
    *rowb = ntilerow * rowix;
    *rowe = *rowb + ntilerow;

    // Assign the starting and ending column to each thread
    *colb = ntilecol * colix;
    *cole = min(*colb + ntilecol, ncol);

    // Clamp
    if (*colb >= ncol)
        *colb = *cole = 0;
}


// CBLAS GEMM constants
#define ROW_MAJOR 101
#define NO_TRANS  111

// CBLAS GEMM prototype
typedef void (*cblas_gemm_t)(int, int, int,
                             int, int, int,
                             fpdtype_t, const fpdtype_t *, int,
                             const fpdtype_t *, int,
                             fpdtype_t, fpdtype_t *, int);

void
par_gemm(cblas_gemm_t gemm, int M, int N, int K,
         fpdtype_t alpha, const fpdtype_t *A, int lda,
         const fpdtype_t *B, int ldb,
         fpdtype_t beta, fpdtype_t *C, int ldc)
{
    #pragma omp parallel
    {
        int begin, end;
        loop_sched_1d(N, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);

        gemm(ROW_MAJOR, NO_TRANS, NO_TRANS, M, end - begin, K,
             alpha, A, lda, B + begin, ldb, beta, C + begin, ldc);
    }
}

 [<class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
--args

--kwargs

<_FuncPtr object at 0x6fffe5694f8>
res cache key good

<pyfr.util.memoize object at 0x6fffe85c160>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> M0
--args

--kwargs

res cache key bad
<pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6fffe108978>
new res cache key

<pyfr.util.memoize object at 0x6fffe7267b8>
--self
<pyfr.backends.openmp.cblas.OpenMPCBLASKernels object at 0x6fffe55f9e8> par_gemm

#include <omp.h>
#include <stdlib.h>
#include <tgmath.h>

#define PYFR_ALIGN_BYTES 32
#define PYFR_NOINLINE __attribute__ ((noinline))

#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))

// Typedefs
typedef double fpdtype_t;

// OpenMP static loop scheduling functions

static inline int
gcd(int a, int b)
{
    return (a == 0) ? b : gcd(b % a, a);
}

static inline void
loop_sched_1d(int n, int align, int *b, int *e)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Round up n to be a multiple of nth
    int rn = n + nth - 1 - (n - 1) % nth;

    // Nominal tile size
    int sz = rn / nth;

    // Handle alignment
    sz += align - 1 - (sz - 1) % align;

    // Assign the starting and ending index
    *b = sz * tid;
    *e = min(*b + sz, n);

    // Clamp
    if (*b >= n)
        *b = *e = 0;
}

static inline void
loop_sched_2d(int nrow, int ncol, int colalign,
              int *rowb, int *rowe, int *colb, int *cole)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Distribute threads
    int nrowth = gcd(nrow, nth);
    int ncolth = nth / nrowth;

    // Row and column indices for our thread
    int rowix = tid / ncolth;
    int colix = tid % ncolth;

    // Round up ncol to be a multiple of ncolth
    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;

    // Nominal tile size
    int ntilerow = nrow / nrowth;
    int ntilecol = rncol / ncolth;

    // Handle column alignment
    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;

    // Assign the starting and ending row to each thread
    *rowb = ntilerow * rowix;
    *rowe = *rowb + ntilerow;

    // Assign the starting and ending column to each thread
    *colb = ntilecol * colix;
    *cole = min(*colb + ntilecol, ncol);

    // Clamp
    if (*colb >= ncol)
        *colb = *cole = 0;
}


// CBLAS GEMM constants
#define ROW_MAJOR 101
#define NO_TRANS  111

// CBLAS GEMM prototype
typedef void (*cblas_gemm_t)(int, int, int,
                             int, int, int,
                             fpdtype_t, const fpdtype_t *, int,
                             const fpdtype_t *, int,
                             fpdtype_t, fpdtype_t *, int);

void
par_gemm(cblas_gemm_t gemm, int M, int N, int K,
         fpdtype_t alpha, const fpdtype_t *A, int lda,
         const fpdtype_t *B, int ldb,
         fpdtype_t beta, fpdtype_t *C, int ldc)
{
    #pragma omp parallel
    {
        int begin, end;
        loop_sched_1d(N, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);

        gemm(ROW_MAJOR, NO_TRANS, NO_TRANS, M, end - begin, K,
             alpha, A, lda, B + begin, ldb, beta, C + begin, ldc);
    }
}

 [<class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.float64'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
--args

--kwargs

<_FuncPtr object at 0x6fffe5694f8>
res cache key good

<pyfr.util.memoize object at 0x6fffe85c1d0>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
--args

--kwargs


<pyfr.util.memoize object at 0x6fffe85c198>
--self
<pyfr.solvers.euler.elements.EulerElements object at 0x6fffe567630> upts
--args

--kwargs

res cache key bad
[[[[  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]
   [ -6.16678657e-17  -8.26510554e-16   3.91590947e-15 ...,
      5.16569096e-14   0.00000000e+00   0.00000000e+00]]

  [[  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]
   [ -2.93107297e-16  -5.95071123e-16   1.86123133e-14 ...,
      3.71919452e-14   0.00000000e+00   0.00000000e+00]]

  [[  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]
   [ -5.95071123e-16  -2.93107297e-16   3.77870163e-14 ...,
      1.83192061e-14   0.00000000e+00   0.00000000e+00]]

  ...,
  [[  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]
   [ -2.93107297e-16  -5.95071123e-16   1.86123133e-14 ...,
      3.71919452e-14   0.00000000e+00   0.00000000e+00]]

  [[  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]
   [ -5.95071123e-16  -2.93107297e-16   3.77870163e-14 ...,
      1.83192061e-14   0.00000000e+00   0.00000000e+00]]

  [[  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]
   [ -8.26510554e-16  -6.16678657e-17   5.24834202e-14 ...,
      3.85424160e-15   0.00000000e+00   0.00000000e+00]]]


 [[[ -6.16678657e-17   6.16678657e-17  -1.23335731e-16 ...,
      0.00000000e+00  -8.26510554e-16   8.26510554e-16]
   [  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]]

  [[ -6.16678657e-17   6.16678657e-17  -1.23335731e-16 ...,
      0.00000000e+00  -8.26510554e-16   8.26510554e-16]
   [  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]]

  [[ -6.16678657e-17   6.16678657e-17  -1.23335731e-16 ...,
      0.00000000e+00  -8.26510554e-16   8.26510554e-16]
   [  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]]

  ...,
  [[ -8.26510554e-16   8.26510554e-16  -1.65302111e-15 ...,
      0.00000000e+00  -6.16678657e-17   6.16678657e-17]
   [  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]]

  [[ -8.26510554e-16   8.26510554e-16  -1.65302111e-15 ...,
      0.00000000e+00  -6.16678657e-17   6.16678657e-17]
   [  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]]

  [[ -8.26510554e-16   8.26510554e-16  -1.65302111e-15 ...,
      0.00000000e+00  -6.16678657e-17   6.16678657e-17]
   [  5.00000000e-01   5.00000000e-01   5.00000000e-01 ...,
      5.00000000e-01   5.00000000e-01   5.00000000e-01]]]]
new res cache key
res cache key bad
<pyfr.backends.openmp.types.OpenMPConstMatrix object at 0x6fffe159b70>
new res cache key

<pyfr.util.memoize object at 0x6fffebbf470>
--self
<pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> tflux pyfr.solvers.euler.kernels.tflux {'ndims': 2, 'c': OrderedDict([('gamma', 1.4), ('S', 13.5), ('M', 0.4), ('R', 1.5)]), 'nvars': 4}
--args

--kwargs

res cache key bad
('\n\n#include <omp.h>\n#include <stdlib.h>\n#include <tgmath.h>\n\n#define PYFR_ALIGN_BYTES 32\n#define PYFR_NOINLINE __attribute__ ((noinline))\n\n#define min(a, b) ((a) < (b) ? (a) : (b))\n#define max(a, b) ((a) > (b) ? (a) : (b))\n\n// Typedefs\ntypedef double fpdtype_t;\n\n// OpenMP static loop scheduling functions\n\nstatic inline int\ngcd(int a, int b)\n{\n    return (a == 0) ? b : gcd(b % a, a);\n}\n\nstatic inline void\nloop_sched_1d(int n, int align, int *b, int *e)\n{\n    int tid = omp_get_thread_num();\n    int nth = omp_get_num_threads();\n\n    // Round up n to be a multiple of nth\n    int rn = n + nth - 1 - (n - 1) % nth;\n\n    // Nominal tile size\n    int sz = rn / nth;\n\n    // Handle alignment\n    sz += align - 1 - (sz - 1) % align;\n\n    // Assign the starting and ending index\n    *b = sz * tid;\n    *e = min(*b + sz, n);\n\n    // Clamp\n    if (*b >= n)\n        *b = *e = 0;\n}\n\nstatic inline void\nloop_sched_2d(int nrow, int ncol, int colalign,\n              int *rowb, int *rowe, int *colb, int *cole)\n{\n    int tid = omp_get_thread_num();\n    int nth = omp_get_num_threads();\n\n    // Distribute threads\n    int nrowth = gcd(nrow, nth);\n    int ncolth = nth / nrowth;\n\n    // Row and column indices for our thread\n    int rowix = tid / ncolth;\n    int colix = tid % ncolth;\n\n    // Round up ncol to be a multiple of ncolth\n    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;\n\n    // Nominal tile size\n    int ntilerow = nrow / nrowth;\n    int ntilecol = rncol / ncolth;\n\n    // Handle column alignment\n    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;\n\n    // Assign the starting and ending row to each thread\n    *rowb = ntilerow * rowix;\n    *rowe = *rowb + ntilerow;\n\n    // Assign the starting and ending column to each thread\n    *colb = ntilecol * colix;\n    *cole = min(*colb + ntilecol, ncol);\n\n    // Clamp\n    if (*colb >= ncol)\n        *colb = *cole = 0;\n}\n\n\n\n\n\n\n\n\n\nstatic PYFR_NOINLINE void tflux_inner(int _nx, fpdtype_t *__restrict__ f_v0v0, fpdtype_t *__restrict__ f_v0v1, fpdtype_t *__restrict__ f_v0v2, fpdtype_t *__restrict__ f_v0v3, fpdtype_t *__restrict__ f_v1v0, fpdtype_t *__restrict__ f_v1v1, fpdtype_t *__restrict__ f_v1v2, fpdtype_t *__restrict__ f_v1v3, const fpdtype_t *__restrict__ smats_v0v0, const fpdtype_t *__restrict__ smats_v0v1, const fpdtype_t *__restrict__ smats_v1v0, const fpdtype_t *__restrict__ smats_v1v1, const fpdtype_t *__restrict__ u_v0, const fpdtype_t *__restrict__ u_v1, const fpdtype_t *__restrict__ u_v2, const fpdtype_t *__restrict__ u_v3)\n               {\n                   for (int _x = 0; _x < _nx; _x++)\n                   {\n                       \n    // Compute the flux\n    fpdtype_t ftemp[2][4];\n    fpdtype_t p, v[2];\n    {\n\n    fpdtype_t invrho_ = 1.0/u_v0[_x], E_ = u_v3[_x];\n\n    // Compute the velocities\n    fpdtype_t rhov_[2];\n    rhov_[0] = u_v1[_x];\n    v[0] = invrho_*rhov_[0];\n    rhov_[1] = u_v2[_x];\n    v[1] = invrho_*rhov_[1];\n\n    // Compute the pressure\n    p = 0.3999999999999999*(E_ - 0.5*invrho_*((rhov_[0])*(rhov_[0]) + (rhov_[1])*(rhov_[1])));\n\n    // Density and energy fluxes\n    ftemp[0][0] = rhov_[0];\n    ftemp[0][3] = (E_ + p)*v[0];\n    ftemp[1][0] = rhov_[1];\n    ftemp[1][3] = (E_ + p)*v[1];\n\n    // Momentum fluxes\n    ftemp[0][1] = rhov_[0]*v[0] + p;\n    ftemp[0][2] = rhov_[0]*v[1];\n    ftemp[1][1] = rhov_[1]*v[0];\n    ftemp[1][2] = rhov_[1]*v[1] + p;\n\n};\n\n    // Transform the fluxes\n    f_v0v0[_x] = smats_v0v0[_x]*ftemp[0][0] + smats_v0v1[_x]*ftemp[1][0];\n    f_v0v1[_x] = smats_v0v0[_x]*ftemp[0][1] + smats_v0v1[_x]*ftemp[1][1];\n    f_v0v2[_x] = smats_v0v0[_x]*ftemp[0][2] + smats_v0v1[_x]*ftemp[1][2];\n    f_v0v3[_x] = smats_v0v0[_x]*ftemp[0][3] + smats_v0v1[_x]*ftemp[1][3];\n    f_v1v0[_x] = smats_v1v0[_x]*ftemp[0][0] + smats_v1v1[_x]*ftemp[1][0];\n    f_v1v1[_x] = smats_v1v0[_x]*ftemp[0][1] + smats_v1v1[_x]*ftemp[1][1];\n    f_v1v2[_x] = smats_v1v0[_x]*ftemp[0][2] + smats_v1v1[_x]*ftemp[1][2];\n    f_v1v3[_x] = smats_v1v0[_x]*ftemp[0][3] + smats_v1v1[_x]*ftemp[1][3];\n\n                   }\n               }\n                   void tflux(int _ny, int _nx, fpdtype_t* __restrict__ f_v, int lsdf, const fpdtype_t* __restrict__ smats_v, int lsdsmats, const fpdtype_t* __restrict__ u_v, int lsdu)\n                   {\n                       #pragma omp parallel\n                       {\n                           int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);\n                           int rb, re, cb, ce;\n                           loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);\n                           for (int _y = rb; _y < re; _y++)\n                           {\n                               tflux_inner(ce - cb, f_v + ((0*_ny + _y)*4 + 0)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 1)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 2)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 3)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 0)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 1)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 2)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 3)*lsdf + cb, smats_v + ((0*_ny + _y)*2 + 0)*lsdsmats + cb, smats_v + ((0*_ny + _y)*2 + 1)*lsdsmats + cb, smats_v + ((1*_ny + _y)*2 + 0)*lsdsmats + cb, smats_v + ((1*_ny + _y)*2 + 1)*lsdsmats + cb, u_v + (_y*4 + 0)*lsdu + cb, u_v + (_y*4 + 1)*lsdu + cb, u_v + (_y*4 + 2)*lsdu + cb, u_v + (_y*4 + 3)*lsdu + cb);\n                           }\n                       }\n                   }\n\n', 2, ['_ny', '_nx', 'f', 'smats', 'u'], [[<class 'numpy.int32'>], [<class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int32'>]])
new res cache key

<pyfr.util.memoize object at 0x6fffe7267b8>
--self
<pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> tflux

#include <omp.h>
#include <stdlib.h>
#include <tgmath.h>

#define PYFR_ALIGN_BYTES 32
#define PYFR_NOINLINE __attribute__ ((noinline))

#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))

// Typedefs
typedef double fpdtype_t;

// OpenMP static loop scheduling functions

static inline int
gcd(int a, int b)
{
    return (a == 0) ? b : gcd(b % a, a);
}

static inline void
loop_sched_1d(int n, int align, int *b, int *e)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Round up n to be a multiple of nth
    int rn = n + nth - 1 - (n - 1) % nth;

    // Nominal tile size
    int sz = rn / nth;

    // Handle alignment
    sz += align - 1 - (sz - 1) % align;

    // Assign the starting and ending index
    *b = sz * tid;
    *e = min(*b + sz, n);

    // Clamp
    if (*b >= n)
        *b = *e = 0;
}

static inline void
loop_sched_2d(int nrow, int ncol, int colalign,
              int *rowb, int *rowe, int *colb, int *cole)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Distribute threads
    int nrowth = gcd(nrow, nth);
    int ncolth = nth / nrowth;

    // Row and column indices for our thread
    int rowix = tid / ncolth;
    int colix = tid % ncolth;

    // Round up ncol to be a multiple of ncolth
    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;

    // Nominal tile size
    int ntilerow = nrow / nrowth;
    int ntilecol = rncol / ncolth;

    // Handle column alignment
    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;

    // Assign the starting and ending row to each thread
    *rowb = ntilerow * rowix;
    *rowe = *rowb + ntilerow;

    // Assign the starting and ending column to each thread
    *colb = ntilecol * colix;
    *cole = min(*colb + ntilecol, ncol);

    // Clamp
    if (*colb >= ncol)
        *colb = *cole = 0;
}


static PYFR_NOINLINE void tflux_inner(int _nx, fpdtype_t *__restrict__ f_v0v0, fpdtype_t *__restrict__ f_v0v1, fpdtype_t *__restrict__ f_v0v2, fpdtype_t *__restrict__ f_v0v3, fpdtype_t *__restrict__ f_v1v0, fpdtype_t *__restrict__ f_v1v1, fpdtype_t *__restrict__ f_v1v2, fpdtype_t *__restrict__ f_v1v3, const fpdtype_t *__restrict__ smats_v0v0, const fpdtype_t *__restrict__ smats_v0v1, const fpdtype_t *__restrict__ smats_v1v0, const fpdtype_t *__restrict__ smats_v1v1, const fpdtype_t *__restrict__ u_v0, const fpdtype_t *__restrict__ u_v1, const fpdtype_t *__restrict__ u_v2, const fpdtype_t *__restrict__ u_v3)
               {
                   for (int _x = 0; _x < _nx; _x++)
                   {

    // Compute the flux
    fpdtype_t ftemp[2][4];
    fpdtype_t p, v[2];
    {

    fpdtype_t invrho_ = 1.0/u_v0[_x], E_ = u_v3[_x];

    // Compute the velocities
    fpdtype_t rhov_[2];
    rhov_[0] = u_v1[_x];
    v[0] = invrho_*rhov_[0];
    rhov_[1] = u_v2[_x];
    v[1] = invrho_*rhov_[1];

    // Compute the pressure
    p = 0.3999999999999999*(E_ - 0.5*invrho_*((rhov_[0])*(rhov_[0]) + (rhov_[1])*(rhov_[1])));

    // Density and energy fluxes
    ftemp[0][0] = rhov_[0];
    ftemp[0][3] = (E_ + p)*v[0];
    ftemp[1][0] = rhov_[1];
    ftemp[1][3] = (E_ + p)*v[1];

    // Momentum fluxes
    ftemp[0][1] = rhov_[0]*v[0] + p;
    ftemp[0][2] = rhov_[0]*v[1];
    ftemp[1][1] = rhov_[1]*v[0];
    ftemp[1][2] = rhov_[1]*v[1] + p;

};

    // Transform the fluxes
    f_v0v0[_x] = smats_v0v0[_x]*ftemp[0][0] + smats_v0v1[_x]*ftemp[1][0];
    f_v0v1[_x] = smats_v0v0[_x]*ftemp[0][1] + smats_v0v1[_x]*ftemp[1][1];
    f_v0v2[_x] = smats_v0v0[_x]*ftemp[0][2] + smats_v0v1[_x]*ftemp[1][2];
    f_v0v3[_x] = smats_v0v0[_x]*ftemp[0][3] + smats_v0v1[_x]*ftemp[1][3];
    f_v1v0[_x] = smats_v1v0[_x]*ftemp[0][0] + smats_v1v1[_x]*ftemp[1][0];
    f_v1v1[_x] = smats_v1v0[_x]*ftemp[0][1] + smats_v1v1[_x]*ftemp[1][1];
    f_v1v2[_x] = smats_v1v0[_x]*ftemp[0][2] + smats_v1v1[_x]*ftemp[1][2];
    f_v1v3[_x] = smats_v1v0[_x]*ftemp[0][3] + smats_v1v1[_x]*ftemp[1][3];

                   }
               }
                   void tflux(int _ny, int _nx, fpdtype_t* __restrict__ f_v, int lsdf, const fpdtype_t* __restrict__ smats_v, int lsdsmats, const fpdtype_t* __restrict__ u_v, int lsdu)
                   {
                       #pragma omp parallel
                       {
                           int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
                           int rb, re, cb, ce;
                           loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
                           for (int _y = rb; _y < re; _y++)
                           {
                               tflux_inner(ce - cb, f_v + ((0*_ny + _y)*4 + 0)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 1)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 2)*lsdf + cb, f_v + ((0*_ny + _y)*4 + 3)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 0)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 1)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 2)*lsdf + cb, f_v + ((1*_ny + _y)*4 + 3)*lsdf + cb, smats_v + ((0*_ny + _y)*2 + 0)*lsdsmats + cb, smats_v + ((0*_ny + _y)*2 + 1)*lsdsmats + cb, smats_v + ((1*_ny + _y)*2 + 0)*lsdsmats + cb, smats_v + ((1*_ny + _y)*2 + 1)*lsdsmats + cb, u_v + (_y*4 + 0)*lsdu + cb, u_v + (_y*4 + 1)*lsdu + cb, u_v + (_y*4 + 2)*lsdu + cb, u_v + (_y*4 + 3)*lsdu + cb);
                           }
                       }
                   }

 [<class 'numpy.int32'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int32'>]
--args

--kwargs

res cache key bad
<_FuncPtr object at 0x6fffe569688>
new res cache key

<pyfr.util.memoize object at 0x6fffebbf470>
--self
<pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> intcflux pyfr.solvers.euler.kernels.intcflux {'ndims': 2, 'rsolver': 'rusanov', 'c': OrderedDict([('gamma', 1.4), ('S', 13.5), ('M', 0.4), ('R', 1.5)]), 'nvars': 4}
--args

--kwargs

res cache key bad
('\n\n#include <omp.h>\n#include <stdlib.h>\n#include <tgmath.h>\n\n#define PYFR_ALIGN_BYTES 32\n#define PYFR_NOINLINE __attribute__ ((noinline))\n\n#define min(a, b) ((a) < (b) ? (a) : (b))\n#define max(a, b) ((a) > (b) ? (a) : (b))\n\n// Typedefs\ntypedef double fpdtype_t;\n\n// OpenMP static loop scheduling functions\n\nstatic inline int\ngcd(int a, int b)\n{\n    return (a == 0) ? b : gcd(b % a, a);\n}\n\nstatic inline void\nloop_sched_1d(int n, int align, int *b, int *e)\n{\n    int tid = omp_get_thread_num();\n    int nth = omp_get_num_threads();\n\n    // Round up n to be a multiple of nth\n    int rn = n + nth - 1 - (n - 1) % nth;\n\n    // Nominal tile size\n    int sz = rn / nth;\n\n    // Handle alignment\n    sz += align - 1 - (sz - 1) % align;\n\n    // Assign the starting and ending index\n    *b = sz * tid;\n    *e = min(*b + sz, n);\n\n    // Clamp\n    if (*b >= n)\n        *b = *e = 0;\n}\n\nstatic inline void\nloop_sched_2d(int nrow, int ncol, int colalign,\n              int *rowb, int *rowe, int *colb, int *cole)\n{\n    int tid = omp_get_thread_num();\n    int nth = omp_get_num_threads();\n\n    // Distribute threads\n    int nrowth = gcd(nrow, nth);\n    int ncolth = nth / nrowth;\n\n    // Row and column indices for our thread\n    int rowix = tid / ncolth;\n    int colix = tid % ncolth;\n\n    // Round up ncol to be a multiple of ncolth\n    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;\n\n    // Nominal tile size\n    int ntilerow = nrow / nrowth;\n    int ntilecol = rncol / ncolth;\n\n    // Handle column alignment\n    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;\n\n    // Assign the starting and ending row to each thread\n    *rowb = ntilerow * rowix;\n    *rowe = *rowb + ntilerow;\n\n    // Assign the starting and ending column to each thread\n    *colb = ntilecol * colix;\n    *cole = min(*colb + ntilecol, ncol);\n\n    // Clamp\n    if (*colb >= ncol)\n        *colb = *cole = 0;\n}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                   void intcflux(int _nx, const fpdtype_t* __restrict__ magnl_v, const fpdtype_t* __restrict__ nl_v, int lsdnl, fpdtype_t* __restrict__ ul_v, const int* __restrict__ ul_vix, const int* __restrict__ ul_vcstri, fpdtype_t* __restrict__ ur_v, const int* __restrict__ ur_vix, const int* __restrict__ ur_vcstri)\n                   {\n                       #pragma omp parallel\n                       {\n                           int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);\n                           int cb, ce;\n                           loop_sched_1d(_nx, align, &cb, &ce);\n                           for (int _x = cb; _x < ce; _x++)\n                           {\n                               \n    // Perform the Riemann solve\n    fpdtype_t fn[4];\n    {\n\n    // Compute the left and right fluxes + velocities and pressures\n    fpdtype_t fl_[2][4], fr_[2][4];\n    fpdtype_t vl_[2], vr_[2];\n    fpdtype_t pl_, pr_;\n\n    {\n\n    fpdtype_t invrho__ = 1.0/ul_v[ul_vix[_x] + ul_vcstri[_x]*0], E__ = ul_v[ul_vix[_x] + ul_vcstri[_x]*3];\n\n    // Compute the velocities\n    fpdtype_t rhov__[2];\n    rhov__[0] = ul_v[ul_vix[_x] + ul_vcstri[_x]*1];\n    vl_[0] = invrho__*rhov__[0];\n    rhov__[1] = ul_v[ul_vix[_x] + ul_vcstri[_x]*2];\n    vl_[1] = invrho__*rhov__[1];\n\n    // Compute the pressure\n    pl_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));\n\n    // Density and energy fluxes\n    fl_[0][0] = rhov__[0];\n    fl_[0][3] = (E__ + pl_)*vl_[0];\n    fl_[1][0] = rhov__[1];\n    fl_[1][3] = (E__ + pl_)*vl_[1];\n\n    // Momentum fluxes\n    fl_[0][1] = rhov__[0]*vl_[0] + pl_;\n    fl_[0][2] = rhov__[0]*vl_[1];\n    fl_[1][1] = rhov__[1]*vl_[0];\n    fl_[1][2] = rhov__[1]*vl_[1] + pl_;\n\n};\n    {\n\n    fpdtype_t invrho__ = 1.0/ur_v[ur_vix[_x] + ur_vcstri[_x]*0], E__ = ur_v[ur_vix[_x] + ur_vcstri[_x]*3];\n\n    // Compute the velocities\n    fpdtype_t rhov__[2];\n    rhov__[0] = ur_v[ur_vix[_x] + ur_vcstri[_x]*1];\n    vr_[0] = invrho__*rhov__[0];\n    rhov__[1] = ur_v[ur_vix[_x] + ur_vcstri[_x]*2];\n    vr_[1] = invrho__*rhov__[1];\n\n    // Compute the pressure\n    pr_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));\n\n    // Density and energy fluxes\n    fr_[0][0] = rhov__[0];\n    fr_[0][3] = (E__ + pr_)*vr_[0];\n    fr_[1][0] = rhov__[1];\n    fr_[1][3] = (E__ + pr_)*vr_[1];\n\n    // Momentum fluxes\n    fr_[0][1] = rhov__[0]*vr_[0] + pr_;\n    fr_[0][2] = rhov__[0]*vr_[1];\n    fr_[1][1] = rhov__[1]*vr_[0];\n    fr_[1][2] = rhov__[1]*vr_[1] + pr_;\n\n};\n\n    // Sum the left and right velocities and take the normal\n    fpdtype_t nv_ = ((nl_v[lsdnl*0 + _x])*(vl_[0] + vr_[0]) + (nl_v[lsdnl*1 + _x])*(vl_[1] + vr_[1]));\n\n    // Estimate the maximum wave speed / 2\n    fpdtype_t a_ = sqrt(0.35*(pl_ + pr_)/(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] + ur_v[ur_vix[_x] + ur_vcstri[_x]*0]))\n                + 0.25*fabs(nv_);\n\n    // Output\n    fn[0] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][0] + fr_[0][0]) + nl_v[lsdnl*1 + _x]*(fl_[1][0] + fr_[1][0]))\n             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] - ur_v[ur_vix[_x] + ur_vcstri[_x]*0]);\n    fn[1] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][1] + fr_[0][1]) + nl_v[lsdnl*1 + _x]*(fl_[1][1] + fr_[1][1]))\n             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*1] - ur_v[ur_vix[_x] + ur_vcstri[_x]*1]);\n    fn[2] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][2] + fr_[0][2]) + nl_v[lsdnl*1 + _x]*(fl_[1][2] + fr_[1][2]))\n             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*2] - ur_v[ur_vix[_x] + ur_vcstri[_x]*2]);\n    fn[3] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][3] + fr_[0][3]) + nl_v[lsdnl*1 + _x]*(fl_[1][3] + fr_[1][3]))\n             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*3] - ur_v[ur_vix[_x] + ur_vcstri[_x]*3]);\n\n};\n\n    // Scale and write out the common normal fluxes\n    ul_v[ul_vix[_x] + ul_vcstri[_x]*0] =  magnl_v[_x]*fn[0];\n    ur_v[ur_vix[_x] + ur_vcstri[_x]*0] = -magnl_v[_x]*fn[0];\n    ul_v[ul_vix[_x] + ul_vcstri[_x]*1] =  magnl_v[_x]*fn[1];\n    ur_v[ur_vix[_x] + ur_vcstri[_x]*1] = -magnl_v[_x]*fn[1];\n    ul_v[ul_vix[_x] + ul_vcstri[_x]*2] =  magnl_v[_x]*fn[2];\n    ur_v[ur_vix[_x] + ur_vcstri[_x]*2] = -magnl_v[_x]*fn[2];\n    ul_v[ul_vix[_x] + ul_vcstri[_x]*3] =  magnl_v[_x]*fn[3];\n    ur_v[ur_vix[_x] + ur_vcstri[_x]*3] = -magnl_v[_x]*fn[3];\n\n                           }\n                       }\n                   }\n\n', 1, ['_nx', 'magnl', 'nl', 'ul', 'ur'], [[<class 'numpy.int32'>], [<class 'numpy.int64'>], [<class 'numpy.int64'>, <class 'numpy.int32'>], [<class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>], [<class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>]])
new res cache key

<pyfr.util.memoize object at 0x6fffe7267b8>
--self
<pyfr.backends.openmp.provider.OpenMPPointwiseKernelProvider object at 0x6fffe826f60> intcflux

#include <omp.h>
#include <stdlib.h>
#include <tgmath.h>

#define PYFR_ALIGN_BYTES 32
#define PYFR_NOINLINE __attribute__ ((noinline))

#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))

// Typedefs
typedef double fpdtype_t;

// OpenMP static loop scheduling functions

static inline int
gcd(int a, int b)
{
    return (a == 0) ? b : gcd(b % a, a);
}

static inline void
loop_sched_1d(int n, int align, int *b, int *e)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Round up n to be a multiple of nth
    int rn = n + nth - 1 - (n - 1) % nth;

    // Nominal tile size
    int sz = rn / nth;

    // Handle alignment
    sz += align - 1 - (sz - 1) % align;

    // Assign the starting and ending index
    *b = sz * tid;
    *e = min(*b + sz, n);

    // Clamp
    if (*b >= n)
        *b = *e = 0;
}

static inline void
loop_sched_2d(int nrow, int ncol, int colalign,
              int *rowb, int *rowe, int *colb, int *cole)
{
    int tid = omp_get_thread_num();
    int nth = omp_get_num_threads();

    // Distribute threads
    int nrowth = gcd(nrow, nth);
    int ncolth = nth / nrowth;

    // Row and column indices for our thread
    int rowix = tid / ncolth;
    int colix = tid % ncolth;

    // Round up ncol to be a multiple of ncolth
    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;

    // Nominal tile size
    int ntilerow = nrow / nrowth;
    int ntilecol = rncol / ncolth;

    // Handle column alignment
    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;

    // Assign the starting and ending row to each thread
    *rowb = ntilerow * rowix;
    *rowe = *rowb + ntilerow;

    // Assign the starting and ending column to each thread
    *colb = ntilecol * colix;
    *cole = min(*colb + ntilecol, ncol);

    // Clamp
    if (*colb >= ncol)
        *colb = *cole = 0;
}


                   void intcflux(int _nx, const fpdtype_t* __restrict__ magnl_v, const fpdtype_t* __restrict__ nl_v, int lsdnl, fpdtype_t* __restrict__ ul_v, const int* __restrict__ ul_vix, const int* __restrict__ ul_vcstri, fpdtype_t* __restrict__ ur_v, const int* __restrict__ ur_vix, const int* __restrict__ ur_vcstri)
                   {
                       #pragma omp parallel
                       {
                           int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
                           int cb, ce;
                           loop_sched_1d(_nx, align, &cb, &ce);
                           for (int _x = cb; _x < ce; _x++)
                           {

    // Perform the Riemann solve
    fpdtype_t fn[4];
    {

    // Compute the left and right fluxes + velocities and pressures
    fpdtype_t fl_[2][4], fr_[2][4];
    fpdtype_t vl_[2], vr_[2];
    fpdtype_t pl_, pr_;

    {

    fpdtype_t invrho__ = 1.0/ul_v[ul_vix[_x] + ul_vcstri[_x]*0], E__ = ul_v[ul_vix[_x] + ul_vcstri[_x]*3];

    // Compute the velocities
    fpdtype_t rhov__[2];
    rhov__[0] = ul_v[ul_vix[_x] + ul_vcstri[_x]*1];
    vl_[0] = invrho__*rhov__[0];
    rhov__[1] = ul_v[ul_vix[_x] + ul_vcstri[_x]*2];
    vl_[1] = invrho__*rhov__[1];

    // Compute the pressure
    pl_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));

    // Density and energy fluxes
    fl_[0][0] = rhov__[0];
    fl_[0][3] = (E__ + pl_)*vl_[0];
    fl_[1][0] = rhov__[1];
    fl_[1][3] = (E__ + pl_)*vl_[1];

    // Momentum fluxes
    fl_[0][1] = rhov__[0]*vl_[0] + pl_;
    fl_[0][2] = rhov__[0]*vl_[1];
    fl_[1][1] = rhov__[1]*vl_[0];
    fl_[1][2] = rhov__[1]*vl_[1] + pl_;

};
    {

    fpdtype_t invrho__ = 1.0/ur_v[ur_vix[_x] + ur_vcstri[_x]*0], E__ = ur_v[ur_vix[_x] + ur_vcstri[_x]*3];

    // Compute the velocities
    fpdtype_t rhov__[2];
    rhov__[0] = ur_v[ur_vix[_x] + ur_vcstri[_x]*1];
    vr_[0] = invrho__*rhov__[0];
    rhov__[1] = ur_v[ur_vix[_x] + ur_vcstri[_x]*2];
    vr_[1] = invrho__*rhov__[1];

    // Compute the pressure
    pr_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));

    // Density and energy fluxes
    fr_[0][0] = rhov__[0];
    fr_[0][3] = (E__ + pr_)*vr_[0];
    fr_[1][0] = rhov__[1];
    fr_[1][3] = (E__ + pr_)*vr_[1];

    // Momentum fluxes
    fr_[0][1] = rhov__[0]*vr_[0] + pr_;
    fr_[0][2] = rhov__[0]*vr_[1];
    fr_[1][1] = rhov__[1]*vr_[0];
    fr_[1][2] = rhov__[1]*vr_[1] + pr_;

};

    // Sum the left and right velocities and take the normal
    fpdtype_t nv_ = ((nl_v[lsdnl*0 + _x])*(vl_[0] + vr_[0]) + (nl_v[lsdnl*1 + _x])*(vl_[1] + vr_[1]));

    // Estimate the maximum wave speed / 2
    fpdtype_t a_ = sqrt(0.35*(pl_ + pr_)/(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] + ur_v[ur_vix[_x] + ur_vcstri[_x]*0]))
                + 0.25*fabs(nv_);

    // Output
    fn[0] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][0] + fr_[0][0]) + nl_v[lsdnl*1 + _x]*(fl_[1][0] + fr_[1][0]))
             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] - ur_v[ur_vix[_x] + ur_vcstri[_x]*0]);
    fn[1] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][1] + fr_[0][1]) + nl_v[lsdnl*1 + _x]*(fl_[1][1] + fr_[1][1]))
             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*1] - ur_v[ur_vix[_x] + ur_vcstri[_x]*1]);
    fn[2] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][2] + fr_[0][2]) + nl_v[lsdnl*1 + _x]*(fl_[1][2] + fr_[1][2]))
             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*2] - ur_v[ur_vix[_x] + ur_vcstri[_x]*2]);
    fn[3] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][3] + fr_[0][3]) + nl_v[lsdnl*1 + _x]*(fl_[1][3] + fr_[1][3]))
             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*3] - ur_v[ur_vix[_x] + ur_vcstri[_x]*3]);

};

    // Scale and write out the common normal fluxes
    ul_v[ul_vix[_x] + ul_vcstri[_x]*0] =  magnl_v[_x]*fn[0];
    ur_v[ur_vix[_x] + ur_vcstri[_x]*0] = -magnl_v[_x]*fn[0];
    ul_v[ul_vix[_x] + ul_vcstri[_x]*1] =  magnl_v[_x]*fn[1];
    ur_v[ur_vix[_x] + ur_vcstri[_x]*1] = -magnl_v[_x]*fn[1];
    ul_v[ul_vix[_x] + ul_vcstri[_x]*2] =  magnl_v[_x]*fn[2];
    ur_v[ur_vix[_x] + ur_vcstri[_x]*2] = -magnl_v[_x]*fn[2];
    ul_v[ul_vix[_x] + ul_vcstri[_x]*3] =  magnl_v[_x]*fn[3];
    ur_v[ur_vix[_x] + ur_vcstri[_x]*3] = -magnl_v[_x]*fn[3];

                           }
                       }
                   }

 [<class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int32'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>, <class 'numpy.int64'>]
--args

--kwargs

      0 [main] python3 4796 child_info_fork::abort: unable to remap libtmp.so to same address as parent (0x1F0000) - try running rebaseall
Traceback (most recent call last):
  File "/home/User/PyFR/env4/src/pyfr/pyfr/util.py", line 39, in __call__
    res = cache[key]
KeyError: (<function OpenMPKernelProvider._build_kernel at 0x6fffe54a6a8>, b'\x80\x03X\x08\x00\x00\x00intcfluxq\x00XZ\x18\x00\x00\n\n#include <omp.h>\n#include <stdlib.h>\n#include <tgmath.h>\n\n#define PYFR_ALIGN_BYTES 32\n#define PYFR_NOINLINE __attribute__ ((noinline))\n\n#define min(a, b) ((a) < (b) ? (a) : (b))\n#define max(a, b) ((a) > (b) ? (a) : (b))\n\n// Typedefs\ntypedef double fpdtype_t;\n\n// OpenMP static loop scheduling functions\n\nstatic inline int\ngcd(int a, int b)\n{\n    return (a == 0) ? b : gcd(b % a, a);\n}\n\nstatic inline void\nloop_sched_1d(int n, int align, int *b, int *e)\n{\n    int tid = omp_get_thread_num();\n    int nth = omp_get_num_threads();\n\n    // Round up n to be a multiple of nth\n    int rn = n + nth - 1 - (n - 1) % nth;\n\n    // Nominal tile size\n    int sz = rn / nth;\n\n    // Handle alignment\n    sz += align - 1 - (sz - 1) % align;\n\n    // Assign the starting and ending index\n    *b = sz * tid;\n    *e = min(*b + sz, n);\n\n    // Clamp\n    if (*b >= n)\n        *b = *e = 0;\n}\n\nstatic inline void\nloop_sched_2d(int nrow, int ncol, int colalign,\n              int *rowb, int *rowe, int *colb, int *cole)\n{\n    int tid = omp_get_thread_num();\n    int nth = omp_get_num_threads();\n\n    // Distribute threads\n    int nrowth = gcd(nrow, nth);\n    int ncolth = nth / nrowth;\n\n    // Row and column indices for our thread\n    int rowix = tid / ncolth;\n    int colix = tid % ncolth;\n\n    // Round up ncol to be a multiple of ncolth\n    int rncol = ncol + ncolth - 1 - (ncol - 1) % ncolth;\n\n    // Nominal tile size\n    int ntilerow = nrow / nrowth;\n    int ntilecol = rncol / ncolth;\n\n    // Handle column alignment\n    ntilecol += colalign - 1 - (ntilecol - 1) % colalign;\n\n    // Assign the starting and ending row to each thread\n    *rowb = ntilerow * rowix;\n    *rowe = *rowb + ntilerow;\n\n    // Assign the starting and ending column to each thread\n    *colb = ntilecol * colix;\n    *cole = min(*colb + ntilecol, ncol);\n\n    // Clamp\n    if (*colb >= ncol)\n        *colb = *cole = 0;\n}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                   void intcflux(int _nx, const fpdtype_t* __restrict__ magnl_v, const fpdtype_t* __restrict__ nl_v, int lsdnl, fpdtype_t* __restrict__ ul_v, const int* __restrict__ ul_vix, const int* __restrict__ ul_vcstri, fpdtype_t* __restrict__ ur_v, const int* __restrict__ ur_vix, const int* __restrict__ ur_vcstri)\n                   {\n                       #pragma omp parallel\n                       {\n                           int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);\n                           int cb, ce;\n                           loop_sched_1d(_nx, align, &cb, &ce);\n                           for (int _x = cb; _x < ce; _x++)\n                           {\n                               \n    // Perform the Riemann solve\n    fpdtype_t fn[4];\n    {\n\n    // Compute the left and right fluxes + velocities and pressures\n    fpdtype_t fl_[2][4], fr_[2][4];\n    fpdtype_t vl_[2], vr_[2];\n    fpdtype_t pl_, pr_;\n\n    {\n\n    fpdtype_t invrho__ = 1.0/ul_v[ul_vix[_x] + ul_vcstri[_x]*0], E__ = ul_v[ul_vix[_x] + ul_vcstri[_x]*3];\n\n    // Compute the velocities\n    fpdtype_t rhov__[2];\n    rhov__[0] = ul_v[ul_vix[_x] + ul_vcstri[_x]*1];\n    vl_[0] = invrho__*rhov__[0];\n    rhov__[1] = ul_v[ul_vix[_x] + ul_vcstri[_x]*2];\n    vl_[1] = invrho__*rhov__[1];\n\n    // Compute the pressure\n    pl_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));\n\n    // Density and energy fluxes\n    fl_[0][0] = rhov__[0];\n    fl_[0][3] = (E__ + pl_)*vl_[0];\n    fl_[1][0] = rhov__[1];\n    fl_[1][3] = (E__ + pl_)*vl_[1];\n\n    // Momentum fluxes\n    fl_[0][1] = rhov__[0]*vl_[0] + pl_;\n    fl_[0][2] = rhov__[0]*vl_[1];\n    fl_[1][1] = rhov__[1]*vl_[0];\n    fl_[1][2] = rhov__[1]*vl_[1] + pl_;\n\n};\n    {\n\n    fpdtype_t invrho__ = 1.0/ur_v[ur_vix[_x] + ur_vcstri[_x]*0], E__ = ur_v[ur_vix[_x] + ur_vcstri[_x]*3];\n\n    // Compute the velocities\n    fpdtype_t rhov__[2];\n    rhov__[0] = ur_v[ur_vix[_x] + ur_vcstri[_x]*1];\n    vr_[0] = invrho__*rhov__[0];\n    rhov__[1] = ur_v[ur_vix[_x] + ur_vcstri[_x]*2];\n    vr_[1] = invrho__*rhov__[1];\n\n    // Compute the pressure\n    pr_ = 0.3999999999999999*(E__ - 0.5*invrho__*((rhov__[0])*(rhov__[0]) + (rhov__[1])*(rhov__[1])));\n\n    // Density and energy fluxes\n    fr_[0][0] = rhov__[0];\n    fr_[0][3] = (E__ + pr_)*vr_[0];\n    fr_[1][0] = rhov__[1];\n    fr_[1][3] = (E__ + pr_)*vr_[1];\n\n    // Momentum fluxes\n    fr_[0][1] = rhov__[0]*vr_[0] + pr_;\n    fr_[0][2] = rhov__[0]*vr_[1];\n    fr_[1][1] = rhov__[1]*vr_[0];\n    fr_[1][2] = rhov__[1]*vr_[1] + pr_;\n\n};\n\n    // Sum the left and right velocities and take the normal\n    fpdtype_t nv_ = ((nl_v[lsdnl*0 + _x])*(vl_[0] + vr_[0]) + (nl_v[lsdnl*1 + _x])*(vl_[1] + vr_[1]));\n\n    // Estimate the maximum wave speed / 2\n    fpdtype_t a_ = sqrt(0.35*(pl_ + pr_)/(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] + ur_v[ur_vix[_x] + ur_vcstri[_x]*0]))\n                + 0.25*fabs(nv_);\n\n    // Output\n    fn[0] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][0] + fr_[0][0]) + nl_v[lsdnl*1 + _x]*(fl_[1][0] + fr_[1][0]))\n             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*0] - ur_v[ur_vix[_x] + ur_vcstri[_x]*0]);\n    fn[1] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][1] + fr_[0][1]) + nl_v[lsdnl*1 + _x]*(fl_[1][1] + fr_[1][1]))\n             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*1] - ur_v[ur_vix[_x] + ur_vcstri[_x]*1]);\n    fn[2] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][2] + fr_[0][2]) + nl_v[lsdnl*1 + _x]*(fl_[1][2] + fr_[1][2]))\n             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*2] - ur_v[ur_vix[_x] + ur_vcstri[_x]*2]);\n    fn[3] = 0.5*(nl_v[lsdnl*0 + _x]*(fl_[0][3] + fr_[0][3]) + nl_v[lsdnl*1 + _x]*(fl_[1][3] + fr_[1][3]))\n             + a_*(ul_v[ul_vix[_x] + ul_vcstri[_x]*3] - ur_v[ur_vix[_x] + ur_vcstri[_x]*3]);\n\n};\n\n    // Scale and write out the common normal fluxes\n    ul_v[ul_vix[_x] + ul_vcstri[_x]*0] =  magnl_v[_x]*fn[0];\n    ur_v[ur_vix[_x] + ur_vcstri[_x]*0] = -magnl_v[_x]*fn[0];\n    ul_v[ul_vix[_x] + ul_vcstri[_x]*1] =  magnl_v[_x]*fn[1];\n    ur_v[ur_vix[_x] + ur_vcstri[_x]*1] = -magnl_v[_x]*fn[1];\n    ul_v[ul_vix[_x] + ul_vcstri[_x]*2] =  magnl_v[_x]*fn[2];\n    ur_v[ur_vix[_x] + ur_vcstri[_x]*2] = -magnl_v[_x]*fn[2];\n    ul_v[ul_vix[_x] + ul_vcstri[_x]*3] =  magnl_v[_x]*fn[3];\n    ur_v[ur_vix[_x] + ur_vcstri[_x]*3] = -magnl_v[_x]*fn[3];\n\n                           }\n                       }\n                   }\n\nq\x01]q\x02(cnumpy\nint32\nq\x03cnumpy\nint64\nq\x04h\x04h\x03h\x04h\x04h\x04h\x04h\x04h\x04e\x87q\x05.', b'\x80\x03}q\x00.')

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/User/PyFR/env4/lib/python3.4/site-packages/pytools/prefork.py", line 46, in call_capture_output
    popen = Popen(cmdline, cwd=cwd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
  File "/usr/lib/python3.4/subprocess.py", line 859, in __init__
    restore_signals, start_new_session)
  File "/usr/lib/python3.4/subprocess.py", line 1395, in _execute_child
    restore_signals, start_new_session, preexec_fn)
BlockingIOError: [Errno 11] Resource temporarily unavailable

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/User/PyFR/env4/bin/pyfr", line 9, in <module>
    load_entry_point('pyfr==1.3.0', 'console_scripts', 'pyfr')()
  File "/home/User/PyFR/env4/src/pyfr/pyfr/scripts/main.py", line 109, in main
    args.process(args)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/scripts/main.py", line 230, in process_run
    args, NativeReader(args.mesh), None, Inifile.load(args.cfg)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/scripts/main.py", line 214, in _process_common
    solver = get_solver(backend, rallocs, mesh, soln, cfg)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/solvers/__init__.py", line 14, in get_solver
    return get_integrator(backend, systemcls, rallocs, mesh, initsoln, cfg)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/integrators/__init__.py", line 26, in get_integrator
    return integrator(backend, systemcls, rallocs, mesh, initsoln, cfg)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/integrators/controllers.py", line 14, in __init__
    super().__init__(*args, **kwargs)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/integrators/steppers.py", line 9, in __init__
    super().__init__(*args, **kwargs)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/integrators/base.py", line 41, in __init__
    self.system = systemcls(backend, rallocs, mesh, initsoln, nreg, cfg)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/solvers/base/system.py", line 59, in __init__
    self._gen_kernels(eles, int_inters, mpi_inters, bc_inters)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/solvers/base/system.py", line 167, in _gen_kernels
    kernels[pn, kn].append(kgetter())
  File "/home/User/PyFR/env4/src/pyfr/pyfr/solvers/euler/inters.py", line 21, in <lambda>
    magnl=self._mag_pnorm_lhs, nl=self._norm_pnorm_lhs
  File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/base/backend.py", line 154, in kernel
    return kern(*args, **kwargs)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/base/kernels.py", line 162, in kernel_meth
    fun = self._build_kernel(name, src, list(it.chain(*argt)))
  File "/home/User/PyFR/env4/src/pyfr/pyfr/util.py", line 43, in __call__
    res = cache[key] = self.func(*args, **kwargs)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/openmp/provider.py", line 13, in _build_kernel
    mod = GccSourceModule(src, self.backend.cfg)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/openmp/compiler.py", line 61, in __init__
    super().__init__(src, cfg)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/openmp/compiler.py", line 30, in __init__
    lname = self._build(tmpdir)
  File "/home/User/PyFR/env4/src/pyfr/pyfr/backends/openmp/compiler.py", line 80, in _build
    call_capture_output(cmd + self._cflags, cwd=tmpdir)
  File "/home/User/PyFR/env4/lib/python3.4/site-packages/pytools/prefork.py", line 197, in call_capture_output
    return forker[0].call_capture_output(cmdline, cwd, error_on_nonzero)
  File "/home/User/PyFR/env4/lib/python3.4/site-packages/pytools/prefork.py", line 54, in call_capture_output
    % ( " ".join(cmdline), e))
pytools.prefork.ExecError: error invoking 'gcc -shared -std=c99 -Ofast -march=native -fopenmp -fPIC -o libtmp.so tmp.c': [Errno 11] Resource temporarily unavailable