Untitled

/home/panyala/anaconda3/lib/python3.6/site-packages/torch/cuda/__init__.py:95: UserWarning:
    Found GPU0 Tesla V100-PCIE-16GB which requires CUDA_VERSION >= 9000 for
     optimal performance and fast startup time, but your PyTorch was compiled
     with CUDA_VERSION 8000. Please install the correct PyTorch binary
     using instructions from http://pytorch.org

  warnings.warn(incorrect_binary_warn % (d, name, 9000, CUDA_VERSION))
/home/panyala/anaconda3/lib/python3.6/site-packages/torch/cuda/__init__.py:95: UserWarning:
    Found GPU1 Tesla V100-PCIE-16GB which requires CUDA_VERSION >= 9000 for
     optimal performance and fast startup time, but your PyTorch was compiled
     with CUDA_VERSION 8000. Please install the correct PyTorch binary
     using instructions from http://pytorch.org

  warnings.warn(incorrect_binary_warn % (d, name, 9000, CUDA_VERSION))
^A[WARNING]: No mapping options passed, 'naive' type mapping options will be used and will likely have bad performance. See help(your_layer.__call__) for setting mapping options.
WARNING: Logging before InitGoogleLogging() is written to STDERR
W1113 10:52:01.228185 40812 rtc.cc:103] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
 source:
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}
#define if_then_else(cond,a,b) (cond) ? (a) : (b);

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;

#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)

extern "C" {
__global__ void matmul_4_3_5(int32 K, int32 M, int32 N, float32* pC, float32* pA, float32* pB) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*C)[5] = reinterpret_cast<float32 (*)[5]>(pC);
  float32 (*A)[4] = reinterpret_cast<float32 (*)[4]>(pA);
  float32 (*B)[5] = reinterpret_cast<float32 (*)[5]>(pB);
  C[t1][t0] = 0.000000f;
  for (int c5 = 0; c5 <= 3; c5 += 1) {
    C[t1][t0] = (C[t1][t0] + (A[t1][c5]*B[c5][t0]));
  }
}
}

/*
Mapping Options:
tc::MappingOptions::makeNaiveMappingOptions()
    .outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
    .outerScheduleAllowSkewing(false)
    .outerSchedulePositiveOrthant(true)
    .intraTileScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
    .intraTileScheduleAllowSkewing(false)
    .intraTileSchedulePositiveOrthant(true)
    .tile(32, 32, 32)
    .mapToThreads(32, 8)
    .mapToBlocks(256, 256)
    .unroll(1)
    .tileImperfectlyNested(false)
    .useSharedMemory(false)
    .usePrivateMemory(false)
    .unrollCopyShared(false)
    .matchLibraryCalls(false);
TC version: 8e112e9dccda62c30ef29208a827e783b9a7f156
*/
E1113 10:52:01.228243 40812 rtc.cc:106] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
 source:
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
}
#define if_then_else(cond,a,b) (cond) ? (a) : (b);

// Halide type handling
typedef int int32;
typedef long int64;
typedef float float32;
typedef double float64;

#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)

extern "C" {
__global__ void matmul_4_3_5(int32 K, int32 M, int32 N, float32* pC, float32* pA, float32* pB) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float32 (*C)[5] = reinterpret_cast<float32 (*)[5]>(pC);
  float32 (*A)[4] = reinterpret_cast<float32 (*)[4]>(pA);
  float32 (*B)[5] = reinterpret_cast<float32 (*)[5]>(pB);
  C[t1][t0] = 0.000000f;
  for (int c5 = 0; c5 <= 3; c5 += 1) {
    C[t1][t0] = (C[t1][t0] + (A[t1][c5]*B[c5][t0]));
  }
}
}

/*
Mapping Options:
tc::MappingOptions::makeNaiveMappingOptions()
    .outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
    .outerScheduleAllowSkewing(false)
    .outerSchedulePositiveOrthant(true)
    .intraTileScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
    .intraTileScheduleAllowSkewing(false)
    .intraTileSchedulePositiveOrthant(true)
    .tile(32, 32, 32)
    .mapToThreads(32, 8)
    .mapToBlocks(256, 256)
    .unroll(1)
    .tileImperfectlyNested(false)
    .useSharedMemory(false)
    .usePrivateMemory(false)
    .unrollCopyShared(false)
    .matchLibraryCalls(false);
TC version: 8e112e9dccda62c30ef29208a827e783b9a7f156
*/
[ERROR]: Caught Exception: Could not compile function