Advertisement
damian101

tensorrt AUR build

Aug 27th, 2023
2,152
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Make 85.41 KB | None | 0 0
  1. [ 79%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin.dir/embLayerNormPlugin/embLayerNormKernel.cu.o
  2. /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu(228): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  3.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  4.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  5.               threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
  6.                            ^
  7.           detected during:
  8.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247
  9.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253
  10.  
  11. /opt/cuda/include/cub/block/specializations/../../warp/specializations/warp_reduce_shfl.cuh(360): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  12.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  13.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  14.               output = reduction_op(input, temp);
  15.                        ^
  16.           detected during:
  17.             instantiation of "_T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(_T, ReductionOp, int, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, _T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 388
  18.             instantiation of "_T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(_T, ReductionOp, int, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<0>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, _T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 403
  19.             instantiation of "void cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(T &, ReductionOp, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<STEP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, STEP=0]" at line 449
  20.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceImpl(cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<1>, T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 530
  21.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::Reduce<ALL_LANES_VALID,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ALL_LANES_VALID=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 204 of /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh
  22.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  23.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  24.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  25.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  26.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  27.  
  28. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  29.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  30.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  31.               warp_aggregate = reduction_op(warp_aggregate, addend);
  32.                                ^
  33.           detected during:
  34.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  35.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  36.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  37.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  38.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  39.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  40.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  41.  
  42. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  43.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  44.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  45.               warp_aggregate = reduction_op(warp_aggregate, addend);
  46.                                ^
  47.           detected during:
  48.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  49.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  50.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  51.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  52.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  53.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  54.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  55.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  56.  
  57. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  58.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  59.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  60.               warp_aggregate = reduction_op(warp_aggregate, addend);
  61.                                ^
  62.           detected during:
  63.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  64.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  65.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  66.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  67.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  68.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  69.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  70.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  71.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  72.  
  73. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  74.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  75.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  76.               warp_aggregate = reduction_op(warp_aggregate, addend);
  77.                                ^
  78.           detected during:
  79.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
  80.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  81.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  82.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  83.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  84.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  85.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  86.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  87.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  88.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  89.  
  90. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  91.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  92.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  93.               warp_aggregate = reduction_op(warp_aggregate, addend);
  94.                                ^
  95.           detected during:
  96.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
  97.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
  98.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  99.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  100.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  101.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  102.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  103.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  104.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  105.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  106.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  107.  
  108. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  109.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  110.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  111.               warp_aggregate = reduction_op(warp_aggregate, addend);
  112.                                ^
  113.           detected during:
  114.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=6]" at line 121
  115.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
  116.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
  117.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  118.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  119.             [ 2 instantiation contexts not shown ]
  120.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  121.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  122.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  123.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  124.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  125.  
  126. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  127.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  128.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  129.               warp_aggregate = reduction_op(warp_aggregate, addend);
  130.                                ^
  131.           detected during:
  132.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=7]" at line 121
  133.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=6]" at line 121
  134.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
  135.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
  136.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  137.             [ 3 instantiation contexts not shown ]
  138.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  139.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  140.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  141.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  142.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  143.  
  144. 9 errors detected in the compilation of "/home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu".
  145. make[2]: *** [plugin/CMakeFiles/nvinfer_plugin.dir/build.make:3075: plugin/CMakeFiles/nvinfer_plugin.dir/embLayerNormPlugin/embLayerNormKernel.cu.o] Error 1
  146. make[1]: *** [CMakeFiles/Makefile2:1040: plugin/CMakeFiles/nvinfer_plugin.dir/all] Error 2
  147. make[1]: *** Waiting for unfinished jobs....
  148. [ 79%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/common/kernels/rproiInferenceFused.cu.o
  149. [ 79%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/common/kernels/sortScoresPerClass.cu.o
  150. [ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/common/kernels/sortScoresPerImage.cu.o
  151. [ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/common/kernels/voxelGeneratorKernels.cu.o
  152. [ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/bertQKVToContextPlugin/qkvToContext.cu.o
  153. [ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/bertQKVToContextPlugin/zeroPadding2d.cu.o
  154. [ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/embLayerNormPlugin/embLayerNormKernel.cu.o
  155. /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu(228): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  156.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  157.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  158.               threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
  159.                            ^
  160.           detected during:
  161.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247
  162.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253
  163.  
  164. /opt/cuda/include/cub/block/specializations/../../warp/specializations/warp_reduce_shfl.cuh(360): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  165.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  166.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  167.               output = reduction_op(input, temp);
  168.                        ^
  169.           detected during:
  170.             instantiation of "_T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(_T, ReductionOp, int, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, _T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 388
  171.             instantiation of "_T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(_T, ReductionOp, int, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<0>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, _T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 403
  172.             instantiation of "void cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(T &, ReductionOp, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<STEP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, STEP=0]" at line 449
  173.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceImpl(cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<1>, T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 530
  174.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::Reduce<ALL_LANES_VALID,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ALL_LANES_VALID=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 204 of /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh
  175.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  176.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  177.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  178.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  179.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  180.  
  181. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  182.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  183.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  184.               warp_aggregate = reduction_op(warp_aggregate, addend);
  185.                                ^
  186.           detected during:
  187.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  188.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  189.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  190.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  191.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  192.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  193.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  194.  
  195. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  196.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  197.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  198.               warp_aggregate = reduction_op(warp_aggregate, addend);
  199.                                ^
  200.           detected during:
  201.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  202.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  203.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  204.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  205.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  206.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  207.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  208.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  209.  
  210. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  211.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  212.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  213.               warp_aggregate = reduction_op(warp_aggregate, addend);
  214.                                ^
  215.           detected during:
  216.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  217.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  218.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  219.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  220.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  221.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  222.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  223.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  224.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  225.  
  226. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  227.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  228.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  229.               warp_aggregate = reduction_op(warp_aggregate, addend);
  230.                                ^
  231.           detected during:
  232.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
  233.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  234.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  235.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  236.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  237.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  238.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  239.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  240.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  241.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  242.  
  243. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  244.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  245.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  246.               warp_aggregate = reduction_op(warp_aggregate, addend);
  247.                                ^
  248.           detected during:
  249.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
  250.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
  251.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  252.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  253.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
  254.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
  255.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  256.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  257.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  258.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  259.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  260.  
  261. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  262.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  263.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  264.               warp_aggregate = reduction_op(warp_aggregate, addend);
  265.                                ^
  266.           detected during:
  267.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=6]" at line 121
  268.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
  269.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
  270.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  271.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
  272.             [ 2 instantiation contexts not shown ]
  273.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  274.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  275.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  276.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  277.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  278.  
  279. /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
  280.             argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
  281.             object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
  282.               warp_aggregate = reduction_op(warp_aggregate, addend);
  283.                                ^
  284.           detected during:
  285.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=7]" at line 121
  286.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=6]" at line 121
  287.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
  288.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
  289.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
  290.             [ 3 instantiation contexts not shown ]
  291.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
  292.             instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
  293.             instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  294.             instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  295.             instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
  296.  
  297. 9 errors detected in the compilation of "/home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu".
  298. make[2]: *** [plugin/CMakeFiles/nvinfer_plugin_static.dir/build.make:3075: plugin/CMakeFiles/nvinfer_plugin_static.dir/embLayerNormPlugin/embLayerNormKernel.cu.o] Error 1
  299. make[2]: *** Waiting for unfinished jobs....
  300. make[1]: *** [CMakeFiles/Makefile2:1066: plugin/CMakeFiles/nvinfer_plugin_static.dir/all] Error 2
  301. make: *** [Makefile:156: all] Error 2
  302. ==> ERROR: A failure occurred in build().
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement