tensorrt AUR build

[ 79%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin.dir/embLayerNormPlugin/embLayerNormKernel.cu.o
/home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu(228): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
                           ^
          detected during:
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253

/opt/cuda/include/cub/block/specializations/../../warp/specializations/warp_reduce_shfl.cuh(360): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              output = reduction_op(input, temp);
                       ^
          detected during:
            instantiation of "_T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(_T, ReductionOp, int, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, _T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 388
            instantiation of "_T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(_T, ReductionOp, int, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<0>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, _T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 403
            instantiation of "void cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(T &, ReductionOp, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<STEP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, STEP=0]" at line 449
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceImpl(cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<1>, T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 530
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::Reduce<ALL_LANES_VALID,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ALL_LANES_VALID=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 204 of /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=6]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            [ 2 instantiation contexts not shown ]
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=7]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=6]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            [ 3 instantiation contexts not shown ]
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

9 errors detected in the compilation of "/home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu".
make[2]: *** [plugin/CMakeFiles/nvinfer_plugin.dir/build.make:3075: plugin/CMakeFiles/nvinfer_plugin.dir/embLayerNormPlugin/embLayerNormKernel.cu.o] Error 1
make[1]: *** [CMakeFiles/Makefile2:1040: plugin/CMakeFiles/nvinfer_plugin.dir/all] Error 2
make[1]: *** Waiting for unfinished jobs....
[ 79%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/common/kernels/rproiInferenceFused.cu.o
[ 79%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/common/kernels/sortScoresPerClass.cu.o
[ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/common/kernels/sortScoresPerImage.cu.o
[ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/common/kernels/voxelGeneratorKernels.cu.o
[ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/bertQKVToContextPlugin/qkvToContext.cu.o
[ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/bertQKVToContextPlugin/zeroPadding2d.cu.o
[ 80%] Building CUDA object plugin/CMakeFiles/nvinfer_plugin_static.dir/embLayerNormPlugin/embLayerNormKernel.cu.o
/home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu(228): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
                           ^
          detected during:
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253

/opt/cuda/include/cub/block/specializations/../../warp/specializations/warp_reduce_shfl.cuh(360): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              output = reduction_op(input, temp);
                       ^
          detected during:
            instantiation of "_T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(_T, ReductionOp, int, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, _T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 388
            instantiation of "_T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(_T, ReductionOp, int, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<0>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, _T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 403
            instantiation of "void cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceStep(T &, ReductionOp, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<STEP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, STEP=0]" at line 449
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::ReduceImpl(cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<1>, T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 530
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::WarpReduceShfl<T, LOGICAL_WARP_THREADS, LEGACY_PTX_ARCH>::Reduce<ALL_LANES_VALID,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, LOGICAL_WARP_THREADS=32, LEGACY_PTX_ARCH=0, ALL_LANES_VALID=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 204 of /opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=1]" at line 156
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp>(ReductionOp, T, int) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 207
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=6]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=2]" at line 121
            [ 2 instantiation contexts not shown ]
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

/opt/cuda/include/cub/block/specializations/block_reduce_warp_reductions.cuh(119): error: no instance of function template "cuda::std::__4::plus<void>::operator()" matches the argument list
            argument types are: (cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>)
            object type is: cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum
              warp_aggregate = reduction_op(warp_aggregate, addend);
                               ^
          detected during:
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=7]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=6]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=5]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=4]" at line 121
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::ApplyWarpAggregates<FULL_TILE,ReductionOp,SUCCESSOR_WARP>(ReductionOp, T, int, cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Int2Type<SUCCESSOR_WARP>) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum, SUCCESSOR_WARP=3]" at line 121
            [ 3 instantiation contexts not shown ]
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce<FULL_TILE,ReductionOp>(T, int, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, FULL_TILE=true, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 354 of /opt/cuda/include/cub/block/block_reduce.cuh
            instantiation of "T cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, LEGACY_PTX_ARCH>::Reduce(T, ReductionOp) [with T=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::KeyValuePair<float, float>, BLOCK_DIM_X=256, ALGORITHM=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y=1, BLOCK_DIM_Z=1, LEGACY_PTX_ARCH=0, ReductionOp=cub::CUB_200101_700_720_750_800_860_870_890_900_NS::Sum]" at line 257 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/common/common.cuh
            instantiation of "void layerNorm<T,R,P,TPB>(const kvp<R> &, int32_t, int32_t, const P *, const P *, T *) [with T=float, R=float, P=float, TPB=256]" at line 233 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "void nvinfer1::plugin::bert::embLayerNormKernel<T,TPB>(int, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float, TPB=256U]" at line 247 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu
            instantiation of "int32_t nvinfer1::plugin::bert::embSkipLayerNorm(cudaStream_t, int32_t, int32_t, int32_t, const int32_t *, const int32_t *, const float *, const float *, const T *, const T *, const T *, int32_t, int32_t, T *) [with T=float]" at line 253 of /home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu

9 errors detected in the compilation of "/home/damian101/.cache/yay/tensorrt/src/TensorRT/plugin/embLayerNormPlugin/embLayerNormKernel.cu".
make[2]: *** [plugin/CMakeFiles/nvinfer_plugin_static.dir/build.make:3075: plugin/CMakeFiles/nvinfer_plugin_static.dir/embLayerNormPlugin/embLayerNormKernel.cu.o] Error 1
make[2]: *** Waiting for unfinished jobs....
make[1]: *** [CMakeFiles/Makefile2:1066: plugin/CMakeFiles/nvinfer_plugin_static.dir/all] Error 2
make: *** [Makefile:156: all] Error 2
==> ERROR: A failure occurred in build().