Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff --git a/lib/DllAvUtil.h b/lib/DllAvUtil.h
- index e882cac..7afc9af 100644
- --- a/lib/DllAvUtil.h
- +++ b/lib/DllAvUtil.h
- @@ -96,6 +96,7 @@ public:
- virtual int av_fifo_size(AVFifoBuffer *f) = 0;
- virtual int av_fifo_generic_read(AVFifoBuffer *f, void *dest, int buf_size, void (*func)(void*, void*, int)) = 0;
- virtual int av_fifo_generic_write(AVFifoBuffer *f, void *src, int size, int (*func)(void*, void*, int)) = 0;
- + virtual int av_reduce(int *dst_num, int *dst_den, int64_t num, int64_t den, int64_t max) = 0;
- virtual char *av_strdup(const char *s)=0;
- };
- @@ -167,6 +168,7 @@ class DllAvUtilBase : public DllDynamic, DllAvUtilInterface
- DEFINE_METHOD1(int, av_fifo_size, (AVFifoBuffer *p1))
- DEFINE_METHOD4(int, av_fifo_generic_read, (AVFifoBuffer *p1, void *p2, int p3, void (*p4)(void*, void*, int)))
- DEFINE_METHOD4(int, av_fifo_generic_write, (AVFifoBuffer *p1, void *p2, int p3, int (*p4)(void*, void*, int)))
- + DEFINE_METHOD5(int, av_reduce, (int *p1, int *p2, int64_t p3, int64_t p4, int64_t p5))
- DEFINE_METHOD1(char*, av_strdup, (const char *p1))
- public:
- @@ -188,6 +190,7 @@ class DllAvUtilBase : public DllDynamic, DllAvUtilInterface
- RESOLVE_METHOD(av_fifo_size)
- RESOLVE_METHOD(av_fifo_generic_read)
- RESOLVE_METHOD(av_fifo_generic_write)
- + RESOLVE_METHOD(av_reduce)
- RESOLVE_METHOD(av_strdup)
- END_METHOD_RESOLVE()
- };
- diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj
- index 24f3ea6..e269339 100644
- --- a/project/VS2010Express/XBMC.vcxproj
- +++ b/project/VS2010Express/XBMC.vcxproj
- @@ -302,6 +302,13 @@
- <ClCompile Include="..\..\xbmc\AutoSwitch.cpp" />
- <ClCompile Include="..\..\xbmc\BackgroundInfoLoader.cpp" />
- <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CrystalHD.cpp" />
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CUDA.cpp" />
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\AVC1AnnexBConverter.cpp" />
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\ByteParser.cpp" />
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264Nalu.cpp" />
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264SequenceParser.cpp" />
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\MPEG2HeaderParser.cpp" />
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\VC1HeaderParser.cpp" />
- <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDInputStreams\DVDInputStreamBluray.cpp" />
- <ClCompile Include="..\..\xbmc\cores\VideoRenderers\RenderCapture.cpp" />
- <ClCompile Include="..\..\xbmc\cores\VideoRenderers\VideoShaders\WinVideoFilter.cpp" />
- @@ -1196,6 +1203,17 @@
- <ClInclude Include="..\..\xbmc\AutoSwitch.h" />
- <ClInclude Include="..\..\xbmc\BackgroundInfoLoader.h" />
- <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CrystalHD.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CUDA.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\AVC1AnnexBConverter.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\ByteParser.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuda.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuda_dynlink.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuviddec.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264Nalu.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264SequenceParser.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\MPEG2HeaderParser.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\nvcuvid.h" />
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\VC1HeaderParser.h" />
- <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDInputStreams\DVDInputStreamBluray.h" />
- <ClInclude Include="..\..\xbmc\cores\VideoRenderers\RenderCapture.h" />
- <ClInclude Include="..\..\xbmc\cores\VideoRenderers\VideoShaders\WinVideoFilter.h" />
- diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters
- index 3a37750..770e509 100644
- --- a/project/VS2010Express/XBMC.vcxproj.filters
- +++ b/project/VS2010Express/XBMC.vcxproj.filters
- @@ -238,6 +238,9 @@
- <Filter Include="interfaces\info">
- <UniqueIdentifier>{cea579fc-bdd7-499e-a6a6-07d681d1ab24}</UniqueIdentifier>
- </Filter>
- + <Filter Include="cores\dvdplayer\DVDCodecs\Video\Cuda">
- + <UniqueIdentifier>{2affa4cc-9f39-42d9-97cc-4f595a6c2aa9}</UniqueIdentifier>
- + </Filter>
- </ItemGroup>
- <ItemGroup>
- <ClCompile Include="..\..\xbmc\win32\pch.cpp">
- @@ -2493,6 +2496,27 @@
- <ClCompile Include="..\..\xbmc\guilib\GUIAction.cpp">
- <Filter>guilib</Filter>
- </ClCompile>
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CUDA.cpp">
- + <Filter>cores\dvdplayer\DVDCodecs\Video</Filter>
- + </ClCompile>
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\AVC1AnnexBConverter.cpp">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClCompile>
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\VC1HeaderParser.cpp">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClCompile>
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\MPEG2HeaderParser.cpp">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClCompile>
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264SequenceParser.cpp">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClCompile>
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264Nalu.cpp">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClCompile>
- + <ClCompile Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\ByteParser.cpp">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClCompile>
- </ItemGroup>
- <ItemGroup>
- <ClInclude Include="..\..\xbmc\win32\pch.h">
- @@ -4976,8 +5000,8 @@
- <ClInclude Include="..\..\xbmc\threads\ThreadLocal.h">
- <Filter>threads</Filter>
- </ClInclude>
- - <ClInclude Include="..\..\xbmc\input\InertialScrollingHandler.h" >
- - <Filter>input</Filter>
- + <ClInclude Include="..\..\xbmc\input\InertialScrollingHandler.h">
- + <Filter>input</Filter>
- </ClInclude>
- <ClInclude Include="..\..\xbmc\threads\platform\Condition.h">
- <Filter>threads\platform</Filter>
- @@ -5006,6 +5030,39 @@
- <ClInclude Include="..\..\xbmc\guilib\GUIAction.h">
- <Filter>guilib</Filter>
- </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\CUDA.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\nvcuvid.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuviddec.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuda_dynlink.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\cuda.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\AVC1AnnexBConverter.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\MPEG2HeaderParser.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\VC1HeaderParser.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264SequenceParser.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\H264Nalu.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- + <ClInclude Include="..\..\xbmc\cores\dvdplayer\DVDCodecs\Video\Cuda\ByteParser.h">
- + <Filter>cores\dvdplayer\DVDCodecs\Video\Cuda</Filter>
- + </ClInclude>
- </ItemGroup>
- <ItemGroup>
- <ResourceCompile Include="..\..\xbmc\win32\XBMC_PC.rc">
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
- index 03f6dcc..50ac74c 100644
- --- a/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
- @@ -37,6 +37,9 @@
- #if defined(HAVE_LIBCRYSTALHD)
- #include "Video/DVDVideoCodecCrystalHD.h"
- #endif
- +#if defined(HAS_DX)
- +#include "Video/CUDA.h"
- +#endif
- #include "Audio/DVDAudioCodecFFmpeg.h"
- #include "Audio/DVDAudioCodecLibMad.h"
- #include "Audio/DVDAudioCodecPcm.h"
- @@ -236,7 +239,10 @@ CDVDVideoCodec* CDVDFactoryCodec::CreateVideoCodec( CDVDStreamInfo &hint )
- }
- }
- #endif
- -
- + //Cuda
- +#if defined(HAS_DX)
- + if( (pCodec = OpenCodec(new CUDA::CDVDVideoCodecCuda(), hint, options)) ) return pCodec;
- +#endif
- // try to decide if we want to try halfres decoding
- #if !defined(_LINUX) && !defined(_WIN32)
- float pixelrate = (float)hint.width*hint.height*hint.fpsrate/hint.fpsscale;
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodec.h
- index 25ebcd7..3d1e9c7 100644
- --- a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodec.h
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodec.h
- @@ -32,6 +32,7 @@
- #define FRAME_TYPE_B 3
- #define FRAME_TYPE_D 4
- +namespace CUDA { class CCuda; }
- namespace DXVA { class CProcessor; }
- namespace VAAPI { struct CHolder; }
- class CVDPAU;
- @@ -55,6 +56,9 @@ struct DVDVideoPicture
- BYTE* data[4]; // [4] = alpha channel, currently not used
- int iLineSize[4]; // [4] = alpha channel, currently not used
- };
- + struct {
- + CUDA::CCuda* cuda;
- + };
- struct {
- DXVA::CProcessor* proc;
- int64_t proc_id;
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.cpp
- new file mode 100644
- index 0000000..cc9ec35
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.cpp
- @@ -0,0 +1,1256 @@
- +/*
- + * Copyright (C) 2005-2009 Team XBMC
- + * http://www.xbmc.org
- + *
- + * This Program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2, or (at your option)
- + * any later version.
- + *
- + * This Program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License
- + * along with XBMC; see the file COPYING. If not, write to
- + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- + * http://www.gnu.org/copyleft/gpl.html
- + *
- + */
- +
- +#ifdef HAS_DX
- +
- +#if (defined HAVE_CONFIG_H) && (!defined WIN32)
- + #include "config.h"
- +#elif defined(_WIN32)
- +#include "system.h"
- +#endif
- +
- +// setting that here because otherwise SampleFormat is defined to AVSampleFormat
- +// which we don't use here
- +#define FF_API_OLD_SAMPLE_FMT 0
- +
- +#define RINT(x) ((x) >= 0 ? ((int)((x) + 0.5)) : ((int)((x) - 0.5)))
- +
- +#include <windows.h>
- +#include "CUDA.h"
- +#include "../../../../windowing/WindowingFactory.h"
- +#include "DVDStreamInfo.h"
- +#include "Cuda/MPEG2HeaderParser.h"
- +#include "Cuda/H264SequenceParser.h"
- +#include "Cuda/VC1HeaderParser.h"
- +#include "utils/SystemInfo.h"
- +#include "DllAvCodec.h"
- +
- +using namespace CUDA;
- +
- +static struct {
- + CodecID ffcodec;
- + cudaVideoCodec cudaCodec;
- +} cuda_codecs[] = {
- + { CODEC_ID_MPEG1VIDEO, cudaVideoCodec_MPEG1 },
- + { CODEC_ID_MPEG2VIDEO, cudaVideoCodec_MPEG2 },
- + { CODEC_ID_VC1, cudaVideoCodec_VC1 },
- + { CODEC_ID_H264, cudaVideoCodec_H264 },
- + { CODEC_ID_MPEG4, cudaVideoCodec_MPEG4 },
- +};
- +
- +////////////////////////////////////////////////////////////////////////////////
- +// Compatibility tables
- +////////////////////////////////////////////////////////////////////////////////
- +
- +#define LEVEL_C_LOW_LIMIT 0x0A20
- +
- +static DWORD LevelCBlacklist[] = {
- + 0x0A22, 0x0A67, // Geforce 315, no VDPAU at all
- + 0x0A68, 0x0A69, // Geforce G105M, only B
- + 0x0CA0, 0x0CA7, // Geforce GT 330, only A
- + 0x0CAC, // Geforce GT 220, no VDPAU
- + 0x10C3 // Geforce 8400GS, only A
- +};
- +
- +static DWORD LevelCWhitelist[] = {
- + 0x06C0, // Geforce GTX 480
- + 0x06C4, // Geforce GTX 465
- + 0x06CA, // Geforce GTX 480M
- + 0x06CD, // Geforce GTX 470
- + 0x08A5, // Geforce 320M
- +
- + 0x06D8, 0x06DC, // Quadro 6000
- + 0x06D9, // Quadro 5000
- + 0x06DA, // Quadro 5000M
- + 0x06DD, // Quadro 4000
- +
- + 0x06D1, // Tesla C2050 / C2070
- + 0x06D2, // Tesla M2070
- + 0x06DE, // Tesla T20 Processor
- + 0x06DF, // Tesla M2070-Q
- +};
- +
- +static BOOL IsLevelC(DWORD deviceId)
- +{
- + int idx = 0;
- + if (deviceId >= LEVEL_C_LOW_LIMIT) {
- + for(idx = 0; idx < sizeof(LevelCBlacklist); idx++) {
- + if (LevelCBlacklist[idx] == deviceId)
- + return FALSE;
- + }
- + return TRUE;
- + } else {
- + for(idx = 0; idx < sizeof(LevelCWhitelist); idx++) {
- + if (LevelCWhitelist[idx] == deviceId)
- + return TRUE;
- + }
- + return FALSE;
- + }
- +}
- +
- +
- +CDVDVideoCodecCuda::CDVDVideoCodecCuda() : CDVDVideoCodec()
- +{
- + ZeroMemory(&cuda, sizeof(cuda));
- + ZeroMemory(&m_VideoFormat, sizeof(m_VideoFormat));
- + ZeroMemory(&m_DXVAExtendedFormat, sizeof(m_DXVAExtendedFormat));
- + m_AccelDeintOutput = 0;
- + m_DeintTreatAsProgressive = 0;
- + m_DeintAggressive = 0;
- + m_bVDPAULevelC = FALSE;
- + m_cudaContext = 0;
- + m_cudaCtxLock = 0;
- + m_hParser = 0;
- + m_hDecoder = 0;
- + m_hStream = 0;
- + m_bForceSequenceUpdate = FALSE;
- + m_bInterlaced =FALSE;
- + m_bFlushing =FALSE;
- + m_pbRawNV12 = NULL;
- + m_cRawNV12 = 0;
- + m_AVC1Converter = NULL;
- + m_dllAvUtil.Load();
- +}
- +
- +CDVDVideoCodecCuda::~CDVDVideoCodecCuda()
- +{
- + Dispose();
- +}
- +
- +bool CDVDVideoCodecCuda::DestroyDecoder(bool bFull)
- +{
- + if (m_AVC1Converter) {
- + SAFE_DELETE(m_AVC1Converter);
- + }
- +
- + if (m_hDecoder) {
- + cuda.cuvidDestroyDecoder(m_hDecoder);
- + m_hDecoder = 0;
- + }
- +
- + if (m_hParser) {
- + cuda.cuvidDestroyVideoParser(m_hParser);
- + m_hParser = 0;
- + }
- +
- + if (m_hStream) {
- + cuda.cuStreamDestroy(m_hStream);
- + m_hStream = 0;
- + }
- +
- + if (m_pbRawNV12) {
- + cuda.cuMemFreeHost(m_pbRawNV12);
- + m_pbRawNV12 = NULL;
- + m_cRawNV12 = 0;
- + }
- +
- + if(bFull) {
- + if (m_cudaCtxLock) {
- + cuda.cuvidCtxLockDestroy(m_cudaCtxLock);
- + m_cudaCtxLock = 0;
- + }
- +
- + if (m_cudaContext) {
- + cuda.cuCtxDestroy(m_cudaContext);
- + m_cudaContext = 0;
- + }
- +
- + FreeLibrary(cuda.cudaLib);
- + FreeLibrary(cuda.cuvidLib);
- + }
- +
- + return true;
- +}
- +
- +#define GET_PROC_EX(name, lib) \
- + cuda.name = (t##name *)GetProcAddress(lib, #name); \
- + if (cuda.name == NULL) { \
- + CLog::Log(LOGERROR,"Failed to load function \"%s\"", TEXT(#name)); \
- + return E_FAIL; \
- + }
- +
- +#define GET_PROC_CUDA(name) GET_PROC_EX(name, cuda.cudaLib)
- +#define GET_PROC_CUVID(name) GET_PROC_EX(name, cuda.cuvidLib)
- +
- +
- +bool CDVDVideoCodecCuda::LoadCUDAFuncRefs()
- +{
- + // Load CUDA functions
- + cuda.cudaLib = LoadLibrary("nvcuda.dll");
- + if (cuda.cudaLib == NULL)
- + {
- + CLog::Log(LOGERROR,"Loading nvcuda.dll failed");
- + return false;
- + }
- +
- + GET_PROC_CUDA(cuInit);
- + GET_PROC_CUDA(cuCtxCreate);
- + GET_PROC_CUDA(cuCtxDestroy);
- + GET_PROC_CUDA(cuCtxPushCurrent);
- + GET_PROC_CUDA(cuCtxPopCurrent);
- + GET_PROC_CUDA(cuD3D9CtxCreate);
- + GET_PROC_CUDA(cuMemAllocHost);
- + GET_PROC_CUDA(cuMemFreeHost);
- + GET_PROC_CUDA(cuMemcpyDtoH);
- + GET_PROC_CUDA(cuMemcpyDtoHAsync);
- + GET_PROC_CUDA(cuStreamCreate);
- + GET_PROC_CUDA(cuStreamDestroy);
- + GET_PROC_CUDA(cuStreamQuery);
- + GET_PROC_CUDA(cuDeviceGetCount);
- + GET_PROC_CUDA(cuDriverGetVersion);
- + GET_PROC_CUDA(cuDeviceGetName);
- + GET_PROC_CUDA(cuDeviceComputeCapability);
- + GET_PROC_CUDA(cuDeviceGetAttribute);
- +
- + // Load CUVID function
- + cuda.cuvidLib = LoadLibrary("nvcuvid.dll");
- + if (cuda.cuvidLib == NULL)
- + {
- + CLog::Log(LOGERROR,"Loading nvcuvid.dll failed");
- + return false;
- + }
- +
- + GET_PROC_CUVID(cuvidCtxLockCreate);
- + GET_PROC_CUVID(cuvidCtxLockDestroy);
- + GET_PROC_CUVID(cuvidCtxLock);
- + GET_PROC_CUVID(cuvidCtxUnlock);
- + GET_PROC_CUVID(cuvidCreateVideoParser);
- + GET_PROC_CUVID(cuvidParseVideoData);
- + GET_PROC_CUVID(cuvidDestroyVideoParser);
- + GET_PROC_CUVID(cuvidCreateDecoder);
- + GET_PROC_CUVID(cuvidDecodePicture);
- + GET_PROC_CUVID(cuvidDestroyDecoder);
- + GET_PROC_CUVID(cuvidMapVideoFrame);
- + GET_PROC_CUVID(cuvidUnmapVideoFrame);
- +
- + return true;
- +}
- +
- +// Beginning of GPU Architecture definitions
- +static int _ConvertSMVer2CoresDrvApi(int major, int minor)
- +{
- + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
- + typedef struct {
- + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
- + int Cores;
- + } sSMtoCores;
- +
- + sSMtoCores nGpuArchCoresPerSM[] =
- + {
- + { 0x10, 8 },
- + { 0x11, 8 },
- + { 0x12, 8 },
- + { 0x13, 8 },
- + { 0x20, 32 },
- + { 0x21, 48 },
- + { 0x30, 192 },
- + { -1, -1 }
- + };
- +
- + int index = 0;
- + while (nGpuArchCoresPerSM[index].SM != -1) {
- + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
- + return nGpuArchCoresPerSM[index].Cores;
- + }
- + index++;
- + }
- + printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
- + return -1;
- +}
- +
- +int CDVDVideoCodecCuda::GetMaxGflopsGraphicsDeviceId()
- +{
- + CUdevice current_device = 0, max_perf_device = 0;
- + int device_count = 0, sm_per_multiproc = 0;
- + int max_compute_perf = 0, best_SM_arch = 0;
- + int major = 0, minor = 0, multiProcessorCount, clockRate;
- + int bTCC = 0, version;
- + char deviceName[256];
- +
- + cuda.cuDeviceGetCount(&device_count);
- + if (device_count <= 0)
- + return -1;
- +
- + cuda.cuDriverGetVersion(&version);
- +
- + // Find the best major SM Architecture GPU device that are graphics devices
- + while ( current_device < device_count ) {
- + cuda.cuDeviceGetName(deviceName, 256, current_device);
- + cuda.cuDeviceComputeCapability(&major, &minor, current_device);
- +
- + if (version >= 3020) {
- + cuda.cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device);
- + } else {
- + // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
- + if (deviceName[0] == 'T') bTCC = 1;
- + }
- + if (!bTCC) {
- + if (major > 0 && major < 9999) {
- + best_SM_arch = std::max(best_SM_arch, major);
- + }
- + }
- + current_device++;
- + }
- +
- + // Find the best CUDA capable GPU device
- + current_device = 0;
- + while( current_device < device_count ) {
- + cuda.cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device);
- + cuda.cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device);
- + cuda.cuDeviceComputeCapability(&major, &minor, current_device);
- +
- + if (version >= 3020) {
- + cuda.cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device);
- + } else {
- + // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
- + if (deviceName[0] == 'T') bTCC = 1;
- + }
- +
- + if (major == 9999 && minor == 9999) {
- + sm_per_multiproc = 1;
- + } else {
- + sm_per_multiproc = _ConvertSMVer2CoresDrvApi(major, minor);
- + }
- +
- + // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor
- + if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
- + {
- + int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
- + if(compute_perf > max_compute_perf) {
- + // If we find GPU with SM major > 2, search only these
- + if (best_SM_arch > 2) {
- + // If our device = dest_SM_arch, then we pick this one
- + if (major == best_SM_arch) {
- + max_compute_perf = compute_perf;
- + max_perf_device = current_device;
- + }
- + } else {
- + max_compute_perf = compute_perf;
- + max_perf_device = current_device;
- + }
- + }
- +
- +#ifdef _DEBUG
- + cuda.cuDeviceGetName(deviceName, 256, current_device);
- + CLog::Log(LOGINFO,"CUDA Device: %S, Compute: %d.%d, CUDA Cores: %d, Clock: %d MHz", deviceName, major, minor, multiProcessorCount * sm_per_multiproc, clockRate / 1000);
- +#endif
- + }
- + ++current_device;
- + }
- + return max_perf_device;
- +}
- +
- +bool CDVDVideoCodecCuda::Init()
- +{
- + CLog::Log(LOGINFO,"%s: Trying to open CUVID device",__FUNCTION__);
- +
- + CUresult cuStatus = CUDA_SUCCESS;
- +
- + if (!LoadCUDAFuncRefs())
- + {
- + CLog::Log(LOGERROR,"Loading CUDA interfaces failed");
- + return false;
- + }
- +
- + cuStatus = cuda.cuInit(0);
- + if (cuStatus != CUDA_SUCCESS)
- + {
- + CLog::Log(LOGERROR,"cuInit failed (status: %d)", cuStatus);
- + return false;
- + }
- +
- + // TODO: select best device
- + int best_device = GetMaxGflopsGraphicsDeviceId();
- + int device = best_device;
- +
- + HWND hwnd = g_Windowing.GetHwnd();
- +
- + D3DADAPTER_IDENTIFIER9 d3dId;
- + D3DPRESENT_PARAMETERS d3dpp;
- + D3DDISPLAYMODE d3ddm;
- + HRESULT hr = g_Windowing.Get3DDevice()->GetDirect3D(&m_pD3D);
- +
- + unsigned uAdapterCount = m_pD3D->GetAdapterCount();
- + for (unsigned lAdapter=0; lAdapter<uAdapterCount; lAdapter++) {
- + CLog::Log(LOGINFO,"Trying D3D Adapter %d..", lAdapter);
- +
- + ZeroMemory(&d3dpp, sizeof(d3dpp));
- + m_pD3D->GetAdapterDisplayMode(lAdapter, &d3ddm);
- +
- + d3dpp.Windowed = TRUE;
- + d3dpp.BackBufferWidth = 640;
- + d3dpp.BackBufferHeight = 480;
- + d3dpp.BackBufferCount = 1;
- + d3dpp.BackBufferFormat = d3ddm.Format;
- + d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;
- + d3dpp.Flags = D3DPRESENTFLAG_VIDEO;
- +
- + IDirect3DDevice9 *pDev = g_Windowing.Get3DDevice();
- + CUcontext cudaCtx = 0;
- + //why the fpu preserve ??
- + //hr = m_pD3D->CreateDevice(lAdapter, D3DDEVTYPE_HAL, hwnd, D3DCREATE_HARDWARE_VERTEXPROCESSING | D3DCREATE_MULTITHREADED | D3DCREATE_FPU_PRESERVE, &d3dpp, &pDev);
- +
- + if (1)
- + {
- + m_pD3D->GetAdapterIdentifier(lAdapter, 0, &d3dId);
- + cuStatus = cuda.cuD3D9CtxCreate(&cudaCtx, &device, CU_CTX_SCHED_BLOCKING_SYNC, pDev);
- + if (cuStatus == CUDA_SUCCESS) {
- + CLog::Log(LOGINFO, "-> Created D3D Device on adapter %S (%d), using CUDA device %d", d3dId.Description, lAdapter, device);
- +
- + BOOL isLevelC = IsLevelC(d3dId.DeviceId);
- + CLog::Log(LOGINFO,"InitCUDA(): D3D Device with Id 0x%x is level C: %d", d3dId.DeviceId, isLevelC);
- +
- + if (m_bVDPAULevelC && !isLevelC) {
- + CLog::Log(LOGINFO, "InitCUDA(): We already had a Level C+ device, this one is not, skipping");
- + continue;
- + }
- +
- + if (m_cudaContext)
- + cuda.cuCtxDestroy(m_cudaContext);
- +
- + // Store resources
- + m_cudaContext = cudaCtx;
- + m_bVDPAULevelC = isLevelC;
- + // Is this the one we want?
- + if (device == best_device)
- + break;
- + }
- + else
- + CLog::Log(LOGINFO,"D3D Device on adapter %d is not CUDA capable", lAdapter);
- + }
- + }
- +
- + cuStatus = CUDA_SUCCESS;
- +
- + /*if (!m_pD3DDevice) {
- + DbgLog((LOG_TRACE, 10, L"-> No D3D device available, building non-D3D context on device %d", best_device));
- + SafeRelease(&m_pD3D);
- + cuStatus = cuda.cuCtxCreate(&m_cudaContext, CU_CTX_SCHED_BLOCKING_SYNC, best_device);
- +
- + int major, minor;
- + cuda.cuDeviceComputeCapability(&major, &minor, best_device);
- + m_bVDPAULevelC = (major >= 2);
- + DbgLog((LOG_TRACE, 10, L"InitCUDA(): pure CUDA context of device with compute %d.%d", major, minor));
- + }*/
- +
- + if (cuStatus == CUDA_SUCCESS) {
- + // Switch to a floating context
- + CUcontext curr_ctx = NULL;
- + cuStatus = cuda.cuCtxPopCurrent(&curr_ctx);
- + if (cuStatus != CUDA_SUCCESS)
- + {
- + CLog::Log(LOGERROR, "Storing context on the stack failed with error %d", cuStatus);
- + return false;
- + }
- + cuStatus = cuda.cuvidCtxLockCreate(&m_cudaCtxLock, m_cudaContext);
- + if (cuStatus != CUDA_SUCCESS) {
- + CLog::Log(LOGERROR, "Creation of floating context failed with error %d", cuStatus);
- + return false;
- + }
- + }
- + else
- + {
- + CLog::Log(LOGERROR, "Creation of CUDA context failed with error %d", cuStatus);
- + return false;
- + }
- +
- + return true;
- +
- +}
- +
- +DWORD avc_quant(BYTE *src, BYTE *dst, int extralen)
- +{
- + DWORD cb = 0;
- + BYTE* src_end = (BYTE *) src + extralen;
- + BYTE* dst_end = (BYTE *) dst + extralen;
- + src += 5;
- + // Two runs, for sps and pps
- + for (int i = 0; i < 2; i++)
- + {
- + for (int n = *(src++) & 0x1f; n > 0; n--)
- + {
- + unsigned len = (((unsigned)src[0] << 8) | src[1]) + 2;
- + if(src + len > src_end || dst + len > dst_end) { ASSERT(0); break; }
- + memcpy(dst, src, len);
- + src += len;
- + dst += len;
- + cb += len;
- + }
- + }
- + return cb;
- +}
- +
- +
- +bool CDVDVideoCodecCuda::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
- +{
- + if (!Init())
- + return false;
- + if (hints.codec == CODEC_ID_MPEG4)
- + m_pFormatName.Format("cuda_mpeg4");
- + else if (hints.codec == CODEC_ID_MPEG2VIDEO)
- + m_pFormatName.Format("cuda_mpeg2");
- + else if (hints.codec == CODEC_ID_H264)
- + m_pFormatName.Format("cuda_h264");
- + else if (hints.codec == CODEC_ID_VC1 || hints.codec == CODEC_ID_WMV3)
- + m_pFormatName.Format("cuda_vc1");
- + CLog::Log(LOGINFO, "CDecCuvid::InitDecoder(): Initializing CUVID decoder");
- + HRESULT hr = S_OK;
- +
- + if (!m_cudaContext) {
- + CLog::Log(LOGERROR, " InitDecoder called without a cuda context");
- + return false;
- + }
- +
- +
- + // Free old device
- + DestroyDecoder(false);
- +
- + // Flush Display Queue
- + memset(&m_DisplayQueue, 0, sizeof(m_DisplayQueue));
- + for (int i=0; i<DISPLAY_DELAY; i++)
- + m_DisplayQueue[i].picture_index = -1;
- + m_DisplayPos = 0;
- +
- + cudaVideoCodec cudaCodec = (cudaVideoCodec)-1;
- + for (int i = 0; i < countof(cuda_codecs); i++) {
- + if (cuda_codecs[i].ffcodec == hints.codec) {
- + cudaCodec = cuda_codecs[i].cudaCodec;
- + break;
- + }
- + }
- +
- + if (cudaCodec == -1) {
- + CLog::Log(LOGERROR, "Codec id %d does not map to a CUVID codec", hints.codec);
- + return false;
- + }
- +
- + if (cudaCodec == cudaVideoCodec_MPEG4 && !m_bVDPAULevelC) {
- + CLog::Log(LOGERROR, "Device is not capable to decode this format (not >= Level C)");
- + return false;
- + }
- +
- + /*m_bUseTimestampQueue = (cudaCodec == cudaVideoCodec_H264 && m_pCallback->H264IsAVI())
- + || (cudaCodec == cudaVideoCodec_MPEG4 && pmt->formattype != FORMAT_MPEG2Video)
- + || (cudaCodec == cudaVideoCodec_VC1 && m_pCallback->VC1IsDTS());*/
- + m_bUseTimestampQueue = (CODEC_ID_MPEG4 == hints.codec);
- + m_bWaitForKeyframe = m_bUseTimestampQueue;
- + m_bInterlaced = TRUE;
- + m_bFormatIncompatible = FALSE;
- + m_bTFF = TRUE;
- + m_rtPrevDiff = AV_NOPTS_VALUE;
- + m_bARPresent = TRUE;
- +
- + // Create the CUDA Video Parser
- + CUVIDPARSERPARAMS oVideoParserParameters;
- + ZeroMemory(&oVideoParserParameters, sizeof(CUVIDPARSERPARAMS));
- + oVideoParserParameters.CodecType = cudaCodec;
- + oVideoParserParameters.ulMaxNumDecodeSurfaces = MAX_DECODE_FRAMES;
- + oVideoParserParameters.ulMaxDisplayDelay = DISPLAY_DELAY;
- + oVideoParserParameters.pUserData = this;
- + oVideoParserParameters.pfnSequenceCallback = CDVDVideoCodecCuda::HandleVideoSequence; // Called before decoding frames and/or whenever there is a format change
- + oVideoParserParameters.pfnDecodePicture = CDVDVideoCodecCuda::HandlePictureDecode; // Called when a picture is ready to be decoded (decode order)
- + oVideoParserParameters.pfnDisplayPicture = CDVDVideoCodecCuda::HandlePictureDisplay; // Called whenever a picture is ready to be displayed (display order)
- + oVideoParserParameters.ulErrorThreshold = m_bUseTimestampQueue ? 100 : 0;
- +
- + memset(&m_VideoParserExInfo, 0, sizeof(CUVIDEOFORMATEX));
- + //TODO
- + //pmt->formattype == FORMAT_MPEG2Video && (pmt->subtype == MEDIASUBTYPE_AVC1 || pmt->subtype == MEDIASUBTYPE_avc1 || pmt->subtype == MEDIASUBTYPE_CCV1)) {
- + if (hints.codec != CODEC_ID_MPEG4)
- + {
- + //MPEG2VIDEOINFO *mp2vi = (MPEG2VIDEOINFO *)pmt->Format();
- + m_AVC1Converter = new CAVC1AnnexBConverter();
- + m_AVC1Converter->SetNALUSize(2);
- + BYTE* extradata = (BYTE*)hints.extradata;
- +
- + int nalusize = (extradata[4] & 3) + 1;
- +
- + BYTE *annexBextra = NULL;
- + int size = 0;
- + BYTE* dwSequenceHeader;
- + dwSequenceHeader = (byte*)malloc(hints.extrasize);
- + int cbSequenceHeader;
- + cbSequenceHeader = avc_quant(extradata, (BYTE *)(dwSequenceHeader), hints.extrasize);
- + //m_AVC1Converter->Convert(&annexBextra, &size, (BYTE *)mp2vi->dwSequenceHeader, mp2vi->cbSequenceHeader);
- + m_AVC1Converter->Convert(&annexBextra, &size, (BYTE *)dwSequenceHeader, cbSequenceHeader);
- + if (annexBextra && size) {
- + memcpy(m_VideoParserExInfo.raw_seqhdr_data, annexBextra, size);
- + m_VideoParserExInfo.format.seqhdr_data_length = size;
- + m_dllAvUtil.av_freep(&annexBextra);
- + }
- +
- + //m_AVC1Converter->SetNALUSize(smp2vi->dwFlags);
- + m_AVC1Converter->SetNALUSize(4);
- + } else {
- + size_t hdr_len = 0;
- + memcpy( m_VideoParserExInfo.raw_seqhdr_data, hints.extradata,hints.extrasize);
- + //getExtraData(*pmt, m_VideoParserExInfo.raw_seqhdr_data, &hdr_len);
- + m_VideoParserExInfo.format.seqhdr_data_length = (unsigned int)hints.extrasize;
- + }
- +
- + m_bNeedSequenceCheck = FALSE;
- + if (m_VideoParserExInfo.format.seqhdr_data_length) {
- + if (cudaCodec == cudaVideoCodec_H264) {
- + hr = CheckH264Sequence(m_VideoParserExInfo.raw_seqhdr_data, m_VideoParserExInfo.format.seqhdr_data_length);
- + if (FAILED(hr)) {
- + return false;
- + } else if (hr == S_FALSE) {
- + m_bNeedSequenceCheck = TRUE;
- + }
- + } else if (cudaCodec == cudaVideoCodec_MPEG2) {
- + CLog::Log(LOGINFO, "-> Scanning extradata for MPEG2 sequence header");
- + CMPEG2HeaderParser mpeg2parser(m_VideoParserExInfo.raw_seqhdr_data, m_VideoParserExInfo.format.seqhdr_data_length);
- + if (mpeg2parser.hdr.valid) {
- + if (mpeg2parser.hdr.chroma >= 2) {
- + CLog::Log(LOGERROR, "Sequence header indicates incompatible chroma sampling (chroma: %d)", mpeg2parser.hdr.chroma);
- + return false;
- + }
- + m_bInterlaced = mpeg2parser.hdr.interlaced;
- + }
- + } else if (cudaCodec == cudaVideoCodec_VC1) {
- + CVC1HeaderParser vc1Parser(m_VideoParserExInfo.raw_seqhdr_data, m_VideoParserExInfo.format.seqhdr_data_length);
- + m_bInterlaced = vc1Parser.hdr.interlaced;
- + }
- + } else {
- + m_bNeedSequenceCheck = (cudaCodec == cudaVideoCodec_H264);
- + }
- +
- + oVideoParserParameters.pExtVideoInfo = &m_VideoParserExInfo;
- + CUresult oResult = cuda.cuvidCreateVideoParser(&m_hParser, &oVideoParserParameters);
- + if (oResult != CUDA_SUCCESS) {
- + CLog::Log(LOGERROR, "Creating parser for type %d failed with code %d", cudaCodec, oResult);
- + return E_FAIL;
- + }
- +
- + {
- + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
- + oResult = cuda.cuStreamCreate(&m_hStream, 0);
- + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
- + if (oResult != CUDA_SUCCESS) {
- + CLog::Log(LOGERROR, "::InitCodec(): Creating stream failed");
- + return E_FAIL;
- + }
- + }
- +
- + //BITMAPINFOHEADER *bmi = NULL;
- + //videoFormatTypeHandler(pmt->Format(), pmt->FormatType(), &bmi);
- +
- + {
- + RECT rcDisplayArea = {0, 0, hints.width, hints.height};
- + hr = CreateCUVIDDecoder(cudaCodec, hints.width, hints.height, hints.width, hints.height, rcDisplayArea);
- + if (FAILED(hr)) {
- + CLog::Log(LOGERROR,"Creating CUVID decoder failed");
- + return false;
- + }
- + }
- +
- + m_bForceSequenceUpdate = TRUE;
- +
- + DecodeSequenceData();
- +
- +
- +
- + return true;
- +}
- +
- +bool CDVDVideoCodecCuda::CheckH264Sequence(const BYTE *buffer, int buflen)
- +{
- + CLog::Log(LOGINFO, "CDecCuvid::CheckH264Sequence(): Checking H264 frame for SPS");
- + CH264SequenceParser h264parser;
- + h264parser.ParseNALs(buffer, buflen, 0);
- + if (h264parser.sps.valid) {
- + m_bInterlaced = h264parser.sps.interlaced;
- + m_iFullRange = h264parser.sps.full_range;
- + m_bARPresent = h264parser.sps.ar_present;
- + CLog::Log(LOGINFO, "SPS found");
- + if (h264parser.sps.profile > 100 || h264parser.sps.chroma != 1 || h264parser.sps.luma_bitdepth != 8 || h264parser.sps.chroma_bitdepth != 8) {
- + CLog::Log(LOGERROR, "SPS indicates video incompatible with CUVID, aborting (profile: %d, chroma: %d, bitdepth: %d/%d)", h264parser.sps.profile, h264parser.sps.chroma, h264parser.sps.luma_bitdepth, h264parser.sps.chroma_bitdepth);
- + return false;
- + }
- + CLog::Log(LOGINFO, "Video seems compatible with CUVID");
- + return true;
- + }
- + return false;
- +}
- +
- +void fillDXVAExtFormat(DXVA2_ExtendedFormat &fmt, int range, int primaries, int matrix, int transfer)
- +{
- + fmt.value = 0;
- +
- + if (range != -1)
- + fmt.NominalRange = range ? DXVA2_NominalRange_0_255 : DXVA2_NominalRange_16_235;
- +
- + // Color Primaries
- + switch(primaries) {
- + case AVCOL_PRI_BT709:
- + fmt.VideoPrimaries = DXVA2_VideoPrimaries_BT709;
- + break;
- + case AVCOL_PRI_BT470M:
- + fmt.VideoPrimaries = DXVA2_VideoPrimaries_BT470_2_SysM;
- + break;
- + case AVCOL_PRI_BT470BG:
- + fmt.VideoPrimaries = DXVA2_VideoPrimaries_BT470_2_SysBG;
- + break;
- + case AVCOL_PRI_SMPTE170M:
- + fmt.VideoPrimaries = DXVA2_VideoPrimaries_SMPTE170M;
- + break;
- + case AVCOL_PRI_SMPTE240M:
- + fmt.VideoPrimaries = DXVA2_VideoPrimaries_SMPTE240M;
- + break;
- + }
- +
- + // Color Space / Transfer Matrix
- + switch (matrix) {
- + case AVCOL_SPC_BT709:
- + fmt.VideoTransferMatrix = DXVA2_VideoTransferMatrix_BT709;
- + break;
- + case AVCOL_SPC_FCC:
- + fmt.VideoTransferMatrix = (DXVA2_VideoTransferMatrix)6;
- + break;
- + case AVCOL_SPC_BT470BG:
- + case AVCOL_SPC_SMPTE170M:
- + fmt.VideoTransferMatrix = DXVA2_VideoTransferMatrix_BT601;
- + break;
- + case AVCOL_SPC_SMPTE240M:
- + fmt.VideoTransferMatrix = DXVA2_VideoTransferMatrix_SMPTE240M;
- + break;
- + case 8://AVCOL_SPC_YCGCO
- + fmt.VideoTransferMatrix = (DXVA2_VideoTransferMatrix)7;
- + break;
- + }
- +
- + // Color Transfer Function
- + switch(transfer) {
- + case AVCOL_TRC_BT709:
- + fmt.VideoTransferFunction = DXVA2_VideoTransFunc_709;
- + break;
- + case AVCOL_TRC_GAMMA22:
- + fmt.VideoTransferFunction = DXVA2_VideoTransFunc_22;
- + break;
- + case AVCOL_TRC_GAMMA28:
- + fmt.VideoTransferFunction = DXVA2_VideoTransFunc_28;
- + break;
- + case AVCOL_SPC_SMPTE240M:
- + fmt.VideoTransferFunction = DXVA2_VideoTransFunc_240M;
- + break;
- + }
- +}
- +
- +CUVIDPARSERDISPINFO* CDVDVideoCodecCuda::GetNextFrame()
- +{
- + int next = (m_DisplayPos + 1) % DISPLAY_DELAY;
- + return &m_DisplayQueue[next];
- +}
- +
- +int CUDAAPI CDVDVideoCodecCuda::HandleVideoSequence(void *obj, CUVIDEOFORMAT *cuvidfmt)
- +{
- + CLog::Log(LOGINFO, "%s: New Video Sequence",__FUNCTION__);
- + CDVDVideoCodecCuda *filter = static_cast<CDVDVideoCodecCuda *>(obj);
- +
- + CUVIDDECODECREATEINFO *dci = &filter->m_VideoDecoderInfo;
- +
- + if ((cuvidfmt->codec != dci->CodecType)
- + || (cuvidfmt->coded_width != dci->ulWidth)
- + || (cuvidfmt->coded_height != dci->ulHeight)
- + || (cuvidfmt->display_area.right != dci->ulTargetWidth)
- + || (cuvidfmt->display_area.bottom != dci->ulTargetHeight)
- + || (cuvidfmt->chroma_format != dci->ChromaFormat)
- + || filter->m_bForceSequenceUpdate)
- + {
- + filter->m_bForceSequenceUpdate = FALSE;
- + RECT rcDisplayArea = {cuvidfmt->display_area.left, cuvidfmt->display_area.top, cuvidfmt->display_area.right, cuvidfmt->display_area.bottom};
- + filter->CreateCUVIDDecoder(cuvidfmt->codec, cuvidfmt->coded_width, cuvidfmt->coded_height, cuvidfmt->display_area.right, cuvidfmt->display_area.bottom, rcDisplayArea);
- + }
- +
- + filter->m_bInterlaced = !cuvidfmt->progressive_sequence;
- + filter->m_bDoubleRateDeint = FALSE;
- + if (filter->m_bInterlaced && cuvidfmt->frame_rate.numerator && cuvidfmt->frame_rate.denominator) {
- + double dFrameTime = 10000000.0 / ((double)cuvidfmt->frame_rate.numerator / cuvidfmt->frame_rate.denominator);
- + if (filter->m_AccelDeintOutput == 0/*DeintOutput_FramePerField*/ && filter->m_VideoDecoderInfo.DeinterlaceMode != cudaVideoDeinterlaceMode_Weave && !filter->m_DeintTreatAsProgressive && (int)(dFrameTime / 10000.0) != 41) {
- + filter->m_bDoubleRateDeint = TRUE;
- + dFrameTime /= 2.0;
- + }
- + if (cuvidfmt->codec != cudaVideoCodec_MPEG4)
- + filter->m_rtAvgTimePerFrame = REFERENCE_TIME(dFrameTime + 0.5);
- + else
- + filter->m_rtAvgTimePerFrame = AV_NOPTS_VALUE; //TODO: base on media type
- + } else {
- + filter->m_rtAvgTimePerFrame = AV_NOPTS_VALUE;
- + }
- + filter->m_VideoFormat = *cuvidfmt;
- +
- + if (cuvidfmt->chroma_format != cudaVideoChromaFormat_420) {
- + CLog::Log(LOGERROR, "CDecCuvid::HandleVideoSequence(): Incompatible Chroma Format detected");
- + filter->m_bFormatIncompatible = TRUE;
- + }
- +
- + fillDXVAExtFormat(filter->m_DXVAExtendedFormat, filter->m_iFullRange, cuvidfmt->video_signal_description.color_primaries, cuvidfmt->video_signal_description.matrix_coefficients, cuvidfmt->video_signal_description.transfer_characteristics);
- +
- + return TRUE;
- +}
- +
- +int CUDAAPI CDVDVideoCodecCuda::HandlePictureDecode(void *obj, CUVIDPICPARAMS *cuvidpic)
- +{
- + CDVDVideoCodecCuda *filter = reinterpret_cast<CDVDVideoCodecCuda *>(obj);
- +
- + if (filter->m_bFlushing)
- + return FALSE;
- +
- + if (filter->m_bWaitForKeyframe) {
- + if (cuvidpic->intra_pic_flag)
- + filter->m_bWaitForKeyframe = FALSE;
- + else {
- + // Pop timestamp from the queue, drop frame
- + if (!filter->m_timestampQueue.empty()) {
- + filter->m_timestampQueue.pop();
- + }
- + return FALSE;
- + }
- + }
- +
- + int flush_pos = filter->m_DisplayPos;
- + for (;;) {
- + bool frame_in_use = false;
- + for (int i=0; i<DISPLAY_DELAY; i++) {
- + if (filter->m_DisplayQueue[i].picture_index == cuvidpic->CurrPicIdx) {
- + frame_in_use = true;
- + break;
- + }
- + }
- + if (!frame_in_use) {
- + // No problem: we're safe to use this frame
- + break;
- + }
- + // The target frame is still pending in the display queue:
- + // Flush the oldest entry from the display queue and repeat
- + if (filter->m_DisplayQueue[flush_pos].picture_index >= 0) {
- + //TODO
- + //filter->Display(&filter->m_DisplayQueue[flush_pos]);
- + filter->m_DisplayQueue[flush_pos].picture_index = -1;
- + }
- + flush_pos = (flush_pos + 1) % DISPLAY_DELAY;
- + }
- +
- + filter->cuda.cuvidCtxLock(filter->m_cudaCtxLock, 0);
- + filter->m_PicParams[cuvidpic->CurrPicIdx] = *cuvidpic;
- + __try {
- + CUresult cuStatus = filter->cuda.cuvidDecodePicture(filter->m_hDecoder, cuvidpic);
- + #ifdef _DEBUG
- + if (cuStatus != CUDA_SUCCESS) {
- + CLog::Log(LOGERROR, "CDVDVideoCodecCuda::HandlePictureDecode(): cuvidDecodePicture returned error code %d", cuStatus);
- + }
- + #endif
- + } __except(1) {
- + CLog::Log(LOGERROR, "CDVDVideoCodecCuda::HandlePictureDecode(): cuvidDecodePicture threw an exception");
- + }
- + filter->cuda.cuvidCtxUnlock(filter->m_cudaCtxLock, 0);
- +
- + return TRUE;
- +}
- +
- +int CUDAAPI CDVDVideoCodecCuda::HandlePictureDisplay(void *obj, CUVIDPARSERDISPINFO *cuviddisp)
- +{
- + CDVDVideoCodecCuda *filter = reinterpret_cast<CDVDVideoCodecCuda *>(obj);
- +
- + if (filter->m_bFlushing)
- + return FALSE;
- +
- + if (filter->m_bUseTimestampQueue) {
- + if (filter->m_timestampQueue.empty()) {
- + cuviddisp->timestamp = AV_NOPTS_VALUE;
- + } else {
- + cuviddisp->timestamp = filter->m_timestampQueue.front();
- + filter->m_timestampQueue.pop();
- + }
- + }
- +
- + // Drop samples with negative timestamps (preroll) or during flushing
- + if (cuviddisp->timestamp != AV_NOPTS_VALUE && cuviddisp->timestamp < 0)
- + return TRUE;
- +
- + /*if (filter->m_DisplayQueue[filter->m_DisplayPos].picture_index >= 0) {
- + filter->Display(&filter->m_DisplayQueue[filter->m_DisplayPos]);
- + filter->m_DisplayQueue[filter->m_DisplayPos].picture_index = -1;
- + }
- + */filter->m_DisplayQueue[filter->m_DisplayPos] = *cuviddisp;
- + /*filter->m_DisplayPos = (filter->m_DisplayPos + 1) % DISPLAY_DELAY;*/
- +
- + return TRUE;
- +}
- +
- +bool CDVDVideoCodecCuda::CreateCUVIDDecoder(cudaVideoCodec codec, DWORD dwWidth, DWORD dwHeight, DWORD dwDisplayWidth, DWORD dwDisplayHeight, RECT rcDisplayArea)
- +{
- + //DbgLog((LOG_TRACE, 10, L"CDecCuvid::CreateCUVIDDecoder(): Creating CUVID decoder instance"));
- + HRESULT hr = S_OK;
- + BOOL bDXVAMode = (g_Windowing.Get3DDevice() && /*m_pSettings->GetHWAccelDeintHQ() &&*/ g_sysinfo.IsVistaOrHigher());
- +
- + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
- + CUVIDDECODECREATEINFO *dci = &m_VideoDecoderInfo;
- +
- +retry:
- + if (m_hDecoder) {
- + cuda.cuvidDestroyDecoder(m_hDecoder);
- + m_hDecoder = 0;
- + }
- + ZeroMemory(dci, sizeof(*dci));
- + dci->ulWidth = dwWidth;
- + dci->ulHeight = dwHeight;
- + dci->ulNumDecodeSurfaces = MAX_DECODE_FRAMES;
- + dci->CodecType = codec;
- + dci->ChromaFormat = cudaVideoChromaFormat_420;
- + dci->OutputFormat = cudaVideoSurfaceFormat_NV12;
- + //TODO
- + dci->DeinterlaceMode = (cudaVideoDeinterlaceMode)0;//m_pSettings->GetHWAccelDeintMode();
- + dci->ulNumOutputSurfaces = 1;
- +
- + dci->ulTargetWidth = dwDisplayWidth;
- + dci->ulTargetHeight = dwDisplayHeight;
- +
- + dci->display_area.left = (short)rcDisplayArea.left;
- + dci->display_area.right = (short)rcDisplayArea.right;
- + dci->display_area.top = (short)rcDisplayArea.top;
- + dci->display_area.bottom = (short)rcDisplayArea.bottom;
- +
- + dci->ulCreationFlags = bDXVAMode ? cudaVideoCreate_PreferDXVA : cudaVideoCreate_PreferCUVID;
- + dci->vidLock = m_cudaCtxLock;
- +
- + // create the decoder
- + CUresult oResult = cuda.cuvidCreateDecoder(&m_hDecoder, dci);
- + if (oResult != CUDA_SUCCESS) {
- + //DbgLog((LOG_ERROR, 10, L"-> Creation of decoder for type %d failed with code %d", dci->CodecType, oResult));
- + if (bDXVAMode) {
- + //DbgLog((LOG_ERROR, 10, L" -> Retrying in pure CUVID mode"));
- + bDXVAMode = FALSE;
- + goto retry;
- + }
- + hr = E_FAIL;
- + }
- + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
- +
- + return SUCCEEDED(hr);
- +}
- +bool CDVDVideoCodecCuda::DecodeSequenceData()
- +{
- + CUresult oResult;
- +
- + CUVIDSOURCEDATAPACKET pCuvidPacket;
- + ZeroMemory(&pCuvidPacket, sizeof(pCuvidPacket));
- +
- + pCuvidPacket.payload = m_VideoParserExInfo.raw_seqhdr_data;
- + pCuvidPacket.payload_size = m_VideoParserExInfo.format.seqhdr_data_length;
- +
- + if (pCuvidPacket.payload && pCuvidPacket.payload_size) {
- + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
- + oResult = cuda.cuvidParseVideoData(m_hParser, &pCuvidPacket);
- + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
- + }
- +
- + return true;
- +}
- +
- +bool CDVDVideoCodecCuda::Display(CUVIDPARSERDISPINFO *cuviddisp, DVDVideoPicture* pDvdVideoPicture)
- +{
- + BOOL bTreatAsProgressive = m_DeintTreatAsProgressive;
- +
- + if (bTreatAsProgressive) {
- + cuviddisp->progressive_frame = TRUE;
- + m_nSoftTelecine = FALSE;
- + } else {
- + if (m_VideoFormat.codec == cudaVideoCodec_MPEG2 || m_VideoFormat.codec == cudaVideoCodec_H264) {
- + if (cuviddisp->repeat_first_field) {
- + m_nSoftTelecine = 2;
- + } else if (m_nSoftTelecine) {
- + m_nSoftTelecine--;
- + }
- + if (!m_nSoftTelecine)
- + m_bTFF = cuviddisp->top_field_first;
- + }
- +
- + cuviddisp->progressive_frame = (cuviddisp->progressive_frame && !(m_bInterlaced && m_DeintAggressive && m_VideoFormat.codec != cudaVideoCodec_VC1) && !m_DeintForce);
- + }
- +
- + DeintFieldOrder fo = m_DeIntFieldOrder;
- + cuviddisp->top_field_first = (fo == DeintFieldOrder_Auto) ? (m_nSoftTelecine ? m_bTFF : cuviddisp->top_field_first) : (fo == DeintFieldOrder_TopFieldFirst);
- +
- + if (m_bDoubleRateDeint) {
- + if (cuviddisp->progressive_frame || m_nSoftTelecine) {
- + Deliver(cuviddisp, pDvdVideoPicture, 2);
- + } else {
- + Deliver(cuviddisp, pDvdVideoPicture, 0);
- + Deliver(cuviddisp, pDvdVideoPicture, 1);
- + }
- + } else {
- + Deliver(cuviddisp, pDvdVideoPicture);
- + }
- + return S_OK;
- +}
- +
- +bool CDVDVideoCodecCuda::Deliver(CUVIDPARSERDISPINFO *cuviddisp, DVDVideoPicture* pDvdVideoPicture, int field)
- +{
- + CUdeviceptr devPtr = 0;
- + unsigned int pitch = 0, width = 0, height = 0;
- + CUVIDPROCPARAMS vpp;
- + CUresult cuStatus = CUDA_SUCCESS;
- +
- + memset(&vpp, 0, sizeof(vpp));
- + vpp.progressive_frame = !m_nSoftTelecine && cuviddisp->progressive_frame;
- + vpp.top_field_first = cuviddisp->top_field_first;
- + vpp.second_field = (field == 1);
- +
- + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
- + cuStatus = cuda.cuvidMapVideoFrame(m_hDecoder, cuviddisp->picture_index, &devPtr, &pitch, &vpp);
- + if (cuStatus != CUDA_SUCCESS) {
- + CLog::Log(LOGERROR, "CDecCuvid::Deliver(): cuvidMapVideoFrame failed on index %d", cuviddisp->picture_index);
- + goto cuda_fail;
- + }
- +
- + width = m_VideoDecoderInfo.display_area.right;
- + height = m_VideoDecoderInfo.display_area.bottom;
- + int size = pitch * height * 3 / 2;
- +
- + if(!m_pbRawNV12 || size > m_cRawNV12) {
- + if (m_pbRawNV12) {
- + cuda.cuMemFreeHost(m_pbRawNV12);
- + m_pbRawNV12 = NULL;
- + m_cRawNV12 = 0;
- + }
- + cuStatus = cuda.cuMemAllocHost((void **)&m_pbRawNV12, size);
- + if (cuStatus != CUDA_SUCCESS) {
- + CLog::Log(LOGERROR, "CDecCuvid::Deliver(): cuMemAllocHost failed to allocate %d bytes (%d)", size, cuStatus);
- + goto cuda_fail;
- + }
- + m_cRawNV12 = size;
- + }
- + // Copy memory from the device into the staging area
- + if (m_pbRawNV12) {
- +#if USE_ASYNC_COPY
- + cuStatus = cuda.cuMemcpyDtoHAsync(m_pbRawNV12, devPtr, size, m_hStream);
- + if (cuStatus != CUDA_SUCCESS) {
- + CLog::Log(LOGERROR, "Async Memory Transfer failed (%d)", cuStatus);
- + goto cuda_fail;
- + }
- + while (CUDA_ERROR_NOT_READY == cuda.cuStreamQuery(m_hStream)) {
- + Sleep(1);
- + }
- +#else
- + cuStatus = cuda.cuMemcpyDtoH(m_pbRawNV12, devPtr, size);
- + if (cuStatus != CUDA_SUCCESS) {
- + CLog::Log(LOGERROR, "Memory Transfer failed (%d)", cuStatus);
- + goto cuda_fail;
- + }
- +#endif
- + } else {
- + // If we don't have our memory, this is bad.
- + CLog::Log(LOGERROR, "No Valid Staging Memory - failing");
- + goto cuda_fail;
- + }
- + cuda.cuvidUnmapVideoFrame(m_hDecoder, devPtr);
- + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
- +
- +
- + // Setup the LAVFrame
- + //DVDVideoPicture* pFrame = NULL;
- + //LAVFrame *pFrame = NULL;
- + //AllocateFrame(&pFrame);
- +
- +
- + if (m_rtAvgTimePerFrame != AV_NOPTS_VALUE) {
- + pDvdVideoPicture->iDuration = m_rtAvgTimePerFrame;//supposed to be avg frame duration
- + }
- +
- + REFERENCE_TIME rtStart = cuviddisp->timestamp, rtStop = AV_NOPTS_VALUE;
- + if (rtStart != AV_NOPTS_VALUE) {
- + CUVIDPARSERDISPINFO *next = GetNextFrame();
- + if (next->picture_index != -1 && next->timestamp != AV_NOPTS_VALUE) {
- + m_rtPrevDiff = next->timestamp - cuviddisp->timestamp;
- + }
- +
- + if (m_rtPrevDiff != AV_NOPTS_VALUE) {
- + REFERENCE_TIME rtHalfDiff = m_rtPrevDiff >> 1;
- + if (field == 1)
- + rtStart += rtHalfDiff;
- +
- + rtStop = rtStart + rtHalfDiff;
- +
- + if (field == 2 || !m_bDoubleRateDeint)
- + rtStop += rtHalfDiff;
- + }
- +
- + // Sanity check in case the duration is null
- + if (rtStop <= rtStart)
- + rtStop = AV_NOPTS_VALUE;
- + }
- +
- + pDvdVideoPicture->format = DVDVideoPicture::FMT_NV12;
- + pDvdVideoPicture->iWidth = width;
- + pDvdVideoPicture->iHeight = height;
- + pDvdVideoPicture->pts = rtStart;
- + if (rtStop>0)
- + pDvdVideoPicture->iDuration = rtStop - rtStart;
- + pDvdVideoPicture->iRepeatPicture = cuviddisp->repeat_first_field;
- + {
- + AVRational ar = { m_VideoFormat.display_aspect_ratio.x, m_VideoFormat.display_aspect_ratio.y };
- + AVRational arDim = { width, height };
- + double aspect_ratio;
- + if (m_bARPresent || av_cmp_q(ar, arDim) != 0) {
- + if (ar.num == 0)
- + aspect_ratio = 0;
- + else
- + aspect_ratio = av_q2d(ar) * pDvdVideoPicture->iWidth / pDvdVideoPicture->iHeight;
- +
- + if (aspect_ratio <= 0.0)
- + aspect_ratio = (float)pDvdVideoPicture->iWidth / (float)pDvdVideoPicture->iHeight;
- + pDvdVideoPicture->iDisplayHeight = pDvdVideoPicture->iHeight;
- + pDvdVideoPicture->iDisplayWidth = ((int)RINT(pDvdVideoPicture->iHeight * aspect_ratio)) & -3;
- + if (pDvdVideoPicture->iDisplayWidth > pDvdVideoPicture->iWidth)
- + {
- + pDvdVideoPicture->iDisplayWidth = pDvdVideoPicture->iWidth;
- + pDvdVideoPicture->iDisplayHeight = ((int)RINT(pDvdVideoPicture->iWidth / aspect_ratio)) & -3;
- + }
- + //pFrame->aspect_ratio = ar;
- + }
- + }
- + pDvdVideoPicture->iDisplayHeight = pDvdVideoPicture->iHeight;
- + pDvdVideoPicture->iDisplayWidth = pDvdVideoPicture->iWidth;
- + //pFrame->cuda //ext_format = m_DXVAExtendedFormat;
- + bool interlaced = !cuviddisp->progressive_frame && m_VideoDecoderInfo.DeinterlaceMode == cudaVideoDeinterlaceMode_Weave;
- + pDvdVideoPicture->iFlags |= interlaced ? DVP_FLAG_INTERLACED : 0;
- + pDvdVideoPicture->iFlags |= cuviddisp->top_field_first ? DVP_FLAG_TOP_FIELD_FIRST: 0;
- +
- + // TODO: This may be wrong for H264 where B-Frames can be references
- +
- + pDvdVideoPicture->iFrameType = m_PicParams[cuviddisp->picture_index].intra_pic_flag ? 'I' : (m_PicParams[cuviddisp->picture_index].ref_pic_flag ? 'P' : 'B');
- +
- + // Assign the buffer to the LAV Frame bufers
- + int Ysize = height * pitch;
- + pDvdVideoPicture->data[0] = m_pbRawNV12;
- + pDvdVideoPicture->data[1] = m_pbRawNV12+Ysize;
- + pDvdVideoPicture->iLineSize[0] = pDvdVideoPicture->iLineSize[1] = pitch;
- + //TODO
- + //pFrame->stride[0] = pFrame->stride[1] = pitch;
- + //TODO
- + //m_pCallback->Deliver(pFrame);
- +
- + return true;
- +
- +cuda_fail:
- + cuda.cuvidUnmapVideoFrame(m_hDecoder, devPtr);
- + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
- + return false;
- +}
- +
- +
- +void CDVDVideoCodecCuda::Dispose()
- +{
- +
- +}
- +
- +void CDVDVideoCodecCuda::SetDropState(bool bDrop)
- +{
- +
- +}
- +
- +int CDVDVideoCodecCuda::Decode(BYTE* pData, int iSize, double dts, double pts)
- +{
- + CUresult result;
- + HRESULT hr;
- +
- + CUVIDSOURCEDATAPACKET pCuvidPacket;
- + ZeroMemory(&pCuvidPacket, sizeof(pCuvidPacket));
- +
- + BYTE *pBuffer = NULL;
- + if (m_AVC1Converter) {
- + int size = 0;
- + hr = m_AVC1Converter->Convert(&pBuffer, &size, pData, iSize);
- + if (SUCCEEDED(hr)) {
- + pCuvidPacket.payload = pBuffer;
- + pCuvidPacket.payload_size = size;
- + }
- + } else {
- + pCuvidPacket.payload = pData;
- + pCuvidPacket.payload_size = iSize;
- + }
- +
- + if (m_bNeedSequenceCheck && m_VideoDecoderInfo.CodecType == cudaVideoCodec_H264) {
- + hr = CheckH264Sequence(pCuvidPacket.payload, pCuvidPacket.payload_size);
- + if (FAILED(hr)) {
- + m_bFormatIncompatible = TRUE;
- + } else if (hr == S_OK) {
- + m_bNeedSequenceCheck = FALSE;
- + }
- + }
- +
- + if (dts != AV_NOPTS_VALUE) {
- + pCuvidPacket.flags |= CUVID_PKT_TIMESTAMP;
- + pCuvidPacket.timestamp = dts;
- + }
- +
- + //if (bDiscontinuity)
- + // pCuvidPacket.flags |= CUVID_PKT_DISCONTINUITY;
- +
- + if (m_bUseTimestampQueue)
- + m_timestampQueue.push(dts);
- +
- + cuda.cuvidCtxLock(m_cudaCtxLock, 0);
- + __try {
- + result = cuda.cuvidParseVideoData(m_hParser, &pCuvidPacket);
- + } __except(1) {
- + CLog::Log(LOGERROR, "CDecCuvid::Decode(): cuvidParseVideoData threw an exception");
- + }
- + cuda.cuvidCtxUnlock(m_cudaCtxLock, 0);
- +
- + m_dllAvUtil.av_freep(&pBuffer);
- +
- + if (m_bFormatIncompatible) {
- + CLog::Log(LOGERROR, "CDecCuvid::Decode(): Incompatible format detected, indicating failure...");
- + return VC_ERROR;
- + }
- +
- + if (m_DisplayQueue[m_DisplayPos].picture_index >= 0)
- + return VC_BUFFER | VC_PICTURE;
- + return VC_BUFFER;
- +}
- +
- +void CDVDVideoCodecCuda::Reset(void)
- +{
- +
- +}
- +
- +bool CDVDVideoCodecCuda::GetPicture(DVDVideoPicture* pDvdVideoPicture)
- +{
- + if (m_DisplayQueue[m_DisplayPos].picture_index >= 0) {
- + Display(&m_DisplayQueue[m_DisplayPos], pDvdVideoPicture);
- + m_DisplayQueue[m_DisplayPos].picture_index = -1;
- + }
- + else
- + return VC_BUFFER;
- + //m_DisplayQueue[m_DisplayPos] = *cuviddisp;
- + m_DisplayPos = (m_DisplayPos + 1) % DISPLAY_DELAY;
- +
- +
- + return VC_PICTURE | VC_BUFFER;
- +}
- +
- +
- +
- +
- +
- +
- +#endif
- \ No newline at end of file
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.h
- new file mode 100644
- index 0000000..245c916
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/CUDA.h
- @@ -0,0 +1,184 @@
- +/*
- + * Copyright (C) 2012 Team XBMC
- + * http://www.xbmc.org
- + *
- + * This Program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2, or (at your option)
- + * any later version.
- + *
- + * This Program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License
- + * along with XBMC; see the file COPYING. If not, write to
- + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- + * http://www.gnu.org/copyleft/gpl.html
- + *
- + */
- +#ifdef HAS_DX
- +#pragma once
- +#include "DVDCodecs/Video/DVDVideoCodecFFmpeg.h"
- +#include "Cuda/AVC1AnnexBConverter.h"
- +#include <dxva2api.h>
- +
- +#define MAX_DECODE_FRAMES 20
- +#define DISPLAY_DELAY 4
- +#define USE_ASYNC_COPY 1
- +#define MAX_PIC_INDEX 64
- +
- +#define CUDA_FORCE_API_VERSION 3010
- +#include "cuda/cuda.h"
- +#include "cuda/nvcuvid.h"
- +#include "cuda/cuda_dynlink.h"
- +#include <queue>
- +#include "dllavutil.h"
- +
- +#define CUMETHOD(name) t##name *##name
- +#define countof( array ) ( sizeof( array )/sizeof( array[0] ) )
- +
- +namespace CUDA {
- +
- + // Control the field order of the deinterlacer
- +typedef enum DeintFieldOrder {
- + DeintFieldOrder_Auto,
- + DeintFieldOrder_TopFieldFirst,
- + DeintFieldOrder_BottomFieldFirst,
- +};
- +
- +class CDVDVideoCodecCuda : public CDVDVideoCodec
- +{
- +public:
- + CDVDVideoCodecCuda();
- + virtual ~CDVDVideoCodecCuda();
- +
- + // Required overrides
- + virtual bool Open(CDVDStreamInfo &hints, CDVDCodecOptions &options);
- + virtual void Dispose(void);
- + virtual int Decode(BYTE *pData, int iSize, double dts, double pts);
- + virtual void Reset(void);
- + virtual bool GetPicture(DVDVideoPicture *pDvdVideoPicture);
- + virtual void SetDropState(bool bDrop);
- + virtual const char* GetName(void) { return (const char*)m_pFormatName; }
- +
- +protected:
- + bool DestroyDecoder(bool full);
- + bool LoadCUDAFuncRefs();
- + int GetMaxGflopsGraphicsDeviceId();
- + bool Init();
- +
- + CStdString m_pFormatName;
- + DVDVideoPicture m_videobuffer;
- +private:
- + struct {
- + HMODULE cudaLib;
- + CUMETHOD(cuInit);
- + CUMETHOD(cuCtxCreate);
- + CUMETHOD(cuCtxDestroy);
- + CUMETHOD(cuCtxPushCurrent);
- + CUMETHOD(cuCtxPopCurrent);
- + CUMETHOD(cuD3D9CtxCreate);
- + CUMETHOD(cuMemAllocHost);
- + CUMETHOD(cuMemFreeHost);
- + CUMETHOD(cuMemcpyDtoH);
- + CUMETHOD(cuMemcpyDtoHAsync);
- + CUMETHOD(cuStreamCreate);
- + CUMETHOD(cuStreamDestroy);
- + CUMETHOD(cuStreamQuery);
- + CUMETHOD(cuDeviceGetCount);
- + CUMETHOD(cuDriverGetVersion);
- + CUMETHOD(cuDeviceGetName);
- + CUMETHOD(cuDeviceComputeCapability);
- + CUMETHOD(cuDeviceGetAttribute);
- +
- + HMODULE cuvidLib;
- + CUMETHOD(cuvidCtxLockCreate);
- + CUMETHOD(cuvidCtxLockDestroy);
- + CUMETHOD(cuvidCtxLock);
- + CUMETHOD(cuvidCtxUnlock);
- + CUMETHOD(cuvidCreateVideoParser);
- + CUMETHOD(cuvidParseVideoData);
- + CUMETHOD(cuvidDestroyVideoParser);
- + CUMETHOD(cuvidCreateDecoder);
- + CUMETHOD(cuvidDecodePicture);
- + CUMETHOD(cuvidDestroyDecoder);
- + CUMETHOD(cuvidMapVideoFrame);
- + CUMETHOD(cuvidUnmapVideoFrame);
- + } cuda;
- +
- + IDirect3D9 *m_pD3D;
- +
- + CUcontext m_cudaContext;
- + CUvideoctxlock m_cudaCtxLock;
- +
- + CUvideoparser m_hParser;
- + CUVIDEOFORMATEX m_VideoParserExInfo;
- +
- + CUvideodecoder m_hDecoder;
- + CUVIDDECODECREATEINFO m_VideoDecoderInfo;
- +
- + CUVIDEOFORMAT m_VideoFormat;
- +
- + CUVIDPARSERDISPINFO m_DisplayQueue[DISPLAY_DELAY];
- + int m_DisplayPos;
- +
- + CUVIDPICPARAMS m_PicParams[MAX_PIC_INDEX];
- +
- + CUstream m_hStream;
- +
- + BOOL m_bVDPAULevelC;
- +
- + BOOL m_bForceSequenceUpdate;
- + BOOL m_bInterlaced;
- + BOOL m_bDoubleRateDeint;
- + BOOL m_bFlushing;
- + REFERENCE_TIME m_rtAvgTimePerFrame;
- + REFERENCE_TIME m_rtPrevDiff;
- + BOOL m_bWaitForKeyframe;
- + int m_iFullRange;
- +
- + DXVA2_ExtendedFormat m_DXVAExtendedFormat;
- +
- + BYTE *m_pbRawNV12;
- + int m_cRawNV12;
- +
- + CAVC1AnnexBConverter *m_AVC1Converter;
- +
- + BOOL m_bFormatIncompatible;
- + BOOL m_bNeedSequenceCheck;
- +
- + BOOL m_bUseTimestampQueue;
- + std::queue<REFERENCE_TIME> m_timestampQueue;
- +
- + int m_nSoftTelecine;
- + BOOL m_bTFF;
- + BOOL m_bARPresent;
- +
- + int m_AccelDeintOutput;
- + BOOL m_DeintTreatAsProgressive;
- + BOOL m_DeintAggressive;
- + BOOL m_DeintForce;
- + DeintFieldOrder m_DeIntFieldOrder;
- +
- + // CUDA Callbacks
- + static int CUDAAPI HandleVideoSequence(void *obj, CUVIDEOFORMAT *cuvidfmt);
- + static int CUDAAPI HandlePictureDecode(void *obj, CUVIDPICPARAMS *cuvidpic);
- + static int CUDAAPI HandlePictureDisplay(void *obj, CUVIDPARSERDISPINFO *cuviddisp);
- +
- +
- + bool Display(CUVIDPARSERDISPINFO *cuviddisp, DVDVideoPicture* pDvdVideoPicture);
- + bool Deliver(CUVIDPARSERDISPINFO *cuviddisp, DVDVideoPicture* pDvdVideoPicture ,int field = 0);
- +
- + bool CreateCUVIDDecoder(cudaVideoCodec codec, DWORD dwWidth, DWORD dwHeight, DWORD dwDisplayWidth, DWORD dwDisplayHeight, RECT rcDisplayArea);
- + bool DecodeSequenceData();
- + bool CheckH264Sequence(const BYTE *buffer, int buflen);
- + //STDMETHODIMP FlushParser();
- + CUVIDPARSERDISPINFO* GetNextFrame();
- + DllAvUtil m_dllAvUtil;
- +};
- +
- +
- +};
- +#endif
- \ No newline at end of file
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.cpp
- new file mode 100644
- index 0000000..71de694
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.cpp
- @@ -0,0 +1,96 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + */
- +
- +
- +#include "AVC1AnnexBConverter.h"
- +
- +
- +
- +#include "libavutil/intreadwrite.h"
- +
- +CAVC1AnnexBConverter::CAVC1AnnexBConverter(void)
- +{
- + m_dllAvUtil.Load();
- +}
- +
- +CAVC1AnnexBConverter::~CAVC1AnnexBConverter(void)
- +{
- +
- +}
- +
- +HRESULT CAVC1AnnexBConverter::alloc_and_copy(uint8_t **poutbuf, int *poutbuf_size, const uint8_t *in, uint32_t in_size)
- +{
- + uint32_t offset = *poutbuf_size;
- + uint8_t nal_header_size = offset ? 3 : 4;
- + void *tmp;
- +
- + *poutbuf_size += in_size+nal_header_size;
- + tmp = m_dllAvUtil.av_realloc(*poutbuf, *poutbuf_size);
- + if (!tmp)
- + return E_OUTOFMEMORY;
- + *poutbuf = (uint8_t *)tmp;
- + memcpy(*poutbuf+nal_header_size+offset, in, in_size);
- + if (!offset) {
- + AV_WB32(*poutbuf, 1);
- + } else {
- + (*poutbuf+offset)[0] = (*poutbuf+offset)[1] = 0;
- + (*poutbuf+offset)[2] = 1;
- + }
- +
- + return S_OK;
- +}
- +
- +HRESULT CAVC1AnnexBConverter::Convert(BYTE **poutbuf, int *poutbuf_size, const BYTE *buf, int buf_size)
- +{
- + int32_t nal_size;
- + const uint8_t *buf_end = buf + buf_size;
- +
- + *poutbuf_size = 0;
- +
- + do {
- + if (buf + m_NaluSize > buf_end)
- + goto fail;
- +
- + if (m_NaluSize == 1) {
- + nal_size = buf[0];
- + } else if (m_NaluSize == 2) {
- + nal_size = AV_RB16(buf);
- + } else {
- + nal_size = AV_RB32(buf);
- + if (m_NaluSize == 3)
- + nal_size >>= 8;
- + }
- +
- + buf += m_NaluSize;
- +
- + if (buf + nal_size > buf_end || nal_size < 0)
- + goto fail;
- +
- + if (FAILED(alloc_and_copy(poutbuf, poutbuf_size, buf, nal_size)))
- + goto fail;
- +
- + buf += nal_size;
- + buf_size -= (nal_size + m_NaluSize);
- + } while (buf_size > 0);
- +
- + return S_OK;
- +fail:
- + m_dllAvUtil.av_freep(poutbuf);
- + return E_FAIL;
- +}
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.h
- new file mode 100644
- index 0000000..dd45b17
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/AVC1AnnexBConverter.h
- @@ -0,0 +1,34 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + */
- +
- +#pragma once
- +#include "DllAvUtil.h"
- +class CAVC1AnnexBConverter
- +{
- +public:
- + CAVC1AnnexBConverter(void);
- + ~CAVC1AnnexBConverter(void);
- +
- + HRESULT SetNALUSize(int nalusize) { m_NaluSize = nalusize; return S_OK; }
- + HRESULT Convert(BYTE **poutbuf, int *poutbuf_size, const BYTE *buf, int buf_size);
- + HRESULT alloc_and_copy(uint8_t **poutbuf, int *poutbuf_size, const uint8_t *in, uint32_t in_size);
- +private:
- + int m_NaluSize;
- + DllAvUtil m_dllAvUtil;
- +};
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.cpp
- new file mode 100644
- index 0000000..bd8b2b8
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.cpp
- @@ -0,0 +1,101 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + *
- + * Initial design and concept by Gabest and the MPC-HC Team, copyright under GPLv2
- + */
- +
- +#include "ByteParser.h"
- +
- +#pragma warning( push )
- +#pragma warning( disable : 4018 )
- +#pragma warning( disable : 4244 )
- +extern "C" {
- +#define AVCODEC_X86_MATHOPS_H
- +#define __STDC_CONSTANT_MACROS
- +
- +#include "libavcodec/get_bits.h"
- +};
- +#pragma warning( pop )
- +
- +CByteParser::CByteParser(const BYTE *pData, size_t length)
- + : m_pData(pData), m_pEnd(pData+length)
- +{
- + m_dllAvUtil.Load();
- + m_gbCtx = (GetBitContext *)m_dllAvUtil.av_mallocz(sizeof(GetBitContext));
- + init_get_bits(m_gbCtx, pData, (int)(length << 3));
- +}
- +
- +CByteParser::~CByteParser()
- +{
- + m_dllAvUtil.av_freep(&m_gbCtx);
- +}
- +
- +unsigned int CByteParser::BitRead(unsigned int numBits, bool peek)
- +{
- + if (numBits == 0)
- + return 0;
- +
- + if (peek)
- + return show_bits_long(m_gbCtx, numBits);
- + else
- + return get_bits_long(m_gbCtx, numBits);
- +}
- +
- +size_t CByteParser::RemainingBits() const
- +{
- + return get_bits_left(m_gbCtx);
- +}
- +
- +size_t CByteParser::Pos() const
- +{
- + return (size_t)(m_pEnd - m_pData - Remaining());
- +}
- +
- +// Exponential Golomb Coding (with k = 0)
- +// As used in H.264/MPEG-4 AVC
- +// http://en.wikipedia.org/wiki/Exponential-Golomb_coding
- +
- +unsigned CByteParser::UExpGolombRead()
- +{
- + int n = -1;
- + for(BYTE b = 0; !b && RemainingBits(); n++) {
- + b = get_bits1(m_gbCtx);
- + }
- + if (!RemainingBits())
- + return 0;
- + return ((1 << n) | BitRead(n)) - 1;
- +}
- +
- +int CByteParser::SExpGolombRead()
- +{
- + int k = UExpGolombRead() + 1;
- + // Negative numbers are interleaved in the series
- + // unsigned: 0, 1, 2, 3, 4, 5, 6, ...
- + // signed: 0, 1, -1, 2, -2, 3, -3, ....
- + // So all even numbers are negative (last bit = 0)
- + // Note that we added 1 to the unsigned value already, so the check is inverted
- + if (k&1)
- + return -(k>>1);
- + else
- + return (k>>1);
- +}
- +
- +void CByteParser::BitByteAlign()
- +{
- + align_get_bits(m_gbCtx);
- +}
- \ No newline at end of file
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.h
- new file mode 100644
- index 0000000..f1c927a
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/ByteParser.h
- @@ -0,0 +1,69 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + *
- + * Initial design and concept by Gabest and the MPC-HC Team, copyright under GPLv2
- + */
- +
- +#pragma once
- +
- +#include "DllAvUtil.h"
- +
- +struct GetBitContext;
- +
- +/**
- +* Byte Parser Utility Class
- +*/
- +class CByteParser
- +{
- +public:
- + /** Construct a Byte Parser to parse the given BYTE array with the given length */
- + CByteParser(const BYTE *pData, size_t length);
- + virtual ~CByteParser();
- +
- + /** Read 1 to 32 Bits from the Byte Array. If peek is set, the data will just be returned, and the buffer not advanced. */
- + unsigned int BitRead(unsigned int numBits, bool peek = false);
- +
- + /** Read a unsigned number in Exponential Golomb encoding (with k = 0) */
- + unsigned int UExpGolombRead();
- + /** Read a signed number in Exponential Golomb encoding (with k = 0) */
- + int SExpGolombRead();
- +
- + /** Pointer to the start of the byte array */
- + const BYTE *Start() const { return m_pData; }
- + /** Pointer to the end of the byte array */
- + const BYTE *End() const { return m_pEnd; }
- +
- + /** Overall length (in bytes) of the byte array */
- + size_t Length() const;
- +
- + size_t Pos() const;
- +
- + /** Number of bytes remaining in the array */
- + size_t Remaining() const { return RemainingBits() >> 3; }
- +
- + /** Number of bits remaining */
- + size_t RemainingBits() const;
- +
- + void BitByteAlign();
- +
- +private:
- + GetBitContext *m_gbCtx;
- + DllAvUtil m_dllAvUtil;
- + const BYTE *m_pData;
- + const BYTE *m_pEnd;
- +};
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.cpp
- new file mode 100644
- index 0000000..7524d92
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.cpp
- @@ -0,0 +1,106 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + *
- + * Initial design and concept by Gabest and the MPC-HC Team, copyright under GPLv2
- + */
- +
- +
- +#include "H264Nalu.h"
- +
- +void CH264Nalu::SetBuffer(const BYTE* pBuffer, size_t nSize, int nNALSize)
- +{
- + m_pBuffer = pBuffer;
- + m_nSize = nSize;
- + m_nNALSize = nNALSize;
- + m_nCurPos = 0;
- + m_nNextRTP = 0;
- +
- + m_nNALStartPos = 0;
- + m_nNALDataPos = 0;
- +
- + if (nNALSize == 0 && nSize > 0)
- + MoveToNextAnnexBStartcode();
- +}
- +
- +bool CH264Nalu::MoveToNextAnnexBStartcode()
- +{
- + if (m_nSize < 4)
- + return false;
- + size_t nBuffEnd = m_nSize - 4;
- +
- + for (size_t i=m_nCurPos; i<nBuffEnd; i++) {
- + if ((*((DWORD*)(m_pBuffer+i)) & 0x00FFFFFF) == 0x00010000) {
- + // Find next AnnexB Nal
- + m_nCurPos = i;
- + return true;
- + }
- + }
- +
- + m_nCurPos = m_nSize;
- + return false;
- +}
- +
- +bool CH264Nalu::MoveToNextRTPStartcode()
- +{
- + if (m_nNextRTP < m_nSize) {
- + m_nCurPos = m_nNextRTP;
- + return true;
- + }
- +
- + m_nCurPos = m_nSize;
- + return false;
- +}
- +
- +bool CH264Nalu::ReadNext()
- +{
- +
- + if (m_nCurPos >= m_nSize) return false;
- +
- + if ((m_nNALSize != 0) && (m_nCurPos == m_nNextRTP))
- + {
- + if (m_nCurPos+m_nNALSize >= m_nSize) return false;
- + // RTP Nalu type : (XX XX) XX XX NAL..., with XX XX XX XX or XX XX equal to NAL size
- + m_nNALStartPos = m_nCurPos;
- + m_nNALDataPos = m_nCurPos + m_nNALSize;
- + unsigned nTemp = 0;
- + for (int i=0; i<m_nNALSize; i++)
- + {
- + nTemp = (nTemp << 8) + m_pBuffer[m_nCurPos++];
- + }
- + m_nNextRTP += nTemp + m_nNALSize;
- + MoveToNextRTPStartcode();
- + }
- + else
- + {
- + // Remove trailing bits
- + while (m_pBuffer[m_nCurPos]==0x00 && ((*((DWORD*)(m_pBuffer+m_nCurPos)) & 0x00FFFFFF) != 0x00010000))
- + m_nCurPos++;
- +
- + // AnnexB Nalu : 00 00 01 NAL...
- + m_nNALStartPos = m_nCurPos;
- + m_nCurPos += 3;
- + m_nNALDataPos = m_nCurPos;
- + MoveToNextAnnexBStartcode();
- + }
- +
- + forbidden_bit = (m_pBuffer[m_nNALDataPos]>>7) & 1;
- + nal_reference_idc = (m_pBuffer[m_nNALDataPos]>>5) & 3;
- + nal_unit_type = (NALU_TYPE) (m_pBuffer[m_nNALDataPos] & 0x1f);
- +
- + return true;
- +}
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.h
- new file mode 100644
- index 0000000..772c852
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264Nalu.h
- @@ -0,0 +1,80 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + *
- + * Initial design and concept by Gabest and the MPC-HC Team, copyright under GPLv2
- + */
- +
- +#pragma once
- +
- +
- +typedef enum
- +{
- + NALU_TYPE_SLICE = 1,
- + NALU_TYPE_DPA = 2,
- + NALU_TYPE_DPB = 3,
- + NALU_TYPE_DPC = 4,
- + NALU_TYPE_IDR = 5,
- + NALU_TYPE_SEI = 6,
- + NALU_TYPE_SPS = 7,
- + NALU_TYPE_PPS = 8,
- + NALU_TYPE_AUD = 9,
- + NALU_TYPE_EOSEQ = 10,
- + NALU_TYPE_EOSTREAM = 11,
- + NALU_TYPE_FILL = 12
- +} NALU_TYPE;
- +
- +
- +class CH264Nalu
- +{
- +private :
- + int forbidden_bit; //! should be always FALSE
- + int nal_reference_idc; //! NALU_PRIORITY_xxxx
- + NALU_TYPE nal_unit_type; //! NALU_TYPE_xxxx
- +
- + size_t m_nNALStartPos; //! NALU start (including startcode / size)
- + size_t m_nNALDataPos; //! Useful part
- +
- + const BYTE *m_pBuffer;
- + size_t m_nCurPos;
- + size_t m_nNextRTP;
- + size_t m_nSize;
- + int m_nNALSize;
- +
- + bool MoveToNextAnnexBStartcode();
- + bool MoveToNextRTPStartcode();
- +
- +public :
- + CH264Nalu() { SetBuffer(NULL, 0, 0); }
- + NALU_TYPE GetType() const { return nal_unit_type; }
- + bool IsRefFrame() const { return (nal_reference_idc != 0); }
- +
- + size_t GetDataLength() const { return m_nCurPos - m_nNALDataPos; }
- + const BYTE *GetDataBuffer() { return m_pBuffer + m_nNALDataPos; }
- + size_t GetRoundedDataLength() const
- + {
- + size_t nSize = m_nCurPos - m_nNALDataPos;
- + return nSize + 128 - (nSize %128);
- + }
- +
- + size_t GetLength() const { return m_nCurPos - m_nNALStartPos; }
- + const BYTE *GetNALBuffer() { return m_pBuffer + m_nNALStartPos; }
- + bool IsEOF() const { return m_nCurPos >= m_nSize; }
- +
- + void SetBuffer (const BYTE *pBuffer, size_t nSize, int nNALSize);
- + bool ReadNext();
- +};
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.cpp
- new file mode 100644
- index 0000000..f25700e
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.cpp
- @@ -0,0 +1,186 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + */
- +
- +
- +#include "H264SequenceParser.h"
- +
- +#include "ByteParser.h"
- +#include "H264Nalu.h"
- +#include "DllAvCodec.h"
- +
- +CH264SequenceParser::CH264SequenceParser(void)
- +{
- + ZeroMemory(&sps, sizeof(sps));
- + ZeroMemory(&pps, sizeof(pps));
- +}
- +
- +
- +CH264SequenceParser::~CH264SequenceParser(void)
- +{
- +}
- +
- +HRESULT CH264SequenceParser::ParseNALs(const BYTE *buffer, size_t buflen, int nal_size)
- +{
- + CH264Nalu nalu;
- + nalu.SetBuffer(buffer, buflen, nal_size);
- +
- + while (nalu.ReadNext()) {
- + const BYTE *data = nalu.GetDataBuffer() + 1;
- + const size_t len = nalu.GetDataLength() - 1;
- + if (nalu.GetType() == NALU_TYPE_SPS) {
- + ParseSPS(data, len);
- + break;
- + }
- + }
- +
- + return S_OK;
- +}
- +
- +static void SPSDecodeScalingList(CByteParser &parser, int size) {
- + int i, last = 8, next = 8;
- + int matrix = parser.BitRead(1);
- + if (matrix) {
- + for (i = 0; i < size; i++) {
- + if(next)
- + next = (last + parser.SExpGolombRead()) & 0xff;
- + if(!i && !next){ /* matrix not written */
- + break;
- + }
- + last = next ? next : last;
- + }
- + }
- +}
- +
- +HRESULT CH264SequenceParser::ParseSPS(const BYTE *buffer, size_t buflen)
- +{
- + CByteParser parser(buffer, buflen);
- + int i;
- +
- + ZeroMemory(&sps, sizeof(sps));
- + // Defaults
- + sps.valid = 1;
- + sps.primaries = AVCOL_PRI_UNSPECIFIED;
- + sps.trc = AVCOL_TRC_UNSPECIFIED;
- + sps.colorspace = AVCOL_SPC_UNSPECIFIED;
- + sps.full_range = -1;
- +
- + // Parse
- + sps.profile = parser.BitRead(8);
- + parser.BitRead(4); // constraint flags
- + parser.BitRead(4); // reserved
- + sps.level = parser.BitRead(8);
- + parser.UExpGolombRead(); // sps id
- +
- + if (sps.profile >= 100) {
- + sps.chroma = (int)parser.UExpGolombRead();
- + if (sps.chroma == 3)
- + parser.BitRead(1);
- + sps.luma_bitdepth = (int)parser.UExpGolombRead() + 8;
- + sps.chroma_bitdepth = (int)parser.UExpGolombRead() + 8;
- + parser.BitRead(1); // transform_bypass
- +
- + // decode scaling matrices
- + int scaling = parser.BitRead(1);
- + if (scaling) {
- + // Decode scaling lists
- + SPSDecodeScalingList(parser, 16); // Intra, Y
- + SPSDecodeScalingList(parser, 16); // Intra, Cr
- + SPSDecodeScalingList(parser, 16); // Intra, Cb
- + SPSDecodeScalingList(parser, 16); // Inter, Y
- + SPSDecodeScalingList(parser, 16); // Inter, Cr
- + SPSDecodeScalingList(parser, 16); // Inter, Cb
- +
- + SPSDecodeScalingList(parser, 64); // Intra, Y
- + if (sps.chroma == 3) {
- + SPSDecodeScalingList(parser, 64); // Intra, Cr
- + SPSDecodeScalingList(parser, 64); // Intra, Cb
- + }
- + SPSDecodeScalingList(parser, 64); // Inter, Y
- + if (sps.chroma == 3) {
- + SPSDecodeScalingList(parser, 64); // Inter, Cr
- + SPSDecodeScalingList(parser, 64); // Inter, Cb
- + }
- + }
- + } else {
- + sps.chroma = 1;
- + sps.luma_bitdepth = 8;
- + sps.chroma_bitdepth = 8;
- + }
- +
- + parser.UExpGolombRead(); // log2_max_frame_num
- + int poc_type = (int)parser.UExpGolombRead(); // poc_type
- + if (poc_type == 0)
- + parser.UExpGolombRead(); // log2_max_poc_lsb
- + else if (poc_type == 1) {
- + parser.BitRead(1); // delta_pic_order_always_zero_flag
- + parser.SExpGolombRead(); // offset_for_non_ref_pic
- + parser.SExpGolombRead(); // offset_for_top_to_bottom_field
- + int cyclen = (int)parser.UExpGolombRead(); // poc_cycle_length
- + for (i = 0; i < cyclen; i++)
- + parser.SExpGolombRead(); // offset_for_ref_frame[i]
- + }
- +
- + sps.ref_frames = parser.UExpGolombRead(); // ref_frame_count
- + parser.BitRead(1); // gaps_in_frame_num_allowed_flag
- + parser.UExpGolombRead(); // mb_width
- + parser.UExpGolombRead(); // mb_height
- + sps.interlaced = !parser.BitRead(1); // frame_mbs_only_flag
- + if (sps.interlaced)
- + parser.BitRead(1); // mb_aff
- +
- + parser.BitRead(1); // direct_8x8_inference_flag
- + int crop = parser.BitRead(1); // crop
- + if (crop) {
- + parser.UExpGolombRead(); // crop_left
- + parser.UExpGolombRead(); // crop_right
- + parser.UExpGolombRead(); // crop_top
- + parser.UExpGolombRead(); // crop_bottom
- + }
- +
- + int vui_present = parser.BitRead(1); // vui_parameters_present_flag
- + if (vui_present) {
- + sps.ar_present = parser.BitRead(1); // aspect_ratio_info_present_flag
- + if (sps.ar_present) {
- + int ar_idc = parser.BitRead(8); // aspect_ratio_idc
- + if (ar_idc == 255) {
- + parser.BitRead(16); // sar.num
- + parser.BitRead(16); // sar.den
- + }
- + }
- +
- + int overscan = parser.BitRead(1); // overscan_info_present_flag
- + if (overscan)
- + parser.BitRead(1); // overscan_appropriate_flag
- +
- + int vid_sig_type = parser.BitRead(1); // video_signal_type_present_flag
- + if (vid_sig_type) {
- + parser.BitRead(3); // video_format
- + sps.full_range = parser.BitRead(1); // video_full_range_flag
- +
- + int colorinfo = parser.BitRead(1); // colour_description_present_flag
- + if (colorinfo) {
- + sps.primaries = parser.BitRead(8);
- + sps.trc = parser.BitRead(8);
- + sps.colorspace = parser.BitRead(8);
- + }
- + }
- + }
- +
- + return S_OK;
- +}
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.h
- new file mode 100644
- index 0000000..2827bfd
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/H264SequenceParser.h
- @@ -0,0 +1,55 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + */
- +
- +#pragma once
- +
- +class CH264SequenceParser
- +{
- +public:
- + CH264SequenceParser(void);
- + virtual ~CH264SequenceParser(void);
- +
- + HRESULT ParseNALs(const BYTE *buffer, size_t buflen, int nal_size);
- +
- +public:
- + struct {
- + int valid;
- +
- + int profile;
- + int level;
- + int chroma;
- + int luma_bitdepth;
- + int chroma_bitdepth;
- + int ref_frames;
- + int interlaced;
- + int ar_present;
- +
- + int full_range;
- + int primaries;
- + int trc;
- + int colorspace;
- + } sps;
- +
- + struct {
- + int valid;
- + } pps;
- +
- +private:
- + HRESULT ParseSPS(const BYTE *buffer, size_t buflen);
- +};
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.cpp
- new file mode 100644
- index 0000000..6889638
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.cpp
- @@ -0,0 +1,109 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + */
- +
- +
- +#include "MPEG2HeaderParser.h"
- +
- +#pragma warning( push )
- +#pragma warning( disable : 4018 )
- +#pragma warning( disable : 4244 )
- +#define AVCODEC_X86_MATHOPS_H
- +#include "libavcodec/get_bits.h"
- +#pragma warning( pop )
- +
- +#define SEQ_START_CODE 0x000001b3
- +#define EXT_START_CODE 0x000001b5
- +
- +static inline const uint8_t* find_next_marker(const uint8_t *src, const uint8_t *end)
- +{
- + uint32_t mrk = 0xFFFFFFFF;
- +
- + if(end-src < 4) return end;
- + while(src < end){
- + mrk = (mrk << 8) | *src++;
- + if((mrk & ~0xFF) == 0x00000100)
- + return src-4;
- + }
- + return end;
- +}
- +
- +CMPEG2HeaderParser::CMPEG2HeaderParser(const BYTE *pData, size_t length)
- +{
- + memset(&hdr, 0, sizeof(hdr));
- + ParseMPEG2Header(pData, length);
- +}
- +
- +CMPEG2HeaderParser::~CMPEG2HeaderParser(void)
- +{
- +}
- +
- +void CMPEG2HeaderParser::ParseMPEG2Header(const BYTE *pData, size_t length)
- +{
- + if (length < 16)
- + return;
- +
- + GetBitContext gb;
- +
- + const uint8_t *start = pData;
- + const uint8_t *end = start + length;
- + const uint8_t *next = NULL;
- +
- + int size;
- +
- + start = find_next_marker(start, end);
- + next = start;
- +
- + for(; next < end; start = next) {
- + next = find_next_marker(start + 4, end);
- + size = (int)(next - start - 4);
- + if(size <= 0) continue;
- +
- + init_get_bits(&gb, start + 4, (size - 4) * 8);
- +
- + switch(AV_RB32(start)) {
- + case SEQ_START_CODE:
- + MPEG2ParseSequenceHeader(&gb);
- + break;
- + case EXT_START_CODE:
- + MPEG2ParseExtHeader(&gb);
- + break;
- + }
- + }
- +}
- +
- +void CMPEG2HeaderParser::MPEG2ParseSequenceHeader(GetBitContext *gb)
- +{
- +}
- +
- +void CMPEG2HeaderParser::MPEG2ParseExtHeader(GetBitContext *gb)
- +{
- + int startcode = get_bits(gb, 4); // Start Code
- + if (startcode == 1) {
- + hdr.valid = 1;
- +
- + skip_bits(gb, 1); // profile and level esc
- + hdr.profile = get_bits(gb, 3);
- + hdr.level = get_bits(gb, 4);
- +
- + hdr.interlaced = !get_bits1(gb);
- + hdr.chroma = get_bits(gb, 2);
- +
- + // TODO: Fill in other fields, if needed
- + }
- +}
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.h
- new file mode 100644
- index 0000000..b45837a
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/MPEG2HeaderParser.h
- @@ -0,0 +1,45 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + */
- +
- +#pragma once
- +
- +struct GetBitContext;
- +
- +class CMPEG2HeaderParser
- +{
- +public:
- + CMPEG2HeaderParser(const BYTE *pData, size_t length);
- + ~CMPEG2HeaderParser(void);
- +
- +public:
- + struct {
- + int valid;
- +
- + int profile;
- + int level;
- +
- + int interlaced;
- + int chroma;
- + } hdr;
- +
- +private:
- + void ParseMPEG2Header(const BYTE *pData, size_t length);
- + void MPEG2ParseSequenceHeader(GetBitContext *gb);
- + void MPEG2ParseExtHeader(GetBitContext *gb);
- +};
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.cpp
- new file mode 100644
- index 0000000..e731eab
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.cpp
- @@ -0,0 +1,203 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + */
- +
- +
- +#include "VC1HeaderParser.h"
- +
- +#pragma warning( push )
- +#pragma warning( disable : 4018 )
- +#pragma warning( disable : 4244 )
- +extern "C" {
- +#define AVCODEC_X86_MATHOPS_H
- +#include "libavcodec/get_bits.h"
- +//extern __declspec(dllimport) const AVRational ff_vc1_pixel_aspect[16];
- +};
- +#pragma warning( pop )
- +
- +const AVRational ff_vc1_pixel_aspect[16] = {
- + { 0, 1 },
- + { 1, 1 },
- + { 12, 11 },
- + { 10, 11 },
- + { 16, 11 },
- + { 40, 33 },
- + { 24, 11 },
- + { 20, 11 },
- + { 32, 11 },
- + { 80, 33 },
- + { 18, 11 },
- + { 15, 11 },
- + { 64, 33 },
- + { 160, 99 },
- + { 0, 1 },
- + { 0, 1 }
- +};
- +
- +/** Markers used in VC-1 AP frame data */
- +//@{
- +enum VC1Code{
- + VC1_CODE_RES0 = 0x00000100,
- + VC1_CODE_ENDOFSEQ = 0x0000010A,
- + VC1_CODE_SLICE,
- + VC1_CODE_FIELD,
- + VC1_CODE_FRAME,
- + VC1_CODE_ENTRYPOINT,
- + VC1_CODE_SEQHDR,
- +};
- +//@}
- +
- +/** Available Profiles */
- +//@{
- +enum Profile {
- + PROFILE_SIMPLE,
- + PROFILE_MAIN,
- + PROFILE_COMPLEX, ///< TODO: WMV9 specific
- + PROFILE_ADVANCED
- +};
- +//@}
- +
- +#define IS_MARKER(x) (((x) & ~0xFF) == VC1_CODE_RES0)
- +
- +/** Find VC-1 marker in buffer
- +* @return position where next marker starts or end of buffer if no marker found
- +*/
- +static inline const uint8_t* find_next_marker(const uint8_t *src, const uint8_t *end)
- +{
- + uint32_t mrk = 0xFFFFFFFF;
- +
- + if(end-src < 4) return end;
- + while(src < end){
- + mrk = (mrk << 8) | *src++;
- + if(IS_MARKER(mrk))
- + return src-4;
- + }
- + return end;
- +}
- +
- +static inline int vc1_unescape_buffer(const uint8_t *src, int size, uint8_t *dst)
- +{
- + int dsize = 0, i;
- +
- + if(size < 4){
- + for(dsize = 0; dsize < size; dsize++) *dst++ = *src++;
- + return size;
- + }
- + for(i = 0; i < size; i++, src++) {
- + if(src[0] == 3 && i >= 2 && !src[-1] && !src[-2] && i < size-1 && src[1] < 4) {
- + dst[dsize++] = src[1];
- + src++;
- + i++;
- + } else
- + dst[dsize++] = *src;
- + }
- + return dsize;
- +}
- +
- +CVC1HeaderParser::CVC1HeaderParser(const BYTE *pData, size_t length)
- +{
- + m_dllAvUtil.Load();
- + memset(&hdr, 0, sizeof(hdr));
- + ParseVC1Header(pData, length);
- +}
- +
- +CVC1HeaderParser::~CVC1HeaderParser(void)
- +{
- +}
- +
- +void CVC1HeaderParser::ParseVC1Header(const BYTE *pData, size_t length)
- +{
- + if (length < 16)
- + return;
- +
- + GetBitContext gb;
- +
- + const uint8_t *start = pData;
- + const uint8_t *end = start + length;
- + const uint8_t *next = NULL;
- +
- + int size, buf2_size;
- + uint8_t *buf2;
- +
- + buf2 = (uint8_t *)m_dllAvUtil.av_mallocz(length + 16);//FF_INPUT_BUFFER_PADDING_SIZE);
- +
- + start = find_next_marker(start, end);
- + next = start;
- +
- + for(; next < end; start = next) {
- + next = find_next_marker(start + 4, end);
- + size = (int)(next - start - 4);
- + if(size <= 0) continue;
- + buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
- +
- + init_get_bits(&gb, buf2, buf2_size * 8);
- +
- + switch(AV_RB32(start)) {
- + case VC1_CODE_SEQHDR:
- + VC1ParseSequenceHeader(&gb);
- + break;
- + }
- + }
- + m_dllAvUtil.av_freep(&buf2);
- +}
- +
- +void CVC1HeaderParser::VC1ParseSequenceHeader(GetBitContext *gb)
- +{
- + hdr.profile = get_bits(gb, 2);
- +
- + if (hdr.profile == PROFILE_ADVANCED) {
- + hdr.valid = 1;
- +
- + hdr.level = get_bits(gb, 3);
- + skip_bits(gb, 2); // Chroma Format, only 1 should be set for 4:2:0
- + skip_bits(gb, 3); // frmrtq_postproc
- + skip_bits(gb, 5); // bitrtq_postproc
- + skip_bits1(gb); // postprocflag
- +
- + hdr.width = (get_bits(gb, 12) + 1) << 1;
- + hdr.height = (get_bits(gb, 12) + 1) << 1;
- +
- + hdr.broadcast = get_bits1(gb); // broadcast
- + hdr.interlaced = get_bits1(gb); // interlaced
- +
- + skip_bits1(gb); // tfcntrflag
- + skip_bits1(gb); // finterpflag
- + skip_bits1(gb); // reserved
- + skip_bits1(gb); // psf
- +
- + if (get_bits1(gb)) { // Display Info
- + int w, h, ar = 0;
- + w = get_bits(gb, 14) + 1;
- + h = get_bits(gb, 14) + 1;
- + if (get_bits1(gb))
- + ar = get_bits(gb, 4);
- + if (ar && ar < 14) {
- + hdr.ar = ff_vc1_pixel_aspect[ar];
- + } else if (ar == 15) {
- + w = get_bits(gb, 8) + 1;
- + h = get_bits(gb, 8) + 1;
- + hdr.ar.num = w;
- + hdr.ar.den = h;
- + } else {
- + m_dllAvUtil.av_reduce(&hdr.ar.num, &hdr.ar.den, hdr.height * w, hdr.width * h, 1 << 30);
- + }
- + }
- +
- + // TODO: add other fields
- + }
- +}
- \ No newline at end of file
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.h
- new file mode 100644
- index 0000000..ef999c7
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/VC1HeaderParser.h
- @@ -0,0 +1,52 @@
- +/*
- + * Copyright (C) 2010-2012 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2 of the License, or
- + * (at your option) any later version.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License along
- + * with this program; if not, write to the Free Software Foundation, Inc.,
- + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- + */
- +
- +#pragma once
- +
- +#include "DllAvUtil.h"
- +
- +struct GetBitContext;
- +
- +class CVC1HeaderParser
- +{
- +public:
- + CVC1HeaderParser(const BYTE *pData, size_t length);
- + ~CVC1HeaderParser(void);
- +
- +public:
- + struct {
- + int valid;
- +
- + int profile;
- + int level;
- +
- + int width;
- + int height;
- +
- + int broadcast;
- + int interlaced;
- +
- + AVRational ar;
- + } hdr;
- +
- +private:
- + void ParseVC1Header(const BYTE *pData, size_t length);
- + void VC1ParseSequenceHeader(GetBitContext *gb);
- + DllAvUtil m_dllAvUtil;
- +};
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda.h
- new file mode 100644
- index 0000000..4c6505a
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda.h
- @@ -0,0 +1,8127 @@
- +/*
- + * Copyright 1993-2011 NVIDIA Corporation. All rights reserved.
- + *
- + * NOTICE TO LICENSEE:
- + *
- + * This source code and/or documentation ("Licensed Deliverables") are
- + * subject to NVIDIA intellectual property rights under U.S. and
- + * international Copyright laws.
- + *
- + * These Licensed Deliverables contained herein is PROPRIETARY and
- + * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- + * conditions of a form of NVIDIA software license agreement by and
- + * between NVIDIA and Licensee ("License Agreement") or electronically
- + * accepted by Licensee. Notwithstanding any terms or conditions to
- + * the contrary in the License Agreement, reproduction or disclosure
- + * of the Licensed Deliverables to any third party without the express
- + * written consent of NVIDIA is prohibited.
- + *
- + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
- + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- + * OF THESE LICENSED DELIVERABLES.
- + *
- + * U.S. Government End Users. These Licensed Deliverables are a
- + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- + * 1995), consisting of "commercial computer software" and "commercial
- + * computer software documentation" as such terms are used in 48
- + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
- + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- + * U.S. Government End Users acquire the Licensed Deliverables with
- + * only those rights set forth herein.
- + *
- + * Any use of the Licensed Deliverables in individual and commercial
- + * software must include, in the user documentation and internal
- + * comments to the code, the above Disclaimer and U.S. Government End
- + * Users Notice.
- + */
- +
- +#ifndef __cuda_cuda_h__
- +#define __cuda_cuda_h__
- +
- +#include <stdlib.h>
- +
- +/**
- + * CUDA API versioning support
- + */
- +#if defined(CUDA_FORCE_API_VERSION)
- + #if (CUDA_FORCE_API_VERSION == 3010)
- + #define __CUDA_API_VERSION 3010
- + #else
- + #error "Unsupported value of CUDA_FORCE_API_VERSION"
- + #endif
- +#else
- + #define __CUDA_API_VERSION 4020
- +#endif /* CUDA_FORCE_API_VERSION */
- +
- +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020
- + #define cuDeviceTotalMem cuDeviceTotalMem_v2
- + #define cuCtxCreate cuCtxCreate_v2
- + #define cuModuleGetGlobal cuModuleGetGlobal_v2
- + #define cuMemGetInfo cuMemGetInfo_v2
- + #define cuMemAlloc cuMemAlloc_v2
- + #define cuMemAllocPitch cuMemAllocPitch_v2
- + #define cuMemFree cuMemFree_v2
- + #define cuMemGetAddressRange cuMemGetAddressRange_v2
- + #define cuMemAllocHost cuMemAllocHost_v2
- + #define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2
- + #define cuMemcpyHtoD cuMemcpyHtoD_v2
- + #define cuMemcpyDtoH cuMemcpyDtoH_v2
- + #define cuMemcpyDtoD cuMemcpyDtoD_v2
- + #define cuMemcpyDtoA cuMemcpyDtoA_v2
- + #define cuMemcpyAtoD cuMemcpyAtoD_v2
- + #define cuMemcpyHtoA cuMemcpyHtoA_v2
- + #define cuMemcpyAtoH cuMemcpyAtoH_v2
- + #define cuMemcpyAtoA cuMemcpyAtoA_v2
- + #define cuMemcpyHtoAAsync cuMemcpyHtoAAsync_v2
- + #define cuMemcpyAtoHAsync cuMemcpyAtoHAsync_v2
- + #define cuMemcpy2D cuMemcpy2D_v2
- + #define cuMemcpy2DUnaligned cuMemcpy2DUnaligned_v2
- + #define cuMemcpy3D cuMemcpy3D_v2
- + #define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
- + #define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
- + #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
- + #define cuMemcpy2DAsync cuMemcpy2DAsync_v2
- + #define cuMemcpy3DAsync cuMemcpy3DAsync_v2
- + #define cuMemsetD8 cuMemsetD8_v2
- + #define cuMemsetD16 cuMemsetD16_v2
- + #define cuMemsetD32 cuMemsetD32_v2
- + #define cuMemsetD2D8 cuMemsetD2D8_v2
- + #define cuMemsetD2D16 cuMemsetD2D16_v2
- + #define cuMemsetD2D32 cuMemsetD2D32_v2
- + #define cuArrayCreate cuArrayCreate_v2
- + #define cuArrayGetDescriptor cuArrayGetDescriptor_v2
- + #define cuArray3DCreate cuArray3DCreate_v2
- + #define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2
- + #define cuTexRefSetAddress cuTexRefSetAddress_v2
- + #define cuTexRefGetAddress cuTexRefGetAddress_v2
- + #define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2
- +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */
- +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000
- + #define cuCtxDestroy cuCtxDestroy_v2
- + #define cuCtxPopCurrent cuCtxPopCurrent_v2
- + #define cuCtxPushCurrent cuCtxPushCurrent_v2
- + #define cuStreamDestroy cuStreamDestroy_v2
- + #define cuEventDestroy cuEventDestroy_v2
- +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */
- +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010
- + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3
- +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */
- +
- +#if !defined(__CUDA_API_VERSION_INTERNAL)
- +#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010
- + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v2
- +#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */
- +#endif /* __CUDA_API_VERSION_INTERNAL */
- +
- +/**
- + * \defgroup CUDA_DRIVER CUDA Driver API
- + *
- + * This section describes the low-level CUDA driver application programming
- + * interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \defgroup CUDA_TYPES Data types used by CUDA driver
- + * @{
- + */
- +
- +/**
- + * CUDA API version number
- + */
- +#define CUDA_VERSION 4020
- +
- +#ifdef __cplusplus
- +extern "C" {
- +#endif
- +
- +/**
- + * CUDA device pointer
- + */
- +#if __CUDA_API_VERSION >= 3020
- +
- +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
- +typedef unsigned long long CUdeviceptr;
- +#else
- +typedef unsigned int CUdeviceptr;
- +#endif
- +
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +typedef int CUdevice; /**< CUDA device */
- +typedef struct CUctx_st *CUcontext; /**< CUDA context */
- +typedef struct CUmod_st *CUmodule; /**< CUDA module */
- +typedef struct CUfunc_st *CUfunction; /**< CUDA function */
- +typedef struct CUarray_st *CUarray; /**< CUDA array */
- +typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */
- +typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */
- +typedef struct CUevent_st *CUevent; /**< CUDA event */
- +typedef struct CUstream_st *CUstream; /**< CUDA stream */
- +typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
- +
- +typedef struct CUuuid_st { /**< CUDA definition of UUID */
- + char bytes[16];
- +} CUuuid;
- +
- +
- +#if __CUDA_API_VERSION >= 4010
- +
- +/**
- + * Interprocess Handles
- + */
- +#define CU_IPC_HANDLE_SIZE 64
- +
- +typedef struct CUipcEventHandle_st {
- + char reserved[CU_IPC_HANDLE_SIZE];
- +} CUipcEventHandle;
- +
- +typedef struct CUipcMemHandle_st {
- + char reserved[CU_IPC_HANDLE_SIZE];
- +} CUipcMemHandle;
- +
- +typedef enum CUipcMem_flags_enum {
- + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
- +} CUipcMem_flags;
- +
- +#endif
- +
- +/**
- + * Context creation flags
- + */
- +typedef enum CUctx_flags_enum {
- + CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
- + CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
- + CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
- + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
- + CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling
- + * \deprecated This flag was deprecated as of CUDA 4.0
- + * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
- + CU_CTX_SCHED_MASK = 0x07,
- + CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */
- + CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */
- + CU_CTX_FLAGS_MASK = 0x1f
- +} CUctx_flags;
- +
- +/**
- + * Event creation flags
- + */
- +typedef enum CUevent_flags_enum {
- + CU_EVENT_DEFAULT = 0x0, /**< Default event flag */
- + CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */
- + CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
- + CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
- +} CUevent_flags;
- +
- +/**
- + * Array formats
- + */
- +typedef enum CUarray_format_enum {
- + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
- + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
- + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
- + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
- + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
- + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
- + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
- + CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */
- +} CUarray_format;
- +
- +/**
- + * Texture reference addressing modes
- + */
- +typedef enum CUaddress_mode_enum {
- + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */
- + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */
- + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
- + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */
- +} CUaddress_mode;
- +
- +/**
- + * Texture reference filtering modes
- + */
- +typedef enum CUfilter_mode_enum {
- + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
- + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
- +} CUfilter_mode;
- +
- +/**
- + * Device properties
- + */
- +typedef enum CUdevice_attribute_enum {
- + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */
- + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */
- + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */
- + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */
- + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */
- + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */
- + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */
- + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
- + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
- + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
- + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */
- + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */
- + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
- + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
- + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */
- + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
- + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
- + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */
- + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */
- + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */
- + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */
- + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
- + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */
- + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */
- + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */
- + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */
- + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */
- + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */
- + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */
- + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */
- + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */
- + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
- + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */
- + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */
- + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */
- + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */
- + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Maximum 1D linear texture width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */
- + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72 /**< Maximum 2D linear texture pitch in bytes */
- +} CUdevice_attribute;
- +
- +/**
- + * Legacy device properties
- + */
- +typedef struct CUdevprop_st {
- + int maxThreadsPerBlock; /**< Maximum number of threads per block */
- + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
- + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
- + int sharedMemPerBlock; /**< Shared memory available per block in bytes */
- + int totalConstantMemory; /**< Constant memory available on device in bytes */
- + int SIMDWidth; /**< Warp size in threads */
- + int memPitch; /**< Maximum pitch in bytes allowed by memory copies */
- + int regsPerBlock; /**< 32-bit registers available per block */
- + int clockRate; /**< Clock frequency in kilohertz */
- + int textureAlign; /**< Alignment requirement for textures */
- +} CUdevprop;
- +
- +/**
- + * Pointer information
- + */
- +typedef enum CUpointer_attribute_enum {
- + CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */
- + CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */
- + CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */
- + CU_POINTER_ATTRIBUTE_HOST_POINTER = 4 /**< The address at which a pointer's memory may be accessed on the host */
- +} CUpointer_attribute;
- +
- +/**
- + * Function properties
- + */
- +typedef enum CUfunction_attribute_enum {
- + /**
- + * The maximum number of threads per block, beyond which a launch of the
- + * function would fail. This number depends on both the function and the
- + * device on which the function is currently loaded.
- + */
- + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
- +
- + /**
- + * The size in bytes of statically-allocated shared memory required by
- + * this function. This does not include dynamically-allocated shared
- + * memory requested by the user at runtime.
- + */
- + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
- +
- + /**
- + * The size in bytes of user-allocated constant memory required by this
- + * function.
- + */
- + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
- +
- + /**
- + * The size in bytes of local memory used by each thread of this function.
- + */
- + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
- +
- + /**
- + * The number of registers used by each thread of this function.
- + */
- + CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
- +
- + /**
- + * The PTX virtual architecture version for which the function was
- + * compiled. This value is the major PTX version * 10 + the minor PTX
- + * version, so a PTX version 1.3 function would return the value 13.
- + * Note that this may return the undefined value of 0 for cubins
- + * compiled prior to CUDA 3.0.
- + */
- + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
- +
- + /**
- + * The binary architecture version for which the function was compiled.
- + * This value is the major binary version * 10 + the minor binary version,
- + * so a binary version 1.3 function would return the value 13. Note that
- + * this will return a value of 10 for legacy cubins that do not have a
- + * properly-encoded binary architecture version.
- + */
- + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
- +
- + CU_FUNC_ATTRIBUTE_MAX
- +} CUfunction_attribute;
- +
- +/**
- + * Function cache configurations
- + */
- +typedef enum CUfunc_cache_enum {
- + CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */
- + CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
- + CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */
- + CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */
- +} CUfunc_cache;
- +
- +/**
- + * Shared memory configurations
- + */
- +typedef enum CUsharedconfig_enum {
- + CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */
- + CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */
- + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */
- +} CUsharedconfig;
- +
- +/**
- + * Memory types
- + */
- +typedef enum CUmemorytype_enum {
- + CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
- + CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
- + CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */
- + CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
- +} CUmemorytype;
- +
- +/**
- + * Compute Modes
- + */
- +typedef enum CUcomputemode_enum {
- + CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
- + CU_COMPUTEMODE_EXCLUSIVE = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */
- + CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
- + CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
- +} CUcomputemode;
- +
- +/**
- + * Online compiler options
- + */
- +typedef enum CUjit_option_enum
- +{
- + /**
- + * Max number of registers that a thread may use.\n
- + * Option type: unsigned int
- + */
- + CU_JIT_MAX_REGISTERS = 0,
- +
- + /**
- + * IN: Specifies minimum number of threads per block to target compilation
- + * for\n
- + * OUT: Returns the number of threads the compiler actually targeted.
- + * This restricts the resource utilization fo the compiler (e.g. max
- + * registers) such that a block with the given number of threads should be
- + * able to launch based on register limitations. Note, this option does not
- + * currently take into account any other resource limitations, such as
- + * shared memory utilization.\n
- + * Option type: unsigned int
- + */
- + CU_JIT_THREADS_PER_BLOCK,
- +
- + /**
- + * Returns a float value in the option of the wall clock time, in
- + * milliseconds, spent creating the cubin\n
- + * Option type: float
- + */
- + CU_JIT_WALL_TIME,
- +
- + /**
- + * Pointer to a buffer in which to print any log messsages from PTXAS
- + * that are informational in nature (the buffer size is specified via
- + * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
- + * Option type: char*
- + */
- + CU_JIT_INFO_LOG_BUFFER,
- +
- + /**
- + * IN: Log buffer size in bytes. Log messages will be capped at this size
- + * (including null terminator)\n
- + * OUT: Amount of log buffer filled with messages\n
- + * Option type: unsigned int
- + */
- + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
- +
- + /**
- + * Pointer to a buffer in which to print any log messages from PTXAS that
- + * reflect errors (the buffer size is specified via option
- + * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
- + * Option type: char*
- + */
- + CU_JIT_ERROR_LOG_BUFFER,
- +
- + /**
- + * IN: Log buffer size in bytes. Log messages will be capped at this size
- + * (including null terminator)\n
- + * OUT: Amount of log buffer filled with messages\n
- + * Option type: unsigned int
- + */
- + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
- +
- + /**
- + * Level of optimizations to apply to generated code (0 - 4), with 4
- + * being the default and highest level of optimizations.\n
- + * Option type: unsigned int
- + */
- + CU_JIT_OPTIMIZATION_LEVEL,
- +
- + /**
- + * No option value required. Determines the target based on the current
- + * attached context (default)\n
- + * Option type: No option value needed
- + */
- + CU_JIT_TARGET_FROM_CUCONTEXT,
- +
- + /**
- + * Target is chosen based on supplied ::CUjit_target_enum.\n
- + * Option type: unsigned int for enumerated type ::CUjit_target_enum
- + */
- + CU_JIT_TARGET,
- +
- + /**
- + * Specifies choice of fallback strategy if matching cubin is not found.
- + * Choice is based on supplied ::CUjit_fallback_enum.\n
- + * Option type: unsigned int for enumerated type ::CUjit_fallback_enum
- + */
- + CU_JIT_FALLBACK_STRATEGY
- +
- +} CUjit_option;
- +
- +/**
- + * Online compilation targets
- + */
- +typedef enum CUjit_target_enum
- +{
- + CU_TARGET_COMPUTE_10 = 0, /**< Compute device class 1.0 */
- + CU_TARGET_COMPUTE_11, /**< Compute device class 1.1 */
- + CU_TARGET_COMPUTE_12, /**< Compute device class 1.2 */
- + CU_TARGET_COMPUTE_13, /**< Compute device class 1.3 */
- + CU_TARGET_COMPUTE_20, /**< Compute device class 2.0 */
- + CU_TARGET_COMPUTE_21, /**< Compute device class 2.1 */
- + CU_TARGET_COMPUTE_30 /**< Compute device class 3.0 */
- +} CUjit_target;
- +
- +/**
- + * Cubin matching fallback strategies
- + */
- +typedef enum CUjit_fallback_enum
- +{
- + CU_PREFER_PTX = 0, /**< Prefer to compile ptx */
- +
- + CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */
- +
- +} CUjit_fallback;
- +
- +/**
- + * Flags to register a graphics resource
- + */
- +typedef enum CUgraphicsRegisterFlags_enum {
- + CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
- + CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
- + CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
- + CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04,
- + CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
- +} CUgraphicsRegisterFlags;
- +
- +/**
- + * Flags for mapping and unmapping interop resources
- + */
- +typedef enum CUgraphicsMapResourceFlags_enum {
- + CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
- + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
- + CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
- +} CUgraphicsMapResourceFlags;
- +
- +/**
- + * Array indices for cube faces
- + */
- +typedef enum CUarray_cubemap_face_enum {
- + CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */
- + CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */
- + CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */
- + CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */
- + CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */
- + CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */
- +} CUarray_cubemap_face;
- +
- +/**
- + * Limits
- + */
- +typedef enum CUlimit_enum {
- + CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */
- + CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */
- + CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */
- +} CUlimit;
- +
- +/**
- + * Error codes
- + */
- +typedef enum cudaError_enum {
- + /**
- + * The API call returned with no errors. In the case of query calls, this
- + * can also mean that the operation being queried is complete (see
- + * ::cuEventQuery() and ::cuStreamQuery()).
- + */
- + CUDA_SUCCESS = 0,
- +
- + /**
- + * This indicates that one or more of the parameters passed to the API call
- + * is not within an acceptable range of values.
- + */
- + CUDA_ERROR_INVALID_VALUE = 1,
- +
- + /**
- + * The API call failed because it was unable to allocate enough memory to
- + * perform the requested operation.
- + */
- + CUDA_ERROR_OUT_OF_MEMORY = 2,
- +
- + /**
- + * This indicates that the CUDA driver has not been initialized with
- + * ::cuInit() or that initialization has failed.
- + */
- + CUDA_ERROR_NOT_INITIALIZED = 3,
- +
- + /**
- + * This indicates that the CUDA driver is in the process of shutting down.
- + */
- + CUDA_ERROR_DEINITIALIZED = 4,
- +
- + /**
- + * This indicates profiling APIs are called while application is running
- + * in visual profiler mode.
- + */
- + CUDA_ERROR_PROFILER_DISABLED = 5,
- + /**
- + * This indicates profiling has not been initialized for this context.
- + * Call cuProfilerInitialize() to resolve this.
- + */
- + CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6,
- + /**
- + * This indicates profiler has already been started and probably
- + * cuProfilerStart() is incorrectly called.
- + */
- + CUDA_ERROR_PROFILER_ALREADY_STARTED = 7,
- + /**
- + * This indicates profiler has already been stopped and probably
- + * cuProfilerStop() is incorrectly called.
- + */
- + CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8,
- + /**
- + * This indicates that no CUDA-capable devices were detected by the installed
- + * CUDA driver.
- + */
- + CUDA_ERROR_NO_DEVICE = 100,
- +
- + /**
- + * This indicates that the device ordinal supplied by the user does not
- + * correspond to a valid CUDA device.
- + */
- + CUDA_ERROR_INVALID_DEVICE = 101,
- +
- +
- + /**
- + * This indicates that the device kernel image is invalid. This can also
- + * indicate an invalid CUDA module.
- + */
- + CUDA_ERROR_INVALID_IMAGE = 200,
- +
- + /**
- + * This most frequently indicates that there is no context bound to the
- + * current thread. This can also be returned if the context passed to an
- + * API call is not a valid handle (such as a context that has had
- + * ::cuCtxDestroy() invoked on it). This can also be returned if a user
- + * mixes different API versions (i.e. 3010 context with 3020 API calls).
- + * See ::cuCtxGetApiVersion() for more details.
- + */
- + CUDA_ERROR_INVALID_CONTEXT = 201,
- +
- + /**
- + * This indicated that the context being supplied as a parameter to the
- + * API call was already the active context.
- + * \deprecated
- + * This error return is deprecated as of CUDA 3.2. It is no longer an
- + * error to attempt to push the active context via ::cuCtxPushCurrent().
- + */
- + CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,
- +
- + /**
- + * This indicates that a map or register operation has failed.
- + */
- + CUDA_ERROR_MAP_FAILED = 205,
- +
- + /**
- + * This indicates that an unmap or unregister operation has failed.
- + */
- + CUDA_ERROR_UNMAP_FAILED = 206,
- +
- + /**
- + * This indicates that the specified array is currently mapped and thus
- + * cannot be destroyed.
- + */
- + CUDA_ERROR_ARRAY_IS_MAPPED = 207,
- +
- + /**
- + * This indicates that the resource is already mapped.
- + */
- + CUDA_ERROR_ALREADY_MAPPED = 208,
- +
- + /**
- + * This indicates that there is no kernel image available that is suitable
- + * for the device. This can occur when a user specifies code generation
- + * options for a particular CUDA source file that do not include the
- + * corresponding device configuration.
- + */
- + CUDA_ERROR_NO_BINARY_FOR_GPU = 209,
- +
- + /**
- + * This indicates that a resource has already been acquired.
- + */
- + CUDA_ERROR_ALREADY_ACQUIRED = 210,
- +
- + /**
- + * This indicates that a resource is not mapped.
- + */
- + CUDA_ERROR_NOT_MAPPED = 211,
- +
- + /**
- + * This indicates that a mapped resource is not available for access as an
- + * array.
- + */
- + CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212,
- +
- + /**
- + * This indicates that a mapped resource is not available for access as a
- + * pointer.
- + */
- + CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,
- +
- + /**
- + * This indicates that an uncorrectable ECC error was detected during
- + * execution.
- + */
- + CUDA_ERROR_ECC_UNCORRECTABLE = 214,
- +
- + /**
- + * This indicates that the ::CUlimit passed to the API call is not
- + * supported by the active device.
- + */
- + CUDA_ERROR_UNSUPPORTED_LIMIT = 215,
- +
- + /**
- + * This indicates that the ::CUcontext passed to the API call can
- + * only be bound to a single CPU thread at a time but is already
- + * bound to a CPU thread.
- + */
- + CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216,
- +
- + /**
- + * This indicates that the device kernel source is invalid.
- + */
- + CUDA_ERROR_INVALID_SOURCE = 300,
- +
- + /**
- + * This indicates that the file specified was not found.
- + */
- + CUDA_ERROR_FILE_NOT_FOUND = 301,
- +
- + /**
- + * This indicates that a link to a shared object failed to resolve.
- + */
- + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
- +
- + /**
- + * This indicates that initialization of a shared object failed.
- + */
- + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303,
- +
- + /**
- + * This indicates that an OS call failed.
- + */
- + CUDA_ERROR_OPERATING_SYSTEM = 304,
- +
- +
- + /**
- + * This indicates that a resource handle passed to the API call was not
- + * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
- + */
- + CUDA_ERROR_INVALID_HANDLE = 400,
- +
- +
- + /**
- + * This indicates that a named symbol was not found. Examples of symbols
- + * are global/constant variable names, texture names, and surface names.
- + */
- + CUDA_ERROR_NOT_FOUND = 500,
- +
- +
- + /**
- + * This indicates that asynchronous operations issued previously have not
- + * completed yet. This result is not actually an error, but must be indicated
- + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
- + * may return this value include ::cuEventQuery() and ::cuStreamQuery().
- + */
- + CUDA_ERROR_NOT_READY = 600,
- +
- +
- + /**
- + * An exception occurred on the device while executing a kernel. Common
- + * causes include dereferencing an invalid device pointer and accessing
- + * out of bounds shared memory. The context cannot be used, so it must
- + * be destroyed (and a new one should be created). All existing device
- + * memory allocations from this context are invalid and must be
- + * reconstructed if the program is to continue using CUDA.
- + */
- + CUDA_ERROR_LAUNCH_FAILED = 700,
- +
- + /**
- + * This indicates that a launch did not occur because it did not have
- + * appropriate resources. This error usually indicates that the user has
- + * attempted to pass too many arguments to the device kernel, or the
- + * kernel launch specifies too many threads for the kernel's register
- + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
- + * when a 32-bit int is expected) is equivalent to passing too many
- + * arguments and can also result in this error.
- + */
- + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,
- +
- + /**
- + * This indicates that the device kernel took too long to execute. This can
- + * only occur if timeouts are enabled - see the device attribute
- + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
- + * context cannot be used (and must be destroyed similar to
- + * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
- + * this context are invalid and must be reconstructed if the program is to
- + * continue using CUDA.
- + */
- + CUDA_ERROR_LAUNCH_TIMEOUT = 702,
- +
- + /**
- + * This error indicates a kernel launch that uses an incompatible texturing
- + * mode.
- + */
- + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,
- +
- + /**
- + * This error indicates that a call to ::cuCtxEnablePeerAccess() is
- + * trying to re-enable peer access to a context which has already
- + * had peer access to it enabled.
- + */
- + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
- +
- + /**
- + * This error indicates that ::cuCtxDisablePeerAccess() is
- + * trying to disable peer access which has not been enabled yet
- + * via ::cuCtxEnablePeerAccess().
- + */
- + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705,
- +
- + /**
- + * This error indicates that the primary context for the specified device
- + * has already been initialized.
- + */
- + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708,
- +
- + /**
- + * This error indicates that the context current to the calling thread
- + * has been destroyed using ::cuCtxDestroy, or is a primary context which
- + * has not yet been initialized.
- + */
- + CUDA_ERROR_CONTEXT_IS_DESTROYED = 709,
- +
- + /**
- + * A device-side assert triggered during kernel execution. The context
- + * cannot be used anymore, and must be destroyed. All existing device
- + * memory allocations from this context are invalid and must be
- + * reconstructed if the program is to continue using CUDA.
- + */
- + CUDA_ERROR_ASSERT = 710,
- +
- + /**
- + * This error indicates that the hardware resources required to enable
- + * peer access have been exhausted for one or more of the devices
- + * passed to ::cuCtxEnablePeerAccess().
- + */
- + CUDA_ERROR_TOO_MANY_PEERS = 711,
- +
- + /**
- + * This error indicates that the memory range passed to ::cuMemHostRegister()
- + * has already been registered.
- + */
- + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
- +
- + /**
- + * This error indicates that the pointer passed to ::cuMemHostUnregister()
- + * does not correspond to any currently registered memory region.
- + */
- + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713,
- +
- + /**
- + * This indicates that an unknown internal error has occurred.
- + */
- + CUDA_ERROR_UNKNOWN = 999
- +} CUresult;
- +
- +/**
- + * If set, host memory is portable between CUDA contexts.
- + * Flag for ::cuMemHostAlloc()
- + */
- +#define CU_MEMHOSTALLOC_PORTABLE 0x01
- +
- +/**
- + * If set, host memory is mapped into CUDA address space and
- + * ::cuMemHostGetDevicePointer() may be called on the host pointer.
- + * Flag for ::cuMemHostAlloc()
- + */
- +#define CU_MEMHOSTALLOC_DEVICEMAP 0x02
- +
- +/**
- + * If set, host memory is allocated as write-combined - fast to write,
- + * faster to DMA, slow to read except via SSE4 streaming load instruction
- + * (MOVNTDQA).
- + * Flag for ::cuMemHostAlloc()
- + */
- +#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04
- +
- +/**
- + * If set, host memory is portable between CUDA contexts.
- + * Flag for ::cuMemHostRegister()
- + */
- +#define CU_MEMHOSTREGISTER_PORTABLE 0x01
- +
- +/**
- + * If set, host memory is mapped into CUDA address space and
- + * ::cuMemHostGetDevicePointer() may be called on the host pointer.
- + * Flag for ::cuMemHostRegister()
- + */
- +#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02
- +
- +#if __CUDA_API_VERSION >= 3020
- +
- +/**
- + * 2D memory copy parameters
- + */
- +typedef struct CUDA_MEMCPY2D_st {
- + size_t srcXInBytes; /**< Source X in bytes */
- + size_t srcY; /**< Source Y */
- +
- + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
- + const void *srcHost; /**< Source host pointer */
- + CUdeviceptr srcDevice; /**< Source device pointer */
- + CUarray srcArray; /**< Source array reference */
- + size_t srcPitch; /**< Source pitch (ignored when src is array) */
- +
- + size_t dstXInBytes; /**< Destination X in bytes */
- + size_t dstY; /**< Destination Y */
- +
- + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
- + void *dstHost; /**< Destination host pointer */
- + CUdeviceptr dstDevice; /**< Destination device pointer */
- + CUarray dstArray; /**< Destination array reference */
- + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
- +
- + size_t WidthInBytes; /**< Width of 2D memory copy in bytes */
- + size_t Height; /**< Height of 2D memory copy */
- +} CUDA_MEMCPY2D;
- +
- +/**
- + * 3D memory copy parameters
- + */
- +typedef struct CUDA_MEMCPY3D_st {
- + size_t srcXInBytes; /**< Source X in bytes */
- + size_t srcY; /**< Source Y */
- + size_t srcZ; /**< Source Z */
- + size_t srcLOD; /**< Source LOD */
- + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
- + const void *srcHost; /**< Source host pointer */
- + CUdeviceptr srcDevice; /**< Source device pointer */
- + CUarray srcArray; /**< Source array reference */
- + void *reserved0; /**< Must be NULL */
- + size_t srcPitch; /**< Source pitch (ignored when src is array) */
- + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
- +
- + size_t dstXInBytes; /**< Destination X in bytes */
- + size_t dstY; /**< Destination Y */
- + size_t dstZ; /**< Destination Z */
- + size_t dstLOD; /**< Destination LOD */
- + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
- + void *dstHost; /**< Destination host pointer */
- + CUdeviceptr dstDevice; /**< Destination device pointer */
- + CUarray dstArray; /**< Destination array reference */
- + void *reserved1; /**< Must be NULL */
- + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
- + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
- +
- + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
- + size_t Height; /**< Height of 3D memory copy */
- + size_t Depth; /**< Depth of 3D memory copy */
- +} CUDA_MEMCPY3D;
- +
- +/**
- + * 3D memory cross-context copy parameters
- + */
- +typedef struct CUDA_MEMCPY3D_PEER_st {
- + size_t srcXInBytes; /**< Source X in bytes */
- + size_t srcY; /**< Source Y */
- + size_t srcZ; /**< Source Z */
- + size_t srcLOD; /**< Source LOD */
- + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
- + const void *srcHost; /**< Source host pointer */
- + CUdeviceptr srcDevice; /**< Source device pointer */
- + CUarray srcArray; /**< Source array reference */
- + CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
- + size_t srcPitch; /**< Source pitch (ignored when src is array) */
- + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
- +
- + size_t dstXInBytes; /**< Destination X in bytes */
- + size_t dstY; /**< Destination Y */
- + size_t dstZ; /**< Destination Z */
- + size_t dstLOD; /**< Destination LOD */
- + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
- + void *dstHost; /**< Destination host pointer */
- + CUdeviceptr dstDevice; /**< Destination device pointer */
- + CUarray dstArray; /**< Destination array reference */
- + CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
- + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
- + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
- +
- + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
- + size_t Height; /**< Height of 3D memory copy */
- + size_t Depth; /**< Depth of 3D memory copy */
- +} CUDA_MEMCPY3D_PEER;
- +
- +/**
- + * Array descriptor
- + */
- +typedef struct CUDA_ARRAY_DESCRIPTOR_st
- +{
- + size_t Width; /**< Width of array */
- + size_t Height; /**< Height of array */
- +
- + CUarray_format Format; /**< Array format */
- + unsigned int NumChannels; /**< Channels per array element */
- +} CUDA_ARRAY_DESCRIPTOR;
- +
- +/**
- + * 3D array descriptor
- + */
- +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
- +{
- + size_t Width; /**< Width of 3D array */
- + size_t Height; /**< Height of 3D array */
- + size_t Depth; /**< Depth of 3D array */
- +
- + CUarray_format Format; /**< Array format */
- + unsigned int NumChannels; /**< Channels per array element */
- + unsigned int Flags; /**< Flags */
- +} CUDA_ARRAY3D_DESCRIPTOR;
- +
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +/**
- + * If set, the CUDA array is a collection of layers, where each layer is either a 1D
- + * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
- + * of layers, not the depth of a 3D array.
- + */
- +#define CUDA_ARRAY3D_LAYERED 0x01
- +
- +/**
- + * Deprecated, use CUDA_ARRAY3D_LAYERED
- + */
- +#define CUDA_ARRAY3D_2DARRAY 0x01
- +
- +/**
- + * This flag must be set in order to bind a surface reference
- + * to the CUDA array
- + */
- +#define CUDA_ARRAY3D_SURFACE_LDST 0x02
- +
- +/**
- + * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
- + * width of such a CUDA array must be equal to its height, and Depth must be six.
- + * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
- + * and Depth must be a multiple of six.
- + */
- +#define CUDA_ARRAY3D_CUBEMAP 0x04
- +
- +/**
- + * This flag must be set in order to perform texture gather operations
- + * on a CUDA array.
- + */
- +#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08
- +
- +/**
- + * Override the texref format with a format inferred from the array.
- + * Flag for ::cuTexRefSetArray()
- + */
- +#define CU_TRSA_OVERRIDE_FORMAT 0x01
- +
- +/**
- + * Read the texture as integers rather than promoting the values to floats
- + * in the range [0,1].
- + * Flag for ::cuTexRefSetFlags()
- + */
- +#define CU_TRSF_READ_AS_INTEGER 0x01
- +
- +/**
- + * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
- + * Flag for ::cuTexRefSetFlags()
- + */
- +#define CU_TRSF_NORMALIZED_COORDINATES 0x02
- +
- +/**
- + * Perform sRGB->linear conversion during texture read.
- + * Flag for ::cuTexRefSetFlags()
- + */
- +#define CU_TRSF_SRGB 0x10
- +
- +/**
- + * End of array terminator for the \p extra parameter to
- + * ::cuLaunchKernel
- + */
- +#define CU_LAUNCH_PARAM_END ((void*)0x00)
- +
- +/**
- + * Indicator that the next value in the \p extra parameter to
- + * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
- + * parameters used for launching kernel \p f. This buffer needs to
- + * honor all alignment/padding requirements of the individual parameters.
- + * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
- + * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
- + * effect.
- + */
- +#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
- +
- +/**
- + * Indicator that the next value in the \p extra parameter to
- + * ::cuLaunchKernel will be a pointer to a size_t which contains the
- + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
- + * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
- + * in the \p extra array if the value associated with
- + * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
- + */
- +#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
- +
- +/**
- + * For texture references loaded into the module, use default texunit from
- + * texture reference.
- + */
- +#define CU_PARAM_TR_DEFAULT -1
- +
- +/** @} */ /* END CUDA_TYPES */
- +
- +#ifdef _WIN32
- +#define CUDAAPI __stdcall
- +#else
- +#define CUDAAPI
- +#endif
- +
- +/**
- + * \defgroup CUDA_INITIALIZE Initialization
- + *
- + * This section describes the initialization functions of the low-level CUDA
- + * driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Initialize the CUDA driver API
- + *
- + * Initializes the driver API and must be called before any other function from
- + * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit()
- + * has not been called, any function from the driver API will return
- + * ::CUDA_ERROR_NOT_INITIALIZED.
- + *
- + * \param Flags - Initialization flag for CUDA.
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + */
- +CUresult CUDAAPI cuInit(unsigned int Flags);
- +
- +/** @} */ /* END CUDA_INITIALIZE */
- +
- +/**
- + * \defgroup CUDA_VERSION Version Management
- + *
- + * This section describes the version management functions of the low-level
- + * CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Returns the CUDA driver version
- + *
- + * Returns in \p *driverVersion the version number of the installed CUDA
- + * driver. This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
- + * the \p driverVersion argument is NULL.
- + *
- + * \param driverVersion - Returns the CUDA driver version
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + */
- +CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
- +
- +/** @} */ /* END CUDA_VERSION */
- +
- +/**
- + * \defgroup CUDA_DEVICE Device Management
- + *
- + * This section describes the device management functions of the low-level
- + * CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Returns a handle to a compute device
- + *
- + * Returns in \p *device a device handle given an ordinal in the range <b>[0,
- + * ::cuDeviceGetCount()-1]</b>.
- + *
- + * \param device - Returned device handle
- + * \param ordinal - Device number to get handle for
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa ::cuDeviceComputeCapability,
- + * ::cuDeviceGetAttribute,
- + * ::cuDeviceGetCount,
- + * ::cuDeviceGetName,
- + * ::cuDeviceGetProperties,
- + * ::cuDeviceTotalMem
- + */
- +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
- +
- +/**
- + * \brief Returns the number of compute-capable devices
- + *
- + * Returns in \p *count the number of devices with compute capability greater
- + * than or equal to 1.0 that are available for execution. If there is no such
- + * device, ::cuDeviceGetCount() returns 0.
- + *
- + * \param count - Returned number of compute-capable devices
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuDeviceComputeCapability,
- + * ::cuDeviceGetAttribute,
- + * ::cuDeviceGetName,
- + * ::cuDeviceGet,
- + * ::cuDeviceGetProperties,
- + * ::cuDeviceTotalMem
- + */
- +CUresult CUDAAPI cuDeviceGetCount(int *count);
- +
- +/**
- + * \brief Returns an identifer string for the device
- + *
- + * Returns an ASCII string identifying the device \p dev in the NULL-terminated
- + * string pointed to by \p name. \p len specifies the maximum length of the
- + * string that may be returned.
- + *
- + * \param name - Returned identifier string for the device
- + * \param len - Maximum length of string to store in \p name
- + * \param dev - Device to get identifier string for
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa ::cuDeviceComputeCapability,
- + * ::cuDeviceGetAttribute,
- + * ::cuDeviceGetCount,
- + * ::cuDeviceGet,
- + * ::cuDeviceGetProperties,
- + * ::cuDeviceTotalMem
- + */
- +CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
- +
- +/**
- + * \brief Returns the compute capability of the device
- + *
- + * Returns in \p *major and \p *minor the major and minor revision numbers that
- + * define the compute capability of the device \p dev.
- + *
- + * \param major - Major revision number
- + * \param minor - Minor revision number
- + * \param dev - Device handle
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa
- + * ::cuDeviceGetAttribute,
- + * ::cuDeviceGetCount,
- + * ::cuDeviceGetName,
- + * ::cuDeviceGet,
- + * ::cuDeviceGetProperties,
- + * ::cuDeviceTotalMem
- + */
- +CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Returns the total amount of memory on the device
- + *
- + * Returns in \p *bytes the total amount of memory available on the device
- + * \p dev in bytes.
- + *
- + * \param bytes - Returned memory available on device in bytes
- + * \param dev - Device handle
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa ::cuDeviceComputeCapability,
- + * ::cuDeviceGetAttribute,
- + * ::cuDeviceGetCount,
- + * ::cuDeviceGetName,
- + * ::cuDeviceGet,
- + * ::cuDeviceGetProperties,
- + */
- +CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +/**
- + * \brief Returns properties for a selected device
- + *
- + * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
- + * structure is defined as:
- + *
- + * \code
- + typedef struct CUdevprop_st {
- + int maxThreadsPerBlock;
- + int maxThreadsDim[3];
- + int maxGridSize[3];
- + int sharedMemPerBlock;
- + int totalConstantMemory;
- + int SIMDWidth;
- + int memPitch;
- + int regsPerBlock;
- + int clockRate;
- + int textureAlign
- + } CUdevprop;
- + * \endcode
- + * where:
- + *
- + * - ::maxThreadsPerBlock is the maximum number of threads per block;
- + * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
- + * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
- + * - ::sharedMemPerBlock is the total amount of shared memory available per
- + * block in bytes;
- + * - ::totalConstantMemory is the total amount of constant memory available on
- + * the device in bytes;
- + * - ::SIMDWidth is the warp size;
- + * - ::memPitch is the maximum pitch allowed by the memory copy functions that
- + * involve memory regions allocated through ::cuMemAllocPitch();
- + * - ::regsPerBlock is the total number of registers available per block;
- + * - ::clockRate is the clock frequency in kilohertz;
- + * - ::textureAlign is the alignment requirement; texture base addresses that
- + * are aligned to ::textureAlign bytes do not need an offset applied to
- + * texture fetches.
- + *
- + * \param prop - Returned properties of device
- + * \param dev - Device to get properties for
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa ::cuDeviceComputeCapability,
- + * ::cuDeviceGetAttribute,
- + * ::cuDeviceGetCount,
- + * ::cuDeviceGetName,
- + * ::cuDeviceGet,
- + * ::cuDeviceTotalMem
- + */
- +CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
- +
- +/**
- + * \brief Returns information about the device
- + *
- + * Returns in \p *pi the integer value of the attribute \p attrib on device
- + * \p dev. The supported attributes are:
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
- + * block;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
- + * shared memory available to a thread block in bytes; this amount is shared
- + * by all thread blocks simultaneously resident on a multiprocessor;
- + * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
- + * __constant__ variables in a CUDA C kernel in bytes;
- + * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
- + * memory copy functions that involve memory regions allocated through
- + * ::cuMemAllocPitch();
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
- + * texture width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
- + * for a 1D texture bound to linear memory;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
- + * texture width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
- + * texture height;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
- + * for a 2D texture bound to linear memory;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
- + * for a 2D texture bound to linear memory;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
- + * in bytes for a 2D texture bound to linear memory;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
- + * texture width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
- + * texture height;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
- + * texture depth;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
- + * Alternate maximum 3D texture width, 0 if no alternate
- + * maximum 3D texture size is supported;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
- + * Alternate maximum 3D texture height, 0 if no alternate
- + * maximum 3D texture size is supported;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
- + * Alternate maximum 3D texture depth, 0 if no alternate
- + * maximum 3D texture size is supported;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
- + * Maximum cubemap texture width or height;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
- + * Maximum 1D layered texture width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
- + * Maximum layers in a 1D layered texture;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
- + * Maximum 2D layered texture width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
- + * Maximum 2D layered texture height;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
- + * Maximum layers in a 2D layered texture;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
- + * Maximum cubemap layered texture width or height;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
- + * Maximum layers in a cubemap layered texture;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
- + * Maximum 1D surface width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
- + * Maximum 2D surface width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
- + * Maximum 2D surface height;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
- + * Maximum 3D surface width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
- + * Maximum 3D surface height;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
- + * Maximum 3D surface depth;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
- + * Maximum 1D layered surface width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
- + * Maximum layers in a 1D layered surface;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
- + * Maximum 2D layered surface width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
- + * Maximum 2D layered surface height;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
- + * Maximum layers in a 2D layered surface;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
- + * Maximum cubemap surface width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
- + * Maximum cubemap layered surface width;
- + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
- + * Maximum layers in a cubemap layered surface;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
- + * registers available to a thread block; this number is shared by all thread
- + * blocks simultaneously resident on a multiprocessor;
- + * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: Peak clock frequency in kilohertz;
- + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
- + * base addresses aligned to ::textureAlign bytes do not need an offset
- + * applied to texture fetches;
- + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
- + * for 2D texture references bound to pitched memory;
- + * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
- + * memory between host and device while executing a kernel, or 0 if not;
- + * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
- + * the device;
- + * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
- + * for kernels executed on the device, or 0 if not;
- + * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
- + * memory subsystem, or 0 if not;
- + * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
- + * memory into the CUDA address space, or 0 if not;
- + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
- + * in. Available modes are as follows:
- + * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
- + * can have multiple CUDA contexts present at a single time.
- + * - ::CU_COMPUTEMODE_EXCLUSIVE: Compute-exclusive mode - Device can have
- + * only one CUDA context present on it at a time.
- + * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
- + * prohibited from creating new CUDA contexts.
- + * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device
- + * can have only one context used by a single process at a time.
- + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
- + * executing multiple kernels within the same context simultaneously, or 0 if
- + * not. It is not guaranteed that multiple kernels will be resident
- + * on the device concurrently so this feature should not be relied upon for
- + * correctness;
- + * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
- + * device, 0 if error correction is disabled or not supported by the device;
- + * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device;
- + * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
- + * of the device;
- + * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
- + * is only available on Tesla hardware running Windows Vista or later;
- + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz;
- + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits;
- + * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache;
- + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor;
- + * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
- + * the host, or 0 if not;
- + *
- + * \param pi - Returned device attribute value
- + * \param attrib - Device attribute to query
- + * \param dev - Device handle
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa ::cuDeviceComputeCapability,
- + * ::cuDeviceGetCount,
- + * ::cuDeviceGetName,
- + * ::cuDeviceGet,
- + * ::cuDeviceGetProperties,
- + * ::cuDeviceTotalMem
- + */
- +CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
- +
- +/** @} */ /* END CUDA_DEVICE */
- +
- +
- +/**
- + * \defgroup CUDA_CTX Context Management
- + *
- + * This section describes the context management functions of the low-level
- + * CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Create a CUDA context
- + *
- + * Creates a new CUDA context and associates it with the calling thread. The
- + * \p flags parameter is described below. The context is created with a usage
- + * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or
- + * when done using the context. If a context is already current to the thread,
- + * it is supplanted by the newly created context and may be restored by a subsequent
- + * call to ::cuCtxPopCurrent().
- + *
- + * The three LSBs of the \p flags parameter can be used to control how the OS
- + * thread, which owns the CUDA context at the time of an API call, interacts
- + * with the OS scheduler when waiting for results from the GPU. Only one of
- + * the scheduling flags can be set when creating a context.
- + *
- + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
- + * uses a heuristic based on the number of active CUDA contexts in the
- + * process \e C and the number of logical processors in the system \e P. If
- + * \e C > \e P, then CUDA will yield to other OS threads when waiting for
- + * the GPU, otherwise CUDA will not yield while waiting for results and
- + * actively spin on the processor.
- + *
- + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
- + * results from the GPU. This can decrease latency when waiting for the GPU,
- + * but may lower the performance of CPU threads if they are performing work in
- + * parallel with the CUDA thread.
- + *
- + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
- + * results from the GPU. This can increase latency when waiting for the GPU,
- + * but can increase the performance of CPU threads performing work in parallel
- + * with the GPU.
- + *
- + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- + * synchronization primitive when waiting for the GPU to finish work.
- + *
- + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- + * synchronization primitive when waiting for the GPU to finish work. <br>
- + * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
- + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
- + *
- + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
- + * This flag must be set in order to allocate pinned host memory that is
- + * accessible to the GPU.
- + *
- + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
- + * after resizing local memory for a kernel. This can prevent thrashing by
- + * local memory allocations when launching many kernels with high local
- + * memory usage at the cost of potentially increased memory usage.
- + *
- + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
- + * the device is ::CU_COMPUTEMODE_PROHIBITED. Similarly, context creation will
- + * also fail with ::CUDA_ERROR_UNKNOWN if the compute mode for the device is
- + * set to ::CU_COMPUTEMODE_EXCLUSIVE and there is already an active context on
- + * the device. The function ::cuDeviceGetAttribute() can be used with
- + * ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode of the
- + * device. The <i>nvidia-smi</i> tool can be used to set the compute mode for
- + * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
- + * -h option to it.
- + *
- + * \param pctx - Returned context handle of the new context
- + * \param flags - Context creation flags
- + * \param dev - Device to create context on
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_DEVICE,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY,
- + * ::CUDA_ERROR_UNKNOWN
- + * \notefnerr
- + *
- + * \sa ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \brief Destroy a CUDA context
- + *
- + * Destroys the CUDA context specified by \p ctx. The context \p ctx will be
- + * destroyed regardless of how many threads it is current to.
- + * It is the responsibility of the calling function to ensure that no API
- + * call issues using \p ctx while ::cuCtxDestroy() is executing.
- + *
- + * If \p ctx is current to the calling thread then \p ctx will also be
- + * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
- + * were called). If \p ctx is current to other threads, then \p ctx will
- + * remain current to those threads, and attempting to access \p ctx from
- + * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
- + *
- + * \param ctx - Context to destroy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +/**
- + * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
- + *
- + * This section describes the deprecated context management functions of the low-level
- + * CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Increment a context's usage-count
- + *
- + * \deprecated
- + *
- + * Note that this function is deprecated and should not be used.
- + *
- + * Increments the usage count of the context and passes back a context handle
- + * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
- + * done with the context. ::cuCtxAttach() fails if there is no context current
- + * to the thread.
- + *
- + * Currently, the \p flags parameter must be 0.
- + *
- + * \param pctx - Returned context handle of the current context
- + * \param flags - Context attach flags (must be 0)
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxDetach,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
- +
- +/**
- + * \brief Decrement a context's usage-count
- + *
- + * \deprecated
- + *
- + * Note that this function is deprecated and should not be used.
- + *
- + * Decrements the usage count of the context \p ctx, and destroys the context
- + * if the usage count goes to 0. The context must be a handle that was passed
- + * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
- + * calling thread.
- + *
- + * \param ctx - Context to destroy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
- +
- +/** @} */ /* END CUDA_CTX_DEPRECATED */
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \brief Pushes a context on the current CPU thread
- + *
- + * Pushes the given context \p ctx onto the CPU thread's stack of current
- + * contexts. The specified context becomes the CPU thread's current context, so
- + * all CUDA functions that operate on the current context are affected.
- + *
- + * The previous current context may be made current again by calling
- + * ::cuCtxDestroy() or ::cuCtxPopCurrent().
- + *
- + * \param ctx - Context to push
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
- +
- +/**
- + * \brief Pops the current CUDA context from the current CPU thread.
- + *
- + * Pops the current CUDA context from the CPU thread and passes back the
- + * old context handle in \p *pctx. That context may then be made current
- + * to a different CPU thread by calling ::cuCtxPushCurrent().
- + *
- + * If a context was current to the CPU thread before ::cuCtxCreate() or
- + * ::cuCtxPushCurrent() was called, this function makes that context current to
- + * the CPU thread again.
- + *
- + * \param pctx - Returned new context handle
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
- +
- +/**
- + * \brief Binds the specified CUDA context to the calling CPU thread
- + *
- + * Binds the specified CUDA context to the calling CPU thread.
- + * If \p ctx is NULL then the CUDA context previously bound to the
- + * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
- + *
- + * If there exists a CUDA context stack on the calling CPU thread, this
- + * will replace the top of that stack with \p ctx.
- + * If \p ctx is NULL then this will be equivalent to popping the top
- + * of the calling CPU thread's CUDA context stack (or a no-op if the
- + * calling CPU thread's CUDA context stack is empty).
- + *
- + * \param ctx - Context to bind to the calling CPU thread
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT
- + * \notefnerr
- + *
- + * \sa ::cuCtxGetCurrent, ::cuCtxCreate, ::cuCtxDestroy
- + */
- +CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
- +
- +/**
- + * \brief Returns the CUDA context bound to the calling CPU thread.
- + *
- + * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
- + * If no context is bound to the calling CPU thread then \p *pctx is
- + * set to NULL and ::CUDA_SUCCESS is returned.
- + *
- + * \param pctx - Returned context handle
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * \notefnerr
- + *
- + * \sa ::cuCtxSetCurrent, ::cuCtxCreate, ::cuCtxDestroy
- + */
- +CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +/**
- + * \brief Returns the device ID for the current context
- + *
- + * Returns in \p *device the ordinal of the current context's device.
- + *
- + * \param device - Returned device ID for the current context
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
- +
- +/**
- + * \brief Block for a context's tasks to complete
- + *
- + * Blocks until the device has completed all preceding requested tasks.
- + * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
- + * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
- + * CPU thread will block until the GPU context has finished its work.
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit
- + */
- +CUresult CUDAAPI cuCtxSynchronize(void);
- +
- +/**
- + * \brief Set resource limits
- + *
- + * Setting \p limit to \p value is a request by the application to update
- + * the current limit maintained by the context. The driver is free to
- + * modify the requested value to meet h/w requirements (this could be
- + * clamping to minimum or maximum values, rounding up to nearest element
- + * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
- + * what the limit has been set to.
- + *
- + * Setting each ::CUlimit has its own specific restrictions, so each is
- + * discussed here.
- + *
- + * - ::CU_LIMIT_STACK_SIZE controls the stack size of each GPU thread.
- + * This limit is only applicable to devices of compute capability
- + * 2.0 and higher. Attempting to set this limit on devices of
- + * compute capability less than 2.0 will result in the error
- + * ::CUDA_ERROR_UNSUPPORTED_LIMIT being returned.
- + *
- + * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size of the FIFO used
- + * by the ::printf() device system call. Setting
- + * ::CU_LIMIT_PRINTF_FIFO_SIZE must be performed before launching any
- + * kernel that uses the ::printf() device system call, otherwise
- + * ::CUDA_ERROR_INVALID_VALUE will be returned.
- + * This limit is only applicable to devices of compute capability
- + * 2.0 and higher. Attempting to set this limit on devices of
- + * compute capability less than 2.0 will result in the error
- + * ::CUDA_ERROR_UNSUPPORTED_LIMIT being returned.
- + *
- + * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size of the heap used
- + * by the ::malloc() and ::free() device system calls. Setting
- + * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching
- + * any kernel that uses the ::malloc() or ::free() device system calls,
- + * otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
- + * This limit is only applicable to devices of compute capability
- + * 2.0 and higher. Attempting to set this limit on devices of
- + * compute capability less than 2.0 will result in the error
- + * ::CUDA_ERROR_UNSUPPORTED_LIMIT being returned.
- + *
- + * \param limit - Limit to set
- + * \param value - Size in bytes of limit
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_UNSUPPORTED_LIMIT
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
- +
- +/**
- + * \brief Returns resource limits
- + *
- + * Returns in \p *pvalue the current size of \p limit. The supported
- + * ::CUlimit values are:
- + * - ::CU_LIMIT_STACK_SIZE: stack size of each GPU thread;
- + * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size of the FIFO used by the
- + * ::printf() device system call.
- + * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size of the heap used by the
- + * ::malloc() and ::free() device system calls;
- + *
- + * \param limit - Limit to query
- + * \param pvalue - Returned size in bytes of limit
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_UNSUPPORTED_LIMIT
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
- +
- +/**
- + * \brief Returns the preferred cache configuration for the current context.
- + *
- + * On devices where the L1 cache and shared memory use the same hardware
- + * resources, this function returns through \p pconfig the preferred cache configuration
- + * for the current context. This is only a preference. The driver will use
- + * the requested configuration if possible, but it is free to choose a different
- + * configuration if required to execute functions.
- + *
- + * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
- + * where the size of the L1 cache and shared memory are fixed.
- + *
- + * The supported cache configurations are:
- + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
- + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
- + *
- + * \param pconfig - Returned cache configuration
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize,
- + * ::cuFuncSetCacheConfig
- + */
- +CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
- +
- +/**
- + * \brief Sets the preferred cache configuration for the current context.
- + *
- + * On devices where the L1 cache and shared memory use the same hardware
- + * resources, this sets through \p config the preferred cache configuration for
- + * the current context. This is only a preference. The driver will use
- + * the requested configuration if possible, but it is free to choose a different
- + * configuration if required to execute the function. Any function preference
- + * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide
- + * setting. Setting the context-wide cache configuration to
- + * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
- + * to not change the cache configuration unless required to launch the kernel.
- + *
- + * This setting does nothing on devices where the size of the L1 cache and
- + * shared memory are fixed.
- + *
- + * Launching a kernel with a different preference than the most recent
- + * preference setting may insert a device-side synchronization point.
- + *
- + * The supported cache configurations are:
- + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
- + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
- + *
- + * \param config - Requested cache configuration
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize,
- + * ::cuFuncSetCacheConfig
- + */
- +CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
- +
- +#if __CUDA_API_VERSION >= 4020
- +/**
- + * \brief Returns the current shared memory configuration for the current context.
- + *
- + * This function will return in \p pConfig the current size of shared memory banks
- + * in the current context. On devices with configurable shared memory banks,
- + * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
- + * subsequent kernel launches will by default use the new bank size. When
- + * ::cuCtxGetSharedMemConfig is called on devices without configurable shared
- + * memory, it will return the fixed bank size of the hardware.
- + *
- + * The returned bank configurations can be either:
- + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is
- + * four bytes.
- + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
- + * eight bytes.
- + *
- + * \param pConfig - returned shared memory configuration
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize,
- + * ::cuCtxGetSharedMemConfig,
- + * ::cuFuncSetCacheConfig,
- + */
- +CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
- +
- +/**
- + * \brief Sets the shared memory configuration for the current context.
- + *
- + * On devices with configurable shared memory banks, this function will set
- + * the context's shared memory bank size which is used for subsequent kernel
- + * launches.
- + *
- + * Changed the shared memory configuration between launches may insert a device
- + * side synchronization point between those launches.
- + *
- + * Changing the shared memory bank size will not increase shared memory usage
- + * or affect occupancy of kernels, but may have major effects on performance.
- + * Larger bank sizes will allow for greater potential bandwidth to shared memory,
- + * but will change what kinds of accesses to shared memory will result in bank
- + * conflicts.
- + *
- + * This function will do nothing on devices with fixed shared memory bank size.
- + *
- + * The supported bank configurations are:
- + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
- + * setting (currently, four bytes).
- + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
- + * be natively four bytes.
- + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
- + * be natively eight bytes.
- + *
- + * \param config - requested shared memory configuration
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetApiVersion,
- + * ::cuCtxGetCacheConfig,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize,
- + * ::cuCtxGetSharedMemConfig,
- + * ::cuFuncSetCacheConfig,
- + */
- +CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
- +#endif
- +
- +/**
- + * \brief Gets the context's API version.
- + *
- + * Returns a version number in \p version corresponding to the capabilities of
- + * the context (e.g. 3010 or 3020), which library developers can use to direct
- + * callers to a specific API version. If \p ctx is NULL, returns the API version
- + * used to create the currently bound context.
- + *
- + * Note that new API versions are only introduced when context capabilities are
- + * changed that break binary compatibility, so the API version and driver version
- + * may be different. For example, it is valid for the API version to be 3020 while
- + * the driver version is 4010.
- + *
- + * \param ctx - Context to check
- + * \param version - Pointer to version
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_UNKNOWN
- + * \notefnerr
- + *
- + * \sa ::cuCtxCreate,
- + * ::cuCtxDestroy,
- + * ::cuCtxGetDevice,
- + * ::cuCtxGetLimit,
- + * ::cuCtxPopCurrent,
- + * ::cuCtxPushCurrent,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxSetLimit,
- + * ::cuCtxSynchronize
- + */
- +CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
- +
- +/** @} */ /* END CUDA_CTX */
- +
- +
- +/**
- + * \defgroup CUDA_MODULE Module Management
- + *
- + * This section describes the module management functions of the low-level CUDA
- + * driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Loads a compute module
- + *
- + * Takes a filename \p fname and loads the corresponding module \p module into
- + * the current context. The CUDA driver API does not attempt to lazily
- + * allocate the resources needed by a module; if the memory for functions and
- + * data (constant and global) needed by the module cannot be allocated,
- + * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
- + * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
- + * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
- + *
- + * \param module - Returned module
- + * \param fname - Filename of module to load
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_NOT_FOUND,
- + * ::CUDA_ERROR_OUT_OF_MEMORY,
- + * ::CUDA_ERROR_FILE_NOT_FOUND,
- + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- + * \notefnerr
- + *
- + * \sa ::cuModuleGetFunction,
- + * ::cuModuleGetGlobal,
- + * ::cuModuleGetTexRef,
- + * ::cuModuleLoadData,
- + * ::cuModuleLoadDataEx,
- + * ::cuModuleLoadFatBinary,
- + * ::cuModuleUnload
- + */
- +CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
- +
- +/**
- + * \brief Load a module's data
- + *
- + * Takes a pointer \p image and loads the corresponding module \p module into
- + * the current context. The pointer may be obtained by mapping a \e cubin or
- + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
- + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
- + * object into the executable resources and using operating system calls such
- + * as Windows \c FindResource() to obtain the pointer.
- + *
- + * \param module - Returned module
- + * \param image - Module data to load
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY,
- + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- + * \notefnerr
- + *
- + * \sa ::cuModuleGetFunction,
- + * ::cuModuleGetGlobal,
- + * ::cuModuleGetTexRef,
- + * ::cuModuleLoad,
- + * ::cuModuleLoadDataEx,
- + * ::cuModuleLoadFatBinary,
- + * ::cuModuleUnload
- + */
- +CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
- +
- +/**
- + * \brief Load a module's data with options
- + *
- + * Takes a pointer \p image and loads the corresponding module \p module into
- + * the current context. The pointer may be obtained by mapping a \e cubin or
- + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
- + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
- + * object into the executable resources and using operating system calls such
- + * as Windows \c FindResource() to obtain the pointer. Options are passed as
- + * an array via \p options and any corresponding parameters are passed in
- + * \p optionValues. The number of total options is supplied via \p numOptions.
- + * Any outputs will be returned via \p optionValues. Supported options are
- + * (types for the option values are specified in parentheses after the option
- + * name):
- + *
- + * - ::CU_JIT_MAX_REGISTERS: (unsigned int) input specifies the maximum number
- + * of registers per thread;
- + * - ::CU_JIT_THREADS_PER_BLOCK: (unsigned int) input specifies number of
- + * threads per block to target compilation for; output returns the number of
- + * threads the compiler actually targeted;
- + * - ::CU_JIT_WALL_TIME: (float) output returns the float value of wall clock
- + * time, in milliseconds, spent compiling the \e PTX code;
- + * - ::CU_JIT_INFO_LOG_BUFFER: (char*) input is a pointer to a buffer in
- + * which to print any informational log messages from \e PTX assembly (the
- + * buffer size is specified via option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES);
- + * - ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: (unsigned int) input is the size in
- + * bytes of the buffer; output is the number of bytes filled with messages;
- + * - ::CU_JIT_ERROR_LOG_BUFFER: (char*) input is a pointer to a buffer in
- + * which to print any error log messages from \e PTX assembly (the buffer size
- + * is specified via option ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES);
- + * - ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: (unsigned int) input is the size in
- + * bytes of the buffer; output is the number of bytes filled with messages;
- + * - ::CU_JIT_OPTIMIZATION_LEVEL: (unsigned int) input is the level of
- + * optimization to apply to generated code (0 - 4), with 4 being the default
- + * and highest level;
- + * - ::CU_JIT_TARGET_FROM_CUCONTEXT: (No option value) causes compilation
- + * target to be determined based on current attached context (default);
- + * - ::CU_JIT_TARGET: (unsigned int for enumerated type ::CUjit_target_enum)
- + * input is the compilation target based on supplied ::CUjit_target_enum;
- + * possible values are:
- + * - ::CU_TARGET_COMPUTE_10
- + * - ::CU_TARGET_COMPUTE_11
- + * - ::CU_TARGET_COMPUTE_12
- + * - ::CU_TARGET_COMPUTE_13
- + * - ::CU_TARGET_COMPUTE_20
- + * - ::CU_JIT_FALLBACK_STRATEGY: (unsigned int for enumerated type
- + * ::CUjit_fallback_enum) chooses fallback strategy if matching cubin is not
- + * found; possible values are:
- + * - ::CU_PREFER_PTX
- + * - ::CU_PREFER_BINARY
- + *
- + * \param module - Returned module
- + * \param image - Module data to load
- + * \param numOptions - Number of options
- + * \param options - Options for JIT
- + * \param optionValues - Option values for JIT
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY,
- + * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- + * \notefnerr
- + *
- + * \sa ::cuModuleGetFunction,
- + * ::cuModuleGetGlobal,
- + * ::cuModuleGetTexRef,
- + * ::cuModuleLoad,
- + * ::cuModuleLoadData,
- + * ::cuModuleLoadFatBinary,
- + * ::cuModuleUnload
- + */
- +CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
- +
- +/**
- + * \brief Load a module's data
- + *
- + * Takes a pointer \p fatCubin and loads the corresponding module \p module
- + * into the current context. The pointer represents a <i>fat binary</i> object,
- + * which is a collection of different \e cubin and/or \e PTX files, all
- + * representing the same device code, but compiled and optimized for different
- + * architectures.
- + *
- + * Prior to CUDA 4.0, there was no documented API for constructing and using
- + * fat binary objects by programmers. Starting with CUDA 4.0, fat binary
- + * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
- + * More information can be found in the \b nvcc document.
- + *
- + * \param module - Returned module
- + * \param fatCubin - Fat binary to load
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_NOT_FOUND,
- + * ::CUDA_ERROR_OUT_OF_MEMORY,
- + * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- + * \notefnerr
- + *
- + * \sa ::cuModuleGetFunction,
- + * ::cuModuleGetGlobal,
- + * ::cuModuleGetTexRef,
- + * ::cuModuleLoad,
- + * ::cuModuleLoadData,
- + * ::cuModuleLoadDataEx,
- + * ::cuModuleUnload
- + */
- +CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
- +
- +/**
- + * \brief Unloads a module
- + *
- + * Unloads a module \p hmod from the current context.
- + *
- + * \param hmod - Module to unload
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuModuleGetFunction,
- + * ::cuModuleGetGlobal,
- + * ::cuModuleGetTexRef,
- + * ::cuModuleLoad,
- + * ::cuModuleLoadData,
- + * ::cuModuleLoadDataEx,
- + * ::cuModuleLoadFatBinary
- + */
- +CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
- +
- +/**
- + * \brief Returns a function handle
- + *
- + * Returns in \p *hfunc the handle of the function of name \p name located in
- + * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
- + * returns ::CUDA_ERROR_NOT_FOUND.
- + *
- + * \param hfunc - Returned function handle
- + * \param hmod - Module to retrieve function from
- + * \param name - Name of function to retrieve
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_NOT_FOUND
- + * \notefnerr
- + *
- + * \sa ::cuModuleGetGlobal,
- + * ::cuModuleGetTexRef,
- + * ::cuModuleLoad,
- + * ::cuModuleLoadData,
- + * ::cuModuleLoadDataEx,
- + * ::cuModuleLoadFatBinary,
- + * ::cuModuleUnload
- + */
- +CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Returns a global pointer from a module
- + *
- + * Returns in \p *dptr and \p *bytes the base pointer and size of the
- + * global of name \p name located in module \p hmod. If no variable of that name
- + * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both
- + * parameters \p dptr and \p bytes are optional. If one of them is
- + * NULL, it is ignored.
- + *
- + * \param dptr - Returned global device pointer
- + * \param bytes - Returned global size in bytes
- + * \param hmod - Module to retrieve global from
- + * \param name - Name of global to retrieve
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_NOT_FOUND
- + * \notefnerr
- + *
- + * \sa ::cuModuleGetFunction,
- + * ::cuModuleGetTexRef,
- + * ::cuModuleLoad,
- + * ::cuModuleLoadData,
- + * ::cuModuleLoadDataEx,
- + * ::cuModuleLoadFatBinary,
- + * ::cuModuleUnload
- + */
- +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +/**
- + * \brief Returns a handle to a texture reference
- + *
- + * Returns in \p *pTexRef the handle of the texture reference of name \p name
- + * in the module \p hmod. If no texture reference of that name exists,
- + * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
- + * handle should not be destroyed, since it will be destroyed when the module
- + * is unloaded.
- + *
- + * \param pTexRef - Returned texture reference
- + * \param hmod - Module to retrieve texture reference from
- + * \param name - Name of texture reference to retrieve
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_NOT_FOUND
- + * \notefnerr
- + *
- + * \sa ::cuModuleGetFunction,
- + * ::cuModuleGetGlobal,
- + * ::cuModuleGetSurfRef,
- + * ::cuModuleLoad,
- + * ::cuModuleLoadData,
- + * ::cuModuleLoadDataEx,
- + * ::cuModuleLoadFatBinary,
- + * ::cuModuleUnload
- + */
- +CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
- +
- +/**
- + * \brief Returns a handle to a surface reference
- + *
- + * Returns in \p *pSurfRef the handle of the surface reference of name \p name
- + * in the module \p hmod. If no surface reference of that name exists,
- + * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
- + *
- + * \param pSurfRef - Returned surface reference
- + * \param hmod - Module to retrieve surface reference from
- + * \param name - Name of surface reference to retrieve
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_NOT_FOUND
- + * \notefnerr
- + *
- + * \sa ::cuModuleGetFunction,
- + * ::cuModuleGetGlobal,
- + * ::cuModuleGetTexRef,
- + * ::cuModuleLoad,
- + * ::cuModuleLoadData,
- + * ::cuModuleLoadDataEx,
- + * ::cuModuleLoadFatBinary,
- + * ::cuModuleUnload
- + */
- +CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
- +
- +/** @} */ /* END CUDA_MODULE */
- +
- +
- +/**
- + * \defgroup CUDA_MEM Memory Management
- + *
- + * This section describes the memory management functions of the low-level CUDA
- + * driver application programming interface.
- + *
- + * @{
- + */
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Gets free and total memory
- + *
- + * Returns in \p *free and \p *total respectively, the free and total amount of
- + * memory available for allocation by the CUDA context, in bytes.
- + *
- + * \param free - Returned free memory in bytes
- + * \param total - Returned total memory in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total);
- +
- +/**
- + * \brief Allocates device memory
- + *
- + * Allocates \p bytesize bytes of linear memory on the device and returns in
- + * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
- + * aligned for any kind of variable. The memory is not cleared. If \p bytesize
- + * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
- + *
- + * \param dptr - Returned device pointer
- + * \param bytesize - Requested allocation size in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
- +
- +/**
- + * \brief Allocates pitched device memory
- + *
- + * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
- + * the device and returns in \p *dptr a pointer to the allocated memory. The
- + * function may pad the allocation to ensure that corresponding pointers in
- + * any given row will continue to meet the alignment requirements for
- + * coalescing as the address is updated from row to row. \p ElementSizeBytes
- + * specifies the size of the largest reads and writes that will be performed
- + * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
- + * memory transactions are not possible on other data sizes). If
- + * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
- + * the kernel will run correctly, but possibly at reduced speed. The pitch
- + * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
- + * allocation. The intended usage of pitch is as a separate parameter of the
- + * allocation, used to compute addresses within the 2D array. Given the row
- + * and column of an array element of type \b T, the address is computed as:
- + * \code
- + T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
- + * \endcode
- + *
- + * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
- + * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
- + * recommended that programmers consider performing pitch allocations using
- + * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
- + * especially true if the application will be performing 2D memory copies
- + * between different regions of device memory (whether linear memory or CUDA
- + * arrays).
- + *
- + * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
- + * to match or exceed the alignment requirement for texture binding with
- + * ::cuTexRefSetAddress2D().
- + *
- + * \param dptr - Returned device pointer
- + * \param pPitch - Returned pitch of allocation in bytes
- + * \param WidthInBytes - Requested allocation width in bytes
- + * \param Height - Requested allocation height in rows
- + * \param ElementSizeBytes - Size of largest reads/writes for range
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
- +
- +/**
- + * \brief Frees device memory
- + *
- + * Frees the memory space pointed to by \p dptr, which must have been returned
- + * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch().
- + *
- + * \param dptr - Pointer to memory to free
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
- +
- +/**
- + * \brief Get information on memory allocations
- + *
- + * Returns the base address in \p *pbase and size in \p *psize of the
- + * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
- + * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
- + * of them is NULL, it is ignored.
- + *
- + * \param pbase - Returned base address
- + * \param psize - Returned size of device memory allocation
- + * \param dptr - Device pointer to query
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
- +
- +/**
- + * \brief Allocates page-locked host memory
- + *
- + * Allocates \p bytesize bytes of host memory that is page-locked and
- + * accessible to the device. The driver tracks the virtual memory ranges
- + * allocated with this function and automatically accelerates calls to
- + * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
- + * the device, it can be read or written with much higher bandwidth than
- + * pageable memory obtained with functions such as ::malloc(). Allocating
- + * excessive amounts of memory with ::cuMemAllocHost() may degrade system
- + * performance, since it reduces the amount of memory available to the system
- + * for paging. As a result, this function is best used sparingly to allocate
- + * staging areas for data exchange between host and device.
- + *
- + * Note all host memory allocated using ::cuMemHostAlloc() will automatically
- + * be immediately accessible to all contexts on all devices which support unified
- + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
- + * The device pointer that may be used to access this host memory from those
- + * contexts is always equal to the returned host pointer \p *pp.
- + * See \ref CUDA_UNIFIED for additional details.
- + *
- + * \param pp - Returned host pointer to page-locked memory
- + * \param bytesize - Requested allocation size in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +/**
- + * \brief Frees page-locked host memory
- + *
- + * Frees the memory space pointed to by \p p, which must have been returned by
- + * a previous call to ::cuMemAllocHost().
- + *
- + * \param p - Pointer to memory to free
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemFreeHost(void *p);
- +
- +/**
- + * \brief Allocates page-locked host memory
- + *
- + * Allocates \p bytesize bytes of host memory that is page-locked and accessible
- + * to the device. The driver tracks the virtual memory ranges allocated with
- + * this function and automatically accelerates calls to functions such as
- + * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
- + * it can be read or written with much higher bandwidth than pageable memory
- + * obtained with functions such as ::malloc(). Allocating excessive amounts of
- + * pinned memory may degrade system performance, since it reduces the amount
- + * of memory available to the system for paging. As a result, this function is
- + * best used sparingly to allocate staging areas for data exchange between
- + * host and device.
- + *
- + * The \p Flags parameter enables different options to be specified that
- + * affect the allocation, as follows.
- + *
- + * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
- + * considered as pinned memory by all CUDA contexts, not just the one that
- + * performed the allocation.
- + *
- + * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
- + * space. The device pointer to the memory may be obtained by calling
- + * ::cuMemHostGetDevicePointer(). This feature is available only on GPUs
- + * with compute capability greater than or equal to 1.1.
- + *
- + * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
- + * (WC). WC memory can be transferred across the PCI Express bus more
- + * quickly on some system configurations, but cannot be read efficiently by
- + * most CPUs. WC memory is a good option for buffers that will be written by
- + * the CPU and read by the GPU via mapped pinned memory or host->device
- + * transfers.
- + *
- + * All of these flags are orthogonal to one another: a developer may allocate
- + * memory that is portable, mapped and/or write-combined with no restrictions.
- + *
- + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
- + * order for the ::CU_MEMHOSTALLOC_MAPPED flag to have any effect.
- + *
- + * The ::CU_MEMHOSTALLOC_MAPPED flag may be specified on CUDA contexts for
- + * devices that do not support mapped pinned memory. The failure is deferred
- + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
- + * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
- + *
- + * The memory allocated by this function must be freed with ::cuMemFreeHost().
- + *
- + * Note all host memory allocated using ::cuMemHostAlloc() will automatically
- + * be immediately accessible to all contexts on all devices which support unified
- + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
- + * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
- + * that may be used to access this host memory from those contexts is always equal
- + * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
- + * is specified, then the function ::cuMemHostGetDevicePointer() must be used
- + * to query the device pointer, even if the context supports unified addressing.
- + * See \ref CUDA_UNIFIED for additional details.
- + *
- + * \param pp - Returned host pointer to page-locked memory
- + * \param bytesize - Requested allocation size in bytes
- + * \param Flags - Flags for allocation request
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Passes back device pointer of mapped pinned memory
- + *
- + * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
- + * host buffer \p p allocated by ::cuMemHostAlloc.
- + *
- + * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMALLOCHOST_DEVICEMAP
- + * flag was not specified at the time the memory was allocated, or if the
- + * function is called on a GPU that does not support mapped pinned memory.
- + *
- + * \p Flags provides for future releases. For now, it must be set to 0.
- + *
- + * \param pdptr - Returned device pointer
- + * \param p - Host pointer
- + * \param Flags - Options (must be 0)
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +/**
- + * \brief Passes back flags that were used for a pinned allocation
- + *
- + * Passes back the flags \p pFlags that were specified when allocating
- + * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
- + *
- + * ::cuMemHostGetFlags() will fail if the pointer does not reside in
- + * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
- + *
- + * \param pFlags - Returned flags word
- + * \param p - Host pointer
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuMemAllocHost, ::cuMemHostAlloc
- + */
- +CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
- +
- +#if __CUDA_API_VERSION >= 4010
- +
- +/**
- + * \brief Returns a handle to a compute device
- + *
- + * Returns in \p *device a device handle given a PCI bus ID string.
- + *
- + * \param dev - Returned device handle
- + *
- + * \param pciBusId - String in one of the following forms:
- + * [domain]:[bus]:[device].[function]
- + * [domain]:[bus]:[device]
- + * [bus]:[device].[function]
- + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa ::cuDeviceGet, ::cuDeviceGetAttribute, ::cuDeviceGetPCIBusId
- + */
- +CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId);
- +
- +/**
- + * \brief Returns a PCI Bus Id string for the device
- + *
- + * Returns an ASCII string identifying the device \p dev in the NULL-terminated
- + * string pointed to by \p pciBusId. \p len specifies the maximum length of the
- + * string that may be returned.
- + *
- + * \param pciBusId - Returned identifier string for the device in the following format
- + * [domain]:[bus]:[device].[function]
- + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
- + * pciBusId should be large enough to store 13 characters including the NULL-terminator.
- + *
- + * \param len - Maximum length of string to store in \p name
- + *
- + * \param dev - Device to get identifier string for
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa ::cuDeviceGet, ::cuDeviceGetAttribute, ::cuDeviceGetByPCIBusId
- + */
- +CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
- +
- +/**
- + * \brief Gets an interprocess handle for a previously allocated event
- + *
- + * Takes as input a previously allocated event. This event must have been
- + * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
- + * flags set. This opaque handle may be copied into other processes and
- + * opened with ::cuIpcOpenEventHandle to allow efficient hardware
- + * synchronization between GPU work in different processes.
- + *
- + * After the event has been been opened in the importing process,
- + * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
- + * ::cuEventQuery may be used in either process. Performing operations
- + * on the imported event after the exported event has been freed
- + * with ::cuEventDestroy will result in undefined behavior.
- + *
- + * IPC functionality is restricted to devices with support for unified
- + * addressing on Linux operating systems.
- + *
- + * \param pHandle - Pointer to a user allocated CUipcEventHandle
- + * in which to return the opaque event handle
- + * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and
- + * ::CU_EVENT_DISABLE_TIMING flags.
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY,
- + * ::CUDA_ERROR_MAP_FAILED
- + *
- + * \sa
- + * ::cuEventCreate,
- + * ::cuEventDestroy,
- + * ::cuEventSynchronize,
- + * ::cuEventQuery,
- + * ::cuStreamWaitEvent,
- + * ::cuIpcOpenEventHandle,
- + * ::cuIpcGetMemHandle,
- + * ::cuIpcOpenMemHandle,
- + * ::cuIpcCloseMemHandle
- + */
- +CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
- +
- +/**
- + * \brief Opens an interprocess event handle for use in the current process
- + *
- + * Opens an interprocess event handle exported from another process with
- + * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
- + * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
- + * This event must be freed with ::cuEventDestroy.
- + *
- + * Performing operations on the imported event after the exported event has
- + * been freed with ::cuEventDestroy will result in undefined behavior.
- + *
- + * IPC functionality is restricted to devices with support for unified
- + * addressing on Linux operating systems.
- + *
- + * \param phEvent - Returns the imported event
- + * \param handle - Interprocess handle to open
- + *
- + * \returns
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_MAP_FAILED,
- + * ::CUDA_ERROR_INVALID_HANDLE
- + *
- + * \sa
- + * ::cuEventCreate,
- + * ::cuEventDestroy,
- + * ::cuEventSynchronize,
- + * ::cuEventQuery,
- + * ::cuStreamWaitEvent,
- + * ::cuIpcGetEventHandle,
- + * ::cuIpcGetMemHandle,
- + * ::cuIpcOpenMemHandle,
- + * ::cuIpcCloseMemHandle
- + */
- +CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
- +
- +/**
- + * /brief Gets an interprocess memory handle for an existing device memory
- + * allocation
- + *
- + * Takes a pointer to the base of an existing device memory allocation created
- + * with ::cuMemAlloc and exports it for use in another process. This is a
- + * lightweight operation and may be called multiple times on an allocation
- + * without adverse effects.
- + *
- + * If a region of memory is freed with ::cuMemFree and a subsequent call
- + * to ::cuMemAlloc returns memory with the same device address,
- + * ::cuIpcGetMemHandle will return a unique handle for the
- + * new memory.
- + *
- + * IPC functionality is restricted to devices with support for unified
- + * addressing on Linux operating systems.
- + *
- + * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
- + * the handle in.
- + * \param dptr - Base pointer to previously allocated device memory
- + *
- + * \returns
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY,
- + * ::CUDA_ERROR_MAP_FAILED,
- + *
- + * \sa
- + * ::cuMemAlloc,
- + * ::cuMemFree,
- + * ::cuIpcGetEventHandle,
- + * ::cuIpcOpenEventHandle,
- + * ::cuIpcOpenMemHandle,
- + * ::cuIpcCloseMemHandle
- + */
- +CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
- +
- +/**
- + * /brief Opens an interprocess memory handle exported from another process
- + * and returns a device pointer usable in the local process.
- + *
- + * Maps memory exported from another process with ::cuIpcGetMemHandle into
- + * the current device address space. For contexts on different devices
- + * ::cuIpcOpenMemHandle can attempt to enable peer access between the
- + * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
- + * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
- + * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
- + *
- + * Contexts that may open ::CUipcMemHandles are restricted in the following way.
- + * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
- + * by one ::CUcontext per ::CUdevice per other process.
- + *
- + * Memory returned from ::cuIpcOpenMemHandle must be freed with
- + * ::cuIpcCloseMemHandle.
- + *
- + * Calling ::cuMemFree on an exported memory region before calling
- + * ::cuIpcCloseMemHandle in the importing context will result in undefined
- + * behavior.
- + *
- + * IPC functionality is restricted to devices with support for unified
- + * addressing on Linux operating systems.
- + *
- + * \param pdptr - Returned device pointer
- + * \param handle - ::CUipcMemHandle to open
- + * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
- + *
- + * \returns
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_MAP_FAILED,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_TOO_MANY_PEERS
- + *
- + * \sa
- + * ::cuMemAlloc,
- + * ::cuMemFree,
- + * ::cuIpcGetEventHandle,
- + * ::cuIpcOpenEventHandle,
- + * ::cuIpcGetMemHandle,
- + * ::cuIpcCloseMemHandle,
- + * ::cuCtxEnablePeerAccess,
- + * ::cuDeviceCanAccessPeer,
- + */
- +CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
- +
- +/**
- + * /brief Close memory mapped with ::cuIpcOpenMemHandle
- + *
- + * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation
- + * in the exporting process as well as imported mappings in other processes
- + * will be unaffected.
- + *
- + * Any resources used to enable peer access will be freed if this is the
- + * last mapping using them.
- + *
- + * IPC functionality is restricted to devices with support for unified
- + * addressing on Linux operating systems.
- + *
- + * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
- + *
- + * \returns
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_MAP_FAILED,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + *
- + * \sa
- + * ::cuMemAlloc,
- + * ::cuMemFree,
- + * ::cuIpcGetEventHandle,
- + * ::cuIpcOpenEventHandle,
- + * ::cuIpcGetMemHandle,
- + * ::cuIpcOpenMemHandle,
- + */
- +CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
- +
- +#endif /* __CUDA_API_VERSION >= 4010 */
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \brief Registers an existing host memory range for use by CUDA
- + *
- + * Page-locks the memory range specified by \p p and \p bytesize and maps it
- + * for the device(s) as specified by \p Flags. This memory range also is added
- + * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
- + * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
- + * directly by the device, it can be read or written with much higher bandwidth
- + * than pageable memory that has not been registered. Page-locking excessive
- + * amounts of memory may degrade system performance, since it reduces the amount
- + * of memory available to the system for paging. As a result, this function is
- + * best used sparingly to register staging areas for data exchange between
- + * host and device.
- + *
- + * This function has limited support on Mac OS X. OS 10.7 or higher is required.
- + *
- + * The \p Flags parameter enables different options to be specified that
- + * affect the allocation, as follows.
- + *
- + * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
- + * considered as pinned memory by all CUDA contexts, not just the one that
- + * performed the allocation.
- + *
- + * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
- + * space. The device pointer to the memory may be obtained by calling
- + * ::cuMemHostGetDevicePointer(). This feature is available only on GPUs
- + * with compute capability greater than or equal to 1.1.
- + *
- + * All of these flags are orthogonal to one another: a developer may page-lock
- + * memory that is portable or mapped with no restrictions.
- + *
- + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
- + * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect.
- + *
- + * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
- + * devices that do not support mapped pinned memory. The failure is deferred
- + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
- + * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
- + *
- + * The memory page-locked by this function must be unregistered with
- + * ::cuMemHostUnregister().
- + *
- + * \param p - Host pointer to memory to page-lock
- + * \param bytesize - Size in bytes of the address range to page-lock
- + * \param Flags - Flags for allocation request
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY
- + * \notefnerr
- + *
- + * \sa ::cuMemHostUnregister, ::cuMemHostGetFlags, ::cuMemHostGetDevicePointer
- + */
- +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
- +
- +/**
- + * \brief Unregisters a memory range that was registered with ::cuMemHostRegister().
- + *
- + * Unmaps the memory range whose base address is specified by \p p, and makes
- + * it pageable again.
- + *
- + * The base address must be the same one specified to ::cuMemHostRegister().
- + *
- + * \param p - Host pointer to memory to unregister
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY
- + * \notefnerr
- + *
- + * \sa ::cuMemHostRegister
- + */
- +CUresult CUDAAPI cuMemHostUnregister(void *p);
- +
- +/**
- + * \brief Copies memory
- + *
- + * Copies data between two pointers.
- + * \p dst and \p src are base pointers of the destination and source, respectively.
- + * \p ByteCount specifies the number of bytes to copy.
- + * Note that this function infers the type of the transfer (host to host, host to
- + * device, device to device, or device to host) from the pointer values. This
- + * function is only allowed in contexts which support unified addressing.
- + * Note that this function is synchronous.
- + *
- + * \param dst - Destination unified virtual address space pointer
- + * \param src - Source unified virtual address space pointer
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
- +
- +/**
- + * \brief Copies device memory between two contexts
- + *
- + * Copies from device memory in one context to device memory in another
- + * context. \p dstDevice is the base device pointer of the destination memory
- + * and \p dstContext is the destination context. \p srcDevice is the base
- + * device pointer of the source memory and \p srcContext is the source pointer.
- + * \p ByteCount specifies the number of bytes to copy.
- +
- + * Note that this function is asynchronous with respect to the host, but
- + * serialized with respect all pending and future asynchronous work in to the
- + * current context, \p srcContext, and \p dstContext (use ::cuMemcpyPeerAsync
- + * to avoid this synchronization).
- + *
- + * \param dstDevice - Destination device pointer
- + * \param dstContext - Destination context
- + * \param srcDevice - Source device pointer
- + * \param srcContext - Source context
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
- + * ::cuMemcpy3DPeerAsync
- + */
- +CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
- +
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Copies memory from Host to Device
- + *
- + * Copies from host memory to device memory. \p dstDevice and \p srcHost are
- + * the base addresses of the destination and source, respectively. \p ByteCount
- + * specifies the number of bytes to copy. Note that this function is
- + * synchronous.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param srcHost - Source host pointer
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
- +
- +/**
- + * \brief Copies memory from Device to Host
- + *
- + * Copies from device to host memory. \p dstHost and \p srcDevice specify the
- + * base pointers of the destination and source, respectively. \p ByteCount
- + * specifies the number of bytes to copy. Note that this function is
- + * synchronous.
- + *
- + * \param dstHost - Destination host pointer
- + * \param srcDevice - Source device pointer
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
- +
- +/**
- + * \brief Copies memory from Device to Device
- + *
- + * Copies from device memory to device memory. \p dstDevice and \p srcDevice
- + * are the base pointers of the destination and source, respectively.
- + * \p ByteCount specifies the number of bytes to copy. Note that this function
- + * is asynchronous.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param srcDevice - Source device pointer
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
- +
- +/**
- + * \brief Copies memory from Device to Array
- + *
- + * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
- + * specify the CUDA array handle and starting index of the destination data.
- + * \p srcDevice specifies the base pointer of the source. \p ByteCount
- + * specifies the number of bytes to copy.
- + *
- + * \param dstArray - Destination array
- + * \param dstOffset - Offset in bytes of destination array
- + * \param srcDevice - Source device pointer
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
- +
- +/**
- + * \brief Copies memory from Array to Device
- + *
- + * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
- + * base pointer of the destination and must be naturally aligned with the CUDA
- + * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
- + * and the offset in bytes into the array where the copy is to begin.
- + * \p ByteCount specifies the number of bytes to copy and must be evenly
- + * divisible by the array element size.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param srcArray - Source array
- + * \param srcOffset - Offset in bytes of source array
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
- +
- +/**
- + * \brief Copies memory from Host to Array
- + *
- + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
- + * specify the CUDA array handle and starting offset in bytes of the destination
- + * data. \p pSrc specifies the base address of the source. \p ByteCount specifies
- + * the number of bytes to copy.
- + *
- + * \param dstArray - Destination array
- + * \param dstOffset - Offset in bytes of destination array
- + * \param srcHost - Source host pointer
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
- +
- +/**
- + * \brief Copies memory from Array to Host
- + *
- + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
- + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
- + * array handle and starting offset in bytes of the source data.
- + * \p ByteCount specifies the number of bytes to copy.
- + *
- + * \param dstHost - Destination device pointer
- + * \param srcArray - Source array
- + * \param srcOffset - Offset in bytes of source array
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
- +
- +/**
- + * \brief Copies memory from Array to Array
- + *
- + * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
- + * specify the handles of the destination and source CUDA arrays for the copy,
- + * respectively. \p dstOffset and \p srcOffset specify the destination and
- + * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
- + * bytes to be copied. The size of the elements in the CUDA arrays need not be
- + * the same format, but the elements must be the same size; and count must be
- + * evenly divisible by that size.
- + *
- + * \param dstArray - Destination array
- + * \param dstOffset - Offset in bytes of destination array
- + * \param srcArray - Source array
- + * \param srcOffset - Offset in bytes of source array
- + * \param ByteCount - Size of memory copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
- +
- +/**
- + * \brief Copies memory for 2D arrays
- + *
- + * Perform a 2D memory copy according to the parameters specified in \p pCopy.
- + * The ::CUDA_MEMCPY2D structure is defined as:
- + *
- + * \code
- + typedef struct CUDA_MEMCPY2D_st {
- + unsigned int srcXInBytes, srcY;
- + CUmemorytype srcMemoryType;
- + const void *srcHost;
- + CUdeviceptr srcDevice;
- + CUarray srcArray;
- + unsigned int srcPitch;
- +
- + unsigned int dstXInBytes, dstY;
- + CUmemorytype dstMemoryType;
- + void *dstHost;
- + CUdeviceptr dstDevice;
- + CUarray dstArray;
- + unsigned int dstPitch;
- +
- + unsigned int WidthInBytes;
- + unsigned int Height;
- + } CUDA_MEMCPY2D;
- + * \endcode
- + * where:
- + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- + * source and destination, respectively; ::CUmemorytype_enum is defined as:
- + *
- + * \code
- + typedef enum CUmemorytype_enum {
- + CU_MEMORYTYPE_HOST = 0x01,
- + CU_MEMORYTYPE_DEVICE = 0x02,
- + CU_MEMORYTYPE_ARRAY = 0x03,
- + CU_MEMORYTYPE_UNIFIED = 0x04
- + } CUmemorytype;
- + * \endcode
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::srcArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
- + * specify the (host) base address of the source data and the bytes per row to
- + * apply. ::srcArray is ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
- + * specify the (device) base address of the source data and the bytes per row
- + * to apply. ::srcArray is ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
- + * ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- + * specify the (host) base address of the destination data and the bytes per
- + * row to apply. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::dstArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- + * specify the (device) base address of the destination data and the bytes per
- + * row to apply. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
- + * ignored.
- + *
- + * - ::srcXInBytes and ::srcY specify the base address of the source data for
- + * the copy.
- + *
- + * \par
- + * For host pointers, the starting address is
- + * \code
- + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - ::dstXInBytes and ::dstY specify the base address of the destination data
- + * for the copy.
- + *
- + * \par
- + * For host pointers, the base address is
- + * \code
- + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
- + * the 2D copy being performed.
- + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
- + * ::WidthInBytes + dstXInBytes.
- + *
- + * \par
- + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
- + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
- + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
- + * (device to device, CUDA array to device, CUDA array to CUDA array),
- + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
- + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
- + * significantly slower in the cases where ::cuMemcpy2D() would have returned
- + * an error code.
- + *
- + * \param pCopy - Parameters for the memory copy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
- +
- +/**
- + * \brief Copies memory for 2D arrays
- + *
- + * Perform a 2D memory copy according to the parameters specified in \p pCopy.
- + * The ::CUDA_MEMCPY2D structure is defined as:
- + *
- + * \code
- + typedef struct CUDA_MEMCPY2D_st {
- + unsigned int srcXInBytes, srcY;
- + CUmemorytype srcMemoryType;
- + const void *srcHost;
- + CUdeviceptr srcDevice;
- + CUarray srcArray;
- + unsigned int srcPitch;
- + unsigned int dstXInBytes, dstY;
- + CUmemorytype dstMemoryType;
- + void *dstHost;
- + CUdeviceptr dstDevice;
- + CUarray dstArray;
- + unsigned int dstPitch;
- + unsigned int WidthInBytes;
- + unsigned int Height;
- + } CUDA_MEMCPY2D;
- + * \endcode
- + * where:
- + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- + * source and destination, respectively; ::CUmemorytype_enum is defined as:
- + *
- + * \code
- + typedef enum CUmemorytype_enum {
- + CU_MEMORYTYPE_HOST = 0x01,
- + CU_MEMORYTYPE_DEVICE = 0x02,
- + CU_MEMORYTYPE_ARRAY = 0x03,
- + CU_MEMORYTYPE_UNIFIED = 0x04
- + } CUmemorytype;
- + * \endcode
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::srcArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
- + * specify the (host) base address of the source data and the bytes per row to
- + * apply. ::srcArray is ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
- + * specify the (device) base address of the source data and the bytes per row
- + * to apply. ::srcArray is ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
- + * ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::dstArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- + * specify the (host) base address of the destination data and the bytes per
- + * row to apply. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- + * specify the (device) base address of the destination data and the bytes per
- + * row to apply. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
- + * ignored.
- + *
- + * - ::srcXInBytes and ::srcY specify the base address of the source data for
- + * the copy.
- + *
- + * \par
- + * For host pointers, the starting address is
- + * \code
- + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - ::dstXInBytes and ::dstY specify the base address of the destination data
- + * for the copy.
- + *
- + * \par
- + * For host pointers, the base address is
- + * \code
- + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
- + * the 2D copy being performed.
- + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
- + * ::WidthInBytes + dstXInBytes.
- + *
- + * \par
- + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
- + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
- + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
- + * (device to device, CUDA array to device, CUDA array to CUDA array),
- + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
- + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
- + * significantly slower in the cases where ::cuMemcpy2D() would have returned
- + * an error code.
- + *
- + * \param pCopy - Parameters for the memory copy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
- +
- +/**
- + * \brief Copies memory for 3D arrays
- + *
- + * Perform a 3D memory copy according to the parameters specified in
- + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
- + *
- + * \code
- + typedef struct CUDA_MEMCPY3D_st {
- +
- + unsigned int srcXInBytes, srcY, srcZ;
- + unsigned int srcLOD;
- + CUmemorytype srcMemoryType;
- + const void *srcHost;
- + CUdeviceptr srcDevice;
- + CUarray srcArray;
- + unsigned int srcPitch; // ignored when src is array
- + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
- +
- + unsigned int dstXInBytes, dstY, dstZ;
- + unsigned int dstLOD;
- + CUmemorytype dstMemoryType;
- + void *dstHost;
- + CUdeviceptr dstDevice;
- + CUarray dstArray;
- + unsigned int dstPitch; // ignored when dst is array
- + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
- +
- + unsigned int WidthInBytes;
- + unsigned int Height;
- + unsigned int Depth;
- + } CUDA_MEMCPY3D;
- + * \endcode
- + * where:
- + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- + * source and destination, respectively; ::CUmemorytype_enum is defined as:
- + *
- + * \code
- + typedef enum CUmemorytype_enum {
- + CU_MEMORYTYPE_HOST = 0x01,
- + CU_MEMORYTYPE_DEVICE = 0x02,
- + CU_MEMORYTYPE_ARRAY = 0x03,
- + CU_MEMORYTYPE_UNIFIED = 0x04
- + } CUmemorytype;
- + * \endcode
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::srcArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
- + * ::srcHeight specify the (host) base address of the source data, the bytes
- + * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- + * ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
- + * ::srcHeight specify the (device) base address of the source data, the bytes
- + * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- + * ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
- + * ::srcHeight are ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::dstArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- + * specify the (host) base address of the destination data, the bytes per row,
- + * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- + * specify the (device) base address of the destination data, the bytes per
- + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
- + * ::dstHeight are ignored.
- + *
- + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
- + * data for the copy.
- + *
- + * \par
- + * For host pointers, the starting address is
- + * \code
- + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
- + * destination data for the copy.
- + *
- + * \par
- + * For host pointers, the base address is
- + * \code
- + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
- + * and depth of the 3D copy being performed.
- + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
- + * ::WidthInBytes + dstXInBytes.
- + * - If specified, ::srcHeight must be greater than or equal to ::Height +
- + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
- + *
- + * \par
- + * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
- + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
- + *
- + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
- + * set to 0.
- + *
- + * \param pCopy - Parameters for the memory copy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \brief Copies memory between contexts
- + *
- + * Perform a 3D memory copy according to the parameters specified in
- + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure
- + * for documentation of its parameters.
- + *
- + * Note that this function is synchronous with respect to the host only if
- + * the source or destination memory is of type ::CU_MEMORYTYPE_HOST.
- + * Note also that this copy is serialized with respect all pending and future
- + * asynchronous work in to the current context, the copy's source context,
- + * and the copy's destination context (use ::cuMemcpy3DPeerAsync to avoid
- + * this synchronization).
- + *
- + * \param pCopy - Parameters for the memory copy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
- + * ::cuMemcpy3DPeerAsync
- + */
- +CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
- +
- +/**
- + * \brief Copies memory asynchronously
- + *
- + * Copies data between two pointers.
- + * \p dst and \p src are base pointers of the destination and source, respectively.
- + * \p ByteCount specifies the number of bytes to copy.
- + * Note that this function infers the type of the transfer (host to host, host to
- + * device, device to device, or device to host) from the pointer values. This
- + * function is only allowed in contexts which support unified addressing.
- + * Note that this function is asynchronous and can optionally be associated to
- + * a stream by passing a non-zero \p hStream argument
- + *
- + * \param dst - Destination unified virtual address space pointer
- + * \param src - Source unified virtual address space pointer
- + * \param ByteCount - Size of memory copy in bytes
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
- +
- +/**
- + * \brief Copies device memory between two contexts asynchronously.
- + *
- + * Copies from device memory in one context to device memory in another
- + * context. \p dstDevice is the base device pointer of the destination memory
- + * and \p dstContext is the destination context. \p srcDevice is the base
- + * device pointer of the source memory and \p srcContext is the source pointer.
- + * \p ByteCount specifies the number of bytes to copy. Note that this function
- + * is asynchronous with respect to the host and all work in other streams in
- + * other devices.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param dstContext - Destination context
- + * \param srcDevice - Source device pointer
- + * \param srcContext - Source context
- + * \param ByteCount - Size of memory copy in bytes
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpy3DPeerAsync
- + */
- +CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Copies memory from Host to Device
- + *
- + * Copies from host memory to device memory. \p dstDevice and \p srcHost are
- + * the base addresses of the destination and source, respectively. \p ByteCount
- + * specifies the number of bytes to copy.
- + *
- + * ::cuMemcpyHtoDAsync() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p hStream argument. It only works on
- + * page-locked memory and returns an error if a pointer to pageable memory is
- + * passed as input.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param srcHost - Source host pointer
- + * \param ByteCount - Size of memory copy in bytes
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
- +
- +/**
- + * \brief Copies memory from Device to Host
- + *
- + * Copies from device to host memory. \p dstHost and \p srcDevice specify the
- + * base pointers of the destination and source, respectively. \p ByteCount
- + * specifies the number of bytes to copy.
- + *
- + * ::cuMemcpyDtoHAsync() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p hStream argument. It only works on
- + * page-locked memory and returns an error if a pointer to pageable memory is
- + * passed as input.
- + *
- + * \param dstHost - Destination host pointer
- + * \param srcDevice - Source device pointer
- + * \param ByteCount - Size of memory copy in bytes
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
- +
- +/**
- + * \brief Copies memory from Device to Device
- + *
- + * Copies from device memory to device memory. \p dstDevice and \p srcDevice
- + * are the base pointers of the destination and source, respectively.
- + * \p ByteCount specifies the number of bytes to copy. Note that this function
- + * is asynchronous and can optionally be associated to a stream by passing a
- + * non-zero \p hStream argument
- + *
- + * \param dstDevice - Destination device pointer
- + * \param srcDevice - Source device pointer
- + * \param ByteCount - Size of memory copy in bytes
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
- +
- +/**
- + * \brief Copies memory from Host to Array
- + *
- + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
- + * specify the CUDA array handle and starting offset in bytes of the
- + * destination data. \p srcHost specifies the base address of the source.
- + * \p ByteCount specifies the number of bytes to copy.
- + *
- + * ::cuMemcpyHtoAAsync() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p hStream argument. It only works on
- + * page-locked memory and returns an error if a pointer to pageable memory is
- + * passed as input.
- + *
- + * \param dstArray - Destination array
- + * \param dstOffset - Offset in bytes of destination array
- + * \param srcHost - Source host pointer
- + * \param ByteCount - Size of memory copy in bytes
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
- +
- +/**
- + * \brief Copies memory from Array to Host
- + *
- + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
- + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
- + * array handle and starting offset in bytes of the source data.
- + * \p ByteCount specifies the number of bytes to copy.
- + *
- + * ::cuMemcpyAtoHAsync() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p stream argument. It only works on
- + * page-locked host memory and returns an error if a pointer to pageable
- + * memory is passed as input.
- + *
- + * \param dstHost - Destination pointer
- + * \param srcArray - Source array
- + * \param srcOffset - Offset in bytes of source array
- + * \param ByteCount - Size of memory copy in bytes
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
- +
- +/**
- + * \brief Copies memory for 2D arrays
- + *
- + * Perform a 2D memory copy according to the parameters specified in \p pCopy.
- + * The ::CUDA_MEMCPY2D structure is defined as:
- + *
- + * \code
- + typedef struct CUDA_MEMCPY2D_st {
- + unsigned int srcXInBytes, srcY;
- + CUmemorytype srcMemoryType;
- + const void *srcHost;
- + CUdeviceptr srcDevice;
- + CUarray srcArray;
- + unsigned int srcPitch;
- + unsigned int dstXInBytes, dstY;
- + CUmemorytype dstMemoryType;
- + void *dstHost;
- + CUdeviceptr dstDevice;
- + CUarray dstArray;
- + unsigned int dstPitch;
- + unsigned int WidthInBytes;
- + unsigned int Height;
- + } CUDA_MEMCPY2D;
- + * \endcode
- + * where:
- + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- + * source and destination, respectively; ::CUmemorytype_enum is defined as:
- + *
- + * \code
- + typedef enum CUmemorytype_enum {
- + CU_MEMORYTYPE_HOST = 0x01,
- + CU_MEMORYTYPE_DEVICE = 0x02,
- + CU_MEMORYTYPE_ARRAY = 0x03,
- + CU_MEMORYTYPE_UNIFIED = 0x04
- + } CUmemorytype;
- + * \endcode
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
- + * specify the (host) base address of the source data and the bytes per row to
- + * apply. ::srcArray is ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::srcArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
- + * specify the (device) base address of the source data and the bytes per row
- + * to apply. ::srcArray is ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
- + * ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::dstArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- + * specify the (host) base address of the destination data and the bytes per
- + * row to apply. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- + * specify the (device) base address of the destination data and the bytes per
- + * row to apply. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
- + * ignored.
- + *
- + * - ::srcXInBytes and ::srcY specify the base address of the source data for
- + * the copy.
- + *
- + * \par
- + * For host pointers, the starting address is
- + * \code
- + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - ::dstXInBytes and ::dstY specify the base address of the destination data
- + * for the copy.
- + *
- + * \par
- + * For host pointers, the base address is
- + * \code
- + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
- + * the 2D copy being performed.
- + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
- + * ::WidthInBytes + dstXInBytes.
- + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
- + * ::WidthInBytes + dstXInBytes.
- + * - If specified, ::srcHeight must be greater than or equal to ::Height +
- + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
- + *
- + * \par
- + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
- + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
- + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
- + * (device to device, CUDA array to device, CUDA array to CUDA array),
- + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
- + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
- + * significantly slower in the cases where ::cuMemcpy2D() would have returned
- + * an error code.
- + *
- + * ::cuMemcpy2DAsync() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p hStream argument. It only works on
- + * page-locked host memory and returns an error if a pointer to pageable
- + * memory is passed as input.
- + *
- + * \param pCopy - Parameters for the memory copy
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
- +
- +/**
- + * \brief Copies memory for 3D arrays
- + *
- + * Perform a 3D memory copy according to the parameters specified in
- + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
- + *
- + * \code
- + typedef struct CUDA_MEMCPY3D_st {
- +
- + unsigned int srcXInBytes, srcY, srcZ;
- + unsigned int srcLOD;
- + CUmemorytype srcMemoryType;
- + const void *srcHost;
- + CUdeviceptr srcDevice;
- + CUarray srcArray;
- + unsigned int srcPitch; // ignored when src is array
- + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
- +
- + unsigned int dstXInBytes, dstY, dstZ;
- + unsigned int dstLOD;
- + CUmemorytype dstMemoryType;
- + void *dstHost;
- + CUdeviceptr dstDevice;
- + CUarray dstArray;
- + unsigned int dstPitch; // ignored when dst is array
- + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
- +
- + unsigned int WidthInBytes;
- + unsigned int Height;
- + unsigned int Depth;
- + } CUDA_MEMCPY3D;
- + * \endcode
- + * where:
- + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- + * source and destination, respectively; ::CUmemorytype_enum is defined as:
- + *
- + * \code
- + typedef enum CUmemorytype_enum {
- + CU_MEMORYTYPE_HOST = 0x01,
- + CU_MEMORYTYPE_DEVICE = 0x02,
- + CU_MEMORYTYPE_ARRAY = 0x03,
- + CU_MEMORYTYPE_UNIFIED = 0x04
- + } CUmemorytype;
- + * \endcode
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::srcArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
- + * ::srcHeight specify the (host) base address of the source data, the bytes
- + * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- + * ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
- + * ::srcHeight specify the (device) base address of the source data, the bytes
- + * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- + * ignored.
- + *
- + * \par
- + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
- + * ::srcHeight are ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- + * specify the (unified virtual address space) base address of the source data
- + * and the bytes per row to apply. ::dstArray is ignored.
- + * This value may be used only if unified addressing is supported in the calling
- + * context.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- + * specify the (host) base address of the destination data, the bytes per row,
- + * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- + * specify the (device) base address of the destination data, the bytes per
- + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- + *
- + * \par
- + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
- + * ::dstHeight are ignored.
- + *
- + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
- + * data for the copy.
- + *
- + * \par
- + * For host pointers, the starting address is
- + * \code
- + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
- + * destination data for the copy.
- + *
- + * \par
- + * For host pointers, the base address is
- + * \code
- + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
- + * \endcode
- + *
- + * \par
- + * For device pointers, the starting address is
- + * \code
- + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
- + * \endcode
- + *
- + * \par
- + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- + * element size.
- + *
- + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
- + * and depth of the 3D copy being performed.
- + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- + * ::srcXInBytes, and ::dstPitch must be greater than or equal to
- + * ::WidthInBytes + dstXInBytes.
- + * - If specified, ::srcHeight must be greater than or equal to ::Height +
- + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
- + *
- + * \par
- + * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
- + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
- + *
- + * ::cuMemcpy3DAsync() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p hStream argument. It only works on
- + * page-locked host memory and returns an error if a pointer to pageable
- + * memory is passed as input.
- + *
- + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
- + * set to 0.
- + *
- + * \param pCopy - Parameters for the memory copy
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \brief Copies memory between contexts asynchronously.
- + *
- + * Perform a 3D memory copy according to the parameters specified in
- + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure
- + * for documentation of its parameters.
- + *
- + * \param pCopy - Parameters for the memory copy
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
- + * ::cuMemcpy3DPeerAsync
- + */
- +CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Initializes device memory
- + *
- + * Sets the memory range of \p N 8-bit values to the specified value
- + * \p uc.
- + *
- + * Note that this function is asynchronous with respect to the host unless
- + * \p dstDevice refers to pinned host memory.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param uc - Value to set
- + * \param N - Number of elements
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
- +
- +/**
- + * \brief Initializes device memory
- + *
- + * Sets the memory range of \p N 16-bit values to the specified value
- + * \p us. The \p dstDevice pointer must be two byte aligned.
- + *
- + * Note that this function is asynchronous with respect to the host unless
- + * \p dstDevice refers to pinned host memory.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param us - Value to set
- + * \param N - Number of elements
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
- +
- +/**
- + * \brief Initializes device memory
- + *
- + * Sets the memory range of \p N 32-bit values to the specified value
- + * \p ui. The \p dstDevice pointer must be four byte aligned.
- + *
- + * Note that this function is asynchronous with respect to the host unless
- + * \p dstDevice refers to pinned host memory.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param ui - Value to set
- + * \param N - Number of elements
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
- +
- +/**
- + * \brief Initializes device memory
- + *
- + * Sets the 2D memory range of \p Width 8-bit values to the specified value
- + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
- + * specifies the number of bytes between each row. This function performs
- + * fastest when the pitch is one that has been passed back by
- + * ::cuMemAllocPitch().
- + *
- + * Note that this function is asynchronous with respect to the host unless
- + * \p dstDevice refers to pinned host memory.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param dstPitch - Pitch of destination device pointer
- + * \param uc - Value to set
- + * \param Width - Width of row
- + * \param Height - Number of rows
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
- +
- +/**
- + * \brief Initializes device memory
- + *
- + * Sets the 2D memory range of \p Width 16-bit values to the specified value
- + * \p us. \p Height specifies the number of rows to set, and \p dstPitch
- + * specifies the number of bytes between each row. The \p dstDevice pointer
- + * and \p dstPitch offset must be two byte aligned. This function performs
- + * fastest when the pitch is one that has been passed back by
- + * ::cuMemAllocPitch().
- + *
- + * Note that this function is asynchronous with respect to the host unless
- + * \p dstDevice refers to pinned host memory.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param dstPitch - Pitch of destination device pointer
- + * \param us - Value to set
- + * \param Width - Width of row
- + * \param Height - Number of rows
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
- +
- +/**
- + * \brief Initializes device memory
- + *
- + * Sets the 2D memory range of \p Width 32-bit values to the specified value
- + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
- + * specifies the number of bytes between each row. The \p dstDevice pointer
- + * and \p dstPitch offset must be four byte aligned. This function performs
- + * fastest when the pitch is one that has been passed back by
- + * ::cuMemAllocPitch().
- + *
- + * Note that this function is asynchronous with respect to the host unless
- + * \p dstDevice refers to pinned host memory.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param dstPitch - Pitch of destination device pointer
- + * \param ui - Value to set
- + * \param Width - Width of row
- + * \param Height - Number of rows
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
- +
- +/**
- + * \brief Sets device memory
- + *
- + * Sets the memory range of \p N 8-bit values to the specified value
- + * \p uc.
- + *
- + * ::cuMemsetD8Async() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p stream argument.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param uc - Value to set
- + * \param N - Number of elements
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
- +
- +/**
- + * \brief Sets device memory
- + *
- + * Sets the memory range of \p N 16-bit values to the specified value
- + * \p us. The \p dstDevice pointer must be two byte aligned.
- + *
- + * ::cuMemsetD16Async() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p stream argument.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param us - Value to set
- + * \param N - Number of elements
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
- +
- +/**
- + * \brief Sets device memory
- + *
- + * Sets the memory range of \p N 32-bit values to the specified value
- + * \p ui. The \p dstDevice pointer must be four byte aligned.
- + *
- + * ::cuMemsetD32Async() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p stream argument.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param ui - Value to set
- + * \param N - Number of elements
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
- +
- +/**
- + * \brief Sets device memory
- + *
- + * Sets the 2D memory range of \p Width 8-bit values to the specified value
- + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
- + * specifies the number of bytes between each row. This function performs
- + * fastest when the pitch is one that has been passed back by
- + * ::cuMemAllocPitch().
- + *
- + * ::cuMemsetD2D8Async() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p stream argument.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param dstPitch - Pitch of destination device pointer
- + * \param uc - Value to set
- + * \param Width - Width of row
- + * \param Height - Number of rows
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
- +
- +/**
- + * \brief Sets device memory
- + *
- + * Sets the 2D memory range of \p Width 16-bit values to the specified value
- + * \p us. \p Height specifies the number of rows to set, and \p dstPitch
- + * specifies the number of bytes between each row. The \p dstDevice pointer
- + * and \p dstPitch offset must be two byte aligned. This function performs
- + * fastest when the pitch is one that has been passed back by
- + * ::cuMemAllocPitch().
- + *
- + * ::cuMemsetD2D16Async() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p stream argument.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param dstPitch - Pitch of destination device pointer
- + * \param us - Value to set
- + * \param Width - Width of row
- + * \param Height - Number of rows
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
- +
- +/**
- + * \brief Sets device memory
- + *
- + * Sets the 2D memory range of \p Width 32-bit values to the specified value
- + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
- + * specifies the number of bytes between each row. The \p dstDevice pointer
- + * and \p dstPitch offset must be four byte aligned. This function performs
- + * fastest when the pitch is one that has been passed back by
- + * ::cuMemAllocPitch().
- + *
- + * ::cuMemsetD2D32Async() is asynchronous and can optionally be associated to a
- + * stream by passing a non-zero \p stream argument.
- + *
- + * \param dstDevice - Destination device pointer
- + * \param dstPitch - Pitch of destination device pointer
- + * \param ui - Value to set
- + * \param Width - Width of row
- + * \param Height - Number of rows
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
- + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- + * ::cuMemsetD32, ::cuMemsetD32Async
- + */
- +CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
- +
- +/**
- + * \brief Creates a 1D or 2D CUDA array
- + *
- + * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
- + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
- + * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
- + *
- + * \code
- + typedef struct {
- + unsigned int Width;
- + unsigned int Height;
- + CUarray_format Format;
- + unsigned int NumChannels;
- + } CUDA_ARRAY_DESCRIPTOR;
- + * \endcode
- + * where:
- + *
- + * - \p Width, and \p Height are the width, and height of the CUDA array (in
- + * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
- + * otherwise;
- + * - ::Format specifies the format of the elements; ::CUarray_format is
- + * defined as:
- + * \code
- + typedef enum CUarray_format_enum {
- + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
- + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
- + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
- + CU_AD_FORMAT_SIGNED_INT8 = 0x08,
- + CU_AD_FORMAT_SIGNED_INT16 = 0x09,
- + CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
- + CU_AD_FORMAT_HALF = 0x10,
- + CU_AD_FORMAT_FLOAT = 0x20
- + } CUarray_format;
- + * \endcode
- + * - \p NumChannels specifies the number of packed components per CUDA array
- + * element; it may be 1, 2, or 4;
- + *
- + * Here are examples of CUDA array descriptions:
- + *
- + * Description for a CUDA array of 2048 floats:
- + * \code
- + CUDA_ARRAY_DESCRIPTOR desc;
- + desc.Format = CU_AD_FORMAT_FLOAT;
- + desc.NumChannels = 1;
- + desc.Width = 2048;
- + desc.Height = 1;
- + * \endcode
- + *
- + * Description for a 64 x 64 CUDA array of floats:
- + * \code
- + CUDA_ARRAY_DESCRIPTOR desc;
- + desc.Format = CU_AD_FORMAT_FLOAT;
- + desc.NumChannels = 1;
- + desc.Width = 64;
- + desc.Height = 64;
- + * \endcode
- + *
- + * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
- + * float16's:
- + * \code
- + CUDA_ARRAY_DESCRIPTOR desc;
- + desc.FormatFlags = CU_AD_FORMAT_HALF;
- + desc.NumChannels = 4;
- + desc.Width = width;
- + desc.Height = height;
- + * \endcode
- + *
- + * Description for a \p width x \p height CUDA array of 16-bit elements, each
- + * of which is two 8-bit unsigned chars:
- + * \code
- + CUDA_ARRAY_DESCRIPTOR arrayDesc;
- + desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8;
- + desc.NumChannels = 2;
- + desc.Width = width;
- + desc.Height = height;
- + * \endcode
- + *
- + * \param pHandle - Returned array
- + * \param pAllocateArray - Array descriptor
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY,
- + * ::CUDA_ERROR_UNKNOWN
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
- +
- +/**
- + * \brief Get a 1D or 2D CUDA array descriptor
- + *
- + * Returns in \p *pArrayDescriptor a descriptor containing information on the
- + * format and dimensions of the CUDA array \p hArray. It is useful for
- + * subroutines that have been passed a CUDA array, but need to know the CUDA
- + * array parameters for validation or other purposes.
- + *
- + * \param pArrayDescriptor - Returned array descriptor
- + * \param hArray - Array to get descriptor of
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_HANDLE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +
- +/**
- + * \brief Destroys a CUDA array
- + *
- + * Destroys the CUDA array \p hArray.
- + *
- + * \param hArray - Array to destroy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_ARRAY_IS_MAPPED
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Creates a 3D CUDA array
- + *
- + * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
- + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
- + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
- + *
- + * \code
- + typedef struct {
- + unsigned int Width;
- + unsigned int Height;
- + unsigned int Depth;
- + CUarray_format Format;
- + unsigned int NumChannels;
- + unsigned int Flags;
- + } CUDA_ARRAY3D_DESCRIPTOR;
- + * \endcode
- + * where:
- + *
- + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
- + * CUDA array (in elements); the following types of CUDA arrays can be allocated:
- + * - A 1D array is allocated if \p Height and \p Depth extents are both zero.
- + * - A 2D array is allocated if only \p Depth extent is zero.
- + * - A 3D array is allocated if all three extents are non-zero.
- + * - A 1D layered CUDA array is allocated if only \p Height is zero and the
- + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
- + * of layers is determined by the depth extent.
- + * - A 2D layered CUDA array is allocated if all three extents are non-zero and
- + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
- + * of layers is determined by the depth extent.
- + * - A cubemap CUDA array is allocated if all three extents are non-zero and the
- + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
- + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
- + * where the six layers represent the six faces of a cube. The order of the six
- + * layers in memory is the same as that listed in ::CUarray_cubemap_face.
- + * - A cubemap layered CUDA array is allocated if all three extents are non-zero,
- + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
- + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
- + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that
- + * consists of a collection of cubemaps. The first six layers represent the first
- + * cubemap, the next six layers form the second cubemap, and so on.
- + *
- + * - ::Format specifies the format of the elements; ::CUarray_format is
- + * defined as:
- + * \code
- + typedef enum CUarray_format_enum {
- + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
- + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
- + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
- + CU_AD_FORMAT_SIGNED_INT8 = 0x08,
- + CU_AD_FORMAT_SIGNED_INT16 = 0x09,
- + CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
- + CU_AD_FORMAT_HALF = 0x10,
- + CU_AD_FORMAT_FLOAT = 0x20
- + } CUarray_format;
- + * \endcode
- + *
- + * - \p NumChannels specifies the number of packed components per CUDA array
- + * element; it may be 1, 2, or 4;
- + *
- + * - ::Flags may be set to
- + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
- + * \p Depth specifies the number of layers, not the depth of a 3D array.
- + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
- + * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
- + * to a surface reference.
- + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
- + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
- + * then \p Depth must be a multiple of six.
- + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
- + * Texture gather can only be performed on 2D CUDA arrays.
- + *
- + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
- + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
- + * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
- + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
- + *
- + * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
- + * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
- + * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
- + *
- + * <table>
- + * <tr><td><b>CUDA array type</b></td>
- + * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
- + * (depth range)}</b></td>
- + * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
- + * {(width range in elements), (height range), (depth range)}</b></td></tr>
- + * <tr><td>1D</td>
- + * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
- + * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
- + * <tr><td>2D</td>
- + * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
- + * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
- + * <tr><td>3D</td>
- + * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
- + * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
- + * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
- + * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
- + * (1,SURFACE3D_DEPTH) }</small></td></tr>
- + * <tr><td>1D Layered</td>
- + * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
- + * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
- + * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
- + * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
- + * <tr><td>2D Layered</td>
- + * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
- + * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
- + * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
- + * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
- + * <tr><td>Cubemap</td>
- + * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
- + * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
- + * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
- + * <tr><td>Cubemap Layered</td>
- + * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
- + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
- + * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
- + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
- + * </table>
- + *
- + * Here are examples of CUDA array descriptions:
- + *
- + * Description for a CUDA array of 2048 floats:
- + * \code
- + CUDA_ARRAY3D_DESCRIPTOR desc;
- + desc.Format = CU_AD_FORMAT_FLOAT;
- + desc.NumChannels = 1;
- + desc.Width = 2048;
- + desc.Height = 0;
- + desc.Depth = 0;
- + * \endcode
- + *
- + * Description for a 64 x 64 CUDA array of floats:
- + * \code
- + CUDA_ARRAY3D_DESCRIPTOR desc;
- + desc.Format = CU_AD_FORMAT_FLOAT;
- + desc.NumChannels = 1;
- + desc.Width = 64;
- + desc.Height = 64;
- + desc.Depth = 0;
- + * \endcode
- + *
- + * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
- + * 4x16-bit float16's:
- + * \code
- + CUDA_ARRAY3D_DESCRIPTOR desc;
- + desc.FormatFlags = CU_AD_FORMAT_HALF;
- + desc.NumChannels = 4;
- + desc.Width = width;
- + desc.Height = height;
- + desc.Depth = depth;
- + * \endcode
- + *
- + * \param pHandle - Returned array
- + * \param pAllocateArray - 3D array descriptor
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY,
- + * ::CUDA_ERROR_UNKNOWN
- + * \notefnerr
- + *
- + * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
- +
- +/**
- + * \brief Get a 3D CUDA array descriptor
- + *
- + * Returns in \p *pArrayDescriptor a descriptor containing information on the
- + * format and dimensions of the CUDA array \p hArray. It is useful for
- + * subroutines that have been passed a CUDA array, but need to know the CUDA
- + * array parameters for validation or other purposes.
- + *
- + * This function may be called on 1D and 2D arrays, in which case the \p Height
- + * and/or \p Depth members of the descriptor struct will be set to 0.
- + *
- + * \param pArrayDescriptor - Returned 3D array descriptor
- + * \param hArray - 3D array to get descriptor of
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_HANDLE
- + * \notefnerr
- + *
- + * \sa ::cuArray3DCreate, ::cuArrayCreate,
- + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- + */
- +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +/** @} */ /* END CUDA_MEM */
- +
- +/**
- + * \defgroup CUDA_UNIFIED Unified Addressing
- + *
- + * This section describes the unified addressing functions of the
- + * low-level CUDA driver application programming interface.
- + *
- + * @{
- + *
- + * \section CUDA_UNIFIED_overview Overview
- + *
- + * CUDA devices can share a unified address space with the host.
- + * For these devices there is no distinction between a device
- + * pointer and a host pointer -- the same pointer value may be
- + * used to access memory from the host program and from a kernel
- + * running on the device (with exceptions enumerated below).
- + *
- + * \section CUDA_UNIFIED_support Supported Platforms
- + *
- + * Whether or not a device supports unified addressing may be
- + * queried by calling ::cuDeviceGetAttribute() with the device
- + * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
- + *
- + * Unified addressing is automatically enabled in 64-bit processes
- + * on devices with compute capability greater than or equal to 2.0.
- + *
- + * Unified addressing is not yet supported on Windows Vista or
- + * Windows 7 for devices that do not use the TCC driver model.
- + *
- + * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
- + *
- + * It is possible to look up information about the memory which backs a
- + * pointer value. For instance, one may want to know if a pointer points
- + * to host or device memory. As another example, in the case of device
- + * memory, one may want to know on which CUDA device the memory
- + * resides. These properties may be queried using the function
- + * ::cuPointerGetAttribute()
- + *
- + * Since pointers are unique, it is not necessary to specify information
- + * about the pointers specified to the various copy functions in the
- + * CUDA API. The function ::cuMemcpy() may be used to perform a copy
- + * between two pointers, ignoring whether they point to host or device
- + * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
- + * unnecessary for devices supporting unified addressing). For
- + * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
- + * used to specify that the CUDA driver should infer the location of the
- + * pointer from its value.
- + *
- + * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
- + *
- + * All host memory allocated in all contexts using ::cuMemAllocHost() and
- + * ::cuMemHostAlloc() is always directly accessible from all contexts on
- + * all devices that support unified addressing. This is the case regardless
- + * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
- + * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
- + *
- + * The pointer value through which allocated host memory may be accessed
- + * in kernels on all devices that support unified addressing is the same
- + * as the pointer value through which that memory is accessed on the host,
- + * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device
- + * pointer for these allocations.
- + *
- + * Note that this is not the case for memory allocated using the flag
- + * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
- + *
- + * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
- + *
- + * Upon enabling direct access from a context that supports unified addressing
- + * to another peer context that supports unified addressing using
- + * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using
- + * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible
- + * by the current context. The device pointer value through
- + * which any peer memory may be accessed in the current context
- + * is the same pointer value through which that memory may be
- + * accessed in the peer context.
- + *
- + * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
- + *
- + * Not all memory may be accessed on devices through the same pointer
- + * value through which they are accessed on the host. These exceptions
- + * are host memory registered using ::cuMemHostRegister() and host memory
- + * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these
- + * exceptions, there exists a distinct host and device address for the
- + * memory. The device address is guaranteed to not overlap any valid host
- + * pointer range and is guaranteed to have the same value across all
- + * contexts that support unified addressing.
- + *
- + * This device address may be queried using ::cuMemHostGetDevicePointer()
- + * when a context using unified addressing is current. Either the host
- + * or the unified device pointer value may be used to refer to this memory
- + * through ::cuMemcpy() and similar functions using the
- + * ::CU_MEMORYTYPE_UNIFIED memory type.
- + *
- + */
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \brief Returns information about a pointer
- + *
- + * The supported attributes are:
- + *
- + * - ::CU_POINTER_ATTRIBUTE_CONTEXT:
- + *
- + * Returns in \p *data the ::CUcontext in which \p ptr was allocated or
- + * registered.
- + * The type of \p data must be ::CUcontext *.
- + *
- + * If \p ptr was not allocated by, mapped by, or registered with
- + * a ::CUcontext which uses unified virtual addressing then
- + * ::CUDA_ERROR_INVALID_VALUE is returned.
- + *
- + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
- + *
- + * Returns in \p *data the physical memory type of the memory that
- + * \p ptr addresses as a ::CUmemorytype enumerated value.
- + * The type of \p data must be unsigned int.
- + *
- + * If \p ptr addresses device memory then \p *data is set to
- + * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the
- + * memory resides is the ::CUdevice of the ::CUcontext returned by the
- + * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
- + *
- + * If \p ptr addresses host memory then \p *data is set to
- + * ::CU_MEMORYTYPE_HOST.
- + *
- + * If \p ptr was not allocated by, mapped by, or registered with
- + * a ::CUcontext which uses unified virtual addressing then
- + * ::CUDA_ERROR_INVALID_VALUE is returned.
- + *
- + * If the current ::CUcontext does not support unified virtual
- + * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
- + *
- + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
- + *
- + * Returns in \p *data the device pointer value through which
- + * \p ptr may be accessed by kernels running in the current
- + * ::CUcontext.
- + * The type of \p data must be CUdeviceptr *.
- + *
- + * If there exists no device pointer value through which
- + * kernels running in the current ::CUcontext may access
- + * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
- + *
- + * If there is no current ::CUcontext then
- + * ::CUDA_ERROR_INVALID_CONTEXT is returned.
- + *
- + * Except in the exceptional disjoint addressing cases discussed
- + * below, the value returned in \p *data will equal the input
- + * value \p ptr.
- + *
- + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
- + *
- + * Returns in \p *data the host pointer value through which
- + * \p ptr may be accessed by by the host program.
- + * The type of \p data must be void **.
- + * If there exists no host pointer value through which
- + * the host program may directly access \p ptr then
- + * ::CUDA_ERROR_INVALID_VALUE is returned.
- + *
- + * Except in the exceptional disjoint addressing cases discussed
- + * below, the value returned in \p *data will equal the input
- + * value \p ptr.
- + *
- + *
- + * \par
- + *
- + * Note that for most allocations in the unified virtual address space
- + * the host and device pointer for accessing the allocation will be the
- + * same. The exceptions to this are
- + * - user memory registered using ::cuMemHostRegister
- + * - host memory allocated using ::cuMemHostAlloc with the
- + * ::CU_MEMHOSTALLOC_WRITECOMBINED flag
- + * For these types of allocation there will exist separate, disjoint host
- + * and device addresses for accessing the allocation. In particular
- + * - The host address will correspond to an invalid unmapped device address
- + * (which will result in an exception if accessed from the device)
- + * - The device address will correspond to an invalid unmapped host address
- + * (which will result in an exception if accessed from the host).
- + * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
- + * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
- + * and device addresses from either address.
- + *
- + * \param data - Returned pointer attribute value
- + * \param attribute - Pointer attribute to query
- + * \param ptr - Pointer
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa ::cuMemAlloc,
- + * ::cuMemFree,
- + * ::cuMemAllocHost,
- + * ::cuMemFreeHost,
- + * ::cuMemHostAlloc,
- + * ::cuMemHostRegister,
- + * ::cuMemHostUnregister
- + */
- +CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +/** @} */ /* END CUDA_UNIFIED */
- +
- +/**
- + * \defgroup CUDA_STREAM Stream Management
- + *
- + * This section describes the stream management functions of the low-level CUDA
- + * driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Create a stream
- + *
- + * Creates a stream and returns a handle in \p phStream. \p Flags is required
- + * to be 0.
- + *
- + * \param phStream - Returned newly created stream
- + * \param Flags - Parameters for stream creation (must be 0)
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY
- + * \notefnerr
- + *
- + * \sa ::cuStreamDestroy,
- + * ::cuStreamWaitEvent,
- + * ::cuStreamQuery,
- + * ::cuStreamSynchronize
- + */
- +CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
- +
- +/**
- + * \brief Make a compute stream wait on an event
- + *
- + * Makes all future work submitted to \p hStream wait until \p hEvent
- + * reports completion before beginning execution. This synchronization
- + * will be performed efficiently on the device. The event \p hEvent may
- + * be from a different context than \p hStream, in which case this function
- + * will perform cross-device synchronization.
- + *
- + * The stream \p hStream will wait only for the completion of the most recent
- + * host call to ::cuEventRecord() on \p hEvent. Once this call has returned,
- + * any functions (including ::cuEventRecord() and ::cuEventDestroy()) may be
- + * called on \p hEvent again, and subsequent calls will not have any
- + * effect on \p hStream.
- + *
- + * If \p hStream is 0 (the NULL stream) any future work submitted in any stream
- + * will wait for \p hEvent to complete before beginning execution. This
- + * effectively creates a barrier for all future work submitted to the context.
- + *
- + * If ::cuEventRecord() has not been called on \p hEvent, this call acts as if
- + * the record has already completed, and so is a functional no-op.
- + *
- + * \param hStream - Stream to wait
- + * \param hEvent - Event to wait on (may not be NULL)
- + * \param Flags - Parameters for the operation (must be 0)
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * \notefnerr
- + *
- + * \sa ::cuStreamCreate,
- + * ::cuEventRecord,
- + * ::cuStreamQuery,
- + * ::cuStreamSynchronize,
- + * ::cuStreamDestroy
- + */
- +CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
- +
- +/**
- + * \brief Determine status of a compute stream
- + *
- + * Returns ::CUDA_SUCCESS if all operations in the stream specified by
- + * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
- + *
- + * \param hStream - Stream to query status of
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_NOT_READY
- + * \notefnerr
- + *
- + * \sa ::cuStreamCreate,
- + * ::cuStreamWaitEvent,
- + * ::cuStreamDestroy,
- + * ::cuStreamSynchronize
- + */
- +CUresult CUDAAPI cuStreamQuery(CUstream hStream);
- +
- +/**
- + * \brief Wait until a stream's tasks are completed
- + *
- + * Waits until the device has completed all operations in the stream specified
- + * by \p hStream. If the context was created with the
- + * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
- + * stream is finished with all of its tasks.
- + *
- + * \param hStream - Stream to wait for
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE
- + * \notefnerr
- + *
- + * \sa ::cuStreamCreate,
- + * ::cuStreamDestroy,
- + * ::cuStreamWaitEvent,
- + * ::cuStreamQuery
- + */
- +CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \brief Destroys a stream
- + *
- + * Destroys the stream specified by \p hStream.
- + *
- + * In case the device is still doing work in the stream \p hStream
- + * when ::cuStreamDestroy() is called, the function will return immediately
- + * and the resources associated with \p hStream will be released automatically
- + * once the device has completed all work in \p hStream.
- + *
- + * \param hStream - Stream to destroy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuStreamCreate,
- + * ::cuStreamWaitEvent,
- + * ::cuStreamQuery,
- + * ::cuStreamSynchronize
- + */
- +CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +/** @} */ /* END CUDA_STREAM */
- +
- +
- +/**
- + * \defgroup CUDA_EVENT Event Management
- + *
- + * This section describes the event management functions of the low-level CUDA
- + * driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Creates an event
- + *
- + * Creates an event *phEvent with the flags specified via \p Flags. Valid flags
- + * include:
- + * - ::CU_EVENT_DEFAULT: Default event creation flag.
- + * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
- + * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on
- + * an event created with this flag will block until the event has actually
- + * been recorded.
- + * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
- + * to record timing data. Events created with this flag specified and
- + * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
- + * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
- + *
- + * \param phEvent - Returns newly created event
- + * \param Flags - Event creation flags
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_OUT_OF_MEMORY
- + * \notefnerr
- + *
- + * \sa
- + * ::cuEventRecord,
- + * ::cuEventQuery,
- + * ::cuEventSynchronize,
- + * ::cuEventDestroy,
- + * ::cuEventElapsedTime
- + */
- +CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
- +
- +/**
- + * \brief Records an event
- + *
- + * Records an event. If \p hStream is non-zero, the event is recorded after all
- + * preceding operations in \p hStream have been completed; otherwise, it is
- + * recorded after all preceding operations in the CUDA context have been
- + * completed. Since operation is asynchronous, ::cuEventQuery and/or
- + * ::cuEventSynchronize() must be used to determine when the event has actually
- + * been recorded.
- + *
- + * If ::cuEventRecord() has previously been called on \p hEvent, then this
- + * call will overwrite any existing state in \p hEvent. Any subsequent calls
- + * which examine the status of \p hEvent will only examine the completion of
- + * this most recent call to ::cuEventRecord().
- + *
- + * It is necessary that \p hEvent and \p hStream be created on the same context.
- + *
- + * \param hEvent - Event to record
- + * \param hStream - Stream to record event for
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuEventCreate,
- + * ::cuEventQuery,
- + * ::cuEventSynchronize,
- + * ::cuStreamWaitEvent,
- + * ::cuEventDestroy,
- + * ::cuEventElapsedTime
- + */
- +CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
- +
- +/**
- + * \brief Queries an event's status
- + *
- + * Query the status of all device work preceding the most recent
- + * call to ::cuEventRecord() (in the appropriate compute streams,
- + * as specified by the arguments to ::cuEventRecord()).
- + *
- + * If this work has successfully been completed by the device, or if
- + * ::cuEventRecord() has not been called on \p hEvent, then ::CUDA_SUCCESS is
- + * returned. If this work has not yet been completed by the device then
- + * ::CUDA_ERROR_NOT_READY is returned.
- + *
- + * \param hEvent - Event to query
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_NOT_READY
- + * \notefnerr
- + *
- + * \sa ::cuEventCreate,
- + * ::cuEventRecord,
- + * ::cuEventSynchronize,
- + * ::cuEventDestroy,
- + * ::cuEventElapsedTime
- + */
- +CUresult CUDAAPI cuEventQuery(CUevent hEvent);
- +
- +/**
- + * \brief Waits for an event to complete
- + *
- + * Wait until the completion of all device work preceding the most recent
- + * call to ::cuEventRecord() (in the appropriate compute streams, as specified
- + * by the arguments to ::cuEventRecord()).
- + *
- + * If ::cuEventRecord() has not been called on \p hEvent, ::CUDA_SUCCESS is
- + * returned immediately.
- + *
- + * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
- + * flag will cause the calling CPU thread to block until the event has
- + * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has
- + * not been set, then the CPU thread will busy-wait until the event has
- + * been completed by the device.
- + *
- + * \param hEvent - Event to wait for
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE
- + * \notefnerr
- + *
- + * \sa ::cuEventCreate,
- + * ::cuEventRecord,
- + * ::cuEventQuery,
- + * ::cuEventDestroy,
- + * ::cuEventElapsedTime
- + */
- +CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \brief Destroys an event
- + *
- + * Destroys the event specified by \p hEvent.
- + *
- + * In case \p hEvent has been recorded but has not yet been completed
- + * when ::cuEventDestroy() is called, the function will return immediately and
- + * the resources associated with \p hEvent will be released automatically once
- + * the device has completed \p hEvent.
- + *
- + * \param hEvent - Event to destroy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE
- + * \notefnerr
- + *
- + * \sa ::cuEventCreate,
- + * ::cuEventRecord,
- + * ::cuEventQuery,
- + * ::cuEventSynchronize,
- + * ::cuEventElapsedTime
- + */
- +CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +/**
- + * \brief Computes the elapsed time between two events
- + *
- + * Computes the elapsed time between two events (in milliseconds with a
- + * resolution of around 0.5 microseconds).
- + *
- + * If either event was last recorded in a non-NULL stream, the resulting time
- + * may be greater than expected (even if both used the same stream handle). This
- + * happens because the ::cuEventRecord() operation takes place asynchronously
- + * and there is no guarantee that the measured latency is actually just between
- + * the two events. Any number of other different stream operations could execute
- + * in between the two measured events, thus altering the timing in a significant
- + * way.
- + *
- + * If ::cuEventRecord() has not been called on either event then
- + * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
- + * on both events but one or both of them has not yet been completed (that is,
- + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
- + * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
- + * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
- + * ::CUDA_ERROR_INVALID_HANDLE.
- + *
- + * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
- + * \param hStart - Starting event
- + * \param hEnd - Ending event
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_NOT_READY
- + * \notefnerr
- + *
- + * \sa ::cuEventCreate,
- + * ::cuEventRecord,
- + * ::cuEventQuery,
- + * ::cuEventSynchronize,
- + * ::cuEventDestroy
- + */
- +CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
- +
- +/** @} */ /* END CUDA_EVENT */
- +
- +
- +/**
- + * \defgroup CUDA_EXEC Execution Control
- + *
- + * This section describes the execution control functions of the low-level CUDA
- + * driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Returns information about a function
- + *
- + * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
- + * given by \p hfunc. The supported attributes are:
- + * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
- + * per block, beyond which a launch of the function would fail. This number
- + * depends on both the function and the device on which the function is
- + * currently loaded.
- + * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
- + * statically-allocated shared memory per block required by this function.
- + * This does not include dynamically-allocated shared memory requested by
- + * the user at runtime.
- + * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
- + * constant memory required by this function.
- + * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
- + * used by each thread of this function.
- + * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
- + * of this function.
- + * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
- + * which the function was compiled. This value is the major PTX version * 10
- + * + the minor PTX version, so a PTX version 1.3 function would return the
- + * value 13. Note that this may return the undefined value of 0 for cubins
- + * compiled prior to CUDA 3.0.
- + * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
- + * which the function was compiled. This value is the major binary
- + * version * 10 + the minor binary version, so a binary version 1.3 function
- + * would return the value 13. Note that this will return a value of 10 for
- + * legacy cubins that do not have a properly-encoded binary architecture
- + * version.
- + *
- + * \param pi - Returned attribute value
- + * \param attrib - Attribute requested
- + * \param hfunc - Function to query attribute of
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuCtxGetCacheConfig,
- + * ::cuCtxSetCacheConfig,
- + * ::cuFuncSetCacheConfig,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
- +
- +/**
- + * \brief Sets the preferred cache configuration for a device function
- + *
- + * On devices where the L1 cache and shared memory use the same hardware
- + * resources, this sets through \p config the preferred cache configuration for
- + * the device function \p hfunc. This is only a preference. The driver will use
- + * the requested configuration if possible, but it is free to choose a different
- + * configuration if required to execute \p hfunc. Any context-wide preference
- + * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
- + * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
- + * that case, the current context-wide setting will be used.
- + *
- + * This setting does nothing on devices where the size of the L1 cache and
- + * shared memory are fixed.
- + *
- + * Launching a kernel with a different preference than the most recent
- + * preference setting may insert a device-side synchronization point.
- + *
- + *
- + * The supported cache configurations are:
- + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
- + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
- + *
- + * \param hfunc - Kernel to configure cache for
- + * \param config - Requested cache configuration
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT
- + * \notefnerr
- + *
- + * \sa ::cuCtxGetCacheConfig,
- + * ::cuCtxSetCacheConfig,
- + * ::cuFuncGetAttribute,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
- +
- +#if __CUDA_API_VERSION >= 4020
- +/**
- + * \brief Sets the shared memory configuration for a device function.
- + *
- + * On devices with configurable shared memory banks, this function will
- + * force all subsequent launches of the specified device function to have
- + * the given shared memory bank size configuration. On any given launch of the
- + * function, the shared memory configuration of the device will be temporarily
- + * changed if needed to suit the function's preferred configuration. Changes in
- + * shared memory configuration between subsequent launches of functions,
- + * may introduce a device side synchronization point.
- + *
- + * Any per-function setting of shared memory bank size set via
- + * ::cuFuncSetSharedMemConfig will override the context wide setting set with
- + * ::cuCtxSetSharedMemConfig.
- + *
- + * Changing the shared memory bank size will not increase shared memory usage
- + * or affect occupancy of kernels, but may have major effects on performance.
- + * Larger bank sizes will allow for greater potential bandwidth to shared memory,
- + * but will change what kinds of accesses to shared memory will result in bank
- + * conflicts.
- + *
- + * This function will do nothing on devices with fixed shared memory bank size.
- + *
- + * The supported bank configurations are:
- + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
- + * configuration when launching this function.
- + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
- + * be natively four bytes when launching this function.
- + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
- + * be natively eight bytes when launching this function.
- + *
- + * \param hfunc - kernel to be given a shared memory config
- + * \param config - requested shared memory configuration
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT
- + * \notefnerr
- + *
- + * \sa ::cuCtxGetCacheConfig,
- + * ::cuCtxSetCacheConfig,
- + * ::cuCtxGetSharedMemConfig
- + * ::cuCtxSetSharedMemConfig
- + * ::cuFuncGetAttribute,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
- +#endif
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \brief Launches a CUDA function
- + *
- + * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
- + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
- + * \p blockDimZ threads.
- + *
- + * \p sharedMemBytes sets the amount of dynamic shared memory that will be
- + * available to each thread block.
- + *
- + * ::cuLaunchKernel() can optionally be associated to a stream by passing a
- + * non-zero \p hStream argument.
- + *
- + * Kernel parameters to \p f can be specified in one of two ways:
- + *
- + * 1) Kernel parameters can be specified via \p kernelParams. If \p f
- + * has N parameters, then \p kernelParams needs to be an array of N
- + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1]
- + * must point to a region of memory from which the actual kernel
- + * parameter will be copied. The number of kernel parameters and their
- + * offsets and sizes do not need to be specified as that information is
- + * retrieved directly from the kernel's image.
- + *
- + * 2) Kernel parameters can also be packaged by the application into
- + * a single buffer that is passed in via the \p extra parameter.
- + * This places the burden on the application of knowing each kernel
- + * parameter's size and alignment/padding within the buffer. Here is
- + * an example of using the \p extra parameter in this manner:
- + * \code
- + size_t argBufferSize;
- + char argBuffer[256];
- +
- + // populate argBuffer and argBufferSize
- +
- + void *config[] = {
- + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
- + CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
- + CU_LAUNCH_PARAM_END
- + };
- + status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
- + * \endcode
- + *
- + * The \p extra parameter exists to allow ::cuLaunchKernel to take
- + * additional less commonly used arguments. \p extra specifies a list of
- + * names of extra settings and their corresponding values. Each extra
- + * setting name is immediately followed by the corresponding value. The
- + * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
- + *
- + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
- + * array;
- + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
- + * value in \p extra will be a pointer to a buffer containing all
- + * the kernel parameters for launching kernel \p f;
- + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
- + * value in \p extra will be a pointer to a size_t containing the
- + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
- + *
- + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
- + * parameters are specified with both \p kernelParams and \p extra
- + * (i.e. both \p kernelParams and \p extra are non-NULL).
- + *
- + * Calling ::cuLaunchKernel() sets persistent function state that is
- + * the same as function state set through the following deprecated APIs:
- + *
- + * ::cuFuncSetBlockShape()
- + * ::cuFuncSetSharedSize()
- + * ::cuParamSetSize()
- + * ::cuParamSeti()
- + * ::cuParamSetf()
- + * ::cuParamSetv()
- + *
- + * When the kernel \p f is launched via ::cuLaunchKernel(), the previous
- + * block shape, shared size and parameter info associated with \p f
- + * is overwritten.
- + *
- + * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
- + * been compiled with toolchain version 3.2 or later so that it will
- + * contain kernel parameter information, or have no kernel parameters.
- + * If either of these conditions is not met, then ::cuLaunchKernel() will
- + * return ::CUDA_ERROR_INVALID_IMAGE.
- + *
- + * \param f - Kernel to launch
- + * \param gridDimX - Width of grid in blocks
- + * \param gridDimY - Height of grid in blocks
- + * \param gridDimZ - Depth of grid in blocks
- + * \param blockDimX - X dimension of each thread block
- + * \param blockDimY - Y dimension of each thread block
- + * \param blockDimZ - Z dimension of each thread block
- + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
- + * \param hStream - Stream identifier
- + * \param kernelParams - Array of pointers to kernel parameters
- + * \param extra - Extra options
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_INVALID_IMAGE,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_LAUNCH_FAILED,
- + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- + * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- + * \notefnerr
- + *
- + * \sa ::cuCtxGetCacheConfig,
- + * ::cuCtxSetCacheConfig,
- + * ::cuFuncSetCacheConfig,
- + * ::cuFuncGetAttribute,
- + */
- +CUresult CUDAAPI cuLaunchKernel(CUfunction f,
- + unsigned int gridDimX,
- + unsigned int gridDimY,
- + unsigned int gridDimZ,
- + unsigned int blockDimX,
- + unsigned int blockDimY,
- + unsigned int blockDimZ,
- + unsigned int sharedMemBytes,
- + CUstream hStream,
- + void **kernelParams,
- + void **extra);
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +/**
- + * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
- + *
- + * This section describes the deprecated execution control functions of the
- + * low-level CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Sets the block-dimensions for the function
- + *
- + * \deprecated
- + *
- + * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
- + * created when the kernel given by \p hfunc is launched.
- + *
- + * \param hfunc - Kernel to specify dimensions of
- + * \param x - X dimension
- + * \param y - Y dimension
- + * \param z - Z dimension
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuFuncSetSharedSize,
- + * ::cuFuncSetCacheConfig,
- + * ::cuFuncGetAttribute,
- + * ::cuParamSetSize,
- + * ::cuParamSeti,
- + * ::cuParamSetf,
- + * ::cuParamSetv,
- + * ::cuLaunch,
- + * ::cuLaunchGrid,
- + * ::cuLaunchGridAsync,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
- +
- +/**
- + * \brief Sets the dynamic shared-memory size for the function
- + *
- + * \deprecated
- + *
- + * Sets through \p bytes the amount of dynamic shared memory that will be
- + * available to each thread block when the kernel given by \p hfunc is launched.
- + *
- + * \param hfunc - Kernel to specify dynamic shared-memory size for
- + * \param bytes - Dynamic shared-memory size per thread in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuFuncSetBlockShape,
- + * ::cuFuncSetCacheConfig,
- + * ::cuFuncGetAttribute,
- + * ::cuParamSetSize,
- + * ::cuParamSeti,
- + * ::cuParamSetf,
- + * ::cuParamSetv,
- + * ::cuLaunch,
- + * ::cuLaunchGrid,
- + * ::cuLaunchGridAsync,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
- +
- +/**
- + * \brief Sets the parameter size for the function
- + *
- + * \deprecated
- + *
- + * Sets through \p numbytes the total size in bytes needed by the function
- + * parameters of the kernel corresponding to \p hfunc.
- + *
- + * \param hfunc - Kernel to set parameter size for
- + * \param numbytes - Size of parameter list in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuFuncSetBlockShape,
- + * ::cuFuncSetSharedSize,
- + * ::cuFuncGetAttribute,
- + * ::cuParamSetf,
- + * ::cuParamSeti,
- + * ::cuParamSetv,
- + * ::cuLaunch,
- + * ::cuLaunchGrid,
- + * ::cuLaunchGridAsync,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
- +
- +/**
- + * \brief Adds an integer parameter to the function's argument list
- + *
- + * \deprecated
- + *
- + * Sets an integer parameter that will be specified the next time the
- + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
- + *
- + * \param hfunc - Kernel to add parameter to
- + * \param offset - Offset to add parameter to argument list
- + * \param value - Value of parameter
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuFuncSetBlockShape,
- + * ::cuFuncSetSharedSize,
- + * ::cuFuncGetAttribute,
- + * ::cuParamSetSize,
- + * ::cuParamSetf,
- + * ::cuParamSetv,
- + * ::cuLaunch,
- + * ::cuLaunchGrid,
- + * ::cuLaunchGridAsync,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
- +
- +/**
- + * \brief Adds a floating-point parameter to the function's argument list
- + *
- + * \deprecated
- + *
- + * Sets a floating-point parameter that will be specified the next time the
- + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
- + *
- + * \param hfunc - Kernel to add parameter to
- + * \param offset - Offset to add parameter to argument list
- + * \param value - Value of parameter
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuFuncSetBlockShape,
- + * ::cuFuncSetSharedSize,
- + * ::cuFuncGetAttribute,
- + * ::cuParamSetSize,
- + * ::cuParamSeti,
- + * ::cuParamSetv,
- + * ::cuLaunch,
- + * ::cuLaunchGrid,
- + * ::cuLaunchGridAsync,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value);
- +
- +/**
- + * \brief Adds arbitrary data to the function's argument list
- + *
- + * \deprecated
- + *
- + * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
- + * into the parameter space of the kernel corresponding to \p hfunc. \p offset
- + * is a byte offset.
- + *
- + * \param hfunc - Kernel to add data to
- + * \param offset - Offset to add data to argument list
- + * \param ptr - Pointer to arbitrary data
- + * \param numbytes - Size of data to copy in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuFuncSetBlockShape,
- + * ::cuFuncSetSharedSize,
- + * ::cuFuncGetAttribute,
- + * ::cuParamSetSize,
- + * ::cuParamSetf,
- + * ::cuParamSeti,
- + * ::cuLaunch,
- + * ::cuLaunchGrid,
- + * ::cuLaunchGridAsync,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
- +
- +/**
- + * \brief Launches a CUDA function
- + *
- + * \deprecated
- + *
- + * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
- + * contains the number of threads specified by a previous call to
- + * ::cuFuncSetBlockShape().
- + *
- + * \param f - Kernel to launch
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_LAUNCH_FAILED,
- + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- + * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- + * \notefnerr
- + *
- + * \sa ::cuFuncSetBlockShape,
- + * ::cuFuncSetSharedSize,
- + * ::cuFuncGetAttribute,
- + * ::cuParamSetSize,
- + * ::cuParamSetf,
- + * ::cuParamSeti,
- + * ::cuParamSetv,
- + * ::cuLaunchGrid,
- + * ::cuLaunchGridAsync,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuLaunch(CUfunction f);
- +
- +/**
- + * \brief Launches a CUDA function
- + *
- + * \deprecated
- + *
- + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
- + * blocks. Each block contains the number of threads specified by a previous
- + * call to ::cuFuncSetBlockShape().
- + *
- + * \param f - Kernel to launch
- + * \param grid_width - Width of grid in blocks
- + * \param grid_height - Height of grid in blocks
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_LAUNCH_FAILED,
- + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- + * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- + * \notefnerr
- + *
- + * \sa ::cuFuncSetBlockShape,
- + * ::cuFuncSetSharedSize,
- + * ::cuFuncGetAttribute,
- + * ::cuParamSetSize,
- + * ::cuParamSetf,
- + * ::cuParamSeti,
- + * ::cuParamSetv,
- + * ::cuLaunch,
- + * ::cuLaunchGridAsync,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
- +
- +/**
- + * \brief Launches a CUDA function
- + *
- + * \deprecated
- + *
- + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
- + * blocks. Each block contains the number of threads specified by a previous
- + * call to ::cuFuncSetBlockShape().
- + *
- + * ::cuLaunchGridAsync() can optionally be associated to a stream by passing a
- + * non-zero \p hStream argument.
- + *
- + * \param f - Kernel to launch
- + * \param grid_width - Width of grid in blocks
- + * \param grid_height - Height of grid in blocks
- + * \param hStream - Stream identifier
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_LAUNCH_FAILED,
- + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- + * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- + * \notefnerr
- + *
- + * \sa ::cuFuncSetBlockShape,
- + * ::cuFuncSetSharedSize,
- + * ::cuFuncGetAttribute,
- + * ::cuParamSetSize,
- + * ::cuParamSetf,
- + * ::cuParamSeti,
- + * ::cuParamSetv,
- + * ::cuLaunch,
- + * ::cuLaunchGrid,
- + * ::cuLaunchKernel
- + */
- +CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
- +
- +
- +/**
- + * \brief Adds a texture-reference to the function's argument list
- + *
- + * \deprecated
- + *
- + * Makes the CUDA array or linear memory bound to the texture reference
- + * \p hTexRef available to a device program as a texture. In this version of
- + * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
- + * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
- + *
- + * \param hfunc - Kernel to add texture-reference to
- + * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
- + * \param hTexRef - Texture-reference to add to argument list
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + */
- +CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
- +/** @} */ /* END CUDA_EXEC_DEPRECATED */
- +
- +/** @} */ /* END CUDA_EXEC */
- +
- +
- +/**
- + * \defgroup CUDA_TEXREF Texture Reference Management
- + *
- + * This section describes the texture reference management functions of the
- + * low-level CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Binds an array as a texture reference
- + *
- + * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
- + * previous address or CUDA array state associated with the texture reference
- + * is superseded by this function. \p Flags must be set to
- + * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
- + * unbound.
- + *
- + * \param hTexRef - Texture reference to bind
- + * \param hArray - Array to bind
- + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Binds an address as a texture reference
- + *
- + * Binds a linear address range to the texture reference \p hTexRef. Any
- + * previous address or CUDA array state associated with the texture reference
- + * is superseded by this function. Any memory previously bound to \p hTexRef
- + * is unbound.
- + *
- + * Since the hardware enforces an alignment requirement on texture base
- + * addresses, ::cuTexRefSetAddress() passes back a byte offset in
- + * \p *ByteOffset that must be applied to texture fetches in order to read from
- + * the desired memory. This offset must be divided by the texel size and
- + * passed to kernels that read from the texture so they can be applied to the
- + * ::tex1Dfetch() function.
- + *
- + * If the device memory pointer was returned from ::cuMemAlloc(), the offset
- + * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
- + *
- + * The total number of elements (or texels) in the linear address range
- + * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
- + * The number of elements is computed as (\p bytes / bytesPerElement),
- + * where bytesPerElement is determined from the data format and number of
- + * components set using ::cuTexRefSetFormat().
- + *
- + * \param ByteOffset - Returned byte offset
- + * \param hTexRef - Texture reference to bind
- + * \param dptr - Device pointer to bind
- + * \param bytes - Size of memory to bind in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
- +
- +/**
- + * \brief Binds an address as a 2D texture reference
- + *
- + * Binds a linear address range to the texture reference \p hTexRef. Any
- + * previous address or CUDA array state associated with the texture reference
- + * is superseded by this function. Any memory previously bound to \p hTexRef
- + * is unbound.
- + *
- + * Using a ::tex2D() function inside a kernel requires a call to either
- + * ::cuTexRefSetArray() to bind the corresponding texture reference to an
- + * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
- + * memory.
- + *
- + * Function calls to ::cuTexRefSetFormat() cannot follow calls to
- + * ::cuTexRefSetAddress2D() for the same texture reference.
- + *
- + * It is required that \p dptr be aligned to the appropriate hardware-specific
- + * texture alignment. You can query this value using the device attribute
- + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
- + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
- + *
- + * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
- + * This value can be queried using the device attribute
- + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
- + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
- + *
- + * Width and Height, which are specified in elements (or texels), cannot exceed
- + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
- + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
- + * \p Pitch, which is specified in bytes, cannot exceed
- + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
- + *
- + * \param hTexRef - Texture reference to bind
- + * \param desc - Descriptor of CUDA array
- + * \param dptr - Device pointer to bind
- + * \param Pitch - Line pitch in bytes
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +/**
- + * \brief Sets the format for a texture reference
- + *
- + * Specifies the format of the data to be read by the texture reference
- + * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
- + * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
- + * They specify the format of each component and the number of components per
- + * array element.
- + *
- + * \param hTexRef - Texture reference
- + * \param fmt - Format to set
- + * \param NumPackedComponents - Number of components per array element
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
- +
- +/**
- + * \brief Sets the addressing mode for a texture reference
- + *
- + * Specifies the addressing mode \p am for the given dimension \p dim of the
- + * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
- + * applied to the first parameter of the functions used to fetch from the
- + * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
- + * as:
- + * \code
- + typedef enum CUaddress_mode_enum {
- + CU_TR_ADDRESS_MODE_WRAP = 0,
- + CU_TR_ADDRESS_MODE_CLAMP = 1,
- + CU_TR_ADDRESS_MODE_MIRROR = 2,
- + CU_TR_ADDRESS_MODE_BORDER = 3
- + } CUaddress_mode;
- + * \endcode
- + *
- + * Note that this call has no effect if \p hTexRef is bound to linear memory.
- + * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
- + * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
- + *
- + * \param hTexRef - Texture reference
- + * \param dim - Dimension
- + * \param am - Addressing mode to set
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
- +
- +/**
- + * \brief Sets the filtering mode for a texture reference
- + *
- + * Specifies the filtering mode \p fm to be used when reading memory through
- + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
- + *
- + * \code
- + typedef enum CUfilter_mode_enum {
- + CU_TR_FILTER_MODE_POINT = 0,
- + CU_TR_FILTER_MODE_LINEAR = 1
- + } CUfilter_mode;
- + * \endcode
- + *
- + * Note that this call has no effect if \p hTexRef is bound to linear memory.
- + *
- + * \param hTexRef - Texture reference
- + * \param fm - Filtering mode to set
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
- +
- +/**
- + * \brief Sets the flags for a texture reference
- + *
- + * Specifies optional flags via \p Flags to specify the behavior of data
- + * returned through the texture reference \p hTexRef. The valid flags are:
- + *
- + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
- + * having the texture promote integer data to floating point data in the
- + * range [0, 1]. Note that texture with 32-bit integer format
- + * would not be promoted, regardless of whether or not this
- + * flag is specified;
- + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
- + * default behavior of having the texture coordinates range
- + * from [0, Dim) where Dim is the width or height of the CUDA
- + * array. Instead, the texture coordinates [0, 1.0) reference
- + * the entire breadth of the array dimension;
- + *
- + * \param hTexRef - Texture reference
- + * \param Flags - Optional flags to set
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Gets the address associated with a texture reference
- + *
- + * Returns in \p *pdptr the base address bound to the texture reference
- + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
- + * is not bound to any device memory range.
- + *
- + * \param pdptr - Returned device address
- + * \param hTexRef - Texture reference
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +/**
- + * \brief Gets the array bound to a texture reference
- + *
- + * Returns in \p *phArray the CUDA array bound to the texture reference
- + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
- + * is not bound to any CUDA array.
- + *
- + * \param phArray - Returned array
- + * \param hTexRef - Texture reference
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
- +
- +/**
- + * \brief Gets the addressing mode used by a texture reference
- + *
- + * Returns in \p *pam the addressing mode corresponding to the
- + * dimension \p dim of the texture reference \p hTexRef. Currently, the only
- + * valid value for \p dim are 0 and 1.
- + *
- + * \param pam - Returned addressing mode
- + * \param hTexRef - Texture reference
- + * \param dim - Dimension
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
- +
- +/**
- + * \brief Gets the filter-mode used by a texture reference
- + *
- + * Returns in \p *pfm the filtering mode of the texture reference
- + * \p hTexRef.
- + *
- + * \param pfm - Returned filtering mode
- + * \param hTexRef - Texture reference
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFlags, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
- +
- +/**
- + * \brief Gets the format used by a texture reference
- + *
- + * Returns in \p *pFormat and \p *pNumChannels the format and number
- + * of components of the CUDA array bound to the texture reference \p hTexRef.
- + * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
- + *
- + * \param pFormat - Returned format
- + * \param pNumChannels - Returned number of components
- + * \param hTexRef - Texture reference
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
- + */
- +CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
- +
- +/**
- + * \brief Gets the flags used by a texture reference
- + *
- + * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
- + *
- + * \param pFlags - Returned flags
- + * \param hTexRef - Texture reference
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefSetAddress,
- + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- + * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
- + */
- +CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
- +
- +/**
- + * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
- + *
- + * This section describes the deprecated texture reference management
- + * functions of the low-level CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Creates a texture reference
- + *
- + * \deprecated
- + *
- + * Creates a texture reference and returns its handle in \p *pTexRef. Once
- + * created, the application must call ::cuTexRefSetArray() or
- + * ::cuTexRefSetAddress() to associate the reference with allocated memory.
- + * Other texture reference functions are used to specify the format and
- + * interpretation (addressing, filtering, etc.) to be used when the memory is
- + * read through this texture reference.
- + *
- + * \param pTexRef - Returned texture reference
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefDestroy
- + */
- +CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef);
- +
- +/**
- + * \brief Destroys a texture reference
- + *
- + * \deprecated
- + *
- + * Destroys the texture reference specified by \p hTexRef.
- + *
- + * \param hTexRef - Texture reference to destroy
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuTexRefCreate
- + */
- +CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
- +
- +/** @} */ /* END CUDA_TEXREF_DEPRECATED */
- +
- +/** @} */ /* END CUDA_TEXREF */
- +
- +
- +/**
- + * \defgroup CUDA_SURFREF Surface Reference Management
- + *
- + * This section describes the surface reference management functions of the
- + * low-level CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Sets the CUDA array for a surface reference.
- + *
- + * Sets the CUDA array \p hArray to be read and written by the surface reference
- + * \p hSurfRef. Any previous CUDA array state associated with the surface
- + * reference is superseded by this function. \p Flags must be set to 0.
- + * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
- + * Any CUDA array previously bound to \p hSurfRef is unbound.
- +
- + * \param hSurfRef - Surface reference handle
- + * \param hArray - CUDA array handle
- + * \param Flags - set to 0
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuModuleGetSurfRef, ::cuSurfRefGetArray
- + */
- +CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
- +
- +/**
- + * \brief Passes back the CUDA array bound to a surface reference.
- + *
- + * Returns in \p *phArray the CUDA array bound to the surface reference
- + * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
- + * is not bound to any CUDA array.
- +
- + * \param phArray - Surface reference handle
- + * \param hSurfRef - Surface reference handle
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + *
- + * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
- + */
- +CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
- +
- +/** @} */ /* END CUDA_SURFREF */
- +
- +#if __CUDA_API_VERSION >= 4000
- +/**
- + * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
- + *
- + * This section describes the direct peer context memory access functions
- + * of the low-level CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Queries if a device may directly access a peer device's memory.
- + *
- + * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
- + * directly accessing memory from contexts on \p peerDev and 0 otherwise.
- + * If direct access of \p peerDev from \p dev is possible, then access may be
- + * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
- + *
- + * \param canAccessPeer - Returned access capability
- + * \param dev - Device from which allocations on \p peerDev are to
- + * be directly accessed.
- + * \param peerDev - Device on which the allocations to be directly accessed
- + * by \p dev reside.
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_DEVICE
- + * \notefnerr
- + *
- + * \sa ::cuCtxEnablePeerAccess,
- + * ::cuCtxDisablePeerAccess
- + */
- +CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
- +
- +/**
- + * \brief Enables direct access to memory allocations in a peer context.
- + *
- + If both the current context and \p peerContext are on devices which support unified
- + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING), then
- + * on success all allocations from \p peerContext will immediately be accessible
- + * by the current context. See \ref CUDA_UNIFIED for additional
- + * details.
- + *
- + * Note that access granted by this call is unidirectional and that in order to access
- + * memory from the current context in \p peerContext, a separate symmetric call
- + * to ::cuCtxEnablePeerAccess() is required.
- + *
- + * Returns ::CUDA_ERROR_INVALID_DEVICE if ::cuDeviceCanAccessPeer() indicates
- + * that the ::CUdevice of the current context cannot directly access memory
- + * from the ::CUdevice of \p peerContext.
- + *
- + * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
- + * \p peerContext from the current context has already been enabled.
- + *
- + * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
- + * because hardware resources required for peer access have been exhausted.
- + *
- + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
- + * is not a valid context, or if the current context is \p peerContext.
- + *
- + * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
- + *
- + * \param peerContext - Peer context to enable direct access to from the current context
- + * \param Flags - Reserved for future use and must be set to 0
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_DEVICE,
- + * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
- + * ::CUDA_ERROR_TOO_MANY_PEERS,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE
- + * \notefnerr
- + *
- + * \sa ::cuDeviceCanAccessPeer,
- + * ::cuCtxDisablePeerAccess
- + */
- +CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
- +
- +/**
- + * \brief Disables direct access to memory allocations in a peer context and
- + * unregisters any registered allocations.
- + *
- + Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
- + * not yet been enabled from \p peerContext to the current context.
- + *
- + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
- + * \p peerContext is not a valid context.
- + *
- + * \param peerContext - Peer context to disable direct access to
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * \notefnerr
- + *
- + * \sa ::cuDeviceCanAccessPeer,
- + * ::cuCtxEnablePeerAccess
- + */
- +CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
- +
- +/** @} */ /* END CUDA_PEER_ACCESS */
- +#endif /* __CUDA_API_VERSION >= 4000 */
- +
- +/**
- + * \defgroup CUDA_GRAPHICS Graphics Interoperability
- + *
- + * This section describes the graphics interoperability functions of the
- + * low-level CUDA driver application programming interface.
- + *
- + * @{
- + */
- +
- +/**
- + * \brief Unregisters a graphics resource for access by CUDA
- + *
- + * Unregisters the graphics resource \p resource so it is not accessible by
- + * CUDA unless registered again.
- + *
- + * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
- + * returned.
- + *
- + * \param resource - Resource to unregister
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_UNKNOWN
- + * \notefnerr
- + *
- + * \sa
- + * ::cuGraphicsD3D9RegisterResource,
- + * ::cuGraphicsD3D10RegisterResource,
- + * ::cuGraphicsD3D11RegisterResource,
- + * ::cuGraphicsGLRegisterBuffer,
- + * ::cuGraphicsGLRegisterImage
- + */
- +CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
- +
- +/**
- + * \brief Get an array through which to access a subresource of a mapped graphics resource.
- + *
- + * Returns in \p *pArray an array through which the subresource of the mapped
- + * graphics resource \p resource which corresponds to array index \p arrayIndex
- + * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may
- + * change every time that \p resource is mapped.
- + *
- + * If \p resource is not a texture then it cannot be accessed via an array and
- + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
- + * If \p arrayIndex is not a valid array index for \p resource then
- + * ::CUDA_ERROR_INVALID_VALUE is returned.
- + * If \p mipLevel is not a valid mipmap level for \p resource then
- + * ::CUDA_ERROR_INVALID_VALUE is returned.
- + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
- + *
- + * \param pArray - Returned array through which a subresource of \p resource may be accessed
- + * \param resource - Mapped resource to access
- + * \param arrayIndex - Array index for array textures or cubemap face
- + * index as defined by ::CUarray_cubemap_face for
- + * cubemap textures for the subresource to access
- + * \param mipLevel - Mipmap level for the subresource to access
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_NOT_MAPPED
- + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
- + * \notefnerr
- + *
- + * \sa ::cuGraphicsResourceGetMappedPointer
- + */
- +CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
- +
- +#if __CUDA_API_VERSION >= 3020
- +/**
- + * \brief Get a device pointer through which to access a mapped graphics resource.
- + *
- + * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
- + * \p resource may be accessed.
- + * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
- + * The value set in \p pPointer may change every time that \p resource is mapped.
- + *
- + * If \p resource is not a buffer then it cannot be accessed via a pointer and
- + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
- + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
- + * *
- + * \param pDevPtr - Returned pointer through which \p resource may be accessed
- + * \param pSize - Returned size of the buffer accessible starting at \p *pPointer
- + * \param resource - Mapped resource to access
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_NOT_MAPPED
- + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
- + * \notefnerr
- + *
- + * \sa
- + * ::cuGraphicsMapResources,
- + * ::cuGraphicsSubResourceGetMappedArray
- + */
- +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
- +#endif /* __CUDA_API_VERSION >= 3020 */
- +
- +/**
- + * \brief Set usage flags for mapping a graphics resource
- + *
- + * Set \p flags for mapping the graphics resource \p resource.
- + *
- + * Changes to \p flags will take effect the next time \p resource is mapped.
- + * The \p flags argument may be any of the following:
- +
- + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
- + * resource will be used. It is therefore assumed that this resource will be
- + * read from and written to by CUDA kernels. This is the default value.
- + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
- + * access this resource will not write to this resource.
- + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
- + * which access this resource will not read from this resource and will
- + * write over the entire contents of the resource, so none of the data
- + * previously stored in the resource will be preserved.
- + *
- + * If \p resource is presently mapped for access by CUDA then
- + * ::CUDA_ERROR_ALREADY_MAPPED is returned.
- + * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
- + *
- + * \param resource - Registered resource to set flags for
- + * \param flags - Parameters for resource mapping
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_VALUE,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_ALREADY_MAPPED
- + * \notefnerr
- + *
- + * \sa
- + * ::cuGraphicsMapResources
- + */
- +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
- +
- +/**
- + * \brief Map graphics resources for access by CUDA
- + *
- + * Maps the \p count graphics resources in \p resources for access by CUDA.
- + *
- + * The resources in \p resources may be accessed by CUDA until they
- + * are unmapped. The graphics API from which \p resources were registered
- + * should not access any resources while they are mapped by CUDA. If an
- + * application does so, the results are undefined.
- + *
- + * This function provides the synchronization guarantee that any graphics calls
- + * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
- + * work issued in \p stream begins.
- + *
- + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
- + * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
- + *
- + * \param count - Number of resources to map
- + * \param resources - Resources to map for CUDA usage
- + * \param hStream - Stream with which to synchronize
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_ALREADY_MAPPED,
- + * ::CUDA_ERROR_UNKNOWN
- + * \notefnerr
- + *
- + * \sa
- + * ::cuGraphicsResourceGetMappedPointer
- + * ::cuGraphicsSubResourceGetMappedArray
- + * ::cuGraphicsUnmapResources
- + */
- +CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
- +
- +/**
- + * \brief Unmap graphics resources.
- + *
- + * Unmaps the \p count graphics resources in \p resources.
- + *
- + * Once unmapped, the resources in \p resources may not be accessed by CUDA
- + * until they are mapped again.
- + *
- + * This function provides the synchronization guarantee that any CUDA work issued
- + * in \p stream before ::cuGraphicsUnmapResources() will complete before any
- + * subsequently issued graphics work begins.
- + *
- + *
- + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
- + * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
- + *
- + * \param count - Number of resources to unmap
- + * \param resources - Resources to unmap
- + * \param hStream - Stream with which to synchronize
- + *
- + * \return
- + * ::CUDA_SUCCESS,
- + * ::CUDA_ERROR_DEINITIALIZED,
- + * ::CUDA_ERROR_NOT_INITIALIZED,
- + * ::CUDA_ERROR_INVALID_CONTEXT,
- + * ::CUDA_ERROR_INVALID_HANDLE,
- + * ::CUDA_ERROR_NOT_MAPPED,
- + * ::CUDA_ERROR_UNKNOWN
- + * \notefnerr
- + *
- + * \sa
- + * ::cuGraphicsMapResources
- + */
- +CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
- +
- +/** @} */ /* END CUDA_GRAPHICS */
- +
- +CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
- +
- +
- +/** @} */ /* END CUDA_DRIVER */
- +
- +/**
- + * CUDA API versioning support
- + */
- +#if defined(__CUDA_API_VERSION_INTERNAL)
- + #undef cuDeviceTotalMem
- + #undef cuCtxCreate
- + #undef cuModuleGetGlobal
- + #undef cuMemGetInfo
- + #undef cuMemAlloc
- + #undef cuMemAllocPitch
- + #undef cuMemFree
- + #undef cuMemGetAddressRange
- + #undef cuMemAllocHost
- + #undef cuMemHostGetDevicePointer
- + #undef cuMemcpyHtoD
- + #undef cuMemcpyDtoH
- + #undef cuMemcpyDtoD
- + #undef cuMemcpyDtoA
- + #undef cuMemcpyAtoD
- + #undef cuMemcpyHtoA
- + #undef cuMemcpyAtoH
- + #undef cuMemcpyAtoA
- + #undef cuMemcpyHtoAAsync
- + #undef cuMemcpyAtoHAsync
- + #undef cuMemcpy2D
- + #undef cuMemcpy2DUnaligned
- + #undef cuMemcpy3D
- + #undef cuMemcpyHtoDAsync
- + #undef cuMemcpyDtoHAsync
- + #undef cuMemcpyDtoDAsync
- + #undef cuMemcpy2DAsync
- + #undef cuMemcpy3DAsync
- + #undef cuMemsetD8
- + #undef cuMemsetD16
- + #undef cuMemsetD32
- + #undef cuMemsetD2D8
- + #undef cuMemsetD2D16
- + #undef cuMemsetD2D32
- + #undef cuArrayCreate
- + #undef cuArrayGetDescriptor
- + #undef cuArray3DCreate
- + #undef cuArray3DGetDescriptor
- + #undef cuTexRefSetAddress
- + #undef cuTexRefSetAddress2D
- + #undef cuTexRefGetAddress
- + #undef cuGraphicsResourceGetMappedPointer
- + #undef cuCtxDestroy
- + #undef cuCtxPopCurrent
- + #undef cuCtxPushCurrent
- + #undef cuStreamDestroy
- + #undef cuEventDestroy
- +#endif /* __CUDA_API_VERSION_INTERNAL */
- +
- +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010)
- +CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
- +#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */
- +
- +/**
- + * CUDA API made obselete at API version 3020
- + */
- +#if defined(__CUDA_API_VERSION_INTERNAL)
- + #define CUdeviceptr CUdeviceptr_v1
- + #define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st
- + #define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1
- + #define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st
- + #define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1
- + #define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st
- + #define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1
- + #define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st
- + #define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1
- +#endif /* CUDA_FORCE_LEGACY32_INTERNAL */
- +
- +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
- +
- +typedef unsigned int CUdeviceptr;
- +
- +typedef struct CUDA_MEMCPY2D_st
- +{
- + unsigned int srcXInBytes; /**< Source X in bytes */
- + unsigned int srcY; /**< Source Y */
- + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
- + const void *srcHost; /**< Source host pointer */
- + CUdeviceptr srcDevice; /**< Source device pointer */
- + CUarray srcArray; /**< Source array reference */
- + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */
- +
- + unsigned int dstXInBytes; /**< Destination X in bytes */
- + unsigned int dstY; /**< Destination Y */
- + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
- + void *dstHost; /**< Destination host pointer */
- + CUdeviceptr dstDevice; /**< Destination device pointer */
- + CUarray dstArray; /**< Destination array reference */
- + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */
- +
- + unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */
- + unsigned int Height; /**< Height of 2D memory copy */
- +} CUDA_MEMCPY2D;
- +
- +typedef struct CUDA_MEMCPY3D_st
- +{
- + unsigned int srcXInBytes; /**< Source X in bytes */
- + unsigned int srcY; /**< Source Y */
- + unsigned int srcZ; /**< Source Z */
- + unsigned int srcLOD; /**< Source LOD */
- + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
- + const void *srcHost; /**< Source host pointer */
- + CUdeviceptr srcDevice; /**< Source device pointer */
- + CUarray srcArray; /**< Source array reference */
- + void *reserved0; /**< Must be NULL */
- + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */
- + unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
- +
- + unsigned int dstXInBytes; /**< Destination X in bytes */
- + unsigned int dstY; /**< Destination Y */
- + unsigned int dstZ; /**< Destination Z */
- + unsigned int dstLOD; /**< Destination LOD */
- + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
- + void *dstHost; /**< Destination host pointer */
- + CUdeviceptr dstDevice; /**< Destination device pointer */
- + CUarray dstArray; /**< Destination array reference */
- + void *reserved1; /**< Must be NULL */
- + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */
- + unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
- +
- + unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */
- + unsigned int Height; /**< Height of 3D memory copy */
- + unsigned int Depth; /**< Depth of 3D memory copy */
- +} CUDA_MEMCPY3D;
- +
- +typedef struct CUDA_ARRAY_DESCRIPTOR_st
- +{
- + unsigned int Width; /**< Width of array */
- + unsigned int Height; /**< Height of array */
- +
- + CUarray_format Format; /**< Array format */
- + unsigned int NumChannels; /**< Channels per array element */
- +} CUDA_ARRAY_DESCRIPTOR;
- +
- +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
- +{
- + unsigned int Width; /**< Width of 3D array */
- + unsigned int Height; /**< Height of 3D array */
- + unsigned int Depth; /**< Depth of 3D array */
- +
- + CUarray_format Format; /**< Array format */
- + unsigned int NumChannels; /**< Channels per array element */
- + unsigned int Flags; /**< Flags */
- +} CUDA_ARRAY3D_DESCRIPTOR;
- +
- +CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
- +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
- +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
- +CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
- +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
- +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
- +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
- +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
- +CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
- +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
- +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
- +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
- +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
- +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
- +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
- +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
- +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
- +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
- +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
- +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
- +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
- +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
- +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
- +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
- +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
- +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
- +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
- +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
- +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
- +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
- +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
- +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
- +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
- +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
- +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
- +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
- +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
- +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
- +CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
- +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
- +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
- +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
- +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */
- +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000
- +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
- +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
- +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
- +CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
- +CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
- +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */
- +
- +#if defined(__CUDA_API_VERSION_INTERNAL)
- + #undef CUdeviceptr
- + #undef CUDA_MEMCPY2D_st
- + #undef CUDA_MEMCPY2D
- + #undef CUDA_MEMCPY3D_st
- + #undef CUDA_MEMCPY3D
- + #undef CUDA_ARRAY_DESCRIPTOR_st
- + #undef CUDA_ARRAY_DESCRIPTOR
- + #undef CUDA_ARRAY3D_DESCRIPTOR_st
- + #undef CUDA_ARRAY3D_DESCRIPTOR
- +#endif /* __CUDA_API_VERSION_INTERNAL */
- +
- +#ifdef __cplusplus
- +}
- +#endif
- +
- +#undef __CUDA_API_VERSION
- +
- +#endif /* __cuda_cuda_h__ */
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda_dynlink.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda_dynlink.h
- new file mode 100644
- index 0000000..24434ee
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuda_dynlink.h
- @@ -0,0 +1,73 @@
- +/*
- + * Copyright (C) 2011 Hendrik Leppkes
- + * http://www.1f0.de
- + *
- + * This Program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License as published by
- + * the Free Software Foundation; either version 2, or (at your option)
- + * any later version.
- + *
- + * This Program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + *
- + * You should have received a copy of the GNU General Public License
- + * along with this program; see the file COPYING. If not, write to
- + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- + * http://www.gnu.org/copyleft/gpl.html
- + *
- + * Assembled from parts of the NVIDIA CUDA SDK, Copyright by NVIDIA, All rights reserved.
- + */
- +
- +#pragma once
- +#include "cuda.h"
- +////////////////////////////////////////////////////
- +/// CUDA functions
- +////////////////////////////////////////////////////
- +typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
- +typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev );
- +typedef CUresult CUDAAPI tcuCtxDestroy( CUcontext ctx );
- +typedef CUresult CUDAAPI tcuCtxPushCurrent( CUcontext ctx );
- +typedef CUresult CUDAAPI tcuCtxPopCurrent( CUcontext *pctx );
- +typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
- +typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
- +typedef CUresult CUDAAPI tcuMemcpyDtoH (void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount );
- +typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
- +typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
- +typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);
- +typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
- +typedef CUresult CUDAAPI tcuDeviceGetCount(int *count);
- +typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion);
- +typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
- +typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
- +typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
- +
- +////////////////////////////////////////////////////
- +/// D3D Interop
- +////////////////////////////////////////////////////
- +typedef CUresult CUDAAPI tcuD3D9CtxCreate( CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice );
- +
- +////////////////////////////////////////////////////
- +/// CUVID functions
- +////////////////////////////////////////////////////
- +typedef CUresult CUDAAPI tcuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx);
- +typedef CUresult CUDAAPI tcuvidCtxLockDestroy(CUvideoctxlock lck);
- +typedef CUresult CUDAAPI tcuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags);
- +typedef CUresult CUDAAPI tcuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags);
- +
- +typedef CUresult CUDAAPI tcuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
- +typedef CUresult CUDAAPI tcuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
- +typedef CUresult CUDAAPI tcuvidDestroyVideoParser(CUvideoparser obj);
- +
- +// Create/Destroy the decoder object
- +typedef CUresult CUDAAPI tcuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci);
- +typedef CUresult CUDAAPI tcuvidDestroyDecoder(CUvideodecoder hDecoder);
- +
- +// Decode a single picture (field or frame)
- +typedef CUresult CUDAAPI tcuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams);
- +
- +// Post-process and map a video frame for use in cuda
- +typedef CUresult CUDAAPI tcuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr, unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
- +// Unmap a previously mapped video frame
- +typedef CUresult CUDAAPI tcuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr);
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuviddec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuviddec.h
- new file mode 100644
- index 0000000..4c2674e
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/cuviddec.h
- @@ -0,0 +1,523 @@
- +/*
- + * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
- + *
- + * NOTICE TO USER:
- + *
- + * This source code is subject to NVIDIA ownership rights under U.S. and
- + * international Copyright laws. Users and possessors of this source code
- + * are hereby granted a nonexclusive, royalty-free license to use this code
- + * in individual and commercial software.
- + *
- + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
- + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
- + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
- + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
- + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
- + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
- + * OR PERFORMANCE OF THIS SOURCE CODE.
- + *
- + * U.S. Government End Users. This source code is a "commercial item" as
- + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
- + * "commercial computer software" and "commercial computer software
- + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
- + * and is provided to the U.S. Government only as a commercial end item.
- + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
- + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
- + * source code with only those rights set forth herein.
- + *
- + * Any use of this source code in individual and commercial software must
- + * include, in the user documentation and internal comments to the code,
- + * the above Disclaimer and U.S. Government End Users Notice.
- + */
- +
- +#if !defined(__CUDA_VIDEO_H__)
- +#define __CUDA_VIDEO_H__
- +
- +#ifndef __cuda_cuda_h__
- +#include <cuda.h>
- +#endif // __cuda_cuda_h__
- +
- +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
- +#if (CUDA_VERSION >= 3020) && (!defined(CUDA_FORCE_API_VERSION) || (CUDA_FORCE_API_VERSION >= 3020))
- +#define __CUVID_DEVPTR64
- +#endif
- +#endif
- +
- +#if defined(__cplusplus)
- +extern "C" {
- +#endif /* __cplusplus */
- +
- +typedef void *CUvideodecoder;
- +typedef struct _CUcontextlock_st *CUvideoctxlock;
- +
- +typedef enum cudaVideoCodec_enum {
- + cudaVideoCodec_MPEG1=0,
- + cudaVideoCodec_MPEG2,
- + cudaVideoCodec_MPEG4,
- + cudaVideoCodec_VC1,
- + cudaVideoCodec_H264,
- + cudaVideoCodec_JPEG,
- + cudaVideoCodec_H264_SVC,
- + cudaVideoCodec_H264_MVC,
- + cudaVideoCodec_NumCodecs,
- + // Uncompressed YUV
- + cudaVideoCodec_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')), // Y,U,V (4:2:0)
- + cudaVideoCodec_YV12 = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')), // Y,V,U (4:2:0)
- + cudaVideoCodec_NV12 = (('N'<<24)|('V'<<16)|('1'<<8)|('2')), // Y,UV (4:2:0)
- + cudaVideoCodec_YUYV = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')), // YUYV/YUY2 (4:2:2)
- + cudaVideoCodec_UYVY = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y')), // UYVY (4:2:2)
- +} cudaVideoCodec;
- +
- +typedef enum cudaVideoSurfaceFormat_enum {
- + cudaVideoSurfaceFormat_NV12=0, // NV12 (currently the only supported output format)
- +} cudaVideoSurfaceFormat;
- +
- +typedef enum cudaVideoDeinterlaceMode_enum {
- + cudaVideoDeinterlaceMode_Weave=0, // Weave both fields (no deinterlacing)
- + cudaVideoDeinterlaceMode_Bob, // Drop one field
- + cudaVideoDeinterlaceMode_Adaptive, // Adaptive deinterlacing
- +} cudaVideoDeinterlaceMode;
- +
- +typedef enum cudaVideoChromaFormat_enum {
- + cudaVideoChromaFormat_Monochrome=0,
- + cudaVideoChromaFormat_420,
- + cudaVideoChromaFormat_422,
- + cudaVideoChromaFormat_444,
- +} cudaVideoChromaFormat;
- +
- +typedef enum cudaVideoCreateFlags_enum {
- + cudaVideoCreate_Default = 0x00, // Default operation mode: use dedicated video engines
- + cudaVideoCreate_PreferCUDA = 0x01, // Use a CUDA-based decoder if faster than dedicated engines (requires a valid vidLock object for multi-threading)
- + cudaVideoCreate_PreferDXVA = 0x02, // Go through DXVA internally if possible (requires D3D9 interop)
- + cudaVideoCreate_PreferCUVID = 0x04, // Use dedicated video engines directly
- +} cudaVideoCreateFlags;
- +
- +
- +typedef struct _CUVIDDECODECREATEINFO
- +{
- + // Decoding
- + unsigned long ulWidth; // Coded Sequence Width
- + unsigned long ulHeight; // Coded Sequence Height
- + unsigned long ulNumDecodeSurfaces; // Maximum number of internal decode surfaces
- + cudaVideoCodec CodecType; // cudaVideoCodec_XXX
- + cudaVideoChromaFormat ChromaFormat; // cudaVideoChromaFormat_XXX (only 4:2:0 is currently supported)
- + unsigned long ulCreationFlags; // Decoder creation flags (cudaVideoCreateFlags_XXX)
- + unsigned long Reserved1[5]; // Reserved for future use - set to zero
- + struct { // area of the frame that should be displayed
- + short left;
- + short top;
- + short right;
- + short bottom;
- + } display_area;
- + // Output format
- + cudaVideoSurfaceFormat OutputFormat; // cudaVideoSurfaceFormat_XXX
- + cudaVideoDeinterlaceMode DeinterlaceMode; // cudaVideoDeinterlaceMode_XXX
- + unsigned long ulTargetWidth; // Post-processed Output Width
- + unsigned long ulTargetHeight; // Post-processed Output Height
- + unsigned long ulNumOutputSurfaces; // Maximum number of output surfaces simultaneously mapped
- + CUvideoctxlock vidLock; // If non-NULL, context lock used for synchronizing ownership of the cuda context
- + struct { // target rectangle in the output frame (for aspect ratio conversion)
- + short left;
- + short top;
- + short right;
- + short bottom;
- + } target_rect; // if a null rectangle is specified, {0,0,ulTargetWidth,ulTargetHeight} will be used
- + unsigned long Reserved2[5]; // Reserved for future use - set to zero
- +} CUVIDDECODECREATEINFO;
- +
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// H.264 Picture Parameters
- +//
- +
- +typedef struct _CUVIDH264DPBENTRY
- +{
- + int PicIdx; // picture index of reference frame
- + int FrameIdx; // frame_num(short-term) or LongTermFrameIdx(long-term)
- + int is_long_term; // 0=short term reference, 1=long term reference
- + int not_existing; // non-existing reference frame (corresponding PicIdx should be set to -1)
- + int used_for_reference; // 0=unused, 1=top_field, 2=bottom_field, 3=both_fields
- + int FieldOrderCnt[2]; // field order count of top and bottom fields
- +} CUVIDH264DPBENTRY;
- +
- +typedef struct _CUVIDH264MVCEXT
- +{
- + int num_views_minus1;
- + int view_id;
- + unsigned char inter_view_flag;
- + unsigned char num_inter_view_refs_l0;
- + unsigned char num_inter_view_refs_l1;
- + unsigned char MVCReserved8Bits;
- + int InterViewRefsL0[16];
- + int InterViewRefsL1[16];
- +} CUVIDH264MVCEXT;
- +
- +typedef struct _CUVIDH264SVCEXT
- +{
- + unsigned char profile_idc;
- + unsigned char level_idc;
- + unsigned char DQId;
- + unsigned char DQIdMax;
- + unsigned char disable_inter_layer_deblocking_filter_idc;
- + unsigned char ref_layer_chroma_phase_y_plus1;
- + signed char inter_layer_slice_alpha_c0_offset_div2;
- + signed char inter_layer_slice_beta_offset_div2;
- +
- + unsigned short DPBEntryValidFlag;
- + unsigned char inter_layer_deblocking_filter_control_present_flag;
- + unsigned char extended_spatial_scalability_idc;
- + unsigned char adaptive_tcoeff_level_prediction_flag;
- + unsigned char slice_header_restriction_flag;
- + unsigned char chroma_phase_x_plus1_flag;
- + unsigned char chroma_phase_y_plus1;
- +
- + unsigned char tcoeff_level_prediction_flag;
- + unsigned char constrained_intra_resampling_flag;
- + unsigned char ref_layer_chroma_phase_x_plus1_flag;
- + unsigned char store_ref_base_pic_flag;
- + unsigned char Reserved8BitsA;
- + unsigned char Reserved8BitsB;
- + // For the 4 scaled_ref_layer_XX fields below,
- + // if (extended_spatial_scalability_idc == 1), SPS field, G.7.3.2.1.4, add prefix "seq_"
- + // if (extended_spatial_scalability_idc == 2), SLH field, G.7.3.3.4,
- + short scaled_ref_layer_left_offset;
- + short scaled_ref_layer_top_offset;
- + short scaled_ref_layer_right_offset;
- + short scaled_ref_layer_bottom_offset;
- + unsigned short Reserved16Bits;
- + struct _CUVIDPICPARAMS *pNextLayer; // Points to the picparams for the next layer to be decoded. Linked list ends at the target layer.
- + int bRefBaseLayer; // whether to store ref base pic
- +} CUVIDH264SVCEXT;
- +
- +typedef struct _CUVIDH264PICPARAMS
- +{
- + // SPS
- + int log2_max_frame_num_minus4;
- + int pic_order_cnt_type;
- + int log2_max_pic_order_cnt_lsb_minus4;
- + int delta_pic_order_always_zero_flag;
- + int frame_mbs_only_flag;
- + int direct_8x8_inference_flag;
- + int num_ref_frames; // NOTE: shall meet level 4.1 restrictions
- + unsigned char residual_colour_transform_flag;
- + unsigned char bit_depth_luma_minus8; // Must be 0 (only 8-bit supported)
- + unsigned char bit_depth_chroma_minus8; // Must be 0 (only 8-bit supported)
- + unsigned char qpprime_y_zero_transform_bypass_flag;
- + // PPS
- + int entropy_coding_mode_flag;
- + int pic_order_present_flag;
- + int num_ref_idx_l0_active_minus1;
- + int num_ref_idx_l1_active_minus1;
- + int weighted_pred_flag;
- + int weighted_bipred_idc;
- + int pic_init_qp_minus26;
- + int deblocking_filter_control_present_flag;
- + int redundant_pic_cnt_present_flag;
- + int transform_8x8_mode_flag;
- + int MbaffFrameFlag;
- + int constrained_intra_pred_flag;
- + int chroma_qp_index_offset;
- + int second_chroma_qp_index_offset;
- + int ref_pic_flag;
- + int frame_num;
- + int CurrFieldOrderCnt[2];
- + // DPB
- + CUVIDH264DPBENTRY dpb[16]; // List of reference frames within the DPB
- + // Quantization Matrices (raster-order)
- + unsigned char WeightScale4x4[6][16];
- + unsigned char WeightScale8x8[2][64];
- + // FMO/ASO
- + unsigned char fmo_aso_enable;
- + unsigned char num_slice_groups_minus1;
- + unsigned char slice_group_map_type;
- + signed char pic_init_qs_minus26;
- + unsigned int slice_group_change_rate_minus1;
- + union
- + {
- + unsigned long long slice_group_map_addr;
- + const unsigned char *pMb2SliceGroupMap;
- + } fmo;
- + unsigned int Reserved[12];
- + // SVC/MVC
- + union
- + {
- + CUVIDH264MVCEXT mvcext;
- + CUVIDH264SVCEXT svcext;
- + };
- +} CUVIDH264PICPARAMS;
- +
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// MPEG-2 Picture Parameters
- +//
- +
- +typedef struct _CUVIDMPEG2PICPARAMS
- +{
- + int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
- + int BackwardRefIdx; // Picture index of backward reference (B-frames)
- + int picture_coding_type;
- + int full_pel_forward_vector;
- + int full_pel_backward_vector;
- + int f_code[2][2];
- + int intra_dc_precision;
- + int frame_pred_frame_dct;
- + int concealment_motion_vectors;
- + int q_scale_type;
- + int intra_vlc_format;
- + int alternate_scan;
- + int top_field_first;
- + // Quantization matrices (raster order)
- + unsigned char QuantMatrixIntra[64];
- + unsigned char QuantMatrixInter[64];
- +} CUVIDMPEG2PICPARAMS;
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// MPEG-4 Picture Parameters
- +//
- +
- +// MPEG-4 has VOP types instead of Picture types
- +#define I_VOP 0
- +#define P_VOP 1
- +#define B_VOP 2
- +#define S_VOP 3
- +
- +typedef struct _CUVIDMPEG4PICPARAMS
- +{
- + int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
- + int BackwardRefIdx; // Picture index of backward reference (B-frames)
- + // VOL
- + int video_object_layer_width;
- + int video_object_layer_height;
- + int vop_time_increment_bitcount;
- + int top_field_first;
- + int resync_marker_disable;
- + int quant_type;
- + int quarter_sample;
- + int short_video_header;
- + int divx_flags;
- + // VOP
- + int vop_coding_type;
- + int vop_coded;
- + int vop_rounding_type;
- + int alternate_vertical_scan_flag;
- + int interlaced;
- + int vop_fcode_forward;
- + int vop_fcode_backward;
- + int trd[2];
- + int trb[2];
- + // Quantization matrices (raster order)
- + unsigned char QuantMatrixIntra[64];
- + unsigned char QuantMatrixInter[64];
- + int gmc_enabled;
- +} CUVIDMPEG4PICPARAMS;
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// VC1 Picture Parameters
- +//
- +
- +typedef struct _CUVIDVC1PICPARAMS
- +{
- + int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
- + int BackwardRefIdx; // Picture index of backward reference (B-frames)
- + int FrameWidth; // Actual frame width
- + int FrameHeight; // Actual frame height
- + // PICTURE
- + int intra_pic_flag; // Set to 1 for I,BI frames
- + int ref_pic_flag; // Set to 1 for I,P frames
- + int progressive_fcm; // Progressive frame
- + // SEQUENCE
- + int profile;
- + int postprocflag;
- + int pulldown;
- + int interlace;
- + int tfcntrflag;
- + int finterpflag;
- + int psf;
- + int multires;
- + int syncmarker;
- + int rangered;
- + int maxbframes;
- + // ENTRYPOINT
- + int panscan_flag;
- + int refdist_flag;
- + int extended_mv;
- + int dquant;
- + int vstransform;
- + int loopfilter;
- + int fastuvmc;
- + int overlap;
- + int quantizer;
- + int extended_dmv;
- + int range_mapy_flag;
- + int range_mapy;
- + int range_mapuv_flag;
- + int range_mapuv;
- + int rangeredfrm; // range reduction state
- +} CUVIDVC1PICPARAMS;
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// JPEG Picture Parameters
- +//
- +
- +typedef struct _CUVIDJPEGPICPARAMS
- +{
- + int Reserved;
- +} CUVIDJPEGPICPARAMS;
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// Picture Parameters for Decoding
- +//
- +
- +typedef struct _CUVIDPICPARAMS
- +{
- + int PicWidthInMbs; // Coded Frame Size
- + int FrameHeightInMbs; // Coded Frame Height
- + int CurrPicIdx; // Output index of the current picture
- + int field_pic_flag; // 0=frame picture, 1=field picture
- + int bottom_field_flag; // 0=top field, 1=bottom field (ignored if field_pic_flag=0)
- + int second_field; // Second field of a complementary field pair
- + // Bitstream data
- + unsigned int nBitstreamDataLen; // Number of bytes in bitstream data buffer
- + const unsigned char *pBitstreamData; // Ptr to bitstream data for this picture (slice-layer)
- + unsigned int nNumSlices; // Number of slices in this picture
- + const unsigned int *pSliceDataOffsets; // nNumSlices entries, contains offset of each slice within the bitstream data buffer
- + int ref_pic_flag; // This picture is a reference picture
- + int intra_pic_flag; // This picture is entirely intra coded
- + unsigned int Reserved[30]; // Reserved for future use
- + // Codec-specific data
- + union {
- + CUVIDMPEG2PICPARAMS mpeg2; // Also used for MPEG-1
- + CUVIDH264PICPARAMS h264;
- + CUVIDVC1PICPARAMS vc1;
- + CUVIDMPEG4PICPARAMS mpeg4;
- + CUVIDJPEGPICPARAMS jpeg;
- + unsigned int CodecReserved[1024];
- + } CodecSpecific;
- +} CUVIDPICPARAMS;
- +
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// Post-processing
- +//
- +
- +typedef struct _CUVIDPROCPARAMS
- +{
- + int progressive_frame; // Input is progressive (deinterlace_mode will be ignored)
- + int second_field; // Output the second field (ignored if deinterlace mode is Weave)
- + int top_field_first; // Input frame is top field first (1st field is top, 2nd field is bottom)
- + int unpaired_field; // Input only contains one field (2nd field is invalid)
- + // The fields below are used for raw YUV input
- + unsigned int reserved_flags; // Reserved for future use (set to zero)
- + unsigned int reserved_zero; // Reserved (set to zero)
- + unsigned long long raw_input_dptr; // Input CUdeviceptr for raw YUV extensions
- + unsigned int raw_input_pitch; // pitch in bytes of raw YUV input (should be aligned appropriately)
- + unsigned int raw_input_format; // Reserved for future use (set to zero)
- + unsigned long long raw_output_dptr; // Reserved for future use (set to zero)
- + unsigned int raw_output_pitch; // Reserved for future use (set to zero)
- + unsigned int Reserved[48];
- + void *Reserved3[3];
- +} CUVIDPROCPARAMS;
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// In order to maximize decode latencies, there should be always at least 2 pictures in the decode
- +// queue at any time, in order to make sure that all decode engines are always busy.
- +//
- +// Overall data flow:
- +// - cuvidCreateDecoder(...)
- +// For each picture:
- +// - cuvidDecodePicture(N)
- +// - cuvidMapVideoFrame(N-4)
- +// - do some processing in cuda
- +// - cuvidUnmapVideoFrame(N-4)
- +// - cuvidDecodePicture(N+1)
- +// - cuvidMapVideoFrame(N-3)
- +// ...
- +// - cuvidDestroyDecoder(...)
- +//
- +// NOTE:
- +// - In the current version, the cuda context MUST be created from a D3D device, using cuD3D9CtxCreate function.
- +// For multi-threaded operation, the D3D device must also be created with the D3DCREATE_MULTITHREADED flag.
- +// - There is a limit to how many pictures can be mapped simultaneously (ulNumOutputSurfaces)
- +// - cuVidDecodePicture may block the calling thread if there are too many pictures pending
- +// in the decode queue
- +//
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +
- +// Create/Destroy the decoder object
- +extern CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci);
- +extern CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder);
- +
- +// Decode a single picture (field or frame)
- +extern CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams);
- +
- +#if !defined(__CUVID_DEVPTR64) || defined(__CUVID_INTERNAL)
- +// Post-process and map a video frame for use in cuda
- +extern CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx,
- + unsigned int *pDevPtr, unsigned int *pPitch,
- + CUVIDPROCPARAMS *pVPP);
- +// Unmap a previously mapped video frame
- +extern CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr);
- +#endif
- +
- +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
- +extern CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
- + unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
- +extern CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
- +#if defined(__CUVID_DEVPTR64) && !defined(__CUVID_INTERNAL)
- +#define cuvidMapVideoFrame cuvidMapVideoFrame64
- +#define cuvidUnmapVideoFrame cuvidUnmapVideoFrame64
- +#endif
- +#endif
- +
- +// Get the pointer to the d3d9 surface that is the decode RT
- +extern CUresult CUDAAPI cuvidGetVideoFrameSurface(CUvideodecoder hDecoder, int nPicIdx, void **pSrcSurface);
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// Context-locking: to facilitate multi-threaded implementations, the following 4 functions
- +// provide a simple mutex-style host synchronization. If a non-NULL context is specified
- +// in CUVIDDECODECREATEINFO, the codec library will acquire the mutex associated with the given
- +// context before making any cuda calls.
- +// A multi-threaded application could create a lock associated with a context handle so that
- +// multiple threads can safely share the same cuda context:
- +// - use cuCtxPopCurrent immediately after context creation in order to create a 'floating' context
- +// that can be passed to cuvidCtxLockCreate.
- +// - When using a floating context, all cuda calls should only be made within a cuvidCtxLock/cuvidCtxUnlock section.
- +//
- +// NOTE: This is a safer alternative to cuCtxPushCurrent and cuCtxPopCurrent, and is not related to video
- +// decoder in any way (implemented as a critical section associated with cuCtx{Push|Pop}Current calls).
- +
- +extern CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx);
- +extern CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck);
- +extern CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags);
- +extern CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags);
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +
- +#if defined(__cplusplus)
- +}
- +
- +// Auto-lock helper for C++ applications
- +class CCtxAutoLock
- +{
- +private:
- + CUvideoctxlock m_ctx;
- +public:
- + CCtxAutoLock(CUvideoctxlock ctx):m_ctx(ctx) { cuvidCtxLock(m_ctx,0); }
- + ~CCtxAutoLock() { cuvidCtxUnlock(m_ctx,0); }
- +};
- +
- +#endif /* __cplusplus */
- +
- +#endif // __CUDA_VIDEO_H__
- diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/nvcuvid.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/nvcuvid.h
- new file mode 100644
- index 0000000..0b81ee4
- --- /dev/null
- +++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Cuda/nvcuvid.h
- @@ -0,0 +1,228 @@
- +/*
- + * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
- + *
- + * NOTICE TO USER:
- + *
- + * This source code is subject to NVIDIA ownership rights under U.S. and
- + * international Copyright laws. Users and possessors of this source code
- + * are hereby granted a nonexclusive, royalty-free license to use this code
- + * in individual and commercial software.
- + *
- + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
- + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
- + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
- + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
- + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
- + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
- + * OR PERFORMANCE OF THIS SOURCE CODE.
- + *
- + * U.S. Government End Users. This source code is a "commercial item" as
- + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
- + * "commercial computer software" and "commercial computer software
- + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
- + * and is provided to the U.S. Government only as a commercial end item.
- + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
- + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
- + * source code with only those rights set forth herein.
- + *
- + * Any use of this source code in individual and commercial software must
- + * include, in the user documentation and internal comments to the code,
- + * the above Disclaimer and U.S. Government End Users Notice.
- + */
- +
- +#if !defined(__NVCUVID_H__)
- +#define __NVCUVID_H__
- +
- +#include "cuviddec.h"
- +
- +#if defined(__cplusplus)
- +extern "C" {
- +#endif /* __cplusplus */
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// High-level helper APIs for video sources
- +//
- +
- +typedef void *CUvideosource;
- +typedef void *CUvideoparser;
- +typedef long long CUvideotimestamp;
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// video data structures
- +//
- +
- +// Video Source State
- +typedef enum {
- + cudaVideoState_Error = -1, // Error state (invalid source)
- + cudaVideoState_Stopped = 0, // Source is stopped (or reached end-of-stream)
- + cudaVideoState_Started = 1, // Source is running and delivering data
- +} cudaVideoState;
- +
- +// Audio compression
- +typedef enum {
- + cudaAudioCodec_MPEG1=0, // MPEG-1 Audio
- + cudaAudioCodec_MPEG2, // MPEG-2 Audio
- + cudaAudioCodec_MP3, // MPEG-1 Layer III Audio
- + cudaAudioCodec_AC3, // Dolby Digital (AC3) Audio
- + cudaAudioCodec_LPCM, // PCM Audio
- +} cudaAudioCodec;
- +
- +
- +// Video format
- +typedef struct
- +{
- + cudaVideoCodec codec; // Compression format
- + struct {
- + unsigned int numerator; // frame rate numerator (0 = unspecified or variable frame rate)
- + unsigned int denominator; // frame rate denominator (0 = unspecified or variable frame rate)
- + } frame_rate; // frame rate = numerator / denominator (for example: 30000/1001)
- + int progressive_sequence; // 0=interlaced, 1=progressive
- + unsigned int coded_width; // coded frame width
- + unsigned int coded_height; // coded frame height
- + struct { // area of the frame that should be displayed
- + int left; // typical example:
- + int top; // coded_width = 1920, coded_height = 1088
- + int right; // display_area = { 0,0,1920,1080 }
- + int bottom;
- + } display_area;
- + cudaVideoChromaFormat chroma_format; // Chroma format
- + unsigned int bitrate; // video bitrate (bps, 0=unknown)
- + struct { // Display Aspect Ratio = x:y (4:3, 16:9, etc)
- + int x;
- + int y;
- + } display_aspect_ratio;
- + struct {
- + unsigned char video_format;
- + unsigned char color_primaries;
- + unsigned char transfer_characteristics;
- + unsigned char matrix_coefficients;
- + } video_signal_description;
- + unsigned int seqhdr_data_length; // Additional bytes following (CUVIDEOFORMATEX)
- +} CUVIDEOFORMAT;
- +
- +// Video format including raw sequence header information
- +typedef struct
- +{
- + CUVIDEOFORMAT format;
- + unsigned char raw_seqhdr_data[1024];
- +} CUVIDEOFORMATEX;
- +
- +
- +// Audio Format
- +typedef struct
- +{
- + cudaAudioCodec codec; // Compression format
- + unsigned int channels; // number of audio channels
- + unsigned int samplespersec; // sampling frequency
- + unsigned int bitrate; // For uncompressed, can also be used to determine bits per sample
- + unsigned int reserved1;
- + unsigned int reserved2;
- +} CUAUDIOFORMAT;
- +
- +
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// video source
- +//
- +
- +// Data packet
- +typedef enum {
- + CUVID_PKT_ENDOFSTREAM = 0x01, // Set when this is the last packet for this stream
- + CUVID_PKT_TIMESTAMP = 0x02, // Timestamp is valid
- + CUVID_PKT_DISCONTINUITY = 0x04, // Set when a discontinuity has to be signalled
- +} CUvideopacketflags;
- +
- +typedef struct _CUVIDSOURCEDATAPACKET
- +{
- + unsigned long flags; // Combination of CUVID_PKT_XXX flags
- + unsigned long payload_size; // number of bytes in the payload (may be zero if EOS flag is set)
- + const unsigned char *payload; // Pointer to packet payload data (may be NULL if EOS flag is set)
- + CUvideotimestamp timestamp; // Presentation timestamp (10MHz clock), only valid if CUVID_PKT_TIMESTAMP flag is set
- +} CUVIDSOURCEDATAPACKET;
- +
- +// Callback for packet delivery
- +typedef int (CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *);
- +
- +typedef struct _CUVIDSOURCEPARAMS
- +{
- + unsigned int ulClockRate; // Timestamp units in Hz (0=default=10000000Hz)
- + unsigned int uReserved1[7]; // Reserved for future use - set to zero
- + void *pUserData; // Parameter passed in to the data handlers
- + PFNVIDSOURCECALLBACK pfnVideoDataHandler; // Called to deliver audio packets
- + PFNVIDSOURCECALLBACK pfnAudioDataHandler; // Called to deliver video packets
- + void *pvReserved2[8]; // Reserved for future use - set to NULL
- +} CUVIDSOURCEPARAMS;
- +
- +typedef enum {
- + CUVID_FMT_EXTFORMATINFO = 0x100, // Return extended format structure (CUVIDEOFORMATEX)
- +} CUvideosourceformat_flags;
- +
- +#if !defined(__APPLE__)
- +// Video file source
- +CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams);
- +CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams);
- +CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj);
- +CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
- +cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj);
- +CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags);
- +CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags);
- +#endif
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +//
- +// Video parser
- +//
- +
- +typedef struct _CUVIDPARSERDISPINFO
- +{
- + int picture_index;
- + int progressive_frame;
- + int top_field_first;
- + int repeat_first_field; // Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling, -1=unpaired field)
- + CUvideotimestamp timestamp;
- +} CUVIDPARSERDISPINFO;
- +
- +//
- +// Parser callbacks
- +// The parser will call these synchronously from within cuvidParseVideoData(), whenever a picture is ready to
- +// be decoded and/or displayed.
- +//
- +typedef int (CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *);
- +typedef int (CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *);
- +typedef int (CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *);
- +
- +typedef struct _CUVIDPARSERPARAMS
- +{
- + cudaVideoCodec CodecType; // cudaVideoCodec_XXX
- + unsigned int ulMaxNumDecodeSurfaces; // Max # of decode surfaces (parser will cycle through these)
- + unsigned int ulClockRate; // Timestamp units in Hz (0=default=10000000Hz)
- + unsigned int ulErrorThreshold; // % Error threshold (0-100) for calling pfnDecodePicture (100=always call pfnDecodePicture even if picture bitstream is fully corrupted)
- + unsigned int ulMaxDisplayDelay; // Max display queue delay (improves pipelining of decode with display) - 0=no delay (recommended values: 2..4)
- + unsigned int uReserved1[5]; // Reserved for future use - set to 0
- + void *pUserData; // User data for callbacks
- + PFNVIDSEQUENCECALLBACK pfnSequenceCallback; // Called before decoding frames and/or whenever there is a format change
- + PFNVIDDECODECALLBACK pfnDecodePicture; // Called when a picture is ready to be decoded (decode order)
- + PFNVIDDISPLAYCALLBACK pfnDisplayPicture; // Called whenever a picture is ready to be displayed (display order)
- + void *pvReserved2[7]; // Reserved for future use - set to NULL
- + CUVIDEOFORMATEX *pExtVideoInfo; // [Optional] sequence header data from system layer
- +} CUVIDPARSERPARAMS;
- +
- +
- +CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
- +CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
- +CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj);
- +
- +
- +////////////////////////////////////////////////////////////////////////////////////////////////
- +
- +#if defined(__cplusplus)
- +}
- +#endif /* __cplusplus */
- +
- +#endif // __NVCUVID_H__
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement